442 lines
16 KiB
LLVM
442 lines
16 KiB
LLVM
;; Copyright (c) 2010-2011, Intel Corporation
|
|
;; All rights reserved.
|
|
;;
|
|
;; Redistribution and use in source and binary forms, with or without
|
|
;; modification, are permitted provided that the following conditions are
|
|
;; met:
|
|
;;
|
|
;; * Redistributions of source code must retain the above copyright
|
|
;; notice, this list of conditions and the following disclaimer.
|
|
;;
|
|
;; * Redistributions in binary form must reproduce the above copyright
|
|
;; notice, this list of conditions and the following disclaimer in the
|
|
;; documentation and/or other materials provided with the distribution.
|
|
;;
|
|
;; * Neither the name of Intel Corporation nor the names of its
|
|
;; contributors may be used to endorse or promote products derived from
|
|
;; this software without specific prior written permission.
|
|
;;
|
|
;;
|
|
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
;; This file declares implementations of various stdlib builtins that
|
|
;; only require SSE version 1 and 2 functionality; this file, in turn
|
|
;; is then included by stdlib-sse2.ll and stdlib-sse4.ll to provide
|
|
;; those definitions for them.
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
int8_16(4)
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; rcp
|
|
|
|
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
|
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
|
|
|
define internal <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
|
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
|
|
; do one N-R iteration to improve precision
|
|
; float iv = __rcp_v(v);
|
|
; return iv * (2. - v * iv);
|
|
%v_iv = fmul <4 x float> %0, %call
|
|
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
|
|
%iv_mul = fmul <4 x float> %call, %two_minus
|
|
ret <4 x float> %iv_mul
|
|
}
|
|
|
|
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
|
; do the rcpss call
|
|
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
|
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
|
%scall = extractelement <4 x float> %call, i32 0
|
|
|
|
; do one N-R iteration to improve precision, as above
|
|
%v_iv = fmul float %0, %scall
|
|
%two_minus = fsub float 2., %v_iv
|
|
%iv_mul = fmul float %scall, %two_minus
|
|
ret float %iv_mul
|
|
}
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; rsqrt
|
|
|
|
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
|
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
|
|
|
define internal <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
|
|
; float is = __rsqrt_v(v);
|
|
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
|
|
; Newton-Raphson iteration to improve precision
|
|
; return 0.5 * is * (3. - (v * is) * is);
|
|
%v_is = fmul <4 x float> %v, %is
|
|
%v_is_is = fmul <4 x float> %v_is, %is
|
|
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
|
|
%is_mul = fmul <4 x float> %is, %three_sub
|
|
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
|
ret <4 x float> %half_scale
|
|
}
|
|
|
|
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
|
; uniform float is = extract(__rsqrt_u(v), 0);
|
|
%v = insertelement <4 x float> undef, float %0, i32 0
|
|
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
|
%is = extractelement <4 x float> %vis, i32 0
|
|
|
|
; Newton-Raphson iteration to improve precision
|
|
; return 0.5 * is * (3. - (v * is) * is);
|
|
%v_is = fmul float %0, %is
|
|
%v_is_is = fmul float %v_is, %is
|
|
%three_sub = fsub float 3., %v_is_is
|
|
%is_mul = fmul float %is, %three_sub
|
|
%half_scale = fmul float 0.5, %is_mul
|
|
ret float %half_scale
|
|
}
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; sqrt
|
|
|
|
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
|
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
|
|
|
define internal <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
|
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
|
|
ret <4 x float> %call
|
|
}
|
|
|
|
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
|
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
|
ret float %ret
|
|
}
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; fast math mode
|
|
|
|
declare void @llvm.x86.sse.stmxcsr(i32 *) nounwind
|
|
declare void @llvm.x86.sse.ldmxcsr(i32 *) nounwind
|
|
|
|
define internal void @__fastmath() nounwind alwaysinline {
|
|
%ptr = alloca i32
|
|
call void @llvm.x86.sse.stmxcsr(i32 * %ptr)
|
|
%oldval = load i32 *%ptr
|
|
|
|
; turn on DAZ (64)/FTZ (32768) -> 32832
|
|
%update = or i32 %oldval, 32832
|
|
store i32 %update, i32 *%ptr
|
|
call void @llvm.x86.sse.ldmxcsr(i32 * %ptr)
|
|
ret void
|
|
}
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; svml stuff
|
|
|
|
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
|
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
|
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
|
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
|
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
|
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
|
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
|
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
|
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
define internal <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
|
|
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
|
|
ret <4 x float> %ret
|
|
}
|
|
|
|
define internal <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
|
|
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
|
|
ret <4 x float> %ret
|
|
}
|
|
|
|
define internal void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
|
|
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
|
|
store <4 x float> %s, <4 x float> * %1
|
|
ret void
|
|
}
|
|
|
|
define internal <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
|
|
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
|
|
ret <4 x float> %ret
|
|
}
|
|
|
|
define internal <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
|
|
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
|
|
ret <4 x float> %ret
|
|
}
|
|
|
|
define internal <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
|
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
|
|
ret <4 x float> %ret
|
|
}
|
|
|
|
define internal <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
|
|
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
|
|
ret <4 x float> %ret
|
|
}
|
|
|
|
define internal <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
|
|
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
|
|
ret <4 x float> %ret
|
|
}
|
|
|
|
define internal <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
|
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
|
|
ret <4 x float> %ret
|
|
}
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; float min/max
|
|
|
|
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
|
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
|
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
define internal <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
|
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
|
|
ret <4 x float> %call
|
|
}
|
|
|
|
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
|
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
|
ret float %ret
|
|
}
|
|
|
|
define internal <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
|
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
|
|
ret <4 x float> %call
|
|
}
|
|
|
|
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
|
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
|
ret float %ret
|
|
}
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; horizontal ops / reductions
|
|
|
|
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
|
|
|
define internal i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
|
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
|
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
|
ret i32 %v
|
|
}
|
|
|
|
define internal float @__reduce_min_float(<4 x float>) nounwind readnone {
|
|
reduce4(float, @__min_varying_float, @__min_uniform_float)
|
|
}
|
|
|
|
define internal float @__reduce_max_float(<4 x float>) nounwind readnone {
|
|
reduce4(float, @__max_varying_float, @__max_uniform_float)
|
|
}
|
|
|
|
define internal i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
|
|
%v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
|
|
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
|
%m1 = add <4 x i32> %v1, %v
|
|
%m1a = extractelement <4 x i32> %m1, i32 0
|
|
%m1b = extractelement <4 x i32> %m1, i32 1
|
|
%sum = add i32 %m1a, %m1b
|
|
ret i32 %sum
|
|
}
|
|
|
|
define internal i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
|
|
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
|
|
}
|
|
|
|
define internal i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
|
|
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
|
|
}
|
|
|
|
define internal i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
|
|
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
|
|
ret i32 %r
|
|
}
|
|
|
|
define internal i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
|
|
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
|
}
|
|
|
|
define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
|
|
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
|
}
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; masked store
|
|
|
|
define void @__masked_store_32(<4 x i32>* nocapture, <4 x i32>, <4 x i32>) nounwind alwaysinline {
|
|
per_lane(4, <4 x i32> %2, `
|
|
; compute address for this one
|
|
%ptr_ID = getelementptr <4 x i32> * %0, i32 0, i32 LANE
|
|
%storeval_ID = extractelement <4 x i32> %1, i32 LANE
|
|
store i32 %storeval_ID, i32 * %ptr_ID')
|
|
ret void
|
|
}
|
|
|
|
define void @__masked_store_64(<4 x i64>* nocapture, <4 x i64>, <4 x i32>) nounwind alwaysinline {
|
|
per_lane(4, <4 x i32> %2, `
|
|
%ptr_ID = getelementptr <4 x i64> * %0, i32 0, i32 LANE
|
|
%storeval_ID = extractelement <4 x i64> %1, i32 LANE
|
|
store i64 %storeval_ID, i64 * %ptr_ID')
|
|
ret void
|
|
}
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; unaligned loads/loads+broadcasts
|
|
|
|
define <4 x i32> @__load_and_broadcast_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
|
|
; must not load if the mask is all off; the address may be invalid
|
|
%mm = call i32 @__movmsk(<4 x i32> %mask)
|
|
%any_on = icmp ne i32 %mm, 0
|
|
br i1 %any_on, label %load, label %skip
|
|
|
|
load:
|
|
%ptr = bitcast i8 * %0 to i32 *
|
|
%val = load i32 * %ptr
|
|
|
|
%ret0 = insertelement <4 x i32> undef, i32 %val, i32 0
|
|
%ret1 = insertelement <4 x i32> %ret0, i32 %val, i32 1
|
|
%ret2 = insertelement <4 x i32> %ret1, i32 %val, i32 2
|
|
%ret3 = insertelement <4 x i32> %ret2, i32 %val, i32 3
|
|
ret <4 x i32> %ret3
|
|
|
|
skip:
|
|
ret <4 x i32> undef
|
|
}
|
|
|
|
define <4 x i64> @__load_and_broadcast_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
|
|
; must not load if the mask is all off; the address may be invalid
|
|
%mm = call i32 @__movmsk(<4 x i32> %mask)
|
|
%any_on = icmp ne i32 %mm, 0
|
|
br i1 %any_on, label %load, label %skip
|
|
|
|
load:
|
|
%ptr = bitcast i8 * %0 to i64 *
|
|
%val = load i64 * %ptr
|
|
|
|
%ret0 = insertelement <4 x i64> undef, i64 %val, i32 0
|
|
%ret1 = insertelement <4 x i64> %ret0, i64 %val, i32 1
|
|
%ret2 = insertelement <4 x i64> %ret1, i64 %val, i32 2
|
|
%ret3 = insertelement <4 x i64> %ret2, i64 %val, i32 3
|
|
ret <4 x i64> %ret3
|
|
|
|
skip:
|
|
ret <4 x i64> undef
|
|
}
|
|
|
|
define <4 x i32> @__load_masked_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
|
|
%mm = call i32 @__movmsk(<4 x i32> %mask)
|
|
%any_on = icmp ne i32 %mm, 0
|
|
br i1 %any_on, label %load, label %skip
|
|
|
|
load:
|
|
; if any mask lane is on, just load all of the values
|
|
; FIXME: there is a lurking bug here if we straddle a page boundary, the
|
|
; next page is invalid to read, but the mask bits are set so that we
|
|
; aren't supposed to be reading those elements...
|
|
%ptr = bitcast i8 * %0 to <4 x i32> *
|
|
%val = load <4 x i32> * %ptr, align 4
|
|
ret <4 x i32> %val
|
|
|
|
skip:
|
|
ret <4 x i32> undef
|
|
}
|
|
|
|
define <4 x i64> @__load_masked_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
|
|
%mm = call i32 @__movmsk(<4 x i32> %mask)
|
|
%any_on = icmp ne i32 %mm, 0
|
|
br i1 %any_on, label %load, label %skip
|
|
|
|
load:
|
|
; if any mask lane is on, just load all of the values
|
|
; FIXME: there is a lurking bug here if we straddle a page boundary, the
|
|
; next page is invalid to read, but the mask bits are set so that we
|
|
; aren't supposed to be reading those elements...
|
|
%ptr = bitcast i8 * %0 to <4 x i64> *
|
|
%val = load <4 x i64> * %ptr, align 8
|
|
ret <4 x i64> %val
|
|
|
|
skip:
|
|
ret <4 x i64> undef
|
|
}
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; gather/scatter
|
|
|
|
; define these with the macros from stdlib.m4
|
|
|
|
gen_gather(4, i32)
|
|
gen_gather(4, i64)
|
|
gen_scatter(4, i32)
|
|
gen_scatter(4, i64)
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; double precision sqrt
|
|
|
|
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
|
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
|
|
|
define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
|
|
unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
|
ret <4 x double> %ret
|
|
}
|
|
|
|
|
|
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
|
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
|
|
ret double %ret
|
|
}
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; double precision min/max
|
|
|
|
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
|
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
|
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
|
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
|
ret <4 x double> %ret
|
|
}
|
|
|
|
|
|
define internal double @__min_uniform_double(double, double) nounwind readnone {
|
|
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
|
ret double %ret
|
|
}
|
|
|
|
|
|
define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
|
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
|
ret <4 x double> %ret
|
|
}
|
|
|
|
|
|
define internal double @__max_uniform_double(double, double) nounwind readnone {
|
|
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
|
ret double %ret
|
|
}
|