diff --git a/Makefile b/Makefile index 4cb9a43b..4a948983 100644 --- a/Makefile +++ b/Makefile @@ -49,7 +49,7 @@ CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \ util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \ +BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll builtins-sse2-x2.ll \ builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll BISON_SRC=parse.yy FLEX_SRC=lex.ll @@ -111,7 +111,7 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc @echo Compiling $< @$(CXX) $(CXXFLAGS) -o $@ -c $< -objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll builtins-avx-common.ll +objs/builtins-%.cpp: builtins-%.ll @echo Creating C++ source from builtin definitions file $< @m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@ @@ -142,3 +142,10 @@ objs/stdlib_ispc.cpp: stdlib.ispc objs/stdlib_ispc.o: objs/stdlib_ispc.cpp @echo Compiling $< @$(CXX) $(CXXFLAGS) -o $@ -c $< + +objs/builtins-sse2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2.ll +objs/builtins-sse2-x2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2-x2.ll +objs/builtins-sse4.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4.ll +objs/builtins-sse4-x2.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4-x2.ll +objs/builtins-avx.cpp: builtins.m4 builtins-avx-common.ll builtins-avx.ll +objs/builtins-avx-x2.cpp: builtins.m4 builtins-avx-common.ll builtins-avx-x2.ll diff --git a/builtins-avx-common.ll b/builtins-avx-common.ll index 49aa8664..92a077fc 100644 --- a/builtins-avx-common.ll +++ b/builtins-avx-common.ll @@ -30,11 +30,7 @@ ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; *** Untested *** AVX target implementation. -;; -;; The LLVM AVX code generator is incomplete, so the ispc AVX target -;; hasn't yet been tested. There is therefore a higher-than-normal -;; chance that there are bugs in the code in this file. +;; AVX target implementation. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins-sse.ll b/builtins-sse.ll deleted file mode 100644 index a29e8437..00000000 --- a/builtins-sse.ll +++ /dev/null @@ -1,417 +0,0 @@ -;; Copyright (c) 2010-2011, Intel Corporation -;; All rights reserved. -;; -;; Redistribution and use in source and binary forms, with or without -;; modification, are permitted provided that the following conditions are -;; met: -;; -;; * Redistributions of source code must retain the above copyright -;; notice, this list of conditions and the following disclaimer. -;; -;; * Redistributions in binary form must reproduce the above copyright -;; notice, this list of conditions and the following disclaimer in the -;; documentation and/or other materials provided with the distribution. -;; -;; * Neither the name of Intel Corporation nor the names of its -;; contributors may be used to endorse or promote products derived from -;; this software without specific prior written permission. -;; -;; -;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER -;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -;; This file declares implementations of various stdlib builtins that -;; only require SSE version 1 and 2 functionality; this file, in turn -;; is then included by builtins-sse2.ll and builtins-sse4.ll to provide -;; those definitions for them. - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -int64minmax(4) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; rcp - -declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone -declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone - -define internal <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline { - %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0) - ; do one N-R iteration to improve precision - ; float iv = __rcp_v(v); - ; return iv * (2. - v * iv); - %v_iv = fmul <4 x float> %0, %call - %two_minus = fsub <4 x float> , %v_iv - %iv_mul = fmul <4 x float> %call, %two_minus - ret <4 x float> %iv_mul -} - -define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline { - ; do the rcpss call - %vecval = insertelement <4 x float> undef, float %0, i32 0 - %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval) - %scall = extractelement <4 x float> %call, i32 0 - - ; do one N-R iteration to improve precision, as above - %v_iv = fmul float %0, %scall - %two_minus = fsub float 2., %v_iv - %iv_mul = fmul float %scall, %two_minus - ret float %iv_mul -} - - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; rsqrt - -declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone -declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone - -define internal <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline { - ; float is = __rsqrt_v(v); - %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v) - ; Newton-Raphson iteration to improve precision - ; return 0.5 * is * (3. - (v * is) * is); - %v_is = fmul <4 x float> %v, %is - %v_is_is = fmul <4 x float> %v_is, %is - %three_sub = fsub <4 x float> , %v_is_is - %is_mul = fmul <4 x float> %is, %three_sub - %half_scale = fmul <4 x float> , %is_mul - ret <4 x float> %half_scale -} - -define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline { - ; uniform float is = extract(__rsqrt_u(v), 0); - %v = insertelement <4 x float> undef, float %0, i32 0 - %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v) - %is = extractelement <4 x float> %vis, i32 0 - - ; Newton-Raphson iteration to improve precision - ; return 0.5 * is * (3. - (v * is) * is); - %v_is = fmul float %0, %is - %v_is_is = fmul float %v_is, %is - %three_sub = fsub float 3., %v_is_is - %is_mul = fmul float %is, %three_sub - %half_scale = fmul float 0.5, %is_mul - ret float %half_scale -} - - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; sqrt - -declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone -declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone - -define internal <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline { - %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0) - ret <4 x float> %call -} - -define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline { - sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0) - ret float %ret -} - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; fast math mode - -declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind -declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind - -define internal void @__fastmath() nounwind alwaysinline { - %ptr = alloca i32 - %ptr8 = bitcast i32 * %ptr to i8 * - call void @llvm.x86.sse.stmxcsr(i8 * %ptr8) - %oldval = load i32 *%ptr - - ; turn on DAZ (64)/FTZ (32768) -> 32832 - %update = or i32 %oldval, 32832 - store i32 %update, i32 *%ptr - call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8) - ret void -} - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone - - -define internal <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) - ret <4 x float> %ret -} - -define internal <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) - ret <4 x float> %ret -} - -define internal void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { - %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) - store <4 x float> %s, <4 x float> * %1 - ret void -} - -define internal <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define internal <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define internal <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - -define internal <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_expf4(<4 x float> %0) - ret <4 x float> %ret -} - -define internal <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_logf4(<4 x float> %0) - ret <4 x float> %ret -} - -define internal <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; float min/max - -declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone - -define internal <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { - %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1) - ret <4 x float> %call -} - -define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline { - sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1) - ret float %ret -} - -define internal <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { - %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1) - ret <4 x float> %call -} - -define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline { - sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1) - ret float %ret -} - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; double precision sqrt - -declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone -declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone - -define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline { - unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0) - ret <4 x double> %ret -} - - -define internal double @__sqrt_uniform_double(double) nounwind alwaysinline { - sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0) - ret double %ret -} - - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; double precision min/max - -declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone -declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone -declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone -declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone - -define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone { - binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1) - ret <4 x double> %ret -} - - -define internal double @__min_uniform_double(double, double) nounwind readnone { - sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1) - ret double %ret -} - - -define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone { - binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1) - ret <4 x double> %ret -} - - -define internal double @__max_uniform_double(double, double) nounwind readnone { - sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1) - ret double %ret -} - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; horizontal ops / reductions - -declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone - -define internal i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline { - %floatmask = bitcast <4 x i32> %0 to <4 x float> - %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone - ret i32 %v -} - -define internal float @__reduce_min_float(<4 x float>) nounwind readnone { - reduce4(float, @__min_varying_float, @__min_uniform_float) -} - -define internal float @__reduce_max_float(<4 x float>) nounwind readnone { - reduce4(float, @__max_varying_float, @__max_uniform_float) -} - -define internal i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone { - %v1 = shufflevector <4 x i32> %v, <4 x i32> undef, - <4 x i32> - %m1 = add <4 x i32> %v1, %v - %m1a = extractelement <4 x i32> %m1, i32 0 - %m1b = extractelement <4 x i32> %m1, i32 1 - %sum = add i32 %m1a, %m1b - ret i32 %sum -} - -define internal i32 @__reduce_min_int32(<4 x i32>) nounwind readnone { - reduce4(i32, @__min_varying_int32, @__min_uniform_int32) -} - -define internal i32 @__reduce_max_int32(<4 x i32>) nounwind readnone { - reduce4(i32, @__max_varying_int32, @__max_uniform_int32) -} - -define internal i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone { - %r = call i32 @__reduce_add_int32(<4 x i32> %v) - ret i32 %r -} - -define internal i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone { - reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32) -} - -define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone { - reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32) - } - - -define internal double @__reduce_add_double(<4 x double>) nounwind readnone { - %v0 = shufflevector <4 x double> %0, <4 x double> undef, - <2 x i32> - %v1 = shufflevector <4 x double> %0, <4 x double> undef, - <2 x i32> - %sum = fadd <2 x double> %v0, %v1 - %e0 = extractelement <2 x double> %sum, i32 0 - %e1 = extractelement <2 x double> %sum, i32 1 - %m = fadd double %e0, %e1 - ret double %m -} - -define internal double @__reduce_min_double(<4 x double>) nounwind readnone { - reduce4(double, @__min_varying_double, @__min_uniform_double) -} - -define internal double @__reduce_max_double(<4 x double>) nounwind readnone { - reduce4(double, @__max_varying_double, @__max_uniform_double) -} - -define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone { - %v0 = shufflevector <4 x i64> %0, <4 x i64> undef, - <2 x i32> - %v1 = shufflevector <4 x i64> %0, <4 x i64> undef, - <2 x i32> - %sum = add <2 x i64> %v0, %v1 - %e0 = extractelement <2 x i64> %sum, i32 0 - %e1 = extractelement <2 x i64> %sum, i32 1 - %m = add i64 %e0, %e1 - ret i64 %m -} - -define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone { - reduce4(i64, @__min_varying_int64, @__min_uniform_int64) -} - -define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone { - reduce4(i64, @__max_varying_int64, @__max_uniform_int64) -} - -define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone { - reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64) -} - -define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone { - reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64) -} - -reduce_equal(4) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; masked store - -masked_store_blend_8_16_by_4() - -gen_masked_store(4, i8, 8) -gen_masked_store(4, i16, 16) -gen_masked_store(4, i32, 32) -gen_masked_store(4, i64, 64) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; unaligned loads/loads+broadcasts - -load_and_broadcast(4, i8, 8) -load_and_broadcast(4, i16, 16) -load_and_broadcast(4, i32, 32) -load_and_broadcast(4, i64, 64) - -load_masked(4, i8, 8, 1) -load_masked(4, i16, 16, 2) -load_masked(4, i32, 32, 4) -load_masked(4, i64, 64, 8) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; gather/scatter - -; define these with the macros from stdlib.m4 - -gen_gather(4, i8) -gen_gather(4, i16) -gen_gather(4, i32) -gen_gather(4, i64) - -gen_scatter(4, i8) -gen_scatter(4, i16) -gen_scatter(4, i32) -gen_scatter(4, i64) diff --git a/builtins-sse2-common.ll b/builtins-sse2-common.ll new file mode 100644 index 00000000..7e94f5a8 --- /dev/null +++ b/builtins-sse2-common.ll @@ -0,0 +1,266 @@ +;; Copyright (c) 2010-2011, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone + +define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline { + ; do the rcpss call + %vecval = insertelement <4 x float> undef, float %0, i32 0 + %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval) + %scall = extractelement <4 x float> %call, i32 0 + + ; do one N-R iteration to improve precision, as above + %v_iv = fmul float %0, %scall + %two_minus = fsub float 2., %v_iv + %iv_mul = fmul float %scall, %two_minus + ret float %iv_mul +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; rsqrt + +declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone + +define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline { + ; uniform float is = extract(__rsqrt_u(v), 0); + %v = insertelement <4 x float> undef, float %0, i32 0 + %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v) + %is = extractelement <4 x float> %vis, i32 0 + + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul float %0, %is + %v_is_is = fmul float %v_is, %is + %three_sub = fsub float 3., %v_is_is + %is_mul = fmul float %is, %three_sub + %half_scale = fmul float 0.5, %is_mul + ret float %half_scale +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; sqrt + +declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone + + +define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline { + sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0) + ret float %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; fast math mode + +declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind +declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind + +define internal void @__fastmath() nounwind alwaysinline { + %ptr = alloca i32 + %ptr8 = bitcast i32 * %ptr to i8 * + call void @llvm.x86.sse.stmxcsr(i8 * %ptr8) + %oldval = load i32 *%ptr + + ; turn on DAZ (64)/FTZ (32768) -> 32832 + %update = or i32 %oldval, 32832 + store i32 %update, i32 *%ptr + call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8) + ret void +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone + +define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline { + sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1) + ret float %ret +} + + +define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline { + sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1) + ret float %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone + +define internal double @__sqrt_uniform_double(double) nounwind alwaysinline { + sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0) + ret double %ret +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone + +define internal double @__min_uniform_double(double, double) nounwind readnone { + sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1) + ret double %ret +} + +define internal double @__max_uniform_double(double, double) nounwind readnone { + sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1) + ret double %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding +;; +;; There are not any rounding instructions in SSE2, so we have to emulate +;; the functionality with multiple instructions... + +; The code for __round_* is the result of compiling the following source +; code. +; +; export float Round(float x) { +; unsigned int sign = signbits(x); +; unsigned int ix = intbits(x); +; ix ^= sign; +; x = floatbits(ix); +; x += 0x1.0p23f; +; x -= 0x1.0p23f; +; ix = intbits(x); +; ix ^= sign; +; x = floatbits(ix); +; return x; +;} + +define internal float @__round_uniform_float(float) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32 + %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648 + %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i + %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06 + %binop21.i = fadd float %binop.i, -8.388608e+06 + %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32 + %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float + ret float %int_to_float_bitcast.i.i.i +} + +;; Similarly, for implementations of the __floor* functions below, we have the +;; bitcode from compiling the following source code... + +;export float Floor(float x) { +; float y = Round(x); +; unsigned int cmp = y > x ? 0xffffffff : 0; +; float delta = -1.f; +; unsigned int idelta = intbits(delta); +; idelta &= cmp; +; delta = floatbits(idelta); +; return y + delta; +;} + +define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline { + %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind + %bincmp.i = fcmp ogt float %calltmp.i, %0 + %selectexpr.i = sext i1 %bincmp.i to i32 + %bitop.i = and i32 %selectexpr.i, -1082130432 + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i + ret float %binop.i +} + +;; And here is the code we compiled to get the __ceil* functions below +; +;export uniform float Ceil(uniform float x) { +; uniform float y = Round(x); +; uniform int yltx = y < x ? 0xffffffff : 0; +; uniform float delta = 1.f; +; uniform int idelta = intbits(delta); +; idelta &= yltx; +; delta = floatbits(idelta); +; return y + delta; +;} + +define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline { + %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind + %bincmp.i = fcmp olt float %calltmp.i, %0 + %selectexpr.i = sext i1 %bincmp.i to i32 + %bitop.i = and i32 %selectexpr.i, 1065353216 + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i + ret float %binop.i +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +declare double @round(double) +declare double @floor(double) +declare double @ceil(double) + +define internal double @__round_uniform_double(double) nounwind readonly alwaysinline { + %r = call double @round(double %0) + ret double %r +} + +define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline { + %r = call double @floor(double %0) + ret double %r +} + +define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline { + %r = call double @ceil(double %0) + ret double %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops / reductions + +declare i32 @llvm.ctpop.i32(i32) +declare i64 @llvm.ctpop.i64(i64) + +define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { + %val = call i32 @llvm.ctpop.i32(i32 %0) + ret i32 %val +} + +define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline { + %val = call i64 @llvm.ctpop.i64(i64 %0) + ret i64 %val +} + + diff --git a/builtins-sse2-x2.ll b/builtins-sse2-x2.ll new file mode 100644 index 00000000..ef5053d2 --- /dev/null +++ b/builtins-sse2-x2.ll @@ -0,0 +1,631 @@ +;; Copyright (c) 2010-2011, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +;; This file defines the target for "double-pumped" SSE2, i.e. running +;; with 8-wide vectors + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; standard 8-wide definitions from m4 macros + +stdlib_core(8) +packed_load_and_store(8) +scans(8) +int64minmax(8) + +include(`builtins-sse2-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + + unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0) + ; do one N-R iteration + %v_iv = fmul <8 x float> %0, %call + %two_minus = fsub <8 x float> , %v_iv + %iv_mul = fmul <8 x float> %call, %two_minus + ret <8 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rsqrt + +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v) + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <8 x float> %v, %is + %v_is_is = fmul <8 x float> %v_is, %is + %three_sub = fsub <8 x float> , %v_is_is + %is_mul = fmul <8 x float> %is, %three_sub + %half_scale = fmul <8 x float> , %is_mul + ret <8 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; sqrt + +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline { + unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0) + ret <8 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; svml stuff + +declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone +declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone + + +define internal <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_sinf4, %0) + ret <8 x float> %ret +} + +define internal <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_cosf4, %0) + ret <8 x float> %ret +} + +define internal void @__svml_sincos(<8 x float>, <8 x float> *, + <8 x float> *) nounwind readnone alwaysinline { + ; call svml_sincosf4 two times with the two 4-wide sub-vectors + %a = shufflevector <8 x float> %0, <8 x float> undef, + <4 x i32> + %b = shufflevector <8 x float> %0, <8 x float> undef, + <4 x i32> + + %cospa = alloca <4 x float> + %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) + + %cospb = alloca <4 x float> + %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) + + %sin = shufflevector <4 x float> %sa, <4 x float> %sb, + <8 x i32> + store <8 x float> %sin, <8 x float> * %1 + + %cosa = load <4 x float> * %cospa + %cosb = load <4 x float> * %cospb + %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, + <8 x i32> + store <8 x float> %cos, <8 x float> * %2 + + ret void +} + +define internal <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_tanf4, %0) + ret <8 x float> %ret +} + +define internal <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_atanf4, %0) + ret <8 x float> %ret +} + +define internal <8 x float> @__svml_atan2(<8 x float>, + <8 x float>) nounwind readnone alwaysinline { + binary4to8(ret, float, @__svml_atan2f4, %0, %1) + ret <8 x float> %ret +} + +define internal <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_expf4, %0) + ret <8 x float> %ret +} + +define internal <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_logf4, %0) + ret <8 x float> %ret +} + +define internal <8 x float> @__svml_pow(<8 x float>, + <8 x float>) nounwind readnone alwaysinline { + binary4to8(ret, float, @__svml_powf4, %0, %1) + ret <8 x float> %ret +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define internal <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { + binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1) + ret <8 x float> %call +} + +define internal <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { + binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1) + ret <8 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +; There is no blend instruction with SSE2, so we simulate it with bit +; operations on i32s. For these two vselect functions, for each +; vector element, if the mask is on, we return the corresponding value +; from %1, and otherwise return the value from %0. + +define internal <8 x i32> @__vselect_i32(<8 x i32>, <8 x i32> , + <8 x i32> %mask) nounwind readnone alwaysinline { + %notmask = xor <8 x i32> %mask, + %cleared_old = and <8 x i32> %0, %notmask + %masked_new = and <8 x i32> %1, %mask + %new = or <8 x i32> %cleared_old, %masked_new + ret <8 x i32> %new +} + +define internal <8 x float> @__vselect_float(<8 x float>, <8 x float>, + <8 x i32> %mask) nounwind readnone alwaysinline { + %v0 = bitcast <8 x float> %0 to <8 x i32> + %v1 = bitcast <8 x float> %1 to <8 x i32> + %r = call <8 x i32> @__vselect_i32(<8 x i32> %v0, <8 x i32> %v1, <8 x i32> %mask) + %rf = bitcast <8 x i32> %r to <8 x float> + ret <8 x float> %rf +} + + +; To do vector integer min and max, we do the vector compare and then sign +; extend the i1 vector result to an i32 mask. The __vselect does the +; rest... + +define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + %c = icmp slt <8 x i32> %0, %1 + %mask = sext <8 x i1> %c to <8 x i32> + %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask) + ret <8 x i32> %v +} + +define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline { + %c = icmp slt i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} + +define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + %c = icmp sgt <8 x i32> %0, %1 + %mask = sext <8 x i1> %c to <8 x i32> + %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask) + ret <8 x i32> %v +} + +define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline { + %c = icmp sgt i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} + +; The functions for unsigned ints are similar, just with unsigned +; comparison functions... + +define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + %c = icmp ult <8 x i32> %0, %1 + %mask = sext <8 x i1> %c to <8 x i32> + %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask) + ret <8 x i32> %v +} + +define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline { + %c = icmp ult i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} + +define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + %c = icmp ugt <8 x i32> %0, %1 + %mask = sext <8 x i1> %c to <8 x i32> + %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask) + ret <8 x i32> %v +} + +define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline { + %c = icmp ugt i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops / reductions + +declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone + +define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline { + ; first do two 4-wide movmsk calls + %floatmask = bitcast <8 x i32> %0 to <8 x float> + %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef, + <4 x i32> + %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone + %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef, + <4 x i32> + %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone + + ; and shift the first one over by 4 before ORing it with the value + ; of the second one + %v1s = shl i32 %v1, 4 + %v = or i32 %v0, %v1s + ret i32 %v +} + +define internal <4 x float> @__vec4_add_float(<4 x float> %v0, + <4 x float> %v1) nounwind readnone alwaysinline { + %v = fadd <4 x float> %v0, %v1 + ret <4 x float> %v +} + +define internal float @__add_float(float, float) nounwind readnone alwaysinline { + %v = fadd float %0, %1 + ret float %v +} + +define internal float @__reduce_add_float(<8 x float>) nounwind readnone alwaysinline { + reduce8by4(float, @__vec4_add_float, @__add_float) +} + +define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline { + reduce8(float, @__min_varying_float, @__min_uniform_float) +} + +define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline { + reduce8(float, @__max_varying_float, @__max_uniform_float) +} + +; helper function for reduce_add_int32 +define internal <4 x i32> @__vec4_add_int32(<4 x i32> %v0, + <4 x i32> %v1) nounwind readnone alwaysinline { + %v = add <4 x i32> %v0, %v1 + ret <4 x i32> %v +} + +; helper function for reduce_add_int32 +define internal i32 @__add_int32(i32, i32) nounwind readnone alwaysinline { + %v = add i32 %0, %1 + ret i32 %v +} + +define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline { + reduce8by4(i32, @__vec4_add_int32, @__add_int32) +} + +define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline { + reduce8(i32, @__min_varying_int32, @__min_uniform_int32) +} + +define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline { + reduce8(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline { + %r = call i32 @__reduce_add_int32(<8 x i32> %v) + ret i32 %r +} + +define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline { + reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline { + reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32) +} + +define internal <4 x double> @__add_varying_double(<4 x double>, + <4 x double>) nounwind readnone alwaysinline { + %r = fadd <4 x double> %0, %1 + ret <4 x double> %r +} + +define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline { + %r = fadd double %0, %1 + ret double %r +} + +define internal double @__reduce_add_double(<8 x double>) nounwind readnone { + reduce8by4(double, @__add_varying_double, @__add_uniform_double) +} + +define internal double @__reduce_min_double(<8 x double>) nounwind readnone { + reduce8(double, @__min_varying_double, @__min_uniform_double) +} + +define internal double @__reduce_max_double(<8 x double>) nounwind readnone { + reduce8(double, @__max_varying_double, @__max_uniform_double) +} + +define internal <4 x i64> @__add_varying_int64(<4 x i64>, + <4 x i64>) nounwind readnone alwaysinline { + %r = add <4 x i64> %0, %1 + ret <4 x i64> %r +} + +define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline { + %r = add i64 %0, %1 + ret i64 %r +} + +define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone { + reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone { + reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone { + reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +reduce_equal(8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +load_and_broadcast(8, i8, 8) +load_and_broadcast(8, i16, 16) +load_and_broadcast(8, i32, 32) +load_and_broadcast(8, i64, 64) + +load_masked(8, i8, 8, 1) +load_masked(8, i16, 16, 2) +load_masked(8, i32, 32, 4) +load_masked(8, i64, 64, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +gen_gather(8, i8) +gen_gather(8, i16) +gen_gather(8, i32) +gen_gather(8, i64) + +gen_scatter(8, i8) +gen_scatter(8, i16) +gen_scatter(8, i32) +gen_scatter(8, i64) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float rounding + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding +;; +;; There are not any rounding instructions in SSE2, so we have to emulate +;; the functionality with multiple instructions... + +; The code for __round_* is the result of compiling the following source +; code. +; +; export float Round(float x) { +; unsigned int sign = signbits(x); +; unsigned int ix = intbits(x); +; ix ^= sign; +; x = floatbits(ix); +; x += 0x1.0p23f; +; x -= 0x1.0p23f; +; ix = intbits(x); +; ix ^= sign; +; x = floatbits(ix); +; return x; +;} + +define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32> + %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i, + %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i, + %binop21.i = fadd <8 x float> %binop.i, + %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32> + %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float> + ret <8 x float> %int_to_float_bitcast.i.i.i +} + +;; Similarly, for implementations of the __floor* functions below, we have the +;; bitcode from compiling the following source code... + +;export float Floor(float x) { +; float y = Round(x); +; unsigned int cmp = y > x ? 0xffffffff : 0; +; float delta = -1.f; +; unsigned int idelta = intbits(delta); +; idelta &= cmp; +; delta = floatbits(idelta); +; return y + delta; +;} + +define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind + %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32> + %bitop.i = and <8 x i32> %val_to_boolvec32.i, + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <8 x float> %binop.i +} + +;; And here is the code we compiled to get the __ceil* functions below +; +;export uniform float Ceil(uniform float x) { +; uniform float y = Round(x); +; uniform int yltx = y < x ? 0xffffffff : 0; +; uniform float delta = 1.f; +; uniform int idelta = intbits(delta); +; idelta &= yltx; +; delta = floatbits(idelta); +; return y + delta; +;} + +define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind + %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32> + %bitop.i = and <8 x i32> %val_to_boolvec32.i, + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <8 x float> %binop.i +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline { + unary1to8(double, @round) +} + +define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline { + unary1to8(double, @floor) +} + +define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline { + unary1to8(double, @ceil) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +gen_masked_store(8, i8, 8) +gen_masked_store(8, i16, 16) +gen_masked_store(8, i32, 32) +gen_masked_store(8, i64, 64) + +masked_store_blend_8_16_by_8() + +define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, + <8 x i32> %mask) nounwind alwaysinline { + %val = load <8 x i32> * %0, align 4 + %newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask) + store <8 x i32> %newval, <8 x i32> * %0, align 4 + ret void +} + +define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, + <8 x i32> %mask) nounwind alwaysinline { + %oldValue = load <8 x i64>* %ptr, align 8 + + ; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values + ; are actually bitcast <2 x i64> values + ; + ; set up the first two 64-bit values + %old0123 = shufflevector <8 x i64> %oldValue, <8 x i64> undef, + <4 x i32> + %old0123f = bitcast <4 x i64> %old0123 to <8 x float> + %new0123 = shufflevector <8 x i64> %new, <8 x i64> undef, + <4 x i32> + %new0123f = bitcast <4 x i64> %new0123 to <8 x float> + ; compute mask--note that the indices are doubled-up + %mask0123 = shufflevector <8 x i32> %mask, <8 x i32> undef, + <8 x i32> + ; and blend the first 4 values + %result0123f = call <8 x float> @__vselect_float(<8 x float> %old0123f, <8 x float> %new0123f, + <8 x i32> %mask0123) + %result0123 = bitcast <8 x float> %result0123f to <4 x i64> + + ; and again + %old4567 = shufflevector <8 x i64> %oldValue, <8 x i64> undef, + <4 x i32> + %old4567f = bitcast <4 x i64> %old4567 to <8 x float> + %new4567 = shufflevector <8 x i64> %new, <8 x i64> undef, + <4 x i32> + %new4567f = bitcast <4 x i64> %new4567 to <8 x float> + ; compute mask--note that the values are doubled-up + %mask4567 = shufflevector <8 x i32> %mask, <8 x i32> undef, + <8 x i32> + ; and blend the two of the values + %result4567f = call <8 x float> @__vselect_float(<8 x float> %old4567f, <8 x float> %new4567f, + <8 x i32> %mask4567) + %result4567 = bitcast <8 x float> %result4567f to <4 x i64> + + ; reconstruct the final <8 x i64> vector + %final = shufflevector <4 x i64> %result0123, <4 x i64> %result4567, + <8 x i32> + store <8 x i64> %final, <8 x i64> * %ptr, align 8 + ret void +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline { + unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ret <8 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision float min/max + +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline { + binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ret <8 x double> %ret +} + +define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline { + binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ret <8 x double> %ret +} diff --git a/builtins-sse2.ll b/builtins-sse2.ll index 561fb59f..4a75479f 100644 --- a/builtins-sse2.ll +++ b/builtins-sse2.ll @@ -36,9 +36,9 @@ stdlib_core(4) packed_load_and_store(4) scans(4) +int64minmax(4) -; Include the various definitions of things that only require SSE1 and SSE2 -include(`builtins-sse.ll') +include(`builtins-sse2-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding @@ -75,19 +75,6 @@ define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonl ret <4 x float> %int_to_float_bitcast.i.i.i } -define internal float @__round_uniform_float(float) nounwind readonly alwaysinline { - %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32 - %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648 - %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i - %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06 - %binop21.i = fadd float %binop.i, -8.388608e+06 - %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32 - %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float - ret float %int_to_float_bitcast.i.i.i -} - ;; Similarly, for implementations of the __floor* functions below, we have the ;; bitcode from compiling the following source code... @@ -111,16 +98,6 @@ define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonl ret <4 x float> %binop.i } -define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline { - %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind - %bincmp.i = fcmp ogt float %calltmp.i, %0 - %selectexpr.i = sext i1 %bincmp.i to i32 - %bitop.i = and i32 %selectexpr.i, -1082130432 - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i - ret float %binop.i -} - ;; And here is the code we compiled to get the __ceil* functions below ; ;export uniform float Ceil(uniform float x) { @@ -143,50 +120,21 @@ define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly ret <4 x float> %binop.i } -define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline { - %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind - %bincmp.i = fcmp olt float %calltmp.i, %0 - %selectexpr.i = sext i1 %bincmp.i to i32 - %bitop.i = and i32 %selectexpr.i, 1065353216 - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i - ret float %binop.i -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding doubles -declare double @round(double) -declare double @floor(double) -declare double @ceil(double) - define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline { unary1to4(double, @round) } -define internal double @__round_uniform_double(double) nounwind readonly alwaysinline { - %r = call double @round(double %0) - ret double %r -} - define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline { unary1to4(double, @floor) } -define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline { - %r = call double @floor(double %0) - ret double %r -} - define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline { unary1to4(double, @ceil) } -define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline { - %r = call double @ceil(double %0) - ret double %r -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; min/max @@ -277,20 +225,14 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions -declare i32 @llvm.ctpop.i32(i32) -declare i64 @llvm.ctpop.i64(i64) +declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone -define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { - %val = call i32 @llvm.ctpop.i32(i32 %0) - ret i32 %val +define internal i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + ret i32 %v } -define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline { - %val = call i64 @llvm.ctpop.i64(i64 %0) - ret i64 %val -} - - define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline { %v1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> @@ -301,6 +243,96 @@ define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwa ret float %sum } +define internal float @__reduce_min_float(<4 x float>) nounwind readnone { + reduce4(float, @__min_varying_float, @__min_uniform_float) +} + +define internal float @__reduce_max_float(<4 x float>) nounwind readnone { + reduce4(float, @__max_varying_float, @__max_uniform_float) +} + +define internal i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone { + %v1 = shufflevector <4 x i32> %v, <4 x i32> undef, + <4 x i32> + %m1 = add <4 x i32> %v1, %v + %m1a = extractelement <4 x i32> %m1, i32 0 + %m1b = extractelement <4 x i32> %m1, i32 1 + %sum = add i32 %m1a, %m1b + ret i32 %sum +} + +define internal i32 @__reduce_min_int32(<4 x i32>) nounwind readnone { + reduce4(i32, @__min_varying_int32, @__min_uniform_int32) +} + +define internal i32 @__reduce_max_int32(<4 x i32>) nounwind readnone { + reduce4(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define internal i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone { + %r = call i32 @__reduce_add_int32(<4 x i32> %v) + ret i32 %r +} + +define internal i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone { + reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone { + reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32) + } + + +define internal double @__reduce_add_double(<4 x double>) nounwind readnone { + %v0 = shufflevector <4 x double> %0, <4 x double> undef, + <2 x i32> + %v1 = shufflevector <4 x double> %0, <4 x double> undef, + <2 x i32> + %sum = fadd <2 x double> %v0, %v1 + %e0 = extractelement <2 x double> %sum, i32 0 + %e1 = extractelement <2 x double> %sum, i32 1 + %m = fadd double %e0, %e1 + ret double %m +} + +define internal double @__reduce_min_double(<4 x double>) nounwind readnone { + reduce4(double, @__min_varying_double, @__min_uniform_double) +} + +define internal double @__reduce_max_double(<4 x double>) nounwind readnone { + reduce4(double, @__max_varying_double, @__max_uniform_double) +} + +define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone { + %v0 = shufflevector <4 x i64> %0, <4 x i64> undef, + <2 x i32> + %v1 = shufflevector <4 x i64> %0, <4 x i64> undef, + <2 x i32> + %sum = add <2 x i64> %v0, %v1 + %e0 = extractelement <2 x i64> %sum, i32 0 + %e1 = extractelement <2 x i64> %sum, i32 1 + %m = add i64 %e0, %e1 + ret i64 %m +} + +define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone { + reduce4(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone { + reduce4(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone { + reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone { + reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +reduce_equal(4) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; masked store @@ -355,3 +387,187 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new, ret void } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define internal <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0) + ; do one N-R iteration to improve precision + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + %v_iv = fmul <4 x float> %0, %call + %two_minus = fsub <4 x float> , %v_iv + %iv_mul = fmul <4 x float> %call, %two_minus + ret <4 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; rsqrt + +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define internal <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <4 x float> %v, %is + %v_is_is = fmul <4 x float> %v_is, %is + %three_sub = fsub <4 x float> , %v_is_is + %is_mul = fmul <4 x float> %is, %three_sub + %half_scale = fmul <4 x float> , %is_mul + ret <4 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; sqrt + +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define internal <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0) + ret <4 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; svml stuff + +declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone +declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone + + +define internal <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) + ret <4 x float> %ret +} + +define internal <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) + ret <4 x float> %ret +} + +define internal void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { + %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) + store <4 x float> %s, <4 x float> * %1 + ret void +} + +define internal <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) + ret <4 x float> %ret +} + +define internal <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) + ret <4 x float> %ret +} + +define internal <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) + ret <4 x float> %ret +} + +define internal <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_expf4(<4 x float> %0) + ret <4 x float> %ret +} + +define internal <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_logf4(<4 x float> %0) + ret <4 x float> %ret +} + +define internal <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) + ret <4 x float> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define internal <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1) + ret <4 x float> %call +} + +define internal <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1) + ret <4 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline { + unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ret <4 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone { + binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ret <4 x double> %ret +} + +define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone { + binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ret <4 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +masked_store_blend_8_16_by_4() + +gen_masked_store(4, i8, 8) +gen_masked_store(4, i16, 16) +gen_masked_store(4, i32, 32) +gen_masked_store(4, i64, 64) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +load_and_broadcast(4, i8, 8) +load_and_broadcast(4, i16, 16) +load_and_broadcast(4, i32, 32) +load_and_broadcast(4, i64, 64) + +load_masked(4, i8, 8, 1) +load_masked(4, i16, 16, 2) +load_masked(4, i32, 32, 4) +load_masked(4, i64, 64, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +; define these with the macros from stdlib.m4 + +gen_gather(4, i8) +gen_gather(4, i16) +gen_gather(4, i32) +gen_gather(4, i64) + +gen_scatter(4, i8) +gen_scatter(4, i16) +gen_scatter(4, i32) +gen_scatter(4, i64) diff --git a/builtins-sse4-x2.ll b/builtins-sse4-x2.ll index 536ec3c5..bc7d2a4a 100644 --- a/builtins-sse4-x2.ll +++ b/builtins-sse4-x2.ll @@ -41,11 +41,12 @@ packed_load_and_store(8) scans(8) int64minmax(8) +include(`builtins-sse4-common.ll') + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone -declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline { ; float iv = __rcp_v(v); @@ -60,25 +61,10 @@ define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly ret <8 x float> %iv_mul } -define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline { -; uniform float iv = extract(__rcp_u(v), 0); -; return iv * (2. - v * iv); - %vecval = insertelement <4 x float> undef, float %0, i32 0 - %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval) - %scall = extractelement <4 x float> %call, i32 0 - - ; do one N-R iteration - %v_iv = fmul float %0, %scall - %two_minus = fsub float 2., %v_iv - %iv_mul = fmul float %scall, %two_minus - ret float %iv_mul -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rsqrt declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone -declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline { ; float is = __rsqrt_v(v); @@ -94,56 +80,16 @@ define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind read ret <8 x float> %half_scale } -define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline { - ; uniform float is = extract(__rsqrt_u(v), 0); - %v = insertelement <4 x float> undef, float %0, i32 0 - %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v) - %is = extractelement <4 x float> %vis, i32 0 - - ; return 0.5 * is * (3. - (v * is) * is); - %v_is = fmul float %0, %is - %v_is_is = fmul float %v_is, %is - %three_sub = fsub float 3., %v_is_is - %is_mul = fmul float %is, %three_sub - %half_scale = fmul float 0.5, %is_mul - ret float %half_scale -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; sqrt declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone -declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline { unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0) ret <8 x float> %call } -define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline { - sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0) - ret float %ret -} - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; fast math - -declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind -declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind - -define internal void @__fastmath() nounwind alwaysinline { - %ptr = alloca i32 - %ptr8 = bitcast i32 * %ptr to i8 * - call void @llvm.x86.sse.stmxcsr(i8 * %ptr8) - %oldval = load i32 *%ptr - - ; turn on DAZ (64)/FTZ (32768) -> 32832 - %update = or i32 %oldval, 32832 - store i32 %update, i32 *%ptr - call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8) - ret void -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; svml stuff @@ -234,85 +180,46 @@ define internal <8 x float> @__svml_pow(<8 x float>, ;; float min/max declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone define internal <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1) ret <8 x float> %call } -define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline { - sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1) - ret float %ret -} - define internal <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1) ret <8 x float> %call } -define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline { - sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1) - ret float %ret -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int32 min/max -declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone -declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone - define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1) ret <8 x i32> %call } -define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline { - sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1) - ret i32 %ret -} - define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1) ret <8 x i32> %call } -define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline { - sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1) - ret i32 %ret -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; unsigned int min/max -declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone -declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone - define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1) ret <8 x i32> %call } -define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline { - sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1) - ret i32 %ret -} - define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1) ret <8 x i32> %call } -define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline { - sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1) - ret i32 %ret -} - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions @@ -467,126 +374,44 @@ gen_scatter(8, i64) ;; float rounding declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone -declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline { ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 round4to8(%0, 8) } -define internal float @__round_uniform_float(float) nounwind readonly alwaysinline { - ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 - ; the roundss intrinsic is a total mess--docs say: - ; - ; __m128 _mm_round_ss (__m128 a, __m128 b, const int c) - ; - ; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function - ; on b0. The higher order 96 bits are copied directly from input parameter a. The - ; return value is described by the following equations: - ; - ; r0 = RND(b0) - ; r1 = a1 - ; r2 = a2 - ; r3 = a3 - ; - ; It doesn't matter what we pass as a, since we only need the r0 value - ; here. So we pass the same register for both. - %xi = insertelement <4 x float> undef, float %0, i32 0 - %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8) - %rs = extractelement <4 x float> %xr, i32 0 - ret float %rs -} - define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline { ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 round4to8(%0, 9) } -define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline { - ; see above for round_ss instrinsic discussion... - %xi = insertelement <4 x float> undef, float %0, i32 0 - ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 - %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9) - %rs = extractelement <4 x float> %xr, i32 0 - ret float %rs -} - define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline { ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 round4to8(%0, 10) } -define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline { - ; see above for round_ss instrinsic discussion... - %xi = insertelement <4 x float> undef, float %0, i32 0 - ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 - %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10) - %rs = extractelement <4 x float> %xr, i32 0 - ret float %rs -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding doubles declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone -declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline { round2to8double(%0, 8) } -define internal double @__round_uniform_double(double) nounwind readonly alwaysinline { - %xi = insertelement <2 x double> undef, double %0, i32 0 - %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8) - %rs = extractelement <2 x double> %xr, i32 0 - ret double %rs -} - define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline { ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 round2to8double(%0, 9) } -define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline { - ; see above for round_ss instrinsic discussion... - %xi = insertelement <2 x double> undef, double %0, i32 0 - ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 - %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9) - %rs = extractelement <2 x double> %xr, i32 0 - ret double %rs -} - define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline { ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10 round2to8double(%0, 10) } -define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline { - ; see above for round_ss instrinsic discussion... - %xi = insertelement <2 x double> undef, double %0, i32 0 - ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 - %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10) - %rs = extractelement <2 x double> %xr, i32 0 - ret double %rs -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions -declare i32 @llvm.ctpop.i32(i32) nounwind readnone - -define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { - %call = call i32 @llvm.ctpop.i32(i32 %0) - ret i32 %call -} - -declare i64 @llvm.ctpop.i64(i64) nounwind readnone - -define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { - %call = call i64 @llvm.ctpop.i64(i64 %0) - ret i64 %call -} - declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline { @@ -718,44 +543,24 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, ;; double precision sqrt declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone -declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline { unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0) ret <8 x double> %ret } - -define internal double @__sqrt_uniform_double(double) nounwind alwaysinline { - sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.pd, %0) - ret double %ret -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; double precision float min/max declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone -declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone -declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline { binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1) ret <8 x double> %ret } -define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline { - sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.pd, %0, %1) - ret double %ret -} - define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline { binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1) ret <8 x double> %ret } - -define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline { - sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.pd, %0, %1) - ret double %ret - -} diff --git a/builtins-sse4.ll b/builtins-sse4.ll index 710b48d4..145e9c07 100644 --- a/builtins-sse4.ll +++ b/builtins-sse4.ll @@ -36,15 +36,68 @@ stdlib_core(4) packed_load_and_store(4) scans(4) +int64minmax(4) -; Define the stuff that can be done with base SSE1/SSE2 instructions -include(`builtins-sse.ll') +include(`builtins-sse4-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define internal <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0) + ; do one N-R iteration to improve precision + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + %v_iv = fmul <4 x float> %0, %call + %two_minus = fsub <4 x float> , %v_iv + %iv_mul = fmul <4 x float> %call, %two_minus + ret <4 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; rsqrt + +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define internal <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <4 x float> %v, %is + %v_is_is = fmul <4 x float> %v_is, %is + %three_sub = fsub <4 x float> , %v_is_is + %is_mul = fmul <4 x float> %is, %three_sub + %half_scale = fmul <4 x float> , %is_mul + ret <4 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; sqrt + +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define internal <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0) + ret <4 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline { + unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ret <4 x double> %ret +} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone -declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline { ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 @@ -52,173 +105,164 @@ define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonl ret <4 x float> %call } -define internal float @__round_uniform_float(float) nounwind readonly alwaysinline { - ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 - ; the roundss intrinsic is a total mess--docs say: - ; - ; __m128 _mm_round_ss (__m128 a, __m128 b, const int c) - ; - ; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function - ; on b0. The higher order 96 bits are copied directly from input parameter a. The - ; return value is described by the following equations: - ; - ; r0 = RND(b0) - ; r1 = a1 - ; r2 = a2 - ; r3 = a3 - ; - ; It doesn't matter what we pass as a, since we only need the r0 value - ; here. So we pass the same register for both. Further, only the 0th - ; element of the b parameter matters - %xi = insertelement <4 x float> undef, float %0, i32 0 - %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8) - %rs = extractelement <4 x float> %xr, i32 0 - ret float %rs -} - define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline { ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9) ret <4 x float> %call } -define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline { - ; see above for round_ss instrinsic discussion... - %xi = insertelement <4 x float> undef, float %0, i32 0 - ; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9 - %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9) - %rs = extractelement <4 x float> %xr, i32 0 - ret float %rs -} - define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline { ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10) ret <4 x float> %call } -define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline { - ; see above for round_ss instrinsic discussion... - %xi = insertelement <4 x float> undef, float %0, i32 0 - ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 - %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10) - %rs = extractelement <4 x float> %xr, i32 0 - ret float %rs -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding doubles declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone -declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline { round2to4double(%0, 8) } -define internal double @__round_uniform_double(double) nounwind readonly alwaysinline { - %xi = insertelement <2 x double> undef, double %0, i32 0 - %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8) - %rs = extractelement <2 x double> %xr, i32 0 - ret double %rs -} - define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline { ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 round2to4double(%0, 9) } -define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline { - ; see above for round_ss instrinsic discussion... - %xi = insertelement <2 x double> undef, double %0, i32 0 - ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 - %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9) - %rs = extractelement <2 x double> %xr, i32 0 - ret double %rs -} - define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline { ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10 round2to4double(%0, 10) } -define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline { - ; see above for round_ss instrinsic discussion... - %xi = insertelement <2 x double> undef, double %0, i32 0 - ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 - %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10) - %rs = extractelement <2 x double> %xr, i32 0 - ret double %rs +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define internal <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1) + ret <4 x float> %call +} + +define internal <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1) + ret <4 x float> %call } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int32 min/max -declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone -declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone - define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1) ret <4 x i32> %call } -define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline { - sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1) - ret i32 %ret -} - define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1) ret <4 x i32> %call } -define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline { - sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1) - ret i32 %ret -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; unsigned int min/max -declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone -declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone - define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1) ret <4 x i32> %call } -define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline { - sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1) - ret i32 %ret -} - define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1) ret <4 x i32> %call } -define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline { - sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1) - ret i32 %ret +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone { + binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ret <4 x double> %ret } +define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone { + binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ret <4 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; svml stuff + +declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone +declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone + + +define internal <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) + ret <4 x float> %ret +} + +define internal <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) + ret <4 x float> %ret +} + +define internal void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { + %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) + store <4 x float> %s, <4 x float> * %1 + ret void +} + +define internal <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) + ret <4 x float> %ret +} + +define internal <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) + ret <4 x float> %ret +} + +define internal <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) + ret <4 x float> %ret +} + +define internal <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_expf4(<4 x float> %0) + ret <4 x float> %ret +} + +define internal <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_logf4(<4 x float> %0) + ret <4 x float> %ret +} + +define internal <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) + ret <4 x float> %ret +} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions -declare i32 @llvm.ctpop.i32(i32) nounwind readnone +declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone -define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { - %call = call i32 @llvm.ctpop.i32(i32 %0) - ret i32 %call -} - -declare i64 @llvm.ctpop.i64(i64) nounwind readnone - -define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { - %call = call i64 @llvm.ctpop.i64(i64 %0) - ret i64 %call +define internal i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + ret i32 %v } declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone @@ -230,6 +274,96 @@ define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysi ret float %scalar } +define internal float @__reduce_min_float(<4 x float>) nounwind readnone { + reduce4(float, @__min_varying_float, @__min_uniform_float) +} + +define internal float @__reduce_max_float(<4 x float>) nounwind readnone { + reduce4(float, @__max_varying_float, @__max_uniform_float) +} + +define internal i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone { + %v1 = shufflevector <4 x i32> %v, <4 x i32> undef, + <4 x i32> + %m1 = add <4 x i32> %v1, %v + %m1a = extractelement <4 x i32> %m1, i32 0 + %m1b = extractelement <4 x i32> %m1, i32 1 + %sum = add i32 %m1a, %m1b + ret i32 %sum +} + +define internal i32 @__reduce_min_int32(<4 x i32>) nounwind readnone { + reduce4(i32, @__min_varying_int32, @__min_uniform_int32) +} + +define internal i32 @__reduce_max_int32(<4 x i32>) nounwind readnone { + reduce4(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define internal i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone { + %r = call i32 @__reduce_add_int32(<4 x i32> %v) + ret i32 %r +} + +define internal i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone { + reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone { + reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32) + } + + +define internal double @__reduce_add_double(<4 x double>) nounwind readnone { + %v0 = shufflevector <4 x double> %0, <4 x double> undef, + <2 x i32> + %v1 = shufflevector <4 x double> %0, <4 x double> undef, + <2 x i32> + %sum = fadd <2 x double> %v0, %v1 + %e0 = extractelement <2 x double> %sum, i32 0 + %e1 = extractelement <2 x double> %sum, i32 1 + %m = fadd double %e0, %e1 + ret double %m +} + +define internal double @__reduce_min_double(<4 x double>) nounwind readnone { + reduce4(double, @__min_varying_double, @__min_uniform_double) +} + +define internal double @__reduce_max_double(<4 x double>) nounwind readnone { + reduce4(double, @__max_varying_double, @__max_uniform_double) +} + +define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone { + %v0 = shufflevector <4 x i64> %0, <4 x i64> undef, + <2 x i32> + %v1 = shufflevector <4 x i64> %0, <4 x i64> undef, + <2 x i32> + %sum = add <2 x i64> %v0, %v1 + %e0 = extractelement <2 x i64> %sum, i32 0 + %e1 = extractelement <2 x i64> %sum, i32 1 + %m = add i64 %e0, %e1 + ret i64 %m +} + +define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone { + reduce4(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone { + reduce4(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone { + reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone { + reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +reduce_equal(4) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; masked store @@ -298,3 +432,41 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new, store <4 x i64> %final, <4 x i64> * %ptr, align 8 ret void } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +masked_store_blend_8_16_by_4() + +gen_masked_store(4, i8, 8) +gen_masked_store(4, i16, 16) +gen_masked_store(4, i32, 32) +gen_masked_store(4, i64, 64) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +load_and_broadcast(4, i8, 8) +load_and_broadcast(4, i16, 16) +load_and_broadcast(4, i32, 32) +load_and_broadcast(4, i64, 64) + +load_masked(4, i8, 8, 1) +load_masked(4, i16, 16, 2) +load_masked(4, i32, 32, 4) +load_masked(4, i64, 64, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +; define these with the macros from stdlib.m4 + +gen_gather(4, i8) +gen_gather(4, i16) +gen_gather(4, i32) +gen_gather(4, i64) + +gen_scatter(4, i8) +gen_scatter(4, i16) +gen_scatter(4, i32) +gen_scatter(4, i64) diff --git a/builtins.cpp b/builtins.cpp index 1b2c4f48..33551632 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -458,8 +458,20 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod case Target::SSE2: extern unsigned char builtins_bitcode_sse2[]; extern int builtins_bitcode_sse2_length; - AddBitcodeToModule(builtins_bitcode_sse2, builtins_bitcode_sse2_length, - module, symbolTable); + extern unsigned char builtins_bitcode_sse2_x2[]; + extern int builtins_bitcode_sse2_x2_length; + switch (g->target.vectorWidth) { + case 4: + AddBitcodeToModule(builtins_bitcode_sse2, builtins_bitcode_sse2_length, + module, symbolTable); + break; + case 8: + AddBitcodeToModule(builtins_bitcode_sse2_x2, builtins_bitcode_sse2_x2_length, + module, symbolTable); + break; + default: + FATAL("logic error in DefineStdlib"); + } break; case Target::SSE4: extern unsigned char builtins_bitcode_sse4[]; diff --git a/builtins.m4 b/builtins.m4 index 723dd800..0b39b414 100644 --- a/builtins.m4 +++ b/builtins.m4 @@ -182,6 +182,34 @@ define(`unary1to4', ` ret <4 x $1> %ret_3 ') +define(`unary1to8', ` + %v_0 = extractelement <8 x $1> %0, i32 0 + %r_0 = call $1 $2($1 %v_0) + %ret_0 = insertelement <8 x $1> undef, $1 %r_0, i32 0 + %v_1 = extractelement <8 x $1> %0, i32 1 + %r_1 = call $1 $2($1 %v_1) + %ret_1 = insertelement <8 x $1> %ret_0, $1 %r_1, i32 1 + %v_2 = extractelement <8 x $1> %0, i32 2 + %r_2 = call $1 $2($1 %v_2) + %ret_2 = insertelement <8 x $1> %ret_1, $1 %r_2, i32 2 + %v_3 = extractelement <8 x $1> %0, i32 3 + %r_3 = call $1 $2($1 %v_3) + %ret_3 = insertelement <8 x $1> %ret_2, $1 %r_3, i32 3 + %v_4 = extractelement <8 x $1> %0, i32 4 + %r_4 = call $1 $2($1 %v_4) + %ret_4 = insertelement <8 x $1> %ret_3, $1 %r_4, i32 4 + %v_5 = extractelement <8 x $1> %0, i32 5 + %r_5 = call $1 $2($1 %v_5) + %ret_5 = insertelement <8 x $1> %ret_4, $1 %r_5, i32 5 + %v_6 = extractelement <8 x $1> %0, i32 6 + %r_6 = call $1 $2($1 %v_6) + %ret_6 = insertelement <8 x $1> %ret_5, $1 %r_6, i32 6 + %v_7 = extractelement <8 x $1> %0, i32 7 + %r_7 = call $1 $2($1 %v_7) + %ret_7 = insertelement <8 x $1> %ret_6, $1 %r_7, i32 7 + ret <8 x $1> %ret_7 +') + ;; Given a unary function that takes a 2-wide vector and a 4-wide vector ;; that we'd like to apply it to, extract 2 2-wide vectors from the 4-wide ;; vector, apply it, and return the corresponding 4-wide vector result diff --git a/docs/ispc.txt b/docs/ispc.txt index 43f30cd8..4cfefdd1 100644 --- a/docs/ispc.txt +++ b/docs/ispc.txt @@ -3213,9 +3213,10 @@ instances. For other workloads, it may lead to a slowdown due to higher register pressure; trying both approaches for key kernels may be worthwhile. -This option is currently only available for the SSE4 and AVX targets, and -is selected with the ``--target=sse4-x2`` and ``--target=avx-x2`` options, -respectively. +This option is only available for each of the SSE2, SSE4 and AVX targets. +It is selected with the ``--target=sse2-x2``, ``--target=sse4-x2`` and +``--target=avx-x2`` options, respectively. + Compiling With Support For Multiple Instruction Sets ---------------------------------------------------- diff --git a/ispc.cpp b/ispc.cpp index 1f928bbc..ae6585c5 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -129,6 +129,12 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->vectorWidth = 4; t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt"; } + else if (!strcasecmp(isa, "sse2-x2")) { + t->isa = Target::SSE2; + t->nativeVectorWidth = 4; + t->vectorWidth = 8; + t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt"; + } else if (!strcasecmp(isa, "sse4")) { t->isa = Target::SSE4; t->nativeVectorWidth = 4; @@ -193,7 +199,7 @@ Target::SupportedTargetArchs() { const char * Target::SupportedTargetISAs() { - return "sse2, sse4, sse4-x2" + return "sse2, sse2-x2, sse4, sse4-x2" #if defined(LLVM_3_0) || defined(LLVM_3_0svn) ", avx, avx-x2" #endif diff --git a/ispc.vcxproj b/ispc.vcxproj index 5d012961..fb56b96c 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -1,4 +1,4 @@ - + @@ -23,6 +23,7 @@ + @@ -87,10 +88,10 @@ Document m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp gen-bitcode-sse4.cpp - builtins.m4;builtins-sse.ll + builtins.m4;builtins-sse4-common.ll m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp gen-bitcode-sse4.cpp - builtins.m4;builtins-sse.ll + builtins.m4;builtins-sse4-common.ll Building gen-bitcode-sse4.cpp Building gen-bitcode-sse4.cpp @@ -113,10 +114,10 @@ Document m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll > gen-bitcode-sse4-x2.cpp gen-bitcode-sse4-x2.cpp - builtins.m4;builtins-sse.ll + builtins.m4;builtins-sse4-common.ll m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll > gen-bitcode-sse4-x2.cpp gen-bitcode-sse4-x2.cpp - builtins.m4;builtins-sse.ll + builtins.m4;builtins-sse4-common.ll Building gen-bitcode-sse4-x2.cpp Building gen-bitcode-sse4-x2.cpp @@ -126,23 +127,36 @@ Document m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp gen-bitcode-sse2.cpp - builtins.m4;builtins-sse.ll + builtins.m4;builtins-sse2-common.ll m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp gen-bitcode-sse2.cpp - builtins.m4;builtins-sse.ll + builtins.m4;builtins-sse2-common.ll Building gen-bitcode-sse2.cpp Building gen-bitcode-sse2.cpp + + + Document + m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll > gen-bitcode-sse2-x2.cpp + gen-bitcode-sse2-x2.cpp + builtins.m4;builtins-sse2-common.ll + m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll > gen-bitcode-sse2-x2.cpp + gen-bitcode-sse2-x2.cpp + builtins.m4;builtins-sse2-common.ll + Building gen-bitcode-sse2-x2.cpp + Building gen-bitcode-sse2-x2.cpp + + Document m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp gen-bitcode-avx.cpp - builtins.m4;builtins-sse.ll + builtins.m4;builtins-avx-common.ll m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp gen-bitcode-avx.cpp - builtins.m4;builtins-sse.ll + builtins.m4;builtins-avx-common.ll Building gen-bitcode-avx.cpp Building gen-bitcode-avx.cpp diff --git a/run_tests.py b/run_tests.py index 87fe3d36..8a1c43bb 100755 --- a/run_tests.py +++ b/run_tests.py @@ -26,7 +26,7 @@ parser.add_option("-s", "--static-exe", dest="static_exe", help="Create and run a regular executable for each test (rather than using the LLVM JIT).", default=False, action="store_true") parser.add_option('-t', '--target', dest='target', - help='Set compilation target (sse2, sse4, sse4-x2, avx, avx-x2)', + help='Set compilation target (sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2)', default="sse4") parser.add_option('-a', '--arch', dest='arch', help='Set architecture (x86, x86-64)',