Add "double-wide" sse2-x2 target.

i.e. run 8 program instances together, along the lines of the double-pumped
sse4-x2 target.
This commit is contained in:
Matt Pharr
2011-10-11 15:17:31 -07:00
parent 1198520029
commit 286c23426e
14 changed files with 1543 additions and 806 deletions

View File

@@ -49,7 +49,7 @@ CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \
util.cpp util.cpp
HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
opt.h stmt.h sym.h type.h util.h opt.h stmt.h sym.h type.h util.h
BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \ BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll builtins-sse2-x2.ll \
builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll
BISON_SRC=parse.yy BISON_SRC=parse.yy
FLEX_SRC=lex.ll FLEX_SRC=lex.ll
@@ -111,7 +111,7 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
@echo Compiling $< @echo Compiling $<
@$(CXX) $(CXXFLAGS) -o $@ -c $< @$(CXX) $(CXXFLAGS) -o $@ -c $<
objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll builtins-avx-common.ll objs/builtins-%.cpp: builtins-%.ll
@echo Creating C++ source from builtin definitions file $< @echo Creating C++ source from builtin definitions file $<
@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@ @m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
@@ -142,3 +142,10 @@ objs/stdlib_ispc.cpp: stdlib.ispc
objs/stdlib_ispc.o: objs/stdlib_ispc.cpp objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
@echo Compiling $< @echo Compiling $<
@$(CXX) $(CXXFLAGS) -o $@ -c $< @$(CXX) $(CXXFLAGS) -o $@ -c $<
objs/builtins-sse2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2.ll
objs/builtins-sse2-x2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2-x2.ll
objs/builtins-sse4.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4.ll
objs/builtins-sse4-x2.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4-x2.ll
objs/builtins-avx.cpp: builtins.m4 builtins-avx-common.ll builtins-avx.ll
objs/builtins-avx-x2.cpp: builtins.m4 builtins-avx-common.ll builtins-avx-x2.ll

View File

@@ -30,11 +30,7 @@
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; *** Untested *** AVX target implementation. ;; AVX target implementation.
;;
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
;; hasn't yet been tested. There is therefore a higher-than-normal
;; chance that there are bugs in the code in this file.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp ;; rcp

View File

@@ -1,417 +0,0 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;; This file declares implementations of various stdlib builtins that
;; only require SSE version 1 and 2 functionality; this file, in turn
;; is then included by builtins-sse2.ll and builtins-sse4.ll to provide
;; those definitions for them.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
int64minmax(4)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define internal <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
; do one N-R iteration to improve precision
; float iv = __rcp_v(v);
; return iv * (2. - v * iv);
%v_iv = fmul <4 x float> %0, %call
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
%iv_mul = fmul <4 x float> %call, %two_minus
ret <4 x float> %iv_mul
}
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
; do the rcpss call
%vecval = insertelement <4 x float> undef, float %0, i32 0
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
%scall = extractelement <4 x float> %call, i32 0
; do one N-R iteration to improve precision, as above
%v_iv = fmul float %0, %scall
%two_minus = fsub float 2., %v_iv
%iv_mul = fmul float %scall, %two_minus
ret float %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; rsqrt
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
define internal <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
; float is = __rsqrt_v(v);
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <4 x float> %v, %is
%v_is_is = fmul <4 x float> %v_is, %is
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <4 x float> %is, %three_sub
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
ret <4 x float> %half_scale
}
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
; uniform float is = extract(__rsqrt_u(v), 0);
%v = insertelement <4 x float> undef, float %0, i32 0
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
%is = extractelement <4 x float> %vis, i32 0
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul float %0, %is
%v_is_is = fmul float %v_is, %is
%three_sub = fsub float 3., %v_is_is
%is_mul = fmul float %is, %three_sub
%half_scale = fmul float 0.5, %is_mul
ret float %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; sqrt
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define internal <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
ret <4 x float> %call
}
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
ret float %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; fast math mode
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
define internal void @__fastmath() nounwind alwaysinline {
%ptr = alloca i32
%ptr8 = bitcast i32 * %ptr to i8 *
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
%oldval = load i32 *%ptr
; turn on DAZ (64)/FTZ (32768) -> 32832
%update = or i32 %oldval, 32832
store i32 %update, i32 *%ptr
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
ret void
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
define internal <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
store <4 x float> %s, <4 x float> * %1
ret void
}
define internal <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
ret <4 x float> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
define internal <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
ret <4 x float> %call
}
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
ret float %ret
}
define internal <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
ret <4 x float> %call
}
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
ret float %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
ret <4 x double> %ret
}
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
ret <4 x double> %ret
}
define internal double @__min_uniform_double(double, double) nounwind readnone {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
ret double %ret
}
define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
ret <4 x double> %ret
}
define internal double @__max_uniform_double(double, double) nounwind readnone {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define internal i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
ret i32 %v
}
define internal float @__reduce_min_float(<4 x float>) nounwind readnone {
reduce4(float, @__min_varying_float, @__min_uniform_float)
}
define internal float @__reduce_max_float(<4 x float>) nounwind readnone {
reduce4(float, @__max_varying_float, @__max_uniform_float)
}
define internal i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
%v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%m1 = add <4 x i32> %v1, %v
%m1a = extractelement <4 x i32> %m1, i32 0
%m1b = extractelement <4 x i32> %m1, i32 1
%sum = add i32 %m1a, %m1b
ret i32 %sum
}
define internal i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
}
define internal i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
}
define internal i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
ret i32 %r
}
define internal i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
}
define internal double @__reduce_add_double(<4 x double>) nounwind readnone {
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
<2 x i32> <i32 0, i32 1>
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
<2 x i32> <i32 2, i32 3>
%sum = fadd <2 x double> %v0, %v1
%e0 = extractelement <2 x double> %sum, i32 0
%e1 = extractelement <2 x double> %sum, i32 1
%m = fadd double %e0, %e1
ret double %m
}
define internal double @__reduce_min_double(<4 x double>) nounwind readnone {
reduce4(double, @__min_varying_double, @__min_uniform_double)
}
define internal double @__reduce_max_double(<4 x double>) nounwind readnone {
reduce4(double, @__max_varying_double, @__max_uniform_double)
}
define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
%v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
<2 x i32> <i32 0, i32 1>
%v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
<2 x i32> <i32 2, i32 3>
%sum = add <2 x i64> %v0, %v1
%e0 = extractelement <2 x i64> %sum, i32 0
%e1 = extractelement <2 x i64> %sum, i32 1
%m = add i64 %e0, %e1
ret i64 %m
}
define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
}
define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
}
define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
}
define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
}
reduce_equal(4)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
masked_store_blend_8_16_by_4()
gen_masked_store(4, i8, 8)
gen_masked_store(4, i16, 16)
gen_masked_store(4, i32, 32)
gen_masked_store(4, i64, 64)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
load_and_broadcast(4, i8, 8)
load_and_broadcast(4, i16, 16)
load_and_broadcast(4, i32, 32)
load_and_broadcast(4, i64, 64)
load_masked(4, i8, 8, 1)
load_masked(4, i16, 16, 2)
load_masked(4, i32, 32, 4)
load_masked(4, i64, 64, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
; define these with the macros from stdlib.m4
gen_gather(4, i8)
gen_gather(4, i16)
gen_gather(4, i32)
gen_gather(4, i64)
gen_scatter(4, i8)
gen_scatter(4, i16)
gen_scatter(4, i32)
gen_scatter(4, i64)

266
builtins-sse2-common.ll Normal file
View File

@@ -0,0 +1,266 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
; do the rcpss call
%vecval = insertelement <4 x float> undef, float %0, i32 0
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
%scall = extractelement <4 x float> %call, i32 0
; do one N-R iteration to improve precision, as above
%v_iv = fmul float %0, %scall
%two_minus = fsub float 2., %v_iv
%iv_mul = fmul float %scall, %two_minus
ret float %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; rsqrt
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
; uniform float is = extract(__rsqrt_u(v), 0);
%v = insertelement <4 x float> undef, float %0, i32 0
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
%is = extractelement <4 x float> %vis, i32 0
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul float %0, %is
%v_is_is = fmul float %v_is, %is
%three_sub = fsub float 3., %v_is_is
%is_mul = fmul float %is, %three_sub
%half_scale = fmul float 0.5, %is_mul
ret float %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; sqrt
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
ret float %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; fast math mode
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
define internal void @__fastmath() nounwind alwaysinline {
%ptr = alloca i32
%ptr8 = bitcast i32 * %ptr to i8 *
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
%oldval = load i32 *%ptr
; turn on DAZ (64)/FTZ (32768) -> 32832
%update = or i32 %oldval, 32832
store i32 %update, i32 *%ptr
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
ret void
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
ret float %ret
}
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
ret float %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
define internal double @__min_uniform_double(double, double) nounwind readnone {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
ret double %ret
}
define internal double @__max_uniform_double(double, double) nounwind readnone {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding
;;
;; There are not any rounding instructions in SSE2, so we have to emulate
;; the functionality with multiple instructions...
; The code for __round_* is the result of compiling the following source
; code.
;
; export float Round(float x) {
; unsigned int sign = signbits(x);
; unsigned int ix = intbits(x);
; ix ^= sign;
; x = floatbits(ix);
; x += 0x1.0p23f;
; x -= 0x1.0p23f;
; ix = intbits(x);
; ix ^= sign;
; x = floatbits(ix);
; return x;
;}
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
%float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
%bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
%bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
%int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
%binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
%binop21.i = fadd float %binop.i, -8.388608e+06
%float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
%bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
ret float %int_to_float_bitcast.i.i.i
}
;; Similarly, for implementations of the __floor* functions below, we have the
;; bitcode from compiling the following source code...
;export float Floor(float x) {
; float y = Round(x);
; unsigned int cmp = y > x ? 0xffffffff : 0;
; float delta = -1.f;
; unsigned int idelta = intbits(delta);
; idelta &= cmp;
; delta = floatbits(idelta);
; return y + delta;
;}
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
%bincmp.i = fcmp ogt float %calltmp.i, %0
%selectexpr.i = sext i1 %bincmp.i to i32
%bitop.i = and i32 %selectexpr.i, -1082130432
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
ret float %binop.i
}
;; And here is the code we compiled to get the __ceil* functions below
;
;export uniform float Ceil(uniform float x) {
; uniform float y = Round(x);
; uniform int yltx = y < x ? 0xffffffff : 0;
; uniform float delta = 1.f;
; uniform int idelta = intbits(delta);
; idelta &= yltx;
; delta = floatbits(idelta);
; return y + delta;
;}
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
%bincmp.i = fcmp olt float %calltmp.i, %0
%selectexpr.i = sext i1 %bincmp.i to i32
%bitop.i = and i32 %selectexpr.i, 1065353216
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
ret float %binop.i
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding doubles
declare double @round(double)
declare double @floor(double)
declare double @ceil(double)
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
%r = call double @round(double %0)
ret double %r
}
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
%r = call double @floor(double %0)
ret double %r
}
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
%r = call double @ceil(double %0)
ret double %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions
declare i32 @llvm.ctpop.i32(i32)
declare i64 @llvm.ctpop.i64(i64)
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
%val = call i32 @llvm.ctpop.i32(i32 %0)
ret i32 %val
}
define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
%val = call i64 @llvm.ctpop.i64(i64 %0)
ret i64 %val
}

631
builtins-sse2-x2.ll Normal file
View File

@@ -0,0 +1,631 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;; This file defines the target for "double-pumped" SSE2, i.e. running
;; with 8-wide vectors
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; standard 8-wide definitions from m4 macros
stdlib_core(8)
packed_load_and_store(8)
scans(8)
int64minmax(8)
include(`builtins-sse2-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
; float iv = __rcp_v(v);
; return iv * (2. - v * iv);
unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
; do one N-R iteration
%v_iv = fmul <8 x float> %0, %call
%two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.>, %v_iv
%iv_mul = fmul <8 x float> %call, %two_minus
ret <8 x float> %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
; float is = __rsqrt_v(v);
unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <8 x float> %v, %is
%v_is_is = fmul <8 x float> %v_is, %is
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <8 x float> %is, %three_sub
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
ret <8 x float> %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; sqrt
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
ret <8 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
define internal <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_sinf4, %0)
ret <8 x float> %ret
}
define internal <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_cosf4, %0)
ret <8 x float> %ret
}
define internal void @__svml_sincos(<8 x float>, <8 x float> *,
<8 x float> *) nounwind readnone alwaysinline {
; call svml_sincosf4 two times with the two 4-wide sub-vectors
%a = shufflevector <8 x float> %0, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%b = shufflevector <8 x float> %0, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%cospa = alloca <4 x float>
%sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
%cospb = alloca <4 x float>
%sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
%sin = shufflevector <4 x float> %sa, <4 x float> %sb,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7>
store <8 x float> %sin, <8 x float> * %1
%cosa = load <4 x float> * %cospa
%cosb = load <4 x float> * %cospb
%cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7>
store <8 x float> %cos, <8 x float> * %2
ret void
}
define internal <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_tanf4, %0)
ret <8 x float> %ret
}
define internal <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_atanf4, %0)
ret <8 x float> %ret
}
define internal <8 x float> @__svml_atan2(<8 x float>,
<8 x float>) nounwind readnone alwaysinline {
binary4to8(ret, float, @__svml_atan2f4, %0, %1)
ret <8 x float> %ret
}
define internal <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_expf4, %0)
ret <8 x float> %ret
}
define internal <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_logf4, %0)
ret <8 x float> %ret
}
define internal <8 x float> @__svml_pow(<8 x float>,
<8 x float>) nounwind readnone alwaysinline {
binary4to8(ret, float, @__svml_powf4, %0, %1)
ret <8 x float> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
define internal <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
ret <8 x float> %call
}
define internal <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
ret <8 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; min/max
; There is no blend instruction with SSE2, so we simulate it with bit
; operations on i32s. For these two vselect functions, for each
; vector element, if the mask is on, we return the corresponding value
; from %1, and otherwise return the value from %0.
define internal <8 x i32> @__vselect_i32(<8 x i32>, <8 x i32> ,
<8 x i32> %mask) nounwind readnone alwaysinline {
%notmask = xor <8 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%cleared_old = and <8 x i32> %0, %notmask
%masked_new = and <8 x i32> %1, %mask
%new = or <8 x i32> %cleared_old, %masked_new
ret <8 x i32> %new
}
define internal <8 x float> @__vselect_float(<8 x float>, <8 x float>,
<8 x i32> %mask) nounwind readnone alwaysinline {
%v0 = bitcast <8 x float> %0 to <8 x i32>
%v1 = bitcast <8 x float> %1 to <8 x i32>
%r = call <8 x i32> @__vselect_i32(<8 x i32> %v0, <8 x i32> %v1, <8 x i32> %mask)
%rf = bitcast <8 x i32> %r to <8 x float>
ret <8 x float> %rf
}
; To do vector integer min and max, we do the vector compare and then sign
; extend the i1 vector result to an i32 mask. The __vselect does the
; rest...
define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
%c = icmp slt <8 x i32> %0, %1
%mask = sext <8 x i1> %c to <8 x i32>
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
ret <8 x i32> %v
}
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
%c = icmp slt i32 %0, %1
%r = select i1 %c, i32 %0, i32 %1
ret i32 %r
}
define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
%c = icmp sgt <8 x i32> %0, %1
%mask = sext <8 x i1> %c to <8 x i32>
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
ret <8 x i32> %v
}
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
%c = icmp sgt i32 %0, %1
%r = select i1 %c, i32 %0, i32 %1
ret i32 %r
}
; The functions for unsigned ints are similar, just with unsigned
; comparison functions...
define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
%c = icmp ult <8 x i32> %0, %1
%mask = sext <8 x i1> %c to <8 x i32>
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
ret <8 x i32> %v
}
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
%c = icmp ult i32 %0, %1
%r = select i1 %c, i32 %0, i32 %1
ret i32 %r
}
define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
%c = icmp ugt <8 x i32> %0, %1
%mask = sext <8 x i1> %c to <8 x i32>
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
ret <8 x i32> %v
}
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
%c = icmp ugt i32 %0, %1
%r = select i1 %c, i32 %0, i32 %1
ret i32 %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
; and shift the first one over by 4 before ORing it with the value
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
ret i32 %v
}
define internal <4 x float> @__vec4_add_float(<4 x float> %v0,
<4 x float> %v1) nounwind readnone alwaysinline {
%v = fadd <4 x float> %v0, %v1
ret <4 x float> %v
}
define internal float @__add_float(float, float) nounwind readnone alwaysinline {
%v = fadd float %0, %1
ret float %v
}
define internal float @__reduce_add_float(<8 x float>) nounwind readnone alwaysinline {
reduce8by4(float, @__vec4_add_float, @__add_float)
}
define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
reduce8(float, @__min_varying_float, @__min_uniform_float)
}
define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
reduce8(float, @__max_varying_float, @__max_uniform_float)
}
; helper function for reduce_add_int32
define internal <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
<4 x i32> %v1) nounwind readnone alwaysinline {
%v = add <4 x i32> %v0, %v1
ret <4 x i32> %v
}
; helper function for reduce_add_int32
define internal i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
%v = add i32 %0, %1
ret i32 %v
}
define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
reduce8by4(i32, @__vec4_add_int32, @__add_int32)
}
define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
}
define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
}
define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
ret i32 %r
}
define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
}
define internal <4 x double> @__add_varying_double(<4 x double>,
<4 x double>) nounwind readnone alwaysinline {
%r = fadd <4 x double> %0, %1
ret <4 x double> %r
}
define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
%r = fadd double %0, %1
ret double %r
}
define internal double @__reduce_add_double(<8 x double>) nounwind readnone {
reduce8by4(double, @__add_varying_double, @__add_uniform_double)
}
define internal double @__reduce_min_double(<8 x double>) nounwind readnone {
reduce8(double, @__min_varying_double, @__min_uniform_double)
}
define internal double @__reduce_max_double(<8 x double>) nounwind readnone {
reduce8(double, @__max_varying_double, @__max_uniform_double)
}
define internal <4 x i64> @__add_varying_int64(<4 x i64>,
<4 x i64>) nounwind readnone alwaysinline {
%r = add <4 x i64> %0, %1
ret <4 x i64> %r
}
define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
%r = add i64 %0, %1
ret i64 %r
}
define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
}
define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
}
define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
}
define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
}
define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
}
reduce_equal(8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
load_and_broadcast(8, i8, 8)
load_and_broadcast(8, i16, 16)
load_and_broadcast(8, i32, 32)
load_and_broadcast(8, i64, 64)
load_masked(8, i8, 8, 1)
load_masked(8, i16, 16, 2)
load_masked(8, i32, 32, 4)
load_masked(8, i64, 64, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
gen_gather(8, i8)
gen_gather(8, i16)
gen_gather(8, i32)
gen_gather(8, i64)
gen_scatter(8, i8)
gen_scatter(8, i16)
gen_scatter(8, i32)
gen_scatter(8, i64)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float rounding
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding
;;
;; There are not any rounding instructions in SSE2, so we have to emulate
;; the functionality with multiple instructions...
; The code for __round_* is the result of compiling the following source
; code.
;
; export float Round(float x) {
; unsigned int sign = signbits(x);
; unsigned int ix = intbits(x);
; ix ^= sign;
; x = floatbits(ix);
; x += 0x1.0p23f;
; x -= 0x1.0p23f;
; ix = intbits(x);
; ix ^= sign;
; x = floatbits(ix);
; return x;
;}
define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
%float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
%bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
%bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
%int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
%binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
%binop21.i = fadd <8 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
%float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
%bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
ret <8 x float> %int_to_float_bitcast.i.i.i
}
;; Similarly, for implementations of the __floor* functions below, we have the
;; bitcode from compiling the following source code...
;export float Floor(float x) {
; float y = Round(x);
; unsigned int cmp = y > x ? 0xffffffff : 0;
; float delta = -1.f;
; unsigned int idelta = intbits(delta);
; idelta &= cmp;
; delta = floatbits(idelta);
; return y + delta;
;}
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
%calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
%bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
%val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
%bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
%binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
ret <8 x float> %binop.i
}
;; And here is the code we compiled to get the __ceil* functions below
;
;export uniform float Ceil(uniform float x) {
; uniform float y = Round(x);
; uniform int yltx = y < x ? 0xffffffff : 0;
; uniform float delta = 1.f;
; uniform int idelta = intbits(delta);
; idelta &= yltx;
; delta = floatbits(idelta);
; return y + delta;
;}
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
%calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
%bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
%val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
%bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
%binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
ret <8 x float> %binop.i
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding doubles
define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
unary1to8(double, @round)
}
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
unary1to8(double, @floor)
}
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
unary1to8(double, @ceil)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
gen_masked_store(8, i8, 8)
gen_masked_store(8, i16, 16)
gen_masked_store(8, i32, 32)
gen_masked_store(8, i64, 64)
masked_store_blend_8_16_by_8()
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
<8 x i32> %mask) nounwind alwaysinline {
%val = load <8 x i32> * %0, align 4
%newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask)
store <8 x i32> %newval, <8 x i32> * %0, align 4
ret void
}
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
<8 x i32> %mask) nounwind alwaysinline {
%oldValue = load <8 x i64>* %ptr, align 8
; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
; are actually bitcast <2 x i64> values
;
; set up the first two 64-bit values
%old0123 = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%old0123f = bitcast <4 x i64> %old0123 to <8 x float>
%new0123 = shufflevector <8 x i64> %new, <8 x i64> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%new0123f = bitcast <4 x i64> %new0123 to <8 x float>
; compute mask--note that the indices are doubled-up
%mask0123 = shufflevector <8 x i32> %mask, <8 x i32> undef,
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
; and blend the first 4 values
%result0123f = call <8 x float> @__vselect_float(<8 x float> %old0123f, <8 x float> %new0123f,
<8 x i32> %mask0123)
%result0123 = bitcast <8 x float> %result0123f to <4 x i64>
; and again
%old4567 = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%old4567f = bitcast <4 x i64> %old4567 to <8 x float>
%new4567 = shufflevector <8 x i64> %new, <8 x i64> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%new4567f = bitcast <4 x i64> %new4567 to <8 x float>
; compute mask--note that the values are doubled-up
%mask4567 = shufflevector <8 x i32> %mask, <8 x i32> undef,
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
; and blend the two of the values
%result4567f = call <8 x float> @__vselect_float(<8 x float> %old4567f, <8 x float> %new4567f,
<8 x i32> %mask4567)
%result4567 = bitcast <8 x float> %result4567f to <4 x i64>
; reconstruct the final <8 x i64> vector
%final = shufflevector <4 x i64> %result0123, <4 x i64> %result4567,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
store <8 x i64> %final, <8 x i64> * %ptr, align 8
ret void
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
ret <8 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision float min/max
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
ret <8 x double> %ret
}
define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
ret <8 x double> %ret
}

View File

@@ -36,9 +36,9 @@
stdlib_core(4) stdlib_core(4)
packed_load_and_store(4) packed_load_and_store(4)
scans(4) scans(4)
int64minmax(4)
; Include the various definitions of things that only require SSE1 and SSE2 include(`builtins-sse2-common.ll')
include(`builtins-sse.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding ;; rounding
@@ -75,19 +75,6 @@ define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonl
ret <4 x float> %int_to_float_bitcast.i.i.i ret <4 x float> %int_to_float_bitcast.i.i.i
} }
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
%float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
%bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
%bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
%int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
%binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
%binop21.i = fadd float %binop.i, -8.388608e+06
%float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
%bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
ret float %int_to_float_bitcast.i.i.i
}
;; Similarly, for implementations of the __floor* functions below, we have the ;; Similarly, for implementations of the __floor* functions below, we have the
;; bitcode from compiling the following source code... ;; bitcode from compiling the following source code...
@@ -111,16 +98,6 @@ define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonl
ret <4 x float> %binop.i ret <4 x float> %binop.i
} }
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
%bincmp.i = fcmp ogt float %calltmp.i, %0
%selectexpr.i = sext i1 %bincmp.i to i32
%bitop.i = and i32 %selectexpr.i, -1082130432
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
ret float %binop.i
}
;; And here is the code we compiled to get the __ceil* functions below ;; And here is the code we compiled to get the __ceil* functions below
; ;
;export uniform float Ceil(uniform float x) { ;export uniform float Ceil(uniform float x) {
@@ -143,50 +120,21 @@ define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly
ret <4 x float> %binop.i ret <4 x float> %binop.i
} }
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
%bincmp.i = fcmp olt float %calltmp.i, %0
%selectexpr.i = sext i1 %bincmp.i to i32
%bitop.i = and i32 %selectexpr.i, 1065353216
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
ret float %binop.i
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding doubles ;; rounding doubles
declare double @round(double)
declare double @floor(double)
declare double @ceil(double)
define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline { define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
unary1to4(double, @round) unary1to4(double, @round)
} }
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
%r = call double @round(double %0)
ret double %r
}
define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline { define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
unary1to4(double, @floor) unary1to4(double, @floor)
} }
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
%r = call double @floor(double %0)
ret double %r
}
define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline { define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
unary1to4(double, @ceil) unary1to4(double, @ceil)
} }
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
%r = call double @ceil(double %0)
ret double %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; min/max ;; min/max
@@ -277,20 +225,14 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions ; horizontal ops / reductions
declare i32 @llvm.ctpop.i32(i32) declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
declare i64 @llvm.ctpop.i64(i64)
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { define internal i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
%val = call i32 @llvm.ctpop.i32(i32 %0) %floatmask = bitcast <4 x i32> %0 to <4 x float>
ret i32 %val %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
ret i32 %v
} }
define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
%val = call i64 @llvm.ctpop.i64(i64 %0)
ret i64 %val
}
define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline { define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
%v1 = shufflevector <4 x float> %v, <4 x float> undef, %v1 = shufflevector <4 x float> %v, <4 x float> undef,
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef> <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -301,6 +243,96 @@ define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwa
ret float %sum ret float %sum
} }
define internal float @__reduce_min_float(<4 x float>) nounwind readnone {
reduce4(float, @__min_varying_float, @__min_uniform_float)
}
define internal float @__reduce_max_float(<4 x float>) nounwind readnone {
reduce4(float, @__max_varying_float, @__max_uniform_float)
}
define internal i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
%v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%m1 = add <4 x i32> %v1, %v
%m1a = extractelement <4 x i32> %m1, i32 0
%m1b = extractelement <4 x i32> %m1, i32 1
%sum = add i32 %m1a, %m1b
ret i32 %sum
}
define internal i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
}
define internal i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
}
define internal i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
ret i32 %r
}
define internal i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
}
define internal double @__reduce_add_double(<4 x double>) nounwind readnone {
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
<2 x i32> <i32 0, i32 1>
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
<2 x i32> <i32 2, i32 3>
%sum = fadd <2 x double> %v0, %v1
%e0 = extractelement <2 x double> %sum, i32 0
%e1 = extractelement <2 x double> %sum, i32 1
%m = fadd double %e0, %e1
ret double %m
}
define internal double @__reduce_min_double(<4 x double>) nounwind readnone {
reduce4(double, @__min_varying_double, @__min_uniform_double)
}
define internal double @__reduce_max_double(<4 x double>) nounwind readnone {
reduce4(double, @__max_varying_double, @__max_uniform_double)
}
define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
%v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
<2 x i32> <i32 0, i32 1>
%v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
<2 x i32> <i32 2, i32 3>
%sum = add <2 x i64> %v0, %v1
%e0 = extractelement <2 x i64> %sum, i32 0
%e1 = extractelement <2 x i64> %sum, i32 1
%m = add i64 %e0, %e1
ret i64 %m
}
define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
}
define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
}
define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
}
define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
}
reduce_equal(4)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store ;; masked store
@@ -355,3 +387,187 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
ret void ret void
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define internal <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
; do one N-R iteration to improve precision
; float iv = __rcp_v(v);
; return iv * (2. - v * iv);
%v_iv = fmul <4 x float> %0, %call
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
%iv_mul = fmul <4 x float> %call, %two_minus
ret <4 x float> %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; rsqrt
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
define internal <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
; float is = __rsqrt_v(v);
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <4 x float> %v, %is
%v_is_is = fmul <4 x float> %v_is, %is
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <4 x float> %is, %three_sub
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
ret <4 x float> %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; sqrt
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define internal <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
ret <4 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
define internal <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
store <4 x float> %s, <4 x float> * %1
ret void
}
define internal <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
ret <4 x float> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
define internal <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
ret <4 x float> %call
}
define internal <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
ret <4 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
ret <4 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
ret <4 x double> %ret
}
define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
ret <4 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
masked_store_blend_8_16_by_4()
gen_masked_store(4, i8, 8)
gen_masked_store(4, i16, 16)
gen_masked_store(4, i32, 32)
gen_masked_store(4, i64, 64)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
load_and_broadcast(4, i8, 8)
load_and_broadcast(4, i16, 16)
load_and_broadcast(4, i32, 32)
load_and_broadcast(4, i64, 64)
load_masked(4, i8, 8, 1)
load_masked(4, i16, 16, 2)
load_masked(4, i32, 32, 4)
load_masked(4, i64, 64, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
; define these with the macros from stdlib.m4
gen_gather(4, i8)
gen_gather(4, i16)
gen_gather(4, i32)
gen_gather(4, i64)
gen_scatter(4, i8)
gen_scatter(4, i16)
gen_scatter(4, i32)
gen_scatter(4, i64)

View File

@@ -41,11 +41,12 @@ packed_load_and_store(8)
scans(8) scans(8)
int64minmax(8) int64minmax(8)
include(`builtins-sse4-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp ;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline { define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
; float iv = __rcp_v(v); ; float iv = __rcp_v(v);
@@ -60,25 +61,10 @@ define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly
ret <8 x float> %iv_mul ret <8 x float> %iv_mul
} }
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
; uniform float iv = extract(__rcp_u(v), 0);
; return iv * (2. - v * iv);
%vecval = insertelement <4 x float> undef, float %0, i32 0
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
%scall = extractelement <4 x float> %call, i32 0
; do one N-R iteration
%v_iv = fmul float %0, %scall
%two_minus = fsub float 2., %v_iv
%iv_mul = fmul float %scall, %two_minus
ret float %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt ;; rsqrt
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline { define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
; float is = __rsqrt_v(v); ; float is = __rsqrt_v(v);
@@ -94,56 +80,16 @@ define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind read
ret <8 x float> %half_scale ret <8 x float> %half_scale
} }
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
; uniform float is = extract(__rsqrt_u(v), 0);
%v = insertelement <4 x float> undef, float %0, i32 0
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
%is = extractelement <4 x float> %vis, i32 0
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul float %0, %is
%v_is_is = fmul float %v_is, %is
%three_sub = fsub float 3., %v_is_is
%is_mul = fmul float %is, %three_sub
%half_scale = fmul float 0.5, %is_mul
ret float %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; sqrt ;; sqrt
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline { define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0) unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
ret <8 x float> %call ret <8 x float> %call
} }
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
ret float %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; fast math
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
define internal void @__fastmath() nounwind alwaysinline {
%ptr = alloca i32
%ptr8 = bitcast i32 * %ptr to i8 *
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
%oldval = load i32 *%ptr
; turn on DAZ (64)/FTZ (32768) -> 32832
%update = or i32 %oldval, 32832
store i32 %update, i32 *%ptr
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
ret void
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff ; svml stuff
@@ -234,85 +180,46 @@ define internal <8 x float> @__svml_pow(<8 x float>,
;; float min/max ;; float min/max
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
define internal <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { define internal <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1) binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
ret <8 x float> %call ret <8 x float> %call
} }
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
ret float %ret
}
define internal <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { define internal <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1) binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
ret <8 x float> %call ret <8 x float> %call
} }
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
ret float %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int32 min/max ;; int32 min/max
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1) binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
ret <8 x i32> %call ret <8 x i32> %call
} }
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
ret i32 %ret
}
define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1) binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
ret <8 x i32> %call ret <8 x i32> %call
} }
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
ret i32 %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unsigned int min/max ; unsigned int min/max
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
define internal <8 x i32> @__min_varying_uint32(<8 x i32>, define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
<8 x i32>) nounwind readonly alwaysinline { <8 x i32>) nounwind readonly alwaysinline {
binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1) binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
ret <8 x i32> %call ret <8 x i32> %call
} }
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
ret i32 %ret
}
define internal <8 x i32> @__max_varying_uint32(<8 x i32>, define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
<8 x i32>) nounwind readonly alwaysinline { <8 x i32>) nounwind readonly alwaysinline {
binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1) binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
ret <8 x i32> %call ret <8 x i32> %call
} }
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
ret i32 %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions ; horizontal ops / reductions
@@ -467,126 +374,44 @@ gen_scatter(8, i64)
;; float rounding ;; float rounding
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline { define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
round4to8(%0, 8) round4to8(%0, 8)
} }
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
; the roundss intrinsic is a total mess--docs say:
;
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
;
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
; on b0. The higher order 96 bits are copied directly from input parameter a. The
; return value is described by the following equations:
;
; r0 = RND(b0)
; r1 = a1
; r2 = a2
; r3 = a3
;
; It doesn't matter what we pass as a, since we only need the r0 value
; here. So we pass the same register for both.
%xi = insertelement <4 x float> undef, float %0, i32 0
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
%rs = extractelement <4 x float> %xr, i32 0
ret float %rs
}
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline { define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
round4to8(%0, 9) round4to8(%0, 9)
} }
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion...
%xi = insertelement <4 x float> undef, float %0, i32 0
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
%rs = extractelement <4 x float> %xr, i32 0
ret float %rs
}
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline { define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
round4to8(%0, 10) round4to8(%0, 10)
} }
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion...
%xi = insertelement <4 x float> undef, float %0, i32 0
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
%rs = extractelement <4 x float> %xr, i32 0
ret float %rs
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding doubles ;; rounding doubles
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline { define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
round2to8double(%0, 8) round2to8double(%0, 8)
} }
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
%xi = insertelement <2 x double> undef, double %0, i32 0
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
%rs = extractelement <2 x double> %xr, i32 0
ret double %rs
}
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline { define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
round2to8double(%0, 9) round2to8double(%0, 9)
} }
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
%rs = extractelement <2 x double> %xr, i32 0
ret double %rs
}
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline { define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10 ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
round2to8double(%0, 10) round2to8double(%0, 10)
} }
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
%rs = extractelement <2 x double> %xr, i32 0
ret double %rs
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions ; horizontal ops / reductions
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
%call = call i32 @llvm.ctpop.i32(i32 %0)
ret i32 %call
}
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
%call = call i64 @llvm.ctpop.i64(i64 %0)
ret i64 %call
}
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline { define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
@@ -718,44 +543,24 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
;; double precision sqrt ;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline { define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0) unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
ret <8 x double> %ret ret <8 x double> %ret
} }
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.pd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision float min/max ;; double precision float min/max
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline { define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1) binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
ret <8 x double> %ret ret <8 x double> %ret
} }
define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.pd, %0, %1)
ret double %ret
}
define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline { define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1) binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
ret <8 x double> %ret ret <8 x double> %ret
} }
define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.pd, %0, %1)
ret double %ret
}

View File

@@ -36,15 +36,68 @@
stdlib_core(4) stdlib_core(4)
packed_load_and_store(4) packed_load_and_store(4)
scans(4) scans(4)
int64minmax(4)
; Define the stuff that can be done with base SSE1/SSE2 instructions include(`builtins-sse4-common.ll')
include(`builtins-sse.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define internal <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
; do one N-R iteration to improve precision
; float iv = __rcp_v(v);
; return iv * (2. - v * iv);
%v_iv = fmul <4 x float> %0, %call
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
%iv_mul = fmul <4 x float> %call, %two_minus
ret <4 x float> %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; rsqrt
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
define internal <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
; float is = __rsqrt_v(v);
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <4 x float> %v, %is
%v_is_is = fmul <4 x float> %v_is, %is
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <4 x float> %is, %three_sub
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
ret <4 x float> %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; sqrt
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define internal <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
ret <4 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
ret <4 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats ;; rounding floats
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline { define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
@@ -52,173 +105,164 @@ define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonl
ret <4 x float> %call ret <4 x float> %call
} }
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
; the roundss intrinsic is a total mess--docs say:
;
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
;
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
; on b0. The higher order 96 bits are copied directly from input parameter a. The
; return value is described by the following equations:
;
; r0 = RND(b0)
; r1 = a1
; r2 = a2
; r3 = a3
;
; It doesn't matter what we pass as a, since we only need the r0 value
; here. So we pass the same register for both. Further, only the 0th
; element of the b parameter matters
%xi = insertelement <4 x float> undef, float %0, i32 0
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
%rs = extractelement <4 x float> %xr, i32 0
ret float %rs
}
define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline { define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9) %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
ret <4 x float> %call ret <4 x float> %call
} }
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion...
%xi = insertelement <4 x float> undef, float %0, i32 0
; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
%rs = extractelement <4 x float> %xr, i32 0
ret float %rs
}
define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline { define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10) %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
ret <4 x float> %call ret <4 x float> %call
} }
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion...
%xi = insertelement <4 x float> undef, float %0, i32 0
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
%rs = extractelement <4 x float> %xr, i32 0
ret float %rs
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding doubles ;; rounding doubles
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline { define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
round2to4double(%0, 8) round2to4double(%0, 8)
} }
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
%xi = insertelement <2 x double> undef, double %0, i32 0
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
%rs = extractelement <2 x double> %xr, i32 0
ret double %rs
}
define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline { define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
round2to4double(%0, 9) round2to4double(%0, 9)
} }
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
%rs = extractelement <2 x double> %xr, i32 0
ret double %rs
}
define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline { define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10 ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
round2to4double(%0, 10) round2to4double(%0, 10)
} }
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; see above for round_ss instrinsic discussion... ;; float min/max
%xi = insertelement <2 x double> undef, double %0, i32 0
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10) declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
%rs = extractelement <2 x double> %xr, i32 0
ret double %rs define internal <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
ret <4 x float> %call
}
define internal <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
ret <4 x float> %call
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int32 min/max ;; int32 min/max
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
%call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1) %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
ret <4 x i32> %call ret <4 x i32> %call
} }
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
ret i32 %ret
}
define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
%call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1) %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
ret <4 x i32> %call ret <4 x i32> %call
} }
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
ret i32 %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unsigned int min/max ; unsigned int min/max
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
%call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1) %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
ret <4 x i32> %call ret <4 x i32> %call
} }
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
ret i32 %ret
}
define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
%call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1) %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
ret <4 x i32> %call ret <4 x i32> %call
} }
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1) ;; double precision min/max
ret i32 %ret
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
ret <4 x double> %ret
} }
define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
ret <4 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
define internal <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
store <4 x float> %s, <4 x float> * %1
ret void
}
define internal <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
ret <4 x float> %ret
}
define internal <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
ret <4 x float> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions ; horizontal ops / reductions
declare i32 @llvm.ctpop.i32(i32) nounwind readnone declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { define internal i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
%call = call i32 @llvm.ctpop.i32(i32 %0) %floatmask = bitcast <4 x i32> %0 to <4 x float>
ret i32 %call %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
} ret i32 %v
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
%call = call i64 @llvm.ctpop.i64(i64 %0)
ret i64 %call
} }
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
@@ -230,6 +274,96 @@ define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysi
ret float %scalar ret float %scalar
} }
define internal float @__reduce_min_float(<4 x float>) nounwind readnone {
reduce4(float, @__min_varying_float, @__min_uniform_float)
}
define internal float @__reduce_max_float(<4 x float>) nounwind readnone {
reduce4(float, @__max_varying_float, @__max_uniform_float)
}
define internal i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
%v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%m1 = add <4 x i32> %v1, %v
%m1a = extractelement <4 x i32> %m1, i32 0
%m1b = extractelement <4 x i32> %m1, i32 1
%sum = add i32 %m1a, %m1b
ret i32 %sum
}
define internal i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
}
define internal i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
}
define internal i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
ret i32 %r
}
define internal i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
}
define internal double @__reduce_add_double(<4 x double>) nounwind readnone {
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
<2 x i32> <i32 0, i32 1>
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
<2 x i32> <i32 2, i32 3>
%sum = fadd <2 x double> %v0, %v1
%e0 = extractelement <2 x double> %sum, i32 0
%e1 = extractelement <2 x double> %sum, i32 1
%m = fadd double %e0, %e1
ret double %m
}
define internal double @__reduce_min_double(<4 x double>) nounwind readnone {
reduce4(double, @__min_varying_double, @__min_uniform_double)
}
define internal double @__reduce_max_double(<4 x double>) nounwind readnone {
reduce4(double, @__max_varying_double, @__max_uniform_double)
}
define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
%v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
<2 x i32> <i32 0, i32 1>
%v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
<2 x i32> <i32 2, i32 3>
%sum = add <2 x i64> %v0, %v1
%e0 = extractelement <2 x i64> %sum, i32 0
%e1 = extractelement <2 x i64> %sum, i32 1
%m = add i64 %e0, %e1
ret i64 %m
}
define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
}
define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
}
define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
}
define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
}
reduce_equal(4)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store ;; masked store
@@ -298,3 +432,41 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
store <4 x i64> %final, <4 x i64> * %ptr, align 8 store <4 x i64> %final, <4 x i64> * %ptr, align 8
ret void ret void
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
masked_store_blend_8_16_by_4()
gen_masked_store(4, i8, 8)
gen_masked_store(4, i16, 16)
gen_masked_store(4, i32, 32)
gen_masked_store(4, i64, 64)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
load_and_broadcast(4, i8, 8)
load_and_broadcast(4, i16, 16)
load_and_broadcast(4, i32, 32)
load_and_broadcast(4, i64, 64)
load_masked(4, i8, 8, 1)
load_masked(4, i16, 16, 2)
load_masked(4, i32, 32, 4)
load_masked(4, i64, 64, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
; define these with the macros from stdlib.m4
gen_gather(4, i8)
gen_gather(4, i16)
gen_gather(4, i32)
gen_gather(4, i64)
gen_scatter(4, i8)
gen_scatter(4, i16)
gen_scatter(4, i32)
gen_scatter(4, i64)

View File

@@ -458,9 +458,21 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
case Target::SSE2: case Target::SSE2:
extern unsigned char builtins_bitcode_sse2[]; extern unsigned char builtins_bitcode_sse2[];
extern int builtins_bitcode_sse2_length; extern int builtins_bitcode_sse2_length;
extern unsigned char builtins_bitcode_sse2_x2[];
extern int builtins_bitcode_sse2_x2_length;
switch (g->target.vectorWidth) {
case 4:
AddBitcodeToModule(builtins_bitcode_sse2, builtins_bitcode_sse2_length, AddBitcodeToModule(builtins_bitcode_sse2, builtins_bitcode_sse2_length,
module, symbolTable); module, symbolTable);
break; break;
case 8:
AddBitcodeToModule(builtins_bitcode_sse2_x2, builtins_bitcode_sse2_x2_length,
module, symbolTable);
break;
default:
FATAL("logic error in DefineStdlib");
}
break;
case Target::SSE4: case Target::SSE4:
extern unsigned char builtins_bitcode_sse4[]; extern unsigned char builtins_bitcode_sse4[];
extern int builtins_bitcode_sse4_length; extern int builtins_bitcode_sse4_length;

View File

@@ -182,6 +182,34 @@ define(`unary1to4', `
ret <4 x $1> %ret_3 ret <4 x $1> %ret_3
') ')
define(`unary1to8', `
%v_0 = extractelement <8 x $1> %0, i32 0
%r_0 = call $1 $2($1 %v_0)
%ret_0 = insertelement <8 x $1> undef, $1 %r_0, i32 0
%v_1 = extractelement <8 x $1> %0, i32 1
%r_1 = call $1 $2($1 %v_1)
%ret_1 = insertelement <8 x $1> %ret_0, $1 %r_1, i32 1
%v_2 = extractelement <8 x $1> %0, i32 2
%r_2 = call $1 $2($1 %v_2)
%ret_2 = insertelement <8 x $1> %ret_1, $1 %r_2, i32 2
%v_3 = extractelement <8 x $1> %0, i32 3
%r_3 = call $1 $2($1 %v_3)
%ret_3 = insertelement <8 x $1> %ret_2, $1 %r_3, i32 3
%v_4 = extractelement <8 x $1> %0, i32 4
%r_4 = call $1 $2($1 %v_4)
%ret_4 = insertelement <8 x $1> %ret_3, $1 %r_4, i32 4
%v_5 = extractelement <8 x $1> %0, i32 5
%r_5 = call $1 $2($1 %v_5)
%ret_5 = insertelement <8 x $1> %ret_4, $1 %r_5, i32 5
%v_6 = extractelement <8 x $1> %0, i32 6
%r_6 = call $1 $2($1 %v_6)
%ret_6 = insertelement <8 x $1> %ret_5, $1 %r_6, i32 6
%v_7 = extractelement <8 x $1> %0, i32 7
%r_7 = call $1 $2($1 %v_7)
%ret_7 = insertelement <8 x $1> %ret_6, $1 %r_7, i32 7
ret <8 x $1> %ret_7
')
;; Given a unary function that takes a 2-wide vector and a 4-wide vector ;; Given a unary function that takes a 2-wide vector and a 4-wide vector
;; that we'd like to apply it to, extract 2 2-wide vectors from the 4-wide ;; that we'd like to apply it to, extract 2 2-wide vectors from the 4-wide
;; vector, apply it, and return the corresponding 4-wide vector result ;; vector, apply it, and return the corresponding 4-wide vector result

View File

@@ -3213,9 +3213,10 @@ instances. For other workloads, it may lead to a slowdown due to higher
register pressure; trying both approaches for key kernels may be register pressure; trying both approaches for key kernels may be
worthwhile. worthwhile.
This option is currently only available for the SSE4 and AVX targets, and This option is only available for each of the SSE2, SSE4 and AVX targets.
is selected with the ``--target=sse4-x2`` and ``--target=avx-x2`` options, It is selected with the ``--target=sse2-x2``, ``--target=sse4-x2`` and
respectively. ``--target=avx-x2`` options, respectively.
Compiling With Support For Multiple Instruction Sets Compiling With Support For Multiple Instruction Sets
---------------------------------------------------- ----------------------------------------------------

View File

@@ -129,6 +129,12 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->vectorWidth = 4; t->vectorWidth = 4;
t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt"; t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
} }
else if (!strcasecmp(isa, "sse2-x2")) {
t->isa = Target::SSE2;
t->nativeVectorWidth = 4;
t->vectorWidth = 8;
t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
}
else if (!strcasecmp(isa, "sse4")) { else if (!strcasecmp(isa, "sse4")) {
t->isa = Target::SSE4; t->isa = Target::SSE4;
t->nativeVectorWidth = 4; t->nativeVectorWidth = 4;
@@ -193,7 +199,7 @@ Target::SupportedTargetArchs() {
const char * const char *
Target::SupportedTargetISAs() { Target::SupportedTargetISAs() {
return "sse2, sse4, sse4-x2" return "sse2, sse2-x2, sse4, sse4-x2"
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
", avx, avx-x2" ", avx, avx-x2"
#endif #endif

View File

@@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations"> <ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32"> <ProjectConfiguration Include="Debug|Win32">
@@ -23,6 +23,7 @@
<ClCompile Include="gen-bitcode-c-64.cpp" /> <ClCompile Include="gen-bitcode-c-64.cpp" />
<ClCompile Include="gen-bitcode-dispatch.cpp" /> <ClCompile Include="gen-bitcode-dispatch.cpp" />
<ClCompile Include="gen-bitcode-sse2.cpp" /> <ClCompile Include="gen-bitcode-sse2.cpp" />
<ClCompile Include="gen-bitcode-sse2-x2.cpp" />
<ClCompile Include="gen-bitcode-sse4.cpp" /> <ClCompile Include="gen-bitcode-sse4.cpp" />
<ClCompile Include="gen-bitcode-sse4-x2.cpp" /> <ClCompile Include="gen-bitcode-sse4-x2.cpp" />
<ClCompile Include="gen-stdlib.cpp" /> <ClCompile Include="gen-stdlib.cpp" />
@@ -87,10 +88,10 @@
<FileType>Document</FileType> <FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command> <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs> <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs> <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command> <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs> <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs> <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message> <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message> <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
</CustomBuild> </CustomBuild>
@@ -113,10 +114,10 @@
<FileType>Document</FileType> <FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command> <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4-x2.cpp</Outputs> <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs> <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command> <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4-x2.cpp</Outputs> <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs> <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4-x2.cpp</Message> <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4-x2.cpp</Message> <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
</CustomBuild> </CustomBuild>
@@ -126,23 +127,36 @@
<FileType>Document</FileType> <FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command> <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2.cpp</Outputs> <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs> <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command> <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2.cpp</Outputs> <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs> <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2.cpp</Message> <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2.cpp</Message>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2.cpp</Message> <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2.cpp</Message>
</CustomBuild> </CustomBuild>
</ItemGroup> </ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins-sse2-x2.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup> <ItemGroup>
<CustomBuild Include="builtins-avx.ll"> <CustomBuild Include="builtins-avx.ll">
<FileType>Document</FileType> <FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command> <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs> <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs> <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command> <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs> <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs> <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message> <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message> <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
</CustomBuild> </CustomBuild>

View File

@@ -26,7 +26,7 @@ parser.add_option("-s", "--static-exe", dest="static_exe",
help="Create and run a regular executable for each test (rather than using the LLVM JIT).", help="Create and run a regular executable for each test (rather than using the LLVM JIT).",
default=False, action="store_true") default=False, action="store_true")
parser.add_option('-t', '--target', dest='target', parser.add_option('-t', '--target', dest='target',
help='Set compilation target (sse2, sse4, sse4-x2, avx, avx-x2)', help='Set compilation target (sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2)',
default="sse4") default="sse4")
parser.add_option('-a', '--arch', dest='arch', parser.add_option('-a', '--arch', dest='arch',
help='Set architecture (x86, x86-64)', help='Set architecture (x86, x86-64)',