diff --git a/Makefile b/Makefile index d5741435..69e24d41 100644 --- a/Makefile +++ b/Makefile @@ -40,8 +40,8 @@ LLVM_CONFIG=$(shell which llvm-config) CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir) # Enable ARM by request -# To enable: make ARM_ENABLED=ON -ARM_ENABLED=OFF +# To enable: make ARM_ENABLED=1 +ARM_ENABLED=0 # Add llvm bin to the path so any scripts run will go to the right llvm-config LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir) @@ -65,7 +65,7 @@ LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1) LLVM_COMPONENTS+=option endif -ifeq ($(ARM_ENABLED), ON) +ifneq ($(ARM_ENABLED), 0) LLVM_COMPONENTS+=arm endif LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS)) @@ -79,6 +79,10 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \ ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \ -lpthread +ifeq ($(LLVM_VERSION),LLVM_3_4) + ISPC_LIBS += -lcurses +endif + ifeq ($(ARCH_OS),Linux) ISPC_LIBS += -ldl endif @@ -111,7 +115,7 @@ OPT=-O2 CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE) \ -Wall $(LLVM_VERSION_DEF) \ -DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" -ifeq ($(ARM_ENABLED), ON) +ifneq ($(ARM_ENABLED), 0) CXXFLAGS+=-DISPC_ARM_ENABLED endif @@ -132,10 +136,11 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \ +TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ + sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \ generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 -ifeq ($(ARM_ENABLED), ON) - TARGETS+=neon +ifneq ($(ARM_ENABLED), 0) + TARGETS+=neon-32 neon-16 neon-8 endif # These files need to be compiled in two versions - 32 and 64 bits. BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) @@ -145,12 +150,12 @@ BUILTINS_OBJS_32=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-32bi BUILTINS_OBJS_64=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-64bit.o))) BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_COMMON:.ll=.o))) \ $(BUILTINS_OBJS_32) $(BUILTINS_OBJS_64) \ - builtins-c-32.cpp builtins-c-64.cpp + builtins-c-32.cpp builtins-c-64.cpp BISON_SRC=parse.yy FLEX_SRC=lex.ll OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \ - stdlib_generic_ispc.o stdlib_x86_ispc.o \ + stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \ $(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o)) default: ispc @@ -256,12 +261,22 @@ objs/builtins-c-64.cpp: builtins/builtins.c @echo Creating C++ source from builtins definition file $< @$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@ -objs/stdlib_generic_ispc.cpp: stdlib.ispc - @echo Creating C++ source from $< for generic - @$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ - python stdlib2cpp.py generic > $@ +objs/stdlib_mask1_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask1 + @$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask1 > $@ -objs/stdlib_x86_ispc.cpp: stdlib.ispc - @echo Creating C++ source from $< for x86 - @$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \ - python stdlib2cpp.py x86 > $@ +objs/stdlib_mask8_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask8 + @$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask8 > $@ + +objs/stdlib_mask16_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask16 + @$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask16 > $@ + +objs/stdlib_mask32_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask32 + @$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask32 > $@ diff --git a/builtins.cpp b/builtins.cpp index 7d99abf9..f3a0cf59 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -112,10 +112,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64; // varying - if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType && - t == LLVMTypes::MaskType) - return AtomicType::VaryingBool; - else if (t == LLVMTypes::Int8VectorType) + if (t == LLVMTypes::Int8VectorType) return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8; else if (t == LLVMTypes::Int16VectorType) return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16; @@ -127,6 +124,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { return AtomicType::VaryingDouble; else if (t == LLVMTypes::Int64VectorType) return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64; + else if (t == LLVMTypes::MaskType) + return AtomicType::VaryingBool; // pointers to uniform else if (t == LLVMTypes::Int8PointerType) @@ -488,7 +487,6 @@ lSetInternalFunctions(llvm::Module *module) { "__num_cores", "__packed_load_active", "__packed_store_active", - "__pause", "__popcnt_int32", "__popcnt_int64", "__prefetch_read_uniform_1", @@ -502,6 +500,8 @@ lSetInternalFunctions(llvm::Module *module) { "__rdrand_i64", "__reduce_add_double", "__reduce_add_float", + "__reduce_add_int8", + "__reduce_add_int16", "__reduce_add_int32", "__reduce_add_int64", "__reduce_equal_double", @@ -581,15 +581,6 @@ lSetInternalFunctions(llvm::Module *module) { "__stdlib_sinf", "__stdlib_tan", "__stdlib_tanf", - "__svml_sin", - "__svml_cos", - "__svml_sincos", - "__svml_tan", - "__svml_atan", - "__svml_atan2", - "__svml_exp", - "__svml_log", - "__svml_pow", "__undef_uniform", "__undef_varying", "__vec4_add_float", @@ -656,7 +647,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length, // the values for an ARM target. This maybe won't cause problems // in the generated code, since bulitins.c doesn't do anything too // complex w.r.t. struct layouts, etc. - if (g->target->getISA() != Target::NEON) + if (g->target->getISA() != Target::NEON32 && + g->target->getISA() != Target::NEON16 && + g->target->getISA() != Target::NEON8) #endif // !__arm__ { Assert(bcTriple.getArch() == llvm::Triple::UnknownArch || @@ -819,13 +812,32 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // Next, add the target's custom implementations of the various needed // builtin functions (e.g. __masked_store_32(), etc). switch (g->target->getISA()) { + #ifdef ISPC_ARM_ENABLED - case Target::NEON: { + case Target::NEON8: { if (runtime32) { - EXPORT_MODULE(builtins_bitcode_neon_32bit); + EXPORT_MODULE(builtins_bitcode_neon_8_32bit); } else { - EXPORT_MODULE(builtins_bitcode_neon_64bit); + EXPORT_MODULE(builtins_bitcode_neon_8_64bit); + } + break; + } + case Target::NEON16: { + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_neon_16_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_neon_16_64bit); + } + break; + } + case Target::NEON32: { + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_neon_32_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_neon_32_64bit); } break; } @@ -865,10 +877,31 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod break; case 8: if (runtime32) { - EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit); + if (g->target->getMaskBitCount() == 16) { + EXPORT_MODULE(builtins_bitcode_sse4_16_32bit); + } + else { + Assert(g->target->getMaskBitCount() == 32); + EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit); + } } else { - EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit); + if (g->target->getMaskBitCount() == 16) { + EXPORT_MODULE(builtins_bitcode_sse4_16_64bit); + } + else { + Assert(g->target->getMaskBitCount() == 32); + EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit); + } + } + break; + case 16: + Assert(g->target->getMaskBitCount() == 8); + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_sse4_8_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_sse4_8_64bit); } break; default: @@ -1017,8 +1050,6 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod symbolTable); lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast, module, symbolTable); - lDefineConstantInt("__math_lib_svml", (int)Globals::Math_SVML, module, - symbolTable); lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module, symbolTable); lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, @@ -1040,16 +1071,30 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // If the user wants the standard library to be included, parse the // serialized version of the stdlib.ispc file to get its // definitions added. + extern char stdlib_mask1_code[], stdlib_mask8_code[]; + extern char stdlib_mask16_code[], stdlib_mask32_code[]; if (g->target->getISA() == Target::GENERIC && - g->target->getVectorWidth() != 1) { // 1 wide uses x86 stdlib - extern char stdlib_generic_code[]; - yy_scan_string(stdlib_generic_code); - yyparse(); + g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib + yy_scan_string(stdlib_mask32_code); } else { - extern char stdlib_x86_code[]; - yy_scan_string(stdlib_x86_code); - yyparse(); + switch (g->target->getMaskBitCount()) { + case 1: + yy_scan_string(stdlib_mask1_code); + break; + case 8: + yy_scan_string(stdlib_mask8_code); + break; + case 16: + yy_scan_string(stdlib_mask16_code); + break; + case 32: + yy_scan_string(stdlib_mask32_code); + break; + default: + FATAL("Unhandled mask bit size for stdlib.ispc"); + } } + yyparse(); } } diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index dcbe0a66..1d317713 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -277,3 +277,9 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1) ret double %ret } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index 8c6b7753..8fb2e427 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -134,23 +134,6 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always ret <16 x float> %call } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; svml - -; FIXME: need either to wire these up to the 8-wide SVML entrypoints, -; or, use the macro to call the 4-wide ones 4x with our 16-wide -; vectors... - -declare <16 x float> @__svml_sin(<16 x float>) -declare <16 x float> @__svml_cos(<16 x float>) -declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *) -declare <16 x float> @__svml_tan(<16 x float>) -declare <16 x float> @__svml_atan(<16 x float>) -declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>) -declare <16 x float> @__svml_exp(<16 x float>) -declare <16 x float> @__svml_log(<16 x float>) -declare <16 x float> @__svml_pow(<16 x float>, <16 x float>) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max @@ -271,6 +254,33 @@ reduce_equal(16) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal int32 ops +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline { + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <16 x i16> @__add_varying_i16(<16 x i16>, + <16 x i16>) nounwind readnone alwaysinline { + %r = add <16 x i16> %0, %1 + ret <16 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline { + reduce16(i16, @__add_varying_i16, @__add_uniform_i16) +} + define <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) nounwind readnone alwaysinline { %s = add <16 x i32> %0, %1 diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index e6ab3a4b..adaed9ba 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -134,23 +134,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ret <8 x float> %call } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; svml - -; FIXME: need either to wire these up to the 8-wide SVML entrypoints, -; or, use the macro to call the 4-wide ones twice with our 8-wide -; vectors... - -declare <8 x float> @__svml_sin(<8 x float>) -declare <8 x float> @__svml_cos(<8 x float>) -declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *) -declare <8 x float> @__svml_tan(<8 x float>) -declare <8 x float> @__svml_atan(<8 x float>) -declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>) -declare <8 x float> @__svml_exp(<8 x float>) -declare <8 x float> @__svml_log(<8 x float>) -declare <8 x float> @__svml_pow(<8 x float>, <8 x float>) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max @@ -217,7 +200,6 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline { ret float %sum } - define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline { reduce8(float, @__min_varying_float, @__min_uniform_float) } @@ -229,6 +211,42 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline { reduce_equal(8) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int8 ops + +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int16 ops + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal int32 ops @@ -257,20 +275,14 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline { reduce8(i32, @__max_varying_int32, @__max_uniform_int32) } - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;; horizontal uint32 ops - define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline { reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32) } - define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline { reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32) } - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal double ops @@ -329,9 +341,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline { } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;; horizontal uint64 ops - define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline { reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) } diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 9b747e2e..3472c207 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -471,6 +471,15 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { ret i64 %call } +define i8 @__reduce_add_int8(<1 x i8> %v) nounwind readonly alwaysinline { + %r = extractelement <1 x i8> %v, i32 0 + ret i8 %r +} + +define i16 @__reduce_add_int16(<1 x i16> %v) nounwind readonly alwaysinline { + %r = extractelement <1 x i16> %v, i32 0 + ret i16 %r +} define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline { %r = extractelement <1 x float> %v, i32 0 @@ -638,104 +647,6 @@ define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw } - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0) - ;ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm.sin.f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - unary1to1(float,@llvm.sin.f32) - -} - -define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0) - ;ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm.cos.f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - unary1to1(float, @llvm.cos.f32) - -} - -define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline { -; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0) -; store <1 x float> %s, <1 x float> * %1 -; ret void - %sin = call <1 x float> @__svml_sin (<1 x float> %0) - %cos = call <1 x float> @__svml_cos (<1 x float> %0) - store <1 x float> %sin, <1 x float> * %1 - store <1 x float> %cos, <1 x float> * %2 - ret void -} - -define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0) - ;ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm_tan_f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - ;unasry1to1(float, @llvm.tan.f32) - ; UNSUPPORTED! - ret <1 x float > %0 -} - -define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline { -; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0) -; ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm_atan_f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - ;unsary1to1(float,@llvm.atan.f32) - ;UNSUPPORTED! - ret <1 x float > %0 - -} - -define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1) - ;ret <1 x float> %ret - ;%y = extractelement <1 x float> %0, i32 0 - ;%x = extractelement <1 x float> %1, i32 0 - ;%q = fdiv float %y, %x - ;%a = call float @llvm.atan.f32 (float %q) - ;%rv = insertelement <1 x float> undef, float %a, i32 0 - ;ret <1 x float> %rv - ; UNSUPPORTED! - ret <1 x float > %0 -} - -define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0) - ;ret <1 x float> %ret - unary1to1(float, @llvm.exp.f32) -} - -define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0) - ;ret <1 x float> %ret - unary1to1(float, @llvm.log.f32) -} - -define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1) - ;ret <1 x float> %ret - %r = extractelement <1 x float> %0, i32 0 - %e = extractelement <1 x float> %1, i32 0 - %s = call float @llvm.pow.f32(float %r,float %e) - %rv = insertelement <1 x float> undef, float %s, i32 0 - ret <1 x float> %rv - -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max @@ -953,3 +864,9 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone declare @__half_to_float_varying( %v) nounwind readnone declare i16 @__float_to_half_uniform(float %v) nounwind readnone declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index bbf1b842..c683ff45 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -202,22 +202,6 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone declare i32 @__count_leading_zeros_i32(i32) nounwind readnone declare i64 @__count_leading_zeros_i64(i64) nounwind readnone -;; svml - -; FIXME: need either to wire these up to the 8-wide SVML entrypoints, -; or, use the macro to call the 4-wide ones twice with our 8-wide -; vectors... - -declare @__svml_sin() -declare @__svml_cos() -declare void @__svml_sincos(, *, *) -declare @__svml_tan() -declare @__svml_atan() -declare @__svml_atan2(, ) -declare @__svml_exp() -declare @__svml_log() -declare @__svml_pow(, ) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions @@ -226,14 +210,16 @@ declare i1 @__any() nounwind readnone declare i1 @__all() nounwind readnone declare i1 @__none() nounwind readnone +declare i16 @__reduce_add_int8() nounwind readnone +declare i32 @__reduce_add_int16() nounwind readnone + declare float @__reduce_add_float() nounwind readnone declare float @__reduce_min_float() nounwind readnone declare float @__reduce_max_float() nounwind readnone -declare i32 @__reduce_add_int32() nounwind readnone +declare i64 @__reduce_add_int32() nounwind readnone declare i32 @__reduce_min_int32() nounwind readnone declare i32 @__reduce_max_int32() nounwind readnone - declare i32 @__reduce_min_uint32() nounwind readnone declare i32 @__reduce_max_uint32() nounwind readnone @@ -244,7 +230,6 @@ declare double @__reduce_max_double() nounwind readnone declare i64 @__reduce_add_int64() nounwind readnone declare i64 @__reduce_min_int64() nounwind readnone declare i64 @__reduce_max_int64() nounwind readnone - declare i64 @__reduce_min_uint64() nounwind readnone declare i64 @__reduce_max_uint64() nounwind readnone @@ -379,3 +364,8 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll new file mode 100644 index 00000000..a0575927 --- /dev/null +++ b/builtins/target-neon-16.ll @@ -0,0 +1,517 @@ +;; +;; target-neon-16.ll +;; +;; Copyright(c) 2013 Google, Inc. +;; +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Matt Pharr nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`8') +define(`MASK',`i16') + +include(`util.m4') +include(`target-neon-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone { + unary4to8conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v) + ret <8 x float> %r +} + +define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone { + unary4to8conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v) + ret <8 x i16> %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +;; round/floor/ceil + +;; FIXME: grabbed these from the sse2 target, which does not have native +;; instructions for these. Is there a better approach for NEON? + +define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32> + %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i, + + %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i, + + %binop21.i = fadd <8 x float> %binop.i, + + %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32> + %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float> + ret <8 x float> %int_to_float_bitcast.i.i.i +} + +define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind + %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32> + %bitop.i = and <8 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <8 x float> %binop.i +} + +define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind + %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32> + %bitop.i = and <8 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <8 x float> %binop.i +} + +;; FIXME: rounding doubles and double vectors needs to be implemented +declare @__round_varying_double() nounwind readnone +declare @__floor_varying_double() nounwind readnone +declare @__ceil_varying_double() nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__max_varying_float(, + ) nounwind readnone { + binary4to8(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1) + ret %r +} + +define @__min_varying_float(, + ) nounwind readnone { + binary4to8(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1) + ret %r +} + +declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +define @__min_varying_int32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1) + ret %r +} + +define @__max_varying_int32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1) + ret %r +} + +define @__min_varying_uint32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1) + ret %r +} + +define @__max_varying_uint32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1) + ret %r +} + +;; sqrt/rsqrt/rcp + +declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rcp_varying_float( %d) nounwind readnone { + unary4to8(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d) + binary4to8(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0) + %x1 = fmul %x0, %x0_nr + binary4to8(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rsqrt_varying_float( %d) nounwind readnone { + unary4to8(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d) + %x0_2 = fmul %x0, %x0 + binary4to8(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2) + %x1 = fmul %x0, %x0_nr + %x1_2 = fmul %x1, %x1 + binary4to8(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +define float @__rsqrt_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + %vr = call <8 x float> @__rsqrt_varying_float(<8 x float> %vs) + %r = extractelement <8 x float> %vr, i32 0 + ret float %r +} + +define float @__rcp_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + %vr = call <8 x float> @__rcp_varying_float(<8 x float> %vs) + %r = extractelement <8 x float> %vr, i32 0 + ret float %r +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + +define @__sqrt_varying_float() nounwind readnone { + unary4to8(result, float, @llvm.sqrt.v4f32, %0) +;; this returns nan for v=0, which is undesirable.. +;; %rsqrt = call @__rsqrt_varying_float( %0) +;; %result = fmul <4 x float> %rsqrt, %0 + ret <8 x float> %result +} + +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) + +define @__sqrt_varying_double() nounwind readnone { + unary4to8(r, double, @llvm.sqrt.v4f64, %0) + ret %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reductions + +define i64 @__movmsk() nounwind readnone { + %and_mask = and %0, + + %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %and_mask) + %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4) + %va = extractelement <2 x i64> %v2, i32 0 + %vb = extractelement <2 x i64> %v2, i32 1 + %v = or i64 %va, %vb + ret i64 %v +} + +define i1 @__any() nounwind readnone alwaysinline { + v8tov4(MASK, %0, %v0123, %v4567) + %vor = or <4 x MASK> %v0123, %v4567 + %v0 = extractelement <4 x MASK> %vor, i32 0 + %v1 = extractelement <4 x MASK> %vor, i32 1 + %v2 = extractelement <4 x MASK> %vor, i32 2 + %v3 = extractelement <4 x MASK> %vor, i32 3 + %v01 = or MASK %v0, %v1 + %v23 = or MASK %v2, %v3 + %v = or MASK %v01, %v23 + %cmp = icmp ne MASK %v, 0 + ret i1 %cmp +} + +define i1 @__all() nounwind readnone alwaysinline { + v8tov4(MASK, %0, %v0123, %v4567) + %vand = and <4 x MASK> %v0123, %v4567 + %v0 = extractelement <4 x MASK> %vand, i32 0 + %v1 = extractelement <4 x MASK> %vand, i32 1 + %v2 = extractelement <4 x MASK> %vand, i32 2 + %v3 = extractelement <4 x MASK> %vand, i32 3 + %v01 = and MASK %v0, %v1 + %v23 = and MASK %v2, %v3 + %v = and MASK %v01, %v23 + %cmp = icmp ne MASK %v, 0 + ret i1 %cmp +} + +define i1 @__none() nounwind readnone alwaysinline { + %any = call i1 @__any( %0) + %none = icmp eq i1 %any, 0 + ret i1 %none +} + +;; $1: scalar type +;; $2: vector/vector reduce function (2 x -> ) +;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>) +;; $4: scalar reduce function + +define(`neon_reduce', ` + v8tov4($1, %0, %v0123, %v4567) + %v0123_8 = shufflevector <4 x $1> %v0123, <4 x $1> undef, + <8 x i32> + %v4567_8 = shufflevector <4 x $1> %v4567, <4 x $1> undef, + <8 x i32> + %vfirst = call <8 x $1> $2(<8 x $1> %v0123_8, <8 x $1> %v4567_8) + %vfirst_4 = shufflevector <8 x $1> %vfirst, <8 x $1> undef, + <4 x i32> + v4tov2($1, %vfirst_4, %v0, %v1) + %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1) + %vh0 = extractelement <2 x $1> %vh, i32 0 + %vh1 = extractelement <2 x $1> %vh, i32 1 + %r = call $1 $4($1 %vh0, $1 %vh1) + ret $1 %r +') + +declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @add_f32(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define internal @__add_varying_float(, ) { + %r = fadd %0, %1 + ret %r +} + +define float @__reduce_add_float() nounwind readnone { + neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @min_f32(float, float) { + %cmp = fcmp olt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_min_float() nounwind readnone { + neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @max_f32(float, float) { + %cmp = fcmp ugt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_max_float() nounwind readnone { + neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32) +} + +declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone + +define i16 @__reduce_add_int8() nounwind readnone { + %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %0) + %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16) + %a0 = extractelement <2 x i32> %a32, i32 0 + %a1 = extractelement <2 x i32> %a32, i32 1 + %r = add i32 %a0, %a1 + %r16 = trunc i32 %r to i16 + ret i16 %r16 +} + +declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16() + +define i64 @__reduce_add_int16() nounwind readnone { + %a1 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16( %0) + %a2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a1) + %aa = extractelement <2 x i64> %a2, i32 0 + %ab = extractelement <2 x i64> %a2, i32 1 + %r = add i64 %aa, %ab + ret i64 %r +} + +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone + +define i64 @__reduce_add_int32() nounwind readnone { + v8tov4(i32, %0, %va, %vb) + %pa = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va) + %pb = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb) + %psum = add <2 x i64> %pa, %pb + %a0 = extractelement <2 x i64> %psum, i32 0 + %a1 = extractelement <2 x i64> %psum, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_si32(i32, i32) { + %cmp = icmp slt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_int32() nounwind readnone { + neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_si32(i32, i32) { + %cmp = icmp sgt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_int32() nounwind readnone { + neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_ui32(i32, i32) { + %cmp = icmp ult i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_uint32() nounwind readnone { + neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_ui32(i32, i32) { + %cmp = icmp ugt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_uint32() nounwind readnone { + neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32) +} + +define double @__reduce_add_double() nounwind readnone { + v8tov2(double, %0, %v0, %v1, %v2, %v3) + %v01 = fadd <2 x double> %v0, %v1 + %v23 = fadd <2 x double> %v2, %v3 + %sum = fadd <2 x double> %v01, %v23 + %e0 = extractelement <2 x double> %sum, i32 0 + %e1 = extractelement <2 x double> %sum, i32 1 + %m = fadd double %e0, %e1 + ret double %m +} + +define double @__reduce_min_double() nounwind readnone { + reduce8(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double() nounwind readnone { + reduce8(double, @__max_varying_double, @__max_uniform_double) +} + +define i64 @__reduce_add_int64() nounwind readnone { + v8tov2(i64, %0, %v0, %v1, %v2, %v3) + %v01 = add <2 x i64> %v0, %v1 + %v23 = add <2 x i64> %v2, %v3 + %sum = add <2 x i64> %v01, %v23 + %e0 = extractelement <2 x i64> %sum, i32 0 + %e1 = extractelement <2 x i64> %sum, i32 1 + %m = add i64 %e0, %e1 + ret i64 %m +} + +define i64 @__reduce_min_int64() nounwind readnone { + reduce8(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64() nounwind readnone { + reduce8(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64() nounwind readnone { + reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64() nounwind readnone { + reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 + +declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} diff --git a/builtins/target-neon.ll b/builtins/target-neon-32.ll similarity index 60% rename from builtins/target-neon.ll rename to builtins/target-neon-32.ll index e70b774b..30b062c9 100644 --- a/builtins/target-neon.ll +++ b/builtins/target-neon-32.ll @@ -1,5 +1,5 @@ ;; -;; target-neon.ll +;; target-neon-32.ll ;; ;; Copyright(c) 2012-2013 Matt Pharr ;; Copyright(c) 2013 Google, Inc. @@ -34,52 +34,20 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32" - define(`WIDTH',`4') - define(`MASK',`i32') include(`util.m4') - -stdlib_core() -scans() -reduce_equal(WIDTH) -rdrand_decls() -define_shuffles() -aossoa() -ctlztz() +include(`target-neon-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines -declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone -declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone - -define float @__half_to_float_uniform(i16 %v) nounwind readnone { - %v1 = bitcast i16 %v to <1 x i16> - %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, - <4 x i32> - %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec) - %r = extractelement <4 x float> %h, i32 0 - ret float %r -} - define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone { %r = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %v) ret <4 x float> %r } -define i16 @__float_to_half_uniform(float %v) nounwind readnone { - %v1 = bitcast float %v to <1 x float> - %vec = shufflevector <1 x float> %v1, <1 x float> undef, - <4 x i32> - %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec) - %r = extractelement <4 x i16> %h, i32 0 - ret i16 %r -} - - define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone { %r = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %v) ret <4 x i16> %r @@ -88,48 +56,11 @@ define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; math -define void @__fastmath() nounwind { - ret void -} - ;; round/floor/ceil ;; FIXME: grabbed these from the sse2 target, which does not have native ;; instructions for these. Is there a better approach for NEON? -define float @__round_uniform_float(float) nounwind readonly alwaysinline { - %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32 - %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648 - %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i - %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06 - %binop21.i = fadd float %binop.i, -8.388608e+06 - %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32 - %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float - ret float %int_to_float_bitcast.i.i.i -} - -define float @__floor_uniform_float(float) nounwind readonly alwaysinline { - %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind - %bincmp.i = fcmp ogt float %calltmp.i, %0 - %selectexpr.i = sext i1 %bincmp.i to i32 - %bitop.i = and i32 %selectexpr.i, -1082130432 - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i - ret float %binop.i -} - -define float @__ceil_uniform_float(float) nounwind readonly alwaysinline { - %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind - %bincmp.i = fcmp olt float %calltmp.i, %0 - %selectexpr.i = sext i1 %bincmp.i to i32 - %bitop.i = and i32 %selectexpr.i, 1065353216 - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i - ret float %binop.i -} - define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline { %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32> %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, @@ -164,10 +95,6 @@ define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysin } ;; FIXME: rounding doubles and double vectors needs to be implemented -declare double @__round_uniform_double(double) nounwind readnone -declare double @__floor_uniform_double(double) nounwind readnone -declare double @__ceil_uniform_double(double) nounwind readnone - declare @__round_varying_double() nounwind readnone declare @__floor_varying_double() nounwind readnone declare @__ceil_varying_double() nounwind readnone @@ -175,78 +102,6 @@ declare @__ceil_varying_double() nounwind readn ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; min/max -define float @__max_uniform_float(float, float) nounwind readnone { - %cmp = fcmp ugt float %0, %1 - %r = select i1 %cmp, float %0, float %1 - ret float %r -} - -define float @__min_uniform_float(float, float) nounwind readnone { - %cmp = fcmp ult float %0, %1 - %r = select i1 %cmp, float %0, float %1 - ret float %r -} - -define i32 @__min_uniform_int32(i32, i32) nounwind readnone { - %cmp = icmp slt i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i32 @__max_uniform_int32(i32, i32) nounwind readnone { - %cmp = icmp sgt i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i32 @__min_uniform_uint32(i32, i32) nounwind readnone { - %cmp = icmp ult i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i32 @__max_uniform_uint32(i32, i32) nounwind readnone { - %cmp = icmp ugt i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i64 @__min_uniform_int64(i64, i64) nounwind readnone { - %cmp = icmp slt i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define i64 @__max_uniform_int64(i64, i64) nounwind readnone { - %cmp = icmp sgt i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define i64 @__min_uniform_uint64(i64, i64) nounwind readnone { - %cmp = icmp ult i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define i64 @__max_uniform_uint64(i64, i64) nounwind readnone { - %cmp = icmp ugt i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define double @__min_uniform_double(double, double) nounwind readnone { - %cmp = fcmp olt double %0, %1 - %r = select i1 %cmp, double %0, double %1 - ret double %r -} - -define double @__max_uniform_double(double, double) nounwind readnone { - %cmp = fcmp ogt double %0, %1 - %r = select i1 %cmp, double %0, double %1 - ret double %r -} - declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone @@ -287,44 +142,6 @@ define @__max_varying_uint32(, ) nounwin ret <4 x i32> %r } -define @__min_varying_int64(, ) nounwind readnone { - %m = icmp slt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__max_varying_int64(, ) nounwind readnone { - %m = icmp sgt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__min_varying_uint64(, ) nounwind readnone { - %m = icmp ult %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__max_varying_uint64(, ) nounwind readnone { - %m = icmp ugt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__min_varying_double(, - ) nounwind readnone { - %m = fcmp olt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__max_varying_double(, - ) nounwind readnone { - %m = fcmp ogt %0, %1 - %r = select %m, %0, %1 - ret %r -} - ;; sqrt/rsqrt/rcp declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone @@ -371,13 +188,6 @@ define float @__rcp_uniform_float(float) nounwind readnone { ret float %r } -declare float @llvm.sqrt.f32(float) - -define float @__sqrt_uniform_float(float) nounwind readnone { - %r = call float @llvm.sqrt.f32(float %0) - ret float %r -} - declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) define @__sqrt_varying_float() nounwind readnone { @@ -388,13 +198,6 @@ define @__sqrt_varying_float() nounwind readnone ret <4 x float> %result } -declare double @llvm.sqrt.f64(double) - -define double @__sqrt_uniform_double(double) nounwind readnone { - %r = call double @llvm.sqrt.f64(double %0) - ret double %r -} - declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) define @__sqrt_varying_double() nounwind readnone { @@ -402,21 +205,6 @@ define @__sqrt_varying_double() nounwind readno ret <4 x double> %r } -;; bit ops - -declare i32 @llvm.ctpop.i32(i32) nounwind readnone -declare i64 @llvm.ctpop.i64(i64) nounwind readnone - -define i32 @__popcnt_int32(i32) nounwind readnone { - %v = call i32 @llvm.ctpop.i32(i32 %0) - ret i32 %v -} - -define i64 @__popcnt_int64(i64) nounwind readnone { - %v = call i64 @llvm.ctpop.i64(i64 %0) - ret i64 %v -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions @@ -509,15 +297,38 @@ define float @__reduce_max_float(<4 x float>) nounwind readnone { neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32) } -define internal i32 @add_i32(i32, i32) { - %r = add i32 %0, %1 +declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone + +define i16 @__reduce_add_int8() nounwind readnone { + %v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <8 x i32> + %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8) + %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16) + %a0 = extractelement <2 x i32> %a32, i32 0 + %a1 = extractelement <2 x i32> %a32, i32 1 + %r = add i32 %a0, %a1 + %r16 = trunc i32 %r to i16 + ret i16 %r16 +} + +declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone + +define i32 @__reduce_add_int16() nounwind readnone { + %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0) + %a0 = extractelement <2 x i32> %a32, i32 0 + %a1 = extractelement <2 x i32> %a32, i32 1 + %r = add i32 %a0, %a1 ret i32 %r } -declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone -define i32 @__reduce_add_int32() nounwind readnone { - neon_reduce(i32, @llvm.arm.neon.vpadd.v2i32, @add_i32) +define i64 @__reduce_add_int32() nounwind readnone { + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0) + %a0 = extractelement <2 x i64> %a64, i32 0 + %a1 = extractelement <2 x i64> %a64, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r } declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone @@ -617,90 +428,60 @@ define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone { } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; unaligned loads/loads+broadcasts +;; int8/int16 -masked_load(i8, 1) -masked_load(i16, 2) -masked_load(i32, 4) -masked_load(float, 4) -masked_load(i64, 8) -masked_load(double, 8) +declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone -gen_masked_store(i8) -gen_masked_store(i16) -gen_masked_store(i32) -gen_masked_store(i64) -masked_store_float_double() - -define void @__masked_store_blend_i8(* nocapture %ptr, %new, - %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i8> %new, <4 x i8> %old - store %result, * %ptr - ret void +define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r } -define void @__masked_store_blend_i16(* nocapture %ptr, %new, - %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i16> %new, <4 x i16> %old - store %result, * %ptr - ret void +declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r } -define void @__masked_store_blend_i32(* nocapture %ptr, %new, - %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i32> %new, <4 x i32> %old - store %result, * %ptr - ret void +declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r } -define void @__masked_store_blend_i64(* nocapture %ptr, - %new, %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i64> %new, <4 x i64> %old - store %result, * %ptr - ret void +declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r } -;; yuck. We need declarations of these, even though we shouldnt ever -;; actually generate calls to them for the NEON target... +declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare @__svml_sin() -declare @__svml_cos() -declare void @__svml_sincos(, *, *) -declare @__svml_tan() -declare @__svml_atan() -declare @__svml_atan2(, ) -declare @__svml_exp() -declare @__svml_log() -declare @__svml_pow(, ) +define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; gather +declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -gen_gather_factored(i8) -gen_gather_factored(i16) -gen_gather_factored(i32) -gen_gather_factored(float) -gen_gather_factored(i64) -gen_gather_factored(double) +define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} -gen_scatter(i8) -gen_scatter(i16) -gen_scatter(i32) -gen_scatter(float) -gen_scatter(i64) -gen_scatter(double) +declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -packed_load_and_store(4) +define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; prefetch +declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -define_prefetches() +define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll new file mode 100644 index 00000000..2accfe53 --- /dev/null +++ b/builtins/target-neon-8.ll @@ -0,0 +1,583 @@ +;; +;; target-neon-8.ll +;; +;; Copyright(c) 2013 Google, Inc. +;; +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Matt Pharr nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`16') +define(`MASK',`i8') + +include(`util.m4') +include(`target-neon-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone { + unary4to16conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v) + ret <16 x float> %r +} + +define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone { + unary4to16conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v) + ret <16 x i16> %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +;; round/floor/ceil + +;; FIXME: grabbed these from the sse2 target, which does not have native +;; instructions for these. Is there a better approach for NEON? + +define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast <16 x float> %0 to <16 x i32> + %bitop.i.i = and <16 x i32> %float_to_int_bitcast.i.i.i.i, + + %bitop.i = xor <16 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <16 x i32> %bitop.i to <16 x float> + %binop.i = fadd <16 x float> %int_to_float_bitcast.i.i40.i, + + %binop21.i = fadd <16 x float> %binop.i, + + %float_to_int_bitcast.i.i.i = bitcast <16 x float> %binop21.i to <16 x i32> + %bitop31.i = xor <16 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop31.i to <16 x float> + ret <16 x float> %int_to_float_bitcast.i.i.i +} + +define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind + %bincmp.i = fcmp ogt <16 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32> + %bitop.i = and <16 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float> + %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <16 x float> %binop.i +} + +define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind + %bincmp.i = fcmp olt <16 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32> + %bitop.i = and <16 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float> + %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <16 x float> %binop.i +} + +;; FIXME: rounding doubles and double vectors needs to be implemented +declare @__round_varying_double() nounwind readnone +declare @__floor_varying_double() nounwind readnone +declare @__ceil_varying_double() nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__max_varying_float(, + ) nounwind readnone { + binary4to16(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1) + ret %r +} + +define @__min_varying_float(, + ) nounwind readnone { + binary4to16(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1) + ret %r +} + +declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +define @__min_varying_int32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1) + ret %r +} + +define @__max_varying_int32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1) + ret %r +} + +define @__min_varying_uint32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1) + ret %r +} + +define @__max_varying_uint32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1) + ret %r +} + +;; sqrt/rsqrt/rcp + +declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rcp_varying_float( %d) nounwind readnone { + unary4to16(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d) + binary4to16(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0) + %x1 = fmul %x0, %x0_nr + binary4to16(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rsqrt_varying_float( %d) nounwind readnone { + unary4to16(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d) + %x0_2 = fmul %x0, %x0 + binary4to16(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2) + %x1 = fmul %x0, %x0_nr + %x1_2 = fmul %x1, %x1 + binary4to16(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +define float @__rsqrt_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <16 x i32> + %vr = call <16 x float> @__rsqrt_varying_float(<16 x float> %vs) + %r = extractelement <16 x float> %vr, i32 0 + ret float %r +} + +define float @__rcp_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <16 x i32> + %vr = call <16 x float> @__rcp_varying_float(<16 x float> %vs) + %r = extractelement <16 x float> %vr, i32 0 + ret float %r +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + +define @__sqrt_varying_float() nounwind readnone { + unary4to16(result, float, @llvm.sqrt.v4f32, %0) +;; this returns nan for v=0, which is undesirable.. +;; %rsqrt = call @__rsqrt_varying_float( %0) +;; %result = fmul <4 x float> %rsqrt, %0 + ret <16 x float> %result +} + +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) + +define @__sqrt_varying_double() nounwind readnone { + unary4to16(r, double, @llvm.sqrt.v4f64, %0) + ret %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reductions + +define i64 @__movmsk() nounwind readnone { + %and_mask = and %0, + + %v8 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %and_mask) + %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %v8) + %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4) + %va = extractelement <2 x i64> %v2, i32 0 + %vb = extractelement <2 x i64> %v2, i32 1 + %vbshift = shl i64 %vb, 8 + %v = or i64 %va, %vbshift + ret i64 %v +} + +define i1 @__any() nounwind readnone alwaysinline { + v16tov8(MASK, %0, %v8a, %v8b) + %vor8 = or <8 x MASK> %v8a, %v8b + %v16 = sext <8 x i8> %vor8 to <8 x i16> + v8tov4(i16, %v16, %v16a, %v16b) + %vor16 = or <4 x i16> %v16a, %v16b + %v32 = sext <4 x i16> %vor16 to <4 x i32> + v4tov2(i32, %v32, %v32a, %v32b) + %vor32 = or <2 x i32> %v32a, %v32b + %v0 = extractelement <2 x i32> %vor32, i32 0 + %v1 = extractelement <2 x i32> %vor32, i32 1 + %v = or i32 %v0, %v1 + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all() nounwind readnone alwaysinline { + v16tov8(MASK, %0, %v8a, %v8b) + %vand8 = and <8 x MASK> %v8a, %v8b + %v16 = sext <8 x i8> %vand8 to <8 x i16> + v8tov4(i16, %v16, %v16a, %v16b) + %vand16 = and <4 x i16> %v16a, %v16b + %v32 = sext <4 x i16> %vand16 to <4 x i32> + v4tov2(i32, %v32, %v32a, %v32b) + %vand32 = and <2 x i32> %v32a, %v32b + %v0 = extractelement <2 x i32> %vand32, i32 0 + %v1 = extractelement <2 x i32> %vand32, i32 1 + %v = and i32 %v0, %v1 + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__none() nounwind readnone alwaysinline { + %any = call i1 @__any( %0) + %none = icmp eq i1 %any, 0 + ret i1 %none +} + +;; $1: scalar type +;; $2: vector/vector reduce function (2 x -> ) +;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>) +;; $4: scalar reduce function + +define(`neon_reduce', ` + v16tov8($1, %0, %va, %vb) + %va_16 = shufflevector <8 x $1> %va, <8 x $1> undef, + <16 x i32> + %vb_16 = shufflevector <8 x $1> %vb, <8 x $1> undef, + <16 x i32> + %v8 = call <16 x $1> $2(<16 x $1> %va_16, <16 x $1> %vb_16) + + %v8a = shufflevector <16 x $1> %v8, <16 x $1> undef, + <16 x i32> + %v8b = shufflevector <16 x $1> %v8, <16 x $1> undef, + <16 x i32> + + %v4 = call <16 x $1> $2(<16 x $1> %v8a, <16 x $1> %v8b) + + %vfirst_4 = shufflevector <16 x $1> %v4, <16 x $1> undef, + <4 x i32> + v4tov2($1, %vfirst_4, %v0, %v1) + %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1) + %vh0 = extractelement <2 x $1> %vh, i32 0 + %vh1 = extractelement <2 x $1> %vh, i32 1 + %r = call $1 $4($1 %vh0, $1 %vh1) + ret $1 %r +') + +declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @add_f32(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define internal @__add_varying_float(, ) { + %r = fadd %0, %1 + ret %r +} + +define float @__reduce_add_float() nounwind readnone { + neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @min_f32(float, float) { + %cmp = fcmp olt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_min_float() nounwind readnone { + neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @max_f32(float, float) { + %cmp = fcmp ugt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_max_float() nounwind readnone { + neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32) +} + +declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone + +define i64 @__reduce_add_int8() nounwind readnone { + %a16 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %0) + %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a16) + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32) + %a0 = extractelement <2 x i64> %a64, i32 0 + %a1 = extractelement <2 x i64> %a64, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +define i64 @__reduce_add_int16() nounwind readnone { + v16tov8(i16, %0, %va, %vb) + %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %va) + %b32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %vb) + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32) + %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %b32) + %sum = add <2 x i64> %a64, %b64 + %a0 = extractelement <2 x i64> %sum, i32 0 + %a1 = extractelement <2 x i64> %sum, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +define i64 @__reduce_add_int32() nounwind readnone { + v16tov4(i32, %0, %va, %vb, %vc, %vd) + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va) + %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb) + %c64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vc) + %d64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vd) + %ab = add <2 x i64> %a64, %b64 + %cd = add <2 x i64> %c64, %d64 + %sum = add <2 x i64> %ab, %cd + %a0 = extractelement <2 x i64> %sum, i32 0 + %a1 = extractelement <2 x i64> %sum, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_si32(i32, i32) { + %cmp = icmp slt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_int32() nounwind readnone { + neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_si32(i32, i32) { + %cmp = icmp sgt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_int32() nounwind readnone { + neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_ui32(i32, i32) { + %cmp = icmp ult i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_uint32() nounwind readnone { + neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_ui32(i32, i32) { + %cmp = icmp ugt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_uint32() nounwind readnone { + neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32) +} + +define internal double @__add_uniform_double(double, double) { + %r = fadd double %0, %1 + ret double %r +} + +define internal @__add_varying_double(, ) { + %r = fadd %0, %1 + ret %r +} + +define double @__reduce_add_double() nounwind readnone { + reduce16(double, @__add_varying_double, @__add_uniform_double) +} + +define double @__reduce_min_double() nounwind readnone { + reduce16(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double() nounwind readnone { + reduce16(double, @__max_varying_double, @__max_uniform_double) +} + +define internal i64 @__add_uniform_int64(i64, i64) { + %r = add i64 %0, %1 + ret i64 %r +} + +define internal @__add_varying_int64(, ) { + %r = add %0, %1 + ret %r +} + +define i64 @__reduce_add_int64() nounwind readnone { + reduce16(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define i64 @__reduce_min_int64() nounwind readnone { + reduce16(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64() nounwind readnone { + reduce16(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64() nounwind readnone { + reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64() nounwind readnone { + reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll new file mode 100644 index 00000000..f892a0a1 --- /dev/null +++ b/builtins/target-neon-common.ll @@ -0,0 +1,338 @@ +;; +;; target-neon-common.ll +;; +;; Copyright(c) 2013 Google, Inc. +;; +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Matt Pharr nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32" + +stdlib_core() +scans() +reduce_equal(WIDTH) +rdrand_decls() +define_shuffles() +aossoa() +ctlztz() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone + +define float @__half_to_float_uniform(i16 %v) nounwind readnone { + %v1 = bitcast i16 %v to <1 x i16> + %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, + <4 x i32> + %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec) + %r = extractelement <4 x float> %h, i32 0 + ret float %r +} + +define i16 @__float_to_half_uniform(float %v) nounwind readnone { + %v1 = bitcast float %v to <1 x float> + %vec = shufflevector <1 x float> %v1, <1 x float> undef, + <4 x i32> + %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec) + %r = extractelement <4 x i16> %h, i32 0 + ret i16 %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +define void @__fastmath() nounwind { + ret void +} + +;; round/floor/ceil + +;; FIXME: grabbed these from the sse2 target, which does not have native +;; instructions for these. Is there a better approach for NEON? + +define float @__round_uniform_float(float) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32 + %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648 + %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i + %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06 + %binop21.i = fadd float %binop.i, -8.388608e+06 + %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32 + %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float + ret float %int_to_float_bitcast.i.i.i +} + +define float @__floor_uniform_float(float) nounwind readonly alwaysinline { + %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind + %bincmp.i = fcmp ogt float %calltmp.i, %0 + %selectexpr.i = sext i1 %bincmp.i to i32 + %bitop.i = and i32 %selectexpr.i, -1082130432 + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i + ret float %binop.i +} + +define float @__ceil_uniform_float(float) nounwind readonly alwaysinline { + %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind + %bincmp.i = fcmp olt float %calltmp.i, %0 + %selectexpr.i = sext i1 %bincmp.i to i32 + %bitop.i = and i32 %selectexpr.i, 1065353216 + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i + ret float %binop.i +} + +;; FIXME: rounding doubles and double vectors needs to be implemented +declare double @__round_uniform_double(double) nounwind readnone +declare double @__floor_uniform_double(double) nounwind readnone +declare double @__ceil_uniform_double(double) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +define float @__max_uniform_float(float, float) nounwind readnone { + %cmp = fcmp ugt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__min_uniform_float(float, float) nounwind readnone { + %cmp = fcmp ult float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define i32 @__min_uniform_int32(i32, i32) nounwind readnone { + %cmp = icmp slt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__max_uniform_int32(i32, i32) nounwind readnone { + %cmp = icmp sgt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__min_uniform_uint32(i32, i32) nounwind readnone { + %cmp = icmp ult i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__max_uniform_uint32(i32, i32) nounwind readnone { + %cmp = icmp ugt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i64 @__min_uniform_int64(i64, i64) nounwind readnone { + %cmp = icmp slt i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__max_uniform_int64(i64, i64) nounwind readnone { + %cmp = icmp sgt i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__min_uniform_uint64(i64, i64) nounwind readnone { + %cmp = icmp ult i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__max_uniform_uint64(i64, i64) nounwind readnone { + %cmp = icmp ugt i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define double @__min_uniform_double(double, double) nounwind readnone { + %cmp = fcmp olt double %0, %1 + %r = select i1 %cmp, double %0, double %1 + ret double %r +} + +define double @__max_uniform_double(double, double) nounwind readnone { + %cmp = fcmp ogt double %0, %1 + %r = select i1 %cmp, double %0, double %1 + ret double %r +} + +define @__min_varying_int64(, ) nounwind readnone { + %m = icmp slt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__max_varying_int64(, ) nounwind readnone { + %m = icmp sgt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__min_varying_uint64(, ) nounwind readnone { + %m = icmp ult %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__max_varying_uint64(, ) nounwind readnone { + %m = icmp ugt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__min_varying_double(, + ) nounwind readnone { + %m = fcmp olt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__max_varying_double(, + ) nounwind readnone { + %m = fcmp ogt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +;; sqrt/rsqrt/rcp + +declare float @llvm.sqrt.f32(float) + +define float @__sqrt_uniform_float(float) nounwind readnone { + %r = call float @llvm.sqrt.f32(float %0) + ret float %r +} + +declare double @llvm.sqrt.f64(double) + +define double @__sqrt_uniform_double(double) nounwind readnone { + %r = call double @llvm.sqrt.f64(double %0) + ret double %r +} + +;; bit ops + +declare i32 @llvm.ctpop.i32(i32) nounwind readnone +declare i64 @llvm.ctpop.i64(i64) nounwind readnone + +define i32 @__popcnt_int32(i32) nounwind readnone { + %v = call i32 @llvm.ctpop.i32(i32 %0) + ret i32 %v +} + +define i64 @__popcnt_int64(i64) nounwind readnone { + %v = call i64 @llvm.ctpop.i64(i64 %0) + ret i64 %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) +masked_store_float_double() + +define void @__masked_store_blend_i8(* nocapture %ptr, %new, + %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +define void @__masked_store_blend_i16(* nocapture %ptr, %new, + %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +define void @__masked_store_blend_i32(* nocapture %ptr, %new, + %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +define void @__masked_store_blend_i64(* nocapture %ptr, + %new, %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + +packed_load_and_store(4) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; prefetch + +define_prefetches() diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index c6a3afe2..ad1d88bc 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -269,4 +269,8 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline { ret i64 %val } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 73361720..057ea98f 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -102,92 +102,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ret <8 x float> %call } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone - - -define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_sinf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_cosf4, %0) - ret <8 x float> %ret -} - -define void @__svml_sincos(<8 x float>, <8 x float> *, - <8 x float> *) nounwind readnone alwaysinline { - ; call svml_sincosf4 two times with the two 4-wide sub-vectors - %a = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - %b = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - - %cospa = alloca <4 x float> - %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) - - %cospb = alloca <4 x float> - %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) - - %sin = shufflevector <4 x float> %sa, <4 x float> %sb, - <8 x i32> - store <8 x float> %sin, <8 x float> * %1 - - %cosa = load <4 x float> * %cospa - %cosb = load <4 x float> * %cospb - %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, - <8 x i32> - store <8 x float> %cos, <8 x float> * %2 - - ret void -} - -define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_tanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_atanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan2(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_atan2f4, %0, %1) - ret <8 x float> %ret -} - -define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_expf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_logf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_pow(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_powf4, %0, %1) - ret <8 x float> %ret -} - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max @@ -367,6 +281,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + define <4 x float> @__vec4_add_float(<4 x float> %v0, <4 x float> %v1) nounwind readnone alwaysinline { %v = fadd <4 x float> %v0, %v1 diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 2bb06391..e0a5c3d5 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -267,6 +267,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <4 x i16> @__add_varying_i16(<4 x i16>, + <4 x i16>) nounwind readnone alwaysinline { + %r = add <4 x i16> %0, %1 + ret <4 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline { + reduce4(i16, @__add_varying_i16, @__add_uniform_i16) +} + define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline { %v1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> @@ -463,66 +493,6 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin ret <4 x float> %call } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone - - -define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) - ret <4 x float> %ret -} - -define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { - %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) - store <4 x float> %s, <4 x float> * %1 - ret void -} - -define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - -define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_expf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_logf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll new file mode 100644 index 00000000..b4772552 --- /dev/null +++ b/builtins/target-sse4-16.ll @@ -0,0 +1,482 @@ +;; Copyright (c) 2013, Google, Inc. +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Google, Inc. nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Define common 4-wide stuff +define(`WIDTH',`8') +define(`MASK',`i16') +include(`util.m4') + +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse4-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define @__rcp_varying_float() nounwind readonly alwaysinline { + unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0) + ; do one N-R iteration to improve precision + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + %v_iv = fmul <8 x float> %0, %call + %two_minus = fsub <8 x float> , %v_iv + %iv_mul = fmul <8 x float> %call, %two_minus + ret <8 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; rsqrt + +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define @__rsqrt_varying_float( %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <8 x float> %v, %is + %v_is_is = fmul <8 x float> %v_is, %is + %three_sub = fsub <8 x float> , %v_is_is + %is_mul = fmul <8 x float> %is, %three_sub + %half_scale = fmul <8 x float> , %is_mul + ret <8 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; sqrt + +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline { + unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0) + ret <8 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind +alwaysinline { + unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ret <8 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + round4to8(%0, 8) +} + +define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 + round4to8(%0, 9) +} + +define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 + round4to8(%0, 10) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone + +define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 8) +} + +define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 9) +} + +define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 10) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { + binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1) + ret <8 x float> %call +} + +define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { + binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1) + ret <8 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 min/max + +define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1) + ret <8 x i32> %call +} + +define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1) + ret <8 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unsigned int min/max + +define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1) + ret <8 x i32> %call +} + +define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1) + ret <8 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone { + binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ret <8 x double> %ret +} + +define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone { + binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ret <8 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops / reductions + +declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone + +define i64 @__movmsk(<8 x MASK>) nounwind readnone alwaysinline { + %m8 = trunc <8 x MASK> %0 to <8 x i8> + %mask8 = shufflevector <8 x i8> %m8, <8 x i8> zeroinitializer, + <16 x i32> + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %mask8) + %m64 = zext i32 %m to i64 + ret i64 %m64 +} + +define i1 @__any(<8 x MASK>) nounwind readnone alwaysinline { + %m = call i64 @__movmsk(<8 x MASK> %0) + %mne = icmp ne i64 %m, 0 + ret i1 %mne +} + +define i1 @__all(<8 x MASK>) nounwind readnone alwaysinline { + %m = call i64 @__movmsk(<8 x MASK> %0) + %meq = icmp eq i64 %m, ALL_ON_MASK + ret i1 %meq +} + +define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline { + %m = call i64 @__movmsk(<8 x MASK> %0) + %meq = icmp eq i64 %m, 0 + ret i1 %meq +} + +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + +define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) { + %r = fadd <8 x float> %0, %1 + ret <8 x float> %r +} + +define internal float @__add_uniform_float(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline { + reduce8(float, @__add_varying_float, @__add_uniform_float) +} + +define float @__reduce_min_float(<8 x float>) nounwind readnone { + reduce8(float, @__min_varying_float, @__min_uniform_float) +} + +define float @__reduce_max_float(<8 x float>) nounwind readnone { + reduce8(float, @__max_varying_float, @__max_uniform_float) +} + +define internal <8 x i32> @__add_varying_int32(<8 x i32>, <8 x i32>) { + %r = add <8 x i32> %0, %1 + ret <8 x i32> %r +} + +define internal i32 @__add_uniform_int32(i32, i32) { + %r = add i32 %0, %1 + ret i32 %r +} + +define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone { + reduce8(i32, @__add_varying_int32, @__add_uniform_int32) +} + +define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone { + reduce8(i32, @__min_varying_int32, @__min_uniform_int32) +} + +define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone { + reduce8(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone { + reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone { + reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32) +} + +define internal <8 x double> @__add_varying_double(<8 x double>, <8 x double>) { + %r = fadd <8 x double> %0, %1 + ret <8 x double> %r +} + +define internal double @__add_uniform_double(double, double) { + %r = fadd double %0, %1 + ret double %r +} + +define double @__reduce_add_double(<8 x double>) nounwind readnone { + reduce8(double, @__add_varying_double, @__add_uniform_double) +} + +define double @__reduce_min_double(<8 x double>) nounwind readnone { + reduce8(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double(<8 x double>) nounwind readnone { + reduce8(double, @__max_varying_double, @__max_uniform_double) +} + +define internal <8 x i64> @__add_varying_int64(<8 x i64>, <8 x i64>) { + %r = add <8 x i64> %0, %1 + ret <8 x i64> %r +} + +define internal i64 @__add_uniform_int64(i64, i64) { + %r = add i64 %0, %1 + ret i64 %r +} + +define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone { + reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone { + reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +reduce_equal(8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +define void @__masked_store_blend_i64(<8 x i64>* nocapture, <8 x i64>, + <8 x MASK> %mask) nounwind + alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i64>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i64> %1, <8 x i64> %old + store <8 x i64> %blend, <8 x i64>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, + <8 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i32>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i32> %1, <8 x i32> %old + store <8 x i32> %blend, <8 x i32>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>, + <8 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i16>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i16> %1, <8 x i16> %old + store <8 x i16> %blend, <8 x i16>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>, + <8 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i8>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i8> %1, <8 x i8> %old + store <8 x i8> %blend, <8 x i8>* %0, align 4 + ret void +} + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) + +masked_store_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +; define these with the macros from stdlib.m4 + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret <8 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) { + %r = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +define_avg_up_int8() +define_avg_up_int16() +define_down_avgs() diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll new file mode 100644 index 00000000..a75d8e3a --- /dev/null +++ b/builtins/target-sse4-8.ll @@ -0,0 +1,483 @@ +;; Copyright (c) 2013, Google, Inc. +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Google, Inc. nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Define common 4-wide stuff +define(`WIDTH',`16') +define(`MASK',`i8') +include(`util.m4') + +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse4-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define @__rcp_varying_float() nounwind readonly alwaysinline { + unary4to16(call, float, @llvm.x86.sse.rcp.ps, %0) + ; do one N-R iteration to improve precision + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + %v_iv = fmul <16 x float> %0, %call + %two_minus = fsub <16 x float> , %v_iv + %iv_mul = fmul <16 x float> %call, %two_minus + ret <16 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; rsqrt + +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + unary4to16(is, float, @llvm.x86.sse.rsqrt.ps, %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <16 x float> %v, %is + %v_is_is = fmul <16 x float> %v_is, %is + %three_sub = fsub <16 x float> , %v_is_is + %is_mul = fmul <16 x float> %is, %three_sub + %half_scale = fmul <16 x float> , %is_mul + ret <16 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; sqrt + +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline { + unary4to16(call, float, @llvm.x86.sse.sqrt.ps, %0) + ret <16 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind +alwaysinline { + unary2to16(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ret <16 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline { + ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + round4to16(%0, 8) +} + +define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline { + ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 + round4to16(%0, 9) +} + +define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline { + ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 + round4to16(%0, 10) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone + +define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline { +; XXXround2to4double(%0, 8) + ; FIXME: need round2to16double in util.m4... + ret <16 x double> undef +} + +define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline { + ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 +; XXXround2to4double(%0, 9) + ret <16 x double> undef +} + +define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline { + ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10 +; XXXround2to4double(%0, 10) + ret <16 x double> undef +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline { + binary4to16(call, float, @llvm.x86.sse.max.ps, %0, %1) + ret <16 x float> %call +} + +define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline { + binary4to16(call, float, @llvm.x86.sse.min.ps, %0, %1) + ret <16 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 min/max + +define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pminsd, %0, %1) + ret <16 x i32> %call +} + +define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1) + ret <16 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unsigned int min/max + +define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pminud, %0, %1) + ret <16 x i32> %call +} + +define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pmaxud, %0, %1) + ret <16 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone { + binary2to16(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ret <16 x double> %ret +} + +define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone { + binary2to16(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ret <16 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops / reductions + +declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone + +define i64 @__movmsk(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %m64 = zext i32 %m to i64 + ret i64 %m64 +} + +define i1 @__any(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %mne = icmp ne i32 %m, 0 + ret i1 %mne +} + +define i1 @__all(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %meq = icmp eq i32 %m, ALL_ON_MASK + ret i1 %meq +} + +define i1 @__none(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %meq = icmp eq i32 %m, 0 + ret i1 %meq +} + +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline { + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <16 x i16> @__add_varying_i16(<16 x i16>, + <16 x i16>) nounwind readnone alwaysinline { + %r = add <16 x i16> %0, %1 + ret <16 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline { + reduce16(i16, @__add_varying_i16, @__add_uniform_i16) +} + +define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) { + %r = fadd <16 x float> %0, %1 + ret <16 x float> %r +} + +define internal float @__add_uniform_float(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline { + reduce16(float, @__add_varying_float, @__add_uniform_float) +} + +define float @__reduce_min_float(<16 x float>) nounwind readnone { + reduce16(float, @__min_varying_float, @__min_uniform_float) +} + +define float @__reduce_max_float(<16 x float>) nounwind readnone { + reduce16(float, @__max_varying_float, @__max_uniform_float) +} + +define internal <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) { + %r = add <16 x i32> %0, %1 + ret <16 x i32> %r +} + +define internal i32 @__add_uniform_int32(i32, i32) { + %r = add i32 %0, %1 + ret i32 %r +} + +define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone { + reduce16(i32, @__add_varying_int32, @__add_uniform_int32) +} + +define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone { + reduce16(i32, @__min_varying_int32, @__min_uniform_int32) +} + +define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone { + reduce16(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone { + reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone { + reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32) +} + +define internal <16 x double> @__add_varying_double(<16 x double>, <16 x double>) { + %r = fadd <16 x double> %0, %1 + ret <16 x double> %r +} + +define internal double @__add_uniform_double(double, double) { + %r = fadd double %0, %1 + ret double %r +} + +define double @__reduce_add_double(<16 x double>) nounwind readnone { + reduce16(double, @__add_varying_double, @__add_uniform_double) +} + +define double @__reduce_min_double(<16 x double>) nounwind readnone { + reduce16(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double(<16 x double>) nounwind readnone { + reduce16(double, @__max_varying_double, @__max_uniform_double) +} + +define internal <16 x i64> @__add_varying_int64(<16 x i64>, <16 x i64>) { + %r = add <16 x i64> %0, %1 + ret <16 x i64> %r +} + +define internal i64 @__add_uniform_int64(i64, i64) { + %r = add i64 %0, %1 + ret i64 %r +} + +define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone { + reduce16(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone { + reduce16(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone { + reduce16(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone { + reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone { + reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +reduce_equal(16) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +define void @__masked_store_blend_i64(<16 x i64>* nocapture, <16 x i64>, + <16 x i8> %mask) nounwind + alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i64>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i64> %1, <16 x i64> %old + store <16 x i64> %blend, <16 x i64>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, + <16 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i32>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i32> %1, <16 x i32> %old + store <16 x i32> %blend, <16 x i32>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>, + <16 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i16>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i16> %1, <16 x i16> %old + store <16 x i16> %blend, <16 x i16>* %0, align 4 + ret void +} + +declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone + +define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>, + <16 x MASK> %mask) nounwind alwaysinline { + %old = load <16 x i8>* %0, align 4 + %blend = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %old, <16 x i8> %1, + <16 x i8> %mask) + store <16 x i8> %blend, <16 x i8>* %0, align 4 + ret void +} + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) + +masked_store_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +; define these with the macros from stdlib.m4 + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +define_avg_up_int8() +define_avg_up_int16() +define_down_avgs() diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index ccae4d51..897a09eb 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -102,92 +102,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ret <8 x float> %call } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone - - -define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_sinf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_cosf4, %0) - ret <8 x float> %ret -} - -define void @__svml_sincos(<8 x float>, <8 x float> *, - <8 x float> *) nounwind readnone alwaysinline { - ; call svml_sincosf4 two times with the two 4-wide sub-vectors - %a = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - %b = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - - %cospa = alloca <4 x float> - %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) - - %cospb = alloca <4 x float> - %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) - - %sin = shufflevector <4 x float> %sa, <4 x float> %sb, - <8 x i32> - store <8 x float> %sin, <8 x float> * %1 - - %cosa = load <4 x float> * %cospa - %cosb = load <4 x float> * %cospb - %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, - <8 x i32> - store <8 x float> %cos, <8 x float> * %2 - - ret void -} - -define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_tanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_atanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan2(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_atan2f4, %0, %1) - ret <8 x float> %ret -} - -define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_expf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_logf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_pow(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_powf4, %0, %1) - ret <8 x float> %ret -} - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max @@ -309,6 +223,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline { reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float) } @@ -629,3 +573,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1) ret <8 x double> %ret } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index f622b839..5429b461 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -206,66 +206,6 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r ret <4 x double> %ret } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone - - -define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) - ret <4 x float> %ret -} - -define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { - %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) - store <4 x float> %s, <4 x float> * %1 - ret void -} - -define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - -define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_expf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_logf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions @@ -299,6 +239,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <4 x i16> @__add_varying_i16(<4 x i16>, + <4 x i16>) nounwind readnone alwaysinline { + %r = add <4 x i16> %0, %1 + ret <4 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline { + reduce4(i16, @__add_varying_i16, @__add_uniform_i16) +} + declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline { @@ -503,3 +473,9 @@ gen_scatter(i32) gen_scatter(float) gen_scatter(i64) gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/util.m4 b/builtins/util.m4 index c19d4930..95e3844d 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -49,6 +49,63 @@ define(`MASK_HIGH_BIT_ON', ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; vector deconstruction utilities +;; split 8-wide vector into 2 4-wide vectors +;; +;; $1: vector element type +;; $2: 8-wide vector +;; $3: first 4-wide vector +;; $4: second 4-wide vector + +define(`v8tov4', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, + <4 x i32> + $4 = shufflevector <8 x $1> $2, <8 x $1> undef, + <4 x i32> +') + +define(`v16tov8', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, + <8 x i32> + $4 = shufflevector <16 x $1> $2, <16 x $1> undef, + <8 x i32> +') + +define(`v4tov2', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> + $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> +') + +define(`v8tov2', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> +') + +define(`v16tov4', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; vector assembly: wider vector from two narrower vectors +;; +;; $1: vector element type +;; $2: first n-wide vector +;; $3: second n-wide vector +;; $4: result 2*n-wide vector +define(`v8tov16', ` + $4 = shufflevector <8 x $1> $2, <8 x $1> $3, + <16 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Helper macro for calling various SSE instructions for scalar values ;; but where the instruction takes a vector parameter. ;; $1 : name of variable to put the final value in @@ -156,10 +213,7 @@ define(`reduce16', ` ;; the final reduction define(`reduce8by4', ` - %v1 = shufflevector <8 x $1> %0, <8 x $1> undef, - <4 x i32> - %v2 = shufflevector <8 x $1> %0, <8 x $1> undef, - <4 x i32> + v8tov4($1, %0, %v1, %v2) %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2) %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef, <4 x i32> @@ -266,30 +320,66 @@ define(`binary2to4', ` ;; $4: 8-wide operand value define(`unary4to8', ` - %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> - %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0) - %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> - %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1) - %$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, + %__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> + %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) + %__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> + %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) + %$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, + <8 x i32> +' +) + +;; $1: name of variable into which the final result should go +;; $2: scalar type of the input vector elements +;; $3: scalar type of the result vector elements +;; $4: 4-wide unary vector function to apply +;; $5: 8-wide operand value + +define(`unary4to8conv', ` + %$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> + %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) + %$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> + %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) + %$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, <8 x i32> ' ) define(`unary4to16', ` - %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0) - %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1) - %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2) - %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3) + %__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) + %__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) + %__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2) + %__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3) - %$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, + %__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, <8 x i32> - %$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3, + %__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3, <8 x i32> - %$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b, + %$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b, + <16 x i32> +' +) + +define(`unary4to16conv', ` + %$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) + %$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) + %$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2) + %$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3) + + %$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, + <8 x i32> + %$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3, + <8 x i32> + %$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b, <16 x i32> ' @@ -411,6 +501,42 @@ define(`unary2to8', ` ' ) +define(`unary2to16', ` + %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0) + %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1) + %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2) + %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3) + %$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4) + %$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5) + %$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6) + %$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> + %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, + <4 x i32> + %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + <8 x i32> + %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, + <4 x i32> + %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, + <4 x i32> + %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, + <8 x i32> + + %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, + <16 x i32> +' +) + ;; Maps an 2-wide binary function to two 8-wide vector operands ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements @@ -432,12 +558,58 @@ define(`binary2to8', ` %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> + %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, + <4 x i32> + %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + <8 x i32> +' +) + +define(`binary2to16', ` + %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b) + %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b) + %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b) + %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) + %$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b) + %$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b) + %$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b) + %$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, <4 x i32> - %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, <8 x i32> + + %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, + <4 x i32> + %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, + <4 x i32> + %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, + <8 x i32> + + %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, + <16 x i32> ' ) @@ -460,6 +632,26 @@ ret <8 x float> %ret ' ) +define(`round4to16', ` +%v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2) +%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2) +%r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2) +%r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2) +%ret01 = shufflevector <4 x float> %r0, <4 x float> %r1, + <8 x i32> +%ret23 = shufflevector <4 x float> %r2, <4 x float> %r3, + <8 x i32> +%ret = shufflevector <8 x float> %ret01, <8 x float> %ret23, + <16 x i32> +ret <16 x float> %ret +' +) + define(`round8to16', ` %v0 = shufflevector <16 x float> $1, <16 x float> undef, <8 x i32> @@ -690,6 +882,75 @@ shuffles(i64, 8) ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) ;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...) +define(`mask_converts', ` +define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) { + ret <$1 x i8> %0 +} +define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) { + %r = trunc <$1 x i16> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) { + ret <$1 x i16> %0 +} +define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) { + %r = sext <$1 x i16> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) { + %r = sext <$1 x i16> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) { + %r = trunc <$1 x i32> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) { + %r = trunc <$1 x i32> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) { + ret <$1 x i32> %0 +} +define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) { + %r = sext <$1 x i32> %0 to <$1 x i64> + ret <$1 x i64> %r +} +') + +mask_converts(WIDTH) + define(`global_atomic_associative', ` define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, @@ -697,17 +958,10 @@ define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, ; first, for any lanes where the mask is off, compute a vector where those lanes ; hold the identity value.. - ; for the bit tricks below, we need the mask to be sign extended to be - ; the size of the element type. - ifelse( - MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>', - $3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>', - $3,i32, ` - ; silly workaround to do %mask = %m, which is not possible directly.. - %maskmem = alloca <$1 x i32> - store <$1 x i32> %m, <$1 x i32> * %maskmem - %mask = load <$1 x i32> * %maskmem' - ) + ; for the bit tricks below, we need the mask to have the + ; the same element size as the element type. + %mask = call <$1 x $3> @convertmask_`'MASK`'_$3_$1(<$1 x MASK> %m) + ; zero out any lanes that are off %valoff = and <$1 x $3> %val, %mask @@ -1551,11 +1805,6 @@ declare i1 @__is_compile_time_constant_mask( %mask) declare i1 @__is_compile_time_constant_uniform_int32(i32) declare i1 @__is_compile_time_constant_varying_int32() -define void @__pause() nounwind readnone { - call void asm sideeffect "pause", "~{dirflag},~{fpsr},~{flags}"() nounwind - ret void -} - ; This function declares placeholder masked store functions for the ; front-end to use. ; @@ -2440,13 +2689,12 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline { } define @__sext_varying_bool() nounwind readnone alwaysinline { - ifelse(MASK,i1, ` - %se = sext %0 to - ret %se - ', ` - ret %0') + ifelse(MASK,i32, `ret %0', + `%se = sext %0 to + ret %se') } + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; memcpy/memmove/memset @@ -2830,17 +3078,11 @@ m4exit(`1') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; read hw clock +declare i64 @llvm.readcyclecounter() + define i64 @__clock() nounwind { -entry: - tail call void asm sideeffect "xorl %eax,%eax \0A cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind - %0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind - %asmresult = extractvalue { i32, i32 } %0, 0 - %asmresult1 = extractvalue { i32, i32 } %0, 1 - %conv = zext i32 %asmresult1 to i64 - %shl = shl nuw i64 %conv, 32 - %conv2 = zext i32 %asmresult to i64 - %or = or i64 %shl, %conv2 - ret i64 %or + %r = call i64 @llvm.readcyclecounter() + ret i64 %r } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3201,8 +3443,8 @@ return: ;; $1: llvm type of elements (and suffix for function name) define(`gen_masked_store', ` -define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { - per_lane(WIDTH, %2, ` +define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { + per_lane(WIDTH, %2, ` %ptr_LANE_ID = getelementptr * %0, i32 0, i32 LANE %storeval_LANE_ID = extractelement %1, i32 LANE store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID') @@ -3378,10 +3620,10 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>, define(`packed_load_and_store', ` define i32 @__packed_load_active(i32 * %startptr, * %val_ptr, - %full_mask) nounwind alwaysinline { + %full_mask) nounwind alwaysinline { entry: - %mask = call i64 @__movmsk( %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) + %mask = call i64 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: @@ -3432,10 +3674,10 @@ done: } define i32 @__packed_store_active(i32 * %startptr, %vals, - %full_mask) nounwind alwaysinline { + %full_mask) nounwind alwaysinline { entry: - %mask = call i64 @__movmsk( %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) + %mask = call i64 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: @@ -3544,10 +3786,10 @@ check_neighbors: %castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1) %vr = bitcast <$1 x $4> %castvr to <$1 x $2> %eq = $5 $7 <$1 x $2> %vec, %vr - ifelse(MASK,i32, ` - %eq32 = sext <$1 x i1> %eq to <$1 x i32> - %eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', ` - %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)') + ifelse(MASK,i1, ` + %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)', + `%eqm = sext <$1 x i1> %eq to <$1 x MASK> + %eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)') %alleq = icmp eq i64 %eqmm, ALL_ON_MASK br i1 %alleq, label %all_equal, label %not_all_equal ', ` @@ -3722,9 +3964,9 @@ pl_done: define(`gen_gather_general', ` ; fully general 32-bit gather, takes array of pointers encoded as vector of i32s define @__gather32_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` + per_lane(WIDTH, %vecmask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID @@ -3738,9 +3980,9 @@ define @__gather32_$1( %ptrs, ; fully general 64-bit gather, takes array of pointers encoded as vector of i32s define @__gather64_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` + per_lane(WIDTH, %vecmask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID @@ -3804,7 +4046,7 @@ define @__gather_elt64_$1(i8 * %ptr, %offsets, i32 %o define @__gather_factored_base_offsets32_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always ; legal to read from (and we do indeed require that, given the benefits!) @@ -3813,13 +4055,13 @@ define @__gather_factored_base_offsets32_$1(i8 * %ptr, store zeroinitializer, * %offsetsPtr call void @__masked_store_blend_i32( * %offsetsPtr, %offsets, - %vecmask) + %vecmask) %newOffsets = load * %offsetsPtr %deltaPtr = alloca store zeroinitializer, * %deltaPtr call void @__masked_store_blend_i32( * %deltaPtr, %offset_delta, - %vecmask) + %vecmask) %newDelta = load * %deltaPtr %ret0 = call @__gather_elt32_$1(i8 * %ptr, %newOffsets, @@ -3835,7 +4077,7 @@ define @__gather_factored_base_offsets32_$1(i8 * %ptr, @__gather_factored_base_offsets64_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always ; legal to read from (and we do indeed require that, given the benefits!) @@ -3844,13 +4086,13 @@ define @__gather_factored_base_offsets64_$1(i8 * %ptr, store zeroinitializer, * %offsetsPtr call void @__masked_store_blend_i64( * %offsetsPtr, %offsets, - %vecmask) + %vecmask) %newOffsets = load * %offsetsPtr %deltaPtr = alloca store zeroinitializer, * %deltaPtr call void @__masked_store_blend_i64( * %deltaPtr, %offset_delta, - %vecmask) + %vecmask) %newDelta = load * %deltaPtr %ret0 = call @__gather_elt64_$1(i8 * %ptr, %newOffsets, @@ -3876,27 +4118,27 @@ gen_gather_factored($1) define @__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale, %offsets, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %scale_vec = bitcast i32 %offset_scale to <1 x i32> %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef, < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > %scaled_offsets = mul %smear_scale, %offsets %v = call @__gather_factored_base_offsets32_$1(i8 * %ptr, %scaled_offsets, i32 1, - zeroinitializer, %vecmask) + zeroinitializer, %vecmask) ret %v } define @__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale, %offsets, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %scale64 = zext i32 %offset_scale to i64 %scale_vec = bitcast i64 %scale64 to <1 x i64> %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef, < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > %scaled_offsets = mul %smear_scale, %offsets %v = call @__gather_factored_base_offsets64_$1(i8 * %ptr, %scaled_offsets, - i32 1, zeroinitializer, %vecmask) + i32 1, zeroinitializer, %vecmask) ret %v } @@ -3955,9 +4197,9 @@ define void @__scatter_elt64_$1(i8 * %ptr, %offsets, i32 %offset_s define void @__scatter_factored_base_offsets32_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, - %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - per_lane(WIDTH, %mask, ` + per_lane(WIDTH, %mask, ` call void @__scatter_elt32_$1(i8 * %base, %offsets, i32 %offset_scale, %offset_delta, %values, i32 LANE)') ret void @@ -3965,9 +4207,9 @@ define void @__scatter_factored_base_offsets32_$1(i8* %base, %offs define void @__scatter_factored_base_offsets64_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, - %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - per_lane(WIDTH, %mask, ` + per_lane(WIDTH, %mask, ` call void @__scatter_elt64_$1(i8 * %base, %offsets, i32 %offset_scale, %offset_delta, %values, i32 LANE)') ret void @@ -3975,8 +4217,8 @@ define void @__scatter_factored_base_offsets64_$1(i8* %base, %offs ; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s define void @__scatter32_$1( %ptrs, %values, - %mask) nounwind alwaysinline { - per_lane(WIDTH, %mask, ` + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * %val_LANE_ID = extractelement %values, i32 LANE @@ -3987,8 +4229,8 @@ define void @__scatter32_$1( %ptrs, %values, ; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s define void @__scatter64_$1( %ptrs, %values, - %mask) nounwind alwaysinline { - per_lane(WIDTH, %mask, ` + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * %val_LANE_ID = extractelement %values, i32 LANE @@ -4044,3 +4286,109 @@ define i1 @__rdrand_i64(i64 * %ptr) { ret i1 %good } ') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define(`define_avg_up_uint8', ` +define @__avg_up_uint8(, ) { + %a16 = zext %0 to + %b16 = zext %1 to + %sum1 = add %a16, %b16 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_int8', ` +define @__avg_up_int8(, ) { + %a16 = sext %0 to + %b16 = sext %1 to + %sum1 = add %a16, %b16 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_uint16', ` +define @__avg_up_uint16(, ) { + %a32 = zext %0 to + %b32 = zext %1 to + %sum1 = add %a32, %b32 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_int16', ` +define @__avg_up_int16(, ) { + %a32 = sext %0 to + %b32 = sext %1 to + %sum1 = add %a32, %b32 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_uint8', ` +define @__avg_down_uint8(, ) { + %a16 = zext %0 to + %b16 = zext %1 to + %sum = add %a16, %b16 + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_int8', ` +define @__avg_down_int8(, ) { + %a16 = sext %0 to + %b16 = sext %1 to + %sum = add %a16, %b16 + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_uint16', ` +define @__avg_down_uint16(, ) { + %a32 = zext %0 to + %b32 = zext %1 to + %sum = add %a32, %b32 + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_int16', ` +define @__avg_down_int16(, ) { + %a32 = sext %0 to + %b32 = sext %1 to + %sum = add %a32, %b32 + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_up_avgs', ` +define_avg_up_uint8() +define_avg_up_int8() +define_avg_up_uint16() +define_avg_up_int16() +') + +define(`define_down_avgs', ` +define_avg_down_uint8() +define_avg_down_int8() +define_avg_down_uint16() +define_avg_down_int16() +') + +define(`define_avgs', ` +define_up_avgs() +define_down_avgs() +') diff --git a/cbackend.cpp b/cbackend.cpp index d23bcc20..7d4b4cfc 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -3704,6 +3704,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) { case llvm::Intrinsic::sadd_with_overflow: case llvm::Intrinsic::trap: case llvm::Intrinsic::objectsize: + case llvm::Intrinsic::readcyclecounter: // We directly implement these intrinsics break; default: @@ -4056,6 +4057,9 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID, return true; case llvm::Intrinsic::objectsize: return true; + case llvm::Intrinsic::readcyclecounter: + Out << "__clock()"; + return true; } } diff --git a/ctx.cpp b/ctx.cpp index 1e79c97b..c50d22f9 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1456,13 +1456,13 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) { for (unsigned int i = 0; i < at->getNumElements(); ++i) { llvm::Value *elt = ExtractInst(b, i); llvm::Value *sext = SExtInst(elt, LLVMTypes::BoolVectorType, - LLVMGetName(elt, "_to_boolvec32")); + LLVMGetName(elt, "_to_boolvec")); ret = InsertInst(ret, sext, i); } return ret; } else - return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_i32")); + return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_boolvec")); } @@ -2781,6 +2781,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, // Figure out if we need a 8, 16, 32 or 64-bit masked store. llvm::Function *maskedStoreFunc = NULL; + llvm::Type *llvmValueType = value->getType(); const PointerType *pt = CastType(valueType); if (pt != NULL) { @@ -2809,8 +2810,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, else maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64"); } - else if (Type::Equal(valueType, AtomicType::VaryingBool) && - g->target->getMaskBitCount() == 1) { + else if (llvmValueType == LLVMTypes::Int1VectorType) { llvm::Value *notMask = BinaryOperator(llvm::Instruction::Xor, mask, LLVMMaskAllOn, "~mask"); llvm::Value *old = LoadInst(ptr); @@ -2823,28 +2823,22 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, StoreInst(final, ptr); return; } - else if (Type::Equal(valueType, AtomicType::VaryingDouble)) { + else if (llvmValueType == LLVMTypes::DoubleVectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_double"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt64) || - Type::Equal(valueType, AtomicType::VaryingUInt64)) { + else if (llvmValueType == LLVMTypes::Int64VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64"); } - else if (Type::Equal(valueType, AtomicType::VaryingFloat)) { + else if (llvmValueType == LLVMTypes::FloatVectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_float"); } - else if (Type::Equal(valueType, AtomicType::VaryingBool) || - Type::Equal(valueType, AtomicType::VaryingInt32) || - Type::Equal(valueType, AtomicType::VaryingUInt32) || - CastType(valueType) != NULL) { + else if (llvmValueType == LLVMTypes::Int32VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt16) || - Type::Equal(valueType, AtomicType::VaryingUInt16)) { + else if (llvmValueType == LLVMTypes::Int16VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i16"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt8) || - Type::Equal(valueType, AtomicType::VaryingUInt8)) { + else if (llvmValueType == LLVMTypes::Int8VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i8"); } AssertPos(currentPos, maskedStoreFunc != NULL); diff --git a/docs/build.sh b/docs/build.sh index a13f3231..4f4fbfe4 100755 --- a/docs/build.sh +++ b/docs/build.sh @@ -1,14 +1,16 @@ #!/bin/bash +rst2html=rst2html.py + for i in ispc perfguide faq; do - rst2html --template=template.txt --link-stylesheet \ + $rst2html --template=template.txt --link-stylesheet \ --stylesheet-path=css/style.css $i.rst > $i.html done -rst2html --template=template-news.txt --link-stylesheet \ +$rst2html --template=template-news.txt --link-stylesheet \ --stylesheet-path=css/style.css news.rst > news.html -rst2html --template=template-perf.txt --link-stylesheet \ +$rst2html --template=template-perf.txt --link-stylesheet \ --stylesheet-path=css/style.css perf.rst > perf.html #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex diff --git a/docs/ispc.rst b/docs/ispc.rst index c6c63172..26cf6be3 100755 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -467,45 +467,100 @@ There are three options that affect the compilation target: ``--arch``, which sets the target architecture, ``--cpu``, which sets the target CPU, and ``--target``, which sets the target instruction set. -By default, the ``ispc`` compiler generates code for the 64-bit x86-64 -architecture (i.e. ``--arch=x86-64``.) To compile to a 32-bit x86 target, -supply ``--arch=x86`` on the command line: +If none of these options is specified, ``ispc`` generates code for the +architecture of the system the compiler is running on (i.e. 64-bit x86-64 +(``--arch=x86-64``) on x86 systems and ARM NEON on ARM systems. + +To compile to a 32-bit x86 target, for example, supply ``--arch=x86`` on +the command line: :: ispc foo.ispc -o foo.obj --arch=x86 -No other architectures are currently supported. +Currently-supported architectures are ``x86-64``, ``x86``, and ``arm``. The target CPU determines both the default instruction set used as well as which CPU architecture the code is tuned for. ``ispc --help`` provides a -list of a number of the supported CPUs. By default, the CPU type of the -system on which you're running ``ispc`` is used to determine the target -CPU. +list of all of the supported CPUs. By default, the CPU type of the system +on which you're running ``ispc`` is used to determine the target CPU. :: ispc foo.ispc -o foo.obj --cpu=corei7-avx -Finally, ``--target`` selects between the SSE2, SSE4, and AVX, and AVX2 -instruction sets. (As general context, SSE2 was first introduced in -processors that shipped in 2001, SSE4 was introduced in 2007, and -processors with AVX were introduced in 2010. AVX2 will be supported on -future CPUs based on Intel's "Haswell" architecture. Consult your CPU's -manual for specifics on which vector instruction set it supports.) +Finally, ``--target`` selects the target instruction set. The target +string is of the form ``[ISA]-i[mask size]x[gang size]``. For example, +``--target=avx2-i32x16`` specifies a target with the AVX2 instruction set, +a mask size of 32 bits, and a gang size of 16. + +The following target ISAs are supported: + +============ ========================================== +Target Description +------------ ------------------------------------------ +avx, avx1 AVX (2010-2011 era Intel CPUs) +avx1.1 AVX 1.1 (2012 era "Ivybridge" Intel CPUs) +avx2 AVX 2 target (2013- Intel "Haswell" CPUs) +neon ARM NEON +sse2 SSE2 (early 2000s era x86 CPUs) +sse4 SSE4 (generally 2008-2010 Intel CPUs) +============ ========================================== + +Consult your CPU's manual for specifics on which vector instruction set it +supports. + +The mask size may be 8, 16, or 32 bits, though not all combinations of ISAs +and mask sizes are supported. For best performance, the best general +approach is to choose a mask size equal to the size of the most common +datatype in your programs. For example, if most of your computation is on +32-bit floating-point values, an ``i32`` target is appropriate. However, +if you're mostly doing computation on 8-bit images, ``i8`` is a better choice. + +See `Basic Concepts: Program Instances and Gangs of Program Instances`_ for +more discussion of the "gang size" and its implications for program +execution. + +Running ``ispc --help`` and looking at the output for the ``--target`` +option gives the most up-to-date documentation about which targets your +compiler binary supports. + +The naming scheme for compilation targets changed in August 2013; the +following table shows the relationship between names in the old scheme and +in the new scheme: + +============= =========== +Target Former Name +------------- ----------- +avx1-i32x8 avx, avx1 +avx1-i32x16 avx-x2 +avx1.1-i32x8 avx1.1 +avx1.1-i32x16 avx1.1-x2 +avx2-i32x8 avx2 +avx2-i32x16 avx2-x2 +neon-8 n/a +neon-16 n/a +neon-32 n/a +sse2-i32x4 sse2 +sse2-i32x8 sse2-x2 +sse4-i32x4 sse4 +sse4-i32x8 sse4-x2 +sse4-i8x16 n/a +sse4-i16x8 n/a +============= =========== By default, the target instruction set is chosen based on the most capable one supported by the system on which you're running ``ispc``. You can override this choice with the ``--target`` flag; for example, to select -Intel® SSE2, use ``--target=sse2``. (As with the other options in this -section, see the output of ``ispc --help`` for a full list of supported -targets.) +Intel® SSE2 with a 32-bit mask and 4 program instances in a gang, use +``--target=sse2-i32x4``. (As with the other options in this section, see +the output of ``ispc --help`` for a full list of supported targets.) Generating Generic C++ Output ----------------------------- In addition to generating object files or assembly output for specific -targets like SSE2, SSE4, and AVX, ``ispc`` provides an option to generate +targets like NEON, SSE2, SSE4, and AVX, ``ispc`` provides an option to generate "generic" C++ output. This As an example, consider the following simple ``ispc`` program: @@ -659,7 +714,7 @@ preprocessor runs: * - ISPC - 1 - Detecting that the ``ispc`` compiler is processing the file - * - ISPC_TARGET_{SSE2,SSE4,AVX,AVX2} + * - ISPC_TARGET_{NEON_8,NEON_16,NEON_32,SSE2,SSE4,AVX,AVX11,AVX2,GENERIC} - 1 - One of these will be set, depending on the compilation target. * - ISPC_POINTER_SIZE @@ -3278,9 +3333,6 @@ for this argument. approximately 1.45e-6 over the range -10pi to 10pi.) * ``fast``: more efficient but lower accuracy versions of the default ``ispc`` implementations. -* ``svml``: use Intel "Short Vector Math Library". Use - ``icc`` to link your final executable so that the appropriate libraries - are linked. * ``system``: use the system's math library. On many systems, these functions are more accurate than both of ``ispc``'s implementations. Using these functions may be quite @@ -3365,6 +3417,31 @@ The ``isnan()`` functions test whether the given value is a floating-point uniform bool isnan(uniform double v) +A number of functions are also available for performing operations on 8- and +16-bit quantities; these map to specialized instructions that perform these +operations on targets that support them. ``avg_up()`` computes the average +of the two values, rounding up if their average is halfway between two +integers (i.e., it computes ``(a+b+1)/2``). + +:: + + int8 avg_up(int8 a, int8 b) + unsigned int8 avg_up(unsigned int8 a, unsigned int8 b) + int16 avg_up(int16 a, int16 b) + unsigned int16 avg_up(unsigned int16 a, unsigned int16 b) + + +``avg_down()`` computes the average of the two values, rounding down (i.e., +it computes ``(a+b)/2``). + +:: + + int8 avg_down(int8 a, int8 b) + unsigned int8 avg_down(unsigned int8 a, unsigned int8 b) + int16 avg_down(int16 a, int16 b) + unsigned int16 avg_down(unsigned int16 a, unsigned int16 b) + + Transcendental Functions ------------------------ @@ -3711,29 +3788,44 @@ instances are added together by the ``reduce_add()`` function. :: - uniform float reduce_add(float x) - uniform int reduce_add(int x) - uniform unsigned int reduce_add(unsigned int x) + uniform int16 reduce_add(int8 x) + uniform unsigned int16 reduce_add(unsigned int8 x) + uniform int32 reduce_add(int16 x) + uniform unsigned int32 reduce_add(unsigned 16int x) + uniform int64 reduce_add(int32 x) + uniform unsigned int64 reduce_add(unsigned int32 x) + uniform int64 reduce_add(int64 x) + uniform unsigned int64 reduce_add(unsigned int64 x) -You can also use functions to compute the minimum and maximum value of the -given value across all of the currently-executing program instances. + uniform float reduce_add(float x) + uniform double reduce_add(double x) + +You can also use functions to compute the minimum value of the given value +across all of the currently-executing program instances. :: - uniform float reduce_min(float a) uniform int32 reduce_min(int32 a) uniform unsigned int32 reduce_min(unsigned int32 a) - uniform double reduce_min(double a) uniform int64 reduce_min(int64 a) uniform unsigned int64 reduce_min(unsigned int64 a) - uniform float reduce_max(float a) + uniform float reduce_min(float a) + uniform double reduce_min(double a) + +Equivalent functions are available to comptue the maximum of the given +varying variable over the active program instances. + +:: + uniform int32 reduce_max(int32 a) uniform unsigned int32 reduce_max(unsigned int32 a) - uniform double reduce_max(double a) uniform int64 reduce_max(int64 a) uniform unsigned int64 reduce_max(unsigned int64 a) + uniform float reduce_max(float a) + uniform double reduce_max(double a) + Finally, you can check to see if a particular value has the same value in all of the currently-running program instances: @@ -3741,9 +3833,10 @@ all of the currently-running program instances: uniform bool reduce_equal(int32 v) uniform bool reduce_equal(unsigned int32 v) - uniform bool reduce_equal(float v) uniform bool reduce_equal(int64 v) uniform bool reduce_equal(unsigned int64 v) + + uniform bool reduce_equal(float v) uniform bool reduce_equal(double) There are also variants of these functions that return the value as a @@ -3758,10 +3851,11 @@ performance in the `Performance Guide`_. uniform bool reduce_equal(int32 v, uniform int32 * uniform sameval) uniform bool reduce_equal(unsigned int32 v, uniform unsigned int32 * uniform sameval) - uniform bool reduce_equal(float v, uniform float * uniform sameval) uniform bool reduce_equal(int64 v, uniform int64 * uniform sameval) uniform bool reduce_equal(unsigned int64 v, uniform unsigned int64 * uniform sameval) + + uniform bool reduce_equal(float v, uniform float * uniform sameval) uniform bool reduce_equal(double, uniform double * uniform sameval) If called when none of the program instances are running, diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 828c1ab4..d81101f7 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1162,19 +1162,20 @@ REDUCE_ADD(double, __vec16_d, __reduce_add_double) REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >) -REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_int32) +REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) + +REDUCE_ADD(int64_t, __vec16_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <) REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >) -REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_uint32) REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <) REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >) -REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_int64) +REDUCE_ADD(int64_t, __vec16_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_min_int64, <) REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_max_int64, >) -REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_uint64) REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <) REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >) @@ -1758,3 +1759,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, return __sync_val_compare_and_swap(p, cmpval, newval); #endif } + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} + +#endif // !WIN32 + diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 64b82cb1..531ed215 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -408,15 +408,15 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) { return (uint64_t)mask.v; } -static FORCEINLINE __vec32_i1 __any(__vec32_i1 mask) { +static FORCEINLINE bool __any(__vec32_i1 mask) { return (mask.v!=0); } -static FORCEINLINE __vec32_i1 __all(__vec32_i1 mask) { - return (mask.v==0xFFFFFFFF); +static FORCEINLINE bool __all(__vec32_i1 mask) { + return (mask.v==0xFFFFFFFFul); } -static FORCEINLINE __vec32_i1 __none(__vec32_i1 mask) { +static FORCEINLINE bool __none(__vec32_i1 mask) { return (mask.v==0); } @@ -1231,19 +1231,20 @@ REDUCE_ADD(double, __vec32_d, __reduce_add_double) REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >) -REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_int32) +//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) + +REDUCE_ADD(int64_t, __vec32_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <) REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_max_int32, >) -REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_uint32) REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_min_uint32, <) REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_max_uint32, >) -REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_int64) +REDUCE_ADD(int64_t, __vec32_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_min_int64, <) REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_max_int64, >) -REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_uint64) REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_min_uint64, <) REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_max_uint64, >) @@ -1826,3 +1827,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, return __sync_val_compare_and_swap(p, cmpval, newval); #endif } + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // WIN32 + +#undef FORCEINLINE diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index 7869faa5..bbeb007a 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -533,15 +533,15 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) { return (uint64_t)mask.v; } -static FORCEINLINE __vec64_i1 __any(__vec64_i1 mask) { +static FORCEINLINE bool __any(__vec64_i1 mask) { return (mask.v!=0); } -static FORCEINLINE __vec64_i1 __all(__vec64_i1 mask) { - return (mask.v==0xFFFFFFFFFFFFFFFF); +static FORCEINLINE bool __all(__vec64_i1 mask) { + return (mask.v==0xFFFFFFFFFFFFFFFFull); } -static FORCEINLINE __vec64_i1 __none(__vec64_i1 mask) { +static FORCEINLINE bool __none(__vec64_i1 mask) { return (mask.v==0); } @@ -1364,19 +1364,20 @@ REDUCE_ADD(double, __vec64_d, __reduce_add_double) REDUCE_MINMAX(double, __vec64_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec64_d, __reduce_max_double, >) -REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_int32) +//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) + +REDUCE_ADD(int64_t, __vec64_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_min_int32, <) REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_max_int32, >) -REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_uint32) REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_min_uint32, <) REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_max_uint32, >) -REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_int64) +REDUCE_ADD(int64_t, __vec64_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_min_int64, <) REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_max_int64, >) -REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_uint64) REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_min_uint64, <) REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_max_uint64, >) @@ -1959,3 +1960,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, return __sync_val_compare_and_swap(p, cmpval, newval); #endif } + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif + +#undef FORCEINLINE diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index bf383c88..8baef8cb 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1511,6 +1511,22 @@ static FORCEINLINE int64_t __count_trailing_zeros_i64(const __vec1_i64 mask) { // reductions /////////////////////////////////////////////////////////////////////////// +static FORCEINLINE int16_t __reduce_add_i8(__vec16_i8 v) { + // TODO: improve this! + int16_t ret = 0; + for (int i = 0; i < 16; ++i) + ret += v.v[i]; + return ret; +} + +static FORCEINLINE int32_t __reduce_add_i16(__vec16_i16 v) { + // TODO: improve this! + int32_t ret = 0; + for (int i = 0; i < 16; ++i) + ret += v.v[i]; + return ret; +} + static FORCEINLINE uint32_t __reduce_add_i32(__vec16_i32 v) { return _mm512_reduce_add_epi32(v); } @@ -2105,9 +2121,24 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, #endif } +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 + #undef FORCEINLINE #undef PRE_ALIGN #undef POST_ALIGN - - - diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h index 0041a6c9..a1b1fc9d 100644 --- a/examples/intrinsics/knc2x.h +++ b/examples/intrinsics/knc2x.h @@ -1607,6 +1607,9 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { /////////////////////////////////////////////////////////////////////////// // reductions +REDUCE_ADD(int16_t, __vec32_i8, __reduce_add_int8) +REDUCE_ADD(int32_t, __vec32_i16, __reduce_add_int16) + static FORCEINLINE float __reduce_add_float(__vec32_f v) { return _mm512_reduce_add_ps(v.v1) + _mm512_reduce_add_ps(v.v2); } @@ -2052,7 +2055,24 @@ static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec32_f *out0, __vec32 } */ +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 + #undef FORCEINLINE #undef PRE_ALIGN #undef POST_ALIGN - diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index d4739d61..ff00d920 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -2528,6 +2528,22 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { /////////////////////////////////////////////////////////////////////////// // reductions +static FORCEINLINE int16_t __reduce_add_int8(__vec4_i8 v) { + // TODO: improve + int16_t ret = 0; + for (int i = 0; i < 4; ++i) + ret += __extract_element(v, i); + return ret; +} + +static FORCEINLINE int32_t __reduce_add_int16(__vec4_i16 v) { + // TODO: improve + int32_t ret = 0; + for (int i = 0; i < 4; ++i) + ret += __extract_element(v, i); + return ret; +} + static FORCEINLINE float __reduce_add_float(__vec4_f v) { float r = bits_as_float(_mm_extract_ps(v.v, 0)); r += bits_as_float(_mm_extract_ps(v.v, 1)); @@ -3984,6 +4000,22 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, #endif } +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 + #undef FORCEINLINE - - diff --git a/expr.cpp b/expr.cpp index fc3d295a..614cb5e5 100644 --- a/expr.cpp +++ b/expr.cpp @@ -1911,6 +1911,40 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1, } +/* Returns true if shifting right by the given amount will lead to + inefficient code. (Assumes x86 target. May also warn inaccurately if + later optimization simplify the shift amount more than we are able to + see at this point.) */ +static bool +lIsDifficultShiftAmount(Expr *expr) { + // Uniform shifts (of uniform values) are no problem. + if (expr->GetType()->IsVaryingType() == false) + return false; + + ConstExpr *ce = dynamic_cast(expr); + if (ce) { + // If the shift is by a constant amount, *and* it's the same amount + // in all vector lanes, we're in good shape. + uint32_t amount[ISPC_MAX_NVEC]; + int count = ce->GetValues(amount); + for (int i = 1; i < count; ++i) + if (amount[i] != amount[0]) + return true; + return false; + } + + TypeCastExpr *tce = dynamic_cast(expr); + if (tce && tce->expr) { + // Finally, if the shift amount is given by a uniform value that's + // been smeared out into a varying, we have the same shift for all + // lanes and are also in good shape. + return (tce->expr->GetType()->IsUniformType() == false); + } + + return true; +} + + llvm::Value * BinaryExpr::GetValue(FunctionEmitContext *ctx) const { if (!arg0 || !arg1) { @@ -1951,9 +1985,8 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const { case BitAnd: case BitXor: case BitOr: { - if (op == Shr && arg1->GetType()->IsVaryingType() && - dynamic_cast(arg1) == NULL) - PerformanceWarning(pos, "Shift right is extremely inefficient for " + if (op == Shr && lIsDifficultShiftAmount(arg1)) + PerformanceWarning(pos, "Shift right is inefficient for " "varying shift amounts."); return lEmitBinaryBitOp(op, value0, value1, arg0->GetType()->IsUnsignedType(), ctx); @@ -2207,6 +2240,49 @@ lConstFoldBinaryIntOp(ConstExpr *constArg0, ConstExpr *constArg1, } +/* Returns true if the given arguments (which are assumed to be the + operands of a divide) represent a divide that can be performed by one of + the __fast_idiv functions. + */ +static bool +lCanImproveVectorDivide(Expr *arg0, Expr *arg1, int *divisor) { + const Type *type = arg0->GetType(); + if (!type) + return false; + + // The value being divided must be an int8/16/32. + if (!(Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8) || + Type::EqualIgnoringConst(type, AtomicType::VaryingInt16) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt16) || + Type::EqualIgnoringConst(type, AtomicType::VaryingInt32) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt32))) + return false; + + // The divisor must be the same compile-time constant value for all of + // the vector lanes. + ConstExpr *ce = dynamic_cast(arg1); + if (!ce) + return false; + int64_t div[ISPC_MAX_NVEC]; + int count = ce->GetValues(div); + for (int i = 1; i < count; ++i) + if (div[i] != div[0]) + return false; + *divisor = div[0]; + + // And finally, the divisor must be >= 2 and <128 (for 8-bit divides), + // and <256 otherwise. + if (*divisor < 2) + return false; + if (Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8)) + return *divisor < 128; + else + return *divisor < 256; +} + + Expr * BinaryExpr::Optimize() { if (arg0 == NULL || arg1 == NULL) @@ -2269,6 +2345,32 @@ BinaryExpr::Optimize() { } } + int divisor; + if (op == Div && lCanImproveVectorDivide(arg0, arg1, &divisor)) { + Debug(pos, "Improving vector divide by constant %d", divisor); + + std::vector idivFuns; + m->symbolTable->LookupFunction("__fast_idiv", &idivFuns); + if (idivFuns.size() == 0) { + Warning(pos, "Couldn't find __fast_idiv to optimize integer divide. " + "Are you compiling with --nostdlib?"); + return this; + } + + Expr *idivSymExpr = new FunctionSymbolExpr("__fast_idiv", idivFuns, pos); + ExprList *args = new ExprList(arg0, pos); + args->exprs.push_back(new ConstExpr(AtomicType::UniformInt32, divisor, arg1->pos)); + Expr *idivCall = new FunctionCallExpr(idivSymExpr, args, pos); + + idivCall = ::TypeCheck(idivCall); + if (idivCall == NULL) + return NULL; + + Assert(Type::EqualIgnoringConst(GetType(), idivCall->GetType())); + idivCall = new TypeCastExpr(GetType(), idivCall, pos); + return ::Optimize(idivCall); + } + // From here on out, we're just doing constant folding, so if both args // aren't constants then we're done... if (constArg0 == NULL || constArg1 == NULL) @@ -3021,6 +3123,14 @@ static llvm::Value * lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, llvm::Value *expr1, llvm::Value *expr2, const Type *type) { +#if 0 // !defined(LLVM_3_1) + // Though it should be equivalent, this seems to cause non-trivial + // performance regressions versus the below. This may be related to + // http://llvm.org/bugs/show_bug.cgi?id=16941. + if (test->getType() != LLVMTypes::Int1VectorType) + test = ctx->TruncInst(test, LLVMTypes::Int1VectorType); + return ctx->SelectInst(test, expr1, expr2, "select"); +#else llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp"); // Don't need to worry about masking here ctx->StoreInst(expr2, resultPtr); @@ -3029,6 +3139,7 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, PointerType::GetUniform(type)->LLVMType(g->ctx)); ctx->StoreInst(expr1, resultPtr, test, type, PointerType::GetUniform(type)); return ctx->LoadInst(resultPtr, "selectexpr_final"); +#endif // !LLVM_3_1 } @@ -6059,9 +6170,9 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) - // If we have a bool vector of i32 elements, first truncate - // down to a single bit + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) + // If we have a bool vector of non-i1 elements, first + // truncate down to a single bit. exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); // And then do an unisgned int->float cast cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int @@ -6103,8 +6214,8 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) - // truncate i32 bool vector values to i1s + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) + // truncate bool vector values to i1s exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double exprVal, targetType, cOpName); @@ -6141,7 +6252,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6177,7 +6288,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6219,7 +6330,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6259,7 +6370,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6305,7 +6416,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6345,7 +6456,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6391,7 +6502,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6429,7 +6540,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6523,12 +6634,12 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, if (fromType->IsUniformType()) { if (toType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) { - // extend out to i32 bool values from i1 here. then we'll - // turn into a vector below, the way it does for everyone - // else... + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) { + // extend out to an bool as an i8/i16/i32 from the i1 here. + // Then we'll turn that into a vector below, the way it + // does for everyone else... cast = ctx->SExtInst(cast, LLVMTypes::BoolVectorType->getElementType(), - LLVMGetName(cast, "to_i32bool")); + LLVMGetName(cast, "to_i_bool")); } } else diff --git a/ispc.cpp b/ispc.cpp index 7743d6b2..8a0f16c6 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -106,7 +106,7 @@ static void __cpuidex(int info[4], int level, int count) { static const char * lGetSystemISA() { #ifdef __arm__ - return "neon"; + return "neon-i32x4"; #else int info[4]; __cpuid(info, 1); @@ -121,19 +121,19 @@ lGetSystemISA() { int info2[4]; __cpuidex(info2, 7, 0); if ((info2[1] & (1 << 5)) != 0) - return "avx2"; + return "avx2-i32x8"; else - return "avx1.1"; + return "avx1.1-i32x8"; } // Regular AVX - return "avx"; + return "avx-i32x8"; } else if ((info[2] & (1 << 19)) != 0) - return "sse4"; + return "sse4-i32x4"; else if ((info[3] & (1 << 26)) != 0) - return "sse2"; + return "sse2-i32x4"; else { - fprintf(stderr, "Unable to detect supported SSE/AVX ISA. Exiting.\n"); + Error(SourcePos(), "Unable to detect supported SSE/AVX ISA. Exiting."); exit(1); } #endif @@ -186,22 +186,22 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // If a CPU was specified explicitly, try to pick the best // possible ISA based on that. if (!strcmp(cpu, "core-avx2")) - isa = "avx2"; + isa = "avx2-i32x8"; #ifdef ISPC_ARM_ENABLED else if (!strcmp(cpu, "cortex-a9") || !strcmp(cpu, "cortex-a15")) - isa = "neon"; + isa = "neon-i32x4"; #endif else if (!strcmp(cpu, "core-avx-i")) - isa = "avx1.1"; + isa = "avx1.1-i32x8"; else if (!strcmp(cpu, "sandybridge") || !strcmp(cpu, "corei7-avx")) - isa = "avx"; + isa = "avx-i32x8"; else if (!strcmp(cpu, "corei7") || !strcmp(cpu, "penryn")) - isa = "sse4"; + isa = "sse4-i32x4"; else - isa = "sse2"; + isa = "sse2-i32x4"; Warning(SourcePos(), "No --target specified on command-line. " "Using ISA \"%s\" based on specified CPU \"%s\".", isa, cpu); @@ -211,12 +211,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // supports. isa = lGetSystemISA(); Warning(SourcePos(), "No --target specified on command-line. " - "Using system ISA \"%s\".", isa); + "Using default system target \"%s\".", isa); } } #if defined(ISPC_ARM_ENABLED) && !defined(__arm__) - if (cpu == NULL && !strcmp(isa, "neon")) + if (cpu == NULL && !strncmp(isa, "neon", 4)) // If we're compiling NEON on an x86 host and the CPU wasn't // supplied, don't go and set the CPU based on the host... cpu = "cortex-a9"; @@ -241,8 +241,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } } if (foundCPU == false) { - fprintf(stderr, "Error: CPU type \"%s\" unknown. Supported CPUs: " - "%s.\n", cpu, SupportedTargetCPUs().c_str()); + Error(SourcePos(), "Error: CPU type \"%s\" unknown. Supported CPUs: " + "%s.", cpu, SupportedCPUs().c_str()); return; } } @@ -251,7 +251,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : if (arch == NULL) { #ifdef ISPC_ARM_ENABLED - if (!strcmp(isa, "neon")) + if (!strncmp(isa, "neon", 4)) arch = "arm"; else #endif @@ -283,7 +283,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } // Check default LLVM generated targets - if (!strcasecmp(isa, "sse2")) { + if (!strcasecmp(isa, "sse2") || + !strcasecmp(isa, "sse2-i32x4")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; @@ -291,7 +292,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "sse2-x2")) { + else if (!strcasecmp(isa, "sse2-x2") || + !strcasecmp(isa, "sse2-i32x8")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 8; @@ -299,7 +301,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "sse4")) { + else if (!strcasecmp(isa, "sse4") || + !strcasecmp(isa, "sse4-i32x4")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; @@ -308,7 +311,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) { + else if (!strcasecmp(isa, "sse4x2") || + !strcasecmp(isa, "sse4-x2") || + !strcasecmp(isa, "sse4-i32x8")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 8; @@ -316,7 +321,24 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "generic-4")) { + else if (!strcasecmp(isa, "sse4-i8x16")) { + this->m_isa = Target::SSE4; + this->m_nativeVectorWidth = 16; + this->m_vectorWidth = 16; + this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + this->m_maskingIsFree = false; + this->m_maskBitCount = 8; + } + else if (!strcasecmp(isa, "sse4-i16x8")) { + this->m_isa = Target::SSE4; + this->m_nativeVectorWidth = 8; + this->m_vectorWidth = 8; + this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + this->m_maskingIsFree = false; + this->m_maskBitCount = 16; + } + else if (!strcasecmp(isa, "generic-4") || + !strcasecmp(isa, "generic-x4")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; @@ -326,7 +348,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-8")) { + else if (!strcasecmp(isa, "generic-8") || + !strcasecmp(isa, "generic-x8")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; @@ -336,7 +359,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-16")) { + else if (!strcasecmp(isa, "generic-16") || + !strcasecmp(isa, "generic-x16")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; @@ -346,7 +370,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-32")) { + else if (!strcasecmp(isa, "generic-32") || + !strcasecmp(isa, "generic-x32")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 32; this->m_vectorWidth = 32; @@ -356,7 +381,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-64")) { + else if (!strcasecmp(isa, "generic-64") || + !strcasecmp(isa, "generic-x64")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 64; this->m_vectorWidth = 64; @@ -366,14 +392,17 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-1")) { + else if (!strcasecmp(isa, "generic-1") || + !strcasecmp(isa, "generic-x1")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 1; this->m_vectorWidth = 1; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "avx") || !strcasecmp(isa, "avx1")) { + else if (!strcasecmp(isa, "avx") || + !strcasecmp(isa, "avx1") || + !strcasecmp(isa, "avx1-i32x8")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; @@ -381,7 +410,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "avx-x2") || !strcasecmp(isa, "avx1-x2")) { + else if (!strcasecmp(isa, "avx-x2") || + !strcasecmp(isa, "avx1-x2") || + !strcasecmp(isa, "avx1-i32x16")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 16; @@ -389,7 +420,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "avx1.1")) { + else if (!strcasecmp(isa, "avx1.1") || + !strcasecmp(isa, "avx1.1-i32x8")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; @@ -402,7 +434,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasRand = true; #endif } - else if (!strcasecmp(isa, "avx1.1-x2")) { + else if (!strcasecmp(isa, "avx1.1-x2") || + !strcasecmp(isa, "avx1.1-i32x16")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 16; @@ -415,7 +448,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasRand = true; #endif } - else if (!strcasecmp(isa, "avx2")) { + else if (!strcasecmp(isa, "avx2") || + !strcasecmp(isa, "avx2-i32x8")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; @@ -433,7 +467,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasGather = true; #endif } - else if (!strcasecmp(isa, "avx2-x2")) { + else if (!strcasecmp(isa, "avx2-x2") || + !strcasecmp(isa, "avx2-i32x16")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; @@ -452,8 +487,27 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : #endif } #ifdef ISPC_ARM_ENABLED - else if (!strcasecmp(isa, "neon")) { - this->m_isa = Target::NEON; + else if (!strcasecmp(isa, "neon-i8x16")) { + this->m_isa = Target::NEON8; + this->m_nativeVectorWidth = 16; + this->m_vectorWidth = 16; + this->m_attributes = "+neon,+fp16"; + this->m_hasHalf = true; // ?? + this->m_maskingIsFree = false; + this->m_maskBitCount = 8; + } + else if (!strcasecmp(isa, "neon-i16x8")) { + this->m_isa = Target::NEON16; + this->m_nativeVectorWidth = 8; + this->m_vectorWidth = 8; + this->m_attributes = "+neon,+fp16"; + this->m_hasHalf = true; // ?? + this->m_maskingIsFree = false; + this->m_maskBitCount = 16; + } + else if (!strcasecmp(isa, "neon") || + !strcasecmp(isa, "neon-i32x4")) { + this->m_isa = Target::NEON32; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; this->m_attributes = "+neon,+fp16"; @@ -463,8 +517,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } #endif else { - fprintf(stderr, "Target ISA \"%s\" is unknown. Choices are: %s\n", - isa, SupportedTargetISAs()); + Error(SourcePos(), "Target \"%s\" is unknown. Choices are: %s.", + isa, SupportedTargets()); error = true; } @@ -477,7 +531,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : std::string featuresString = m_attributes; llvm::TargetOptions options; #ifdef ISPC_ARM_ENABLED - if (m_isa == Target::NEON) + if (m_isa == Target::NEON8 || m_isa == Target::NEON16 || + m_isa == Target::NEON32) options.FloatABIType = llvm::FloatABI::Hard; #endif #if !defined(LLVM_3_1) @@ -557,7 +612,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : std::string -Target::SupportedTargetCPUs() { +Target::SupportedCPUs() { std::string ret; int count = sizeof(supportedCPUs) / sizeof(supportedCPUs[0]); for (int i = 0; i < count; ++i) { @@ -570,7 +625,7 @@ Target::SupportedTargetCPUs() { const char * -Target::SupportedTargetArchs() { +Target::SupportedArchs() { return #ifdef ISPC_ARM_ENABLED "arm, " @@ -580,14 +635,18 @@ Target::SupportedTargetArchs() { const char * -Target::SupportedTargetISAs() { +Target::SupportedTargets() { return #ifdef ISPC_ARM_ENABLED - "neon, " + "neon-i8x16, neon-16x8, neon-32x4, " #endif - "sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2" - ", avx1.1, avx1.1-x2, avx2, avx2-x2" - ", generic-1, generic-4, generic-8, generic-16, generic-32"; + "sse2-i32x4, sse2-i32x8, " + "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, " + "avx1-i32x8, avx1-i32x16, " + "avx1.1-i32x8, avx1.1-i32x16, " + "avx2-i32x8, avx2-i32x16, " + "generic-x1, generic-x4, generic-x8, generic-x16, " + "generic-x32, generic-x64"; } @@ -624,9 +683,13 @@ const char * Target::ISAToString(ISA isa) { switch (isa) { #ifdef ISPC_ARM_ENABLED - case Target::NEON: + case Target::NEON8: + return "neon-8"; + case Target::NEON16: + return "neon-16"; + case Target::NEON32: + return "neon-32"; #endif - return "neon"; case Target::SSE2: return "sse2"; case Target::SSE4: diff --git a/ispc.h b/ispc.h index bb9e2b31..fc78e415 100644 --- a/ispc.h +++ b/ispc.h @@ -181,9 +181,10 @@ public: added or the enumerant values are reordered. */ enum ISA { #ifdef ISPC_ARM_ENABLED - NEON, + NEON32, NEON16, NEON8, #endif - SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS }; + SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, + NUM_ISAS }; /** Initializes the given Target pointer for a target of the given name, if the name is a known target. Returns true if the @@ -191,16 +192,16 @@ public: Target(const char *arch, const char *cpu, const char *isa, bool pic); /** Returns a comma-delimited string giving the names of the currently - supported target ISAs. */ - static const char *SupportedTargetISAs(); + supported compilation targets. */ + static const char *SupportedTargets(); /** Returns a comma-delimited string giving the names of the currently - supported target CPUs. */ - static std::string SupportedTargetCPUs(); + supported CPUs. */ + static std::string SupportedCPUs(); /** Returns a comma-delimited string giving the names of the currently - supported target architectures. */ - static const char *SupportedTargetArchs(); + supported architectures. */ + static const char *SupportedArchs(); /** Returns a triple string specifying the target architecture, vendor, and environment. */ @@ -487,7 +488,7 @@ struct Globals { /** There are a number of math libraries that can be used for transcendentals and the like during program compilation. */ - enum MathLib { Math_ISPC, Math_ISPCFast, Math_SVML, Math_System }; + enum MathLib { Math_ISPC, Math_ISPCFast, Math_System }; MathLib mathLib; /** Records whether the ispc standard library should be made available diff --git a/ispc.vcxproj b/ispc.vcxproj index 36fbad5d..b4a8b764 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -51,10 +51,16 @@ + + + + - - + + + + 4146;4800;4996;4355;4624;4005;4003;4018 @@ -97,11 +103,13 @@ Document - %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 > $(Configuration)/gen-stdlib-x86.cpp; -%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic > $(Configuration)/gen-stdlib-generic.cpp; + %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask1 > $(Configuration)/gen-stdlib-mask1.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask8 > $(Configuration)/gen-stdlib-mask8.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask16 > $(Configuration)/gen-stdlib-mask16.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask32 > $(Configuration)/gen-stdlib-mask32.cpp; - $(Configuration)/gen-stdlib-generic.cpp;$(Configuration)/gen-stdlib-x86.cpp - Building gen-stdlib-{generic,x86}.cpp + $(Configuration)/gen-stdlib-mask1.cpp;$(Configuration)/gen-stdlib-mask8.cpp;$(Configuration)/gen-stdlib-mask16.cpp;$(Configuration)/gen-stdlib-mask32.cpp + Building gen-stdlib-{mask1,8,16,32}.cpp @@ -131,6 +139,42 @@ Building gen-bitcode-sse4-64bit.cpp + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit > $(Configuration)/gen-bitcode-sse4-8-32bit.cpp + $(Configuration)/gen-bitcode-sse4-8-32bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-8-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit > $(Configuration)/gen-bitcode-sse4-8-64bit.cpp + $(Configuration)/gen-bitcode-sse4-8-64bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-8-64bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit > $(Configuration)/gen-bitcode-sse4-16-32bit.cpp + $(Configuration)/gen-bitcode-sse4-16-32bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-16-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit > $(Configuration)/gen-bitcode-sse4-16-64bit.cpp + $(Configuration)/gen-bitcode-sse4-16-64bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-16-64bit.cpp + + Document diff --git a/lex.ll b/lex.ll index f6633fce..8baa627a 100644 --- a/lex.ll +++ b/lex.ll @@ -77,6 +77,8 @@ static int allTokens[] = { TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE, TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT, TOKEN_FLOAT_CONSTANT, + TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT, + TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT, TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT, TOKEN_INT64_CONSTANT, TOKEN_UINT64_CONSTANT, TOKEN_INC_OP, TOKEN_DEC_OP, TOKEN_LEFT_OP, TOKEN_RIGHT_OP, TOKEN_LE_OP, @@ -150,6 +152,10 @@ void ParserInit() { tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\""; tokenToName[TOKEN_DOTDOTDOT] = "..."; tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT"; + tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT"; + tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT"; + tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT"; + tokenToName[TOKEN_UINT16_CONSTANT] = "TOKEN_UINT16_CONSTANT"; tokenToName[TOKEN_INT32_CONSTANT] = "TOKEN_INT32_CONSTANT"; tokenToName[TOKEN_UINT32_CONSTANT] = "TOKEN_UINT32_CONSTANT"; tokenToName[TOKEN_INT64_CONSTANT] = "TOKEN_INT64_CONSTANT"; @@ -260,6 +266,10 @@ void ParserInit() { tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\""; tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'"; tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant"; + tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant"; + tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant"; + tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant"; + tokenNameRemap["TOKEN_UINT16_CONSTANT"] = "unsigned int16 constant"; tokenNameRemap["TOKEN_INT32_CONSTANT"] = "int32 constant"; tokenNameRemap["TOKEN_UINT32_CONSTANT"] = "unsigned int32 constant"; tokenNameRemap["TOKEN_INT64_CONSTANT"] = "int64 constant"; @@ -599,7 +609,22 @@ lParseInteger(bool dotdotdot) { } else { // No u or l suffix - // First, see if we can fit this into a 32-bit integer... + // If we're compiling to an 8-bit mask target and the constant + // fits into 8 bits, return an 8-bit int. + if (g->target->getMaskBitCount() == 8) { + if (yylval.intVal <= 0x7fULL) + return TOKEN_INT8_CONSTANT; + else if (yylval.intVal <= 0xffULL) + return TOKEN_UINT8_CONSTANT; + } + // And similarly for 16-bit masks and constants + if (g->target->getMaskBitCount() == 16) { + if (yylval.intVal <= 0x7fffULL) + return TOKEN_INT16_CONSTANT; + else if (yylval.intVal <= 0xffffULL) + return TOKEN_UINT16_CONSTANT; + } + // Otherwise, see if we can fit this into a 32-bit integer... if (yylval.intVal <= 0x7fffffffULL) return TOKEN_INT32_CONSTANT; else if (yylval.intVal <= 0xffffffffULL) diff --git a/llvmutil.cpp b/llvmutil.cpp index 26c18bf5..180c8676 100644 --- a/llvmutil.cpp +++ b/llvmutil.cpp @@ -115,13 +115,25 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0); LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0); - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: LLVMTypes::MaskType = LLVMTypes::BoolVectorType = llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.getVectorWidth()); - else { - Assert(target.getMaskBitCount() == 32); + break; + case 8: + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt8Ty(*ctx), target.getVectorWidth()); + break; + case 16: + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt16Ty(*ctx), target.getVectorWidth()); + break; + case 32: LLVMTypes::MaskType = LLVMTypes::BoolVectorType = llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth()); + break; + default: + FATAL("Unhandled mask width for initializing MaskType"); } LLVMTypes::Int1VectorType = @@ -154,12 +166,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { std::vector maskOnes; llvm::Constant *onMask = NULL; - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1, false /*unsigned*/); // 0x1 - else + break; + case 8: + onMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), -1, + true /*signed*/); // 0xff + break; + case 16: + onMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), -1, + true /*signed*/); // 0xffff + break; + case 32: onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1, true /*signed*/); // 0xffffffff + break; + default: + FATAL("Unhandled mask width for onMask"); + } for (int i = 0; i < target.getVectorWidth(); ++i) maskOnes.push_back(onMask); @@ -167,13 +193,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { std::vector maskZeros; llvm::Constant *offMask = NULL; - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0, true /*signed*/); - else + break; + case 8: + offMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), 0, + true /*signed*/); + break; + case 16: + offMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), 0, + true /*signed*/); + break; + case 32: offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0, true /*signed*/); - + break; + default: + FATAL("Unhandled mask width for offMask"); + } for (int i = 0; i < target.getVectorWidth(); ++i) maskZeros.push_back(offMask); LLVMMaskAllOff = llvm::ConstantVector::get(maskZeros); @@ -444,9 +483,14 @@ LLVMBoolVector(bool b) { if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0, false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int16Type, b ? 0xffff : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int8Type, b ? 0xff : 0, + false /*unsigned*/); else { - Assert(LLVMTypes::BoolVectorType->getElementType() == - llvm::Type::getInt1Ty(*g->ctx)); + Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType); v = b ? LLVMTrue : LLVMFalse; } @@ -465,9 +509,14 @@ LLVMBoolVector(const bool *bvec) { if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0, false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int16Type, bvec[i] ? 0xffff : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int8Type, bvec[i] ? 0xff : 0, + false /*unsigned*/); else { - Assert(LLVMTypes::BoolVectorType->getElementType() == - llvm::Type::getInt1Ty(*g->ctx)); + Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType); v = bvec[i] ? LLVMTrue : LLVMFalse; } diff --git a/main.cpp b/main.cpp index b4a02b9f..58daa2d3 100644 --- a/main.cpp +++ b/main.cpp @@ -85,13 +85,16 @@ usage(int ret) { printf(" \t\taddressing calculations are done by default, even\n"); printf(" \t\ton 64-bit target architectures.)\n"); printf(" [--arch={%s}]\t\tSelect target architecture\n", - Target::SupportedTargetArchs()); + Target::SupportedArchs()); printf(" [--c++-include-file=]\t\tSpecify name of file to emit in #include statement in generated C++ code.\n"); #ifndef ISPC_IS_WINDOWS printf(" [--colored-output]\t\tAlways use terminal colors in error/warning messages.\n"); #endif - printf(" [--cpu=]\t\t\tSelect target CPU type\n"); - printf(" ={%s}\n", Target::SupportedTargetCPUs().c_str()); + printf(" "); + char cpuHelp[2048]; + sprintf(cpuHelp, "[--cpu=]\t\t\tSelect target CPU type\n={%s}\n", + Target::SupportedCPUs().c_str()); + PrintWithWordBreaks(cpuHelp, 16, TerminalWidth(), stdout); printf(" [-D]\t\t\t\t#define given value when running preprocessor\n"); printf(" [--dev-stub ]\t\tEmit device-side offload stub functions to file\n"); printf(" [--emit-asm]\t\t\tGenerate assembly language file as output\n"); @@ -109,7 +112,6 @@ usage(int ret) { printf(" [--math-lib=