diff --git a/Makefile b/Makefile index d5741435..09ec302d 100644 --- a/Makefile +++ b/Makefile @@ -40,8 +40,8 @@ LLVM_CONFIG=$(shell which llvm-config) CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir) # Enable ARM by request -# To enable: make ARM_ENABLED=ON -ARM_ENABLED=OFF +# To enable: make ARM_ENABLED=1 +ARM_ENABLED=0 # Add llvm bin to the path so any scripts run will go to the right llvm-config LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir) @@ -65,7 +65,7 @@ LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1) LLVM_COMPONENTS+=option endif -ifeq ($(ARM_ENABLED), ON) +ifneq ($(ARM_ENABLED), 0) LLVM_COMPONENTS+=arm endif LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS)) @@ -79,6 +79,10 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \ ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \ -lpthread +ifeq ($(LLVM_VERSION),LLVM_3_4) + ISPC_LIBS += -lcurses +endif + ifeq ($(ARCH_OS),Linux) ISPC_LIBS += -ldl endif @@ -109,9 +113,14 @@ CXX=g++ CPP=cpp OPT=-O2 CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE) \ - -Wall $(LLVM_VERSION_DEF) \ - -DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" -ifeq ($(ARM_ENABLED), ON) + $(LLVM_VERSION_DEF) \ + -Wall \ + -DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" \ + -Wno-sign-compare +ifneq ($(LLVM_VERSION),LLVM_3_1) + CXXFLAGS+=-Werror +endif +ifneq ($(ARM_ENABLED), 0) CXXFLAGS+=-DISPC_ARM_ENABLED endif @@ -132,10 +141,11 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \ +TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ + sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \ generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 -ifeq ($(ARM_ENABLED), ON) - TARGETS+=neon +ifneq ($(ARM_ENABLED), 0) + TARGETS+=neon-32 neon-16 neon-8 endif # These files need to be compiled in two versions - 32 and 64 bits. BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) @@ -145,12 +155,12 @@ BUILTINS_OBJS_32=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-32bi BUILTINS_OBJS_64=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-64bit.o))) BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_COMMON:.ll=.o))) \ $(BUILTINS_OBJS_32) $(BUILTINS_OBJS_64) \ - builtins-c-32.cpp builtins-c-64.cpp + builtins-c-32.cpp builtins-c-64.cpp BISON_SRC=parse.yy FLEX_SRC=lex.ll OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \ - stdlib_generic_ispc.o stdlib_x86_ispc.o \ + stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \ $(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o)) default: ispc @@ -256,12 +266,22 @@ objs/builtins-c-64.cpp: builtins/builtins.c @echo Creating C++ source from builtins definition file $< @$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@ -objs/stdlib_generic_ispc.cpp: stdlib.ispc - @echo Creating C++ source from $< for generic - @$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ - python stdlib2cpp.py generic > $@ +objs/stdlib_mask1_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask1 + @$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask1 > $@ -objs/stdlib_x86_ispc.cpp: stdlib.ispc - @echo Creating C++ source from $< for x86 - @$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \ - python stdlib2cpp.py x86 > $@ +objs/stdlib_mask8_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask8 + @$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask8 > $@ + +objs/stdlib_mask16_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask16 + @$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask16 > $@ + +objs/stdlib_mask32_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask32 + @$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask32 > $@ diff --git a/builtins.cpp b/builtins.cpp index 7d99abf9..886eec15 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -112,10 +112,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64; // varying - if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType && - t == LLVMTypes::MaskType) - return AtomicType::VaryingBool; - else if (t == LLVMTypes::Int8VectorType) + if (t == LLVMTypes::Int8VectorType) return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8; else if (t == LLVMTypes::Int16VectorType) return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16; @@ -127,6 +124,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { return AtomicType::VaryingDouble; else if (t == LLVMTypes::Int64VectorType) return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64; + else if (t == LLVMTypes::MaskType) + return AtomicType::VaryingBool; // pointers to uniform else if (t == LLVMTypes::Int8PointerType) @@ -488,7 +487,6 @@ lSetInternalFunctions(llvm::Module *module) { "__num_cores", "__packed_load_active", "__packed_store_active", - "__pause", "__popcnt_int32", "__popcnt_int64", "__prefetch_read_uniform_1", @@ -502,6 +500,8 @@ lSetInternalFunctions(llvm::Module *module) { "__rdrand_i64", "__reduce_add_double", "__reduce_add_float", + "__reduce_add_int8", + "__reduce_add_int16", "__reduce_add_int32", "__reduce_add_int64", "__reduce_equal_double", @@ -656,7 +656,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length, // the values for an ARM target. This maybe won't cause problems // in the generated code, since bulitins.c doesn't do anything too // complex w.r.t. struct layouts, etc. - if (g->target->getISA() != Target::NEON) + if (g->target->getISA() != Target::NEON32 && + g->target->getISA() != Target::NEON16 && + g->target->getISA() != Target::NEON8) #endif // !__arm__ { Assert(bcTriple.getArch() == llvm::Triple::UnknownArch || @@ -819,13 +821,32 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // Next, add the target's custom implementations of the various needed // builtin functions (e.g. __masked_store_32(), etc). switch (g->target->getISA()) { + #ifdef ISPC_ARM_ENABLED - case Target::NEON: { + case Target::NEON8: { if (runtime32) { - EXPORT_MODULE(builtins_bitcode_neon_32bit); + EXPORT_MODULE(builtins_bitcode_neon_8_32bit); } else { - EXPORT_MODULE(builtins_bitcode_neon_64bit); + EXPORT_MODULE(builtins_bitcode_neon_8_64bit); + } + break; + } + case Target::NEON16: { + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_neon_16_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_neon_16_64bit); + } + break; + } + case Target::NEON32: { + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_neon_32_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_neon_32_64bit); } break; } @@ -865,10 +886,31 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod break; case 8: if (runtime32) { - EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit); + if (g->target->getMaskBitCount() == 16) { + EXPORT_MODULE(builtins_bitcode_sse4_16_32bit); + } + else { + Assert(g->target->getMaskBitCount() == 32); + EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit); + } } else { - EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit); + if (g->target->getMaskBitCount() == 16) { + EXPORT_MODULE(builtins_bitcode_sse4_16_64bit); + } + else { + Assert(g->target->getMaskBitCount() == 32); + EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit); + } + } + break; + case 16: + Assert(g->target->getMaskBitCount() == 8); + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_sse4_8_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_sse4_8_64bit); } break; default: @@ -1040,16 +1082,30 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // If the user wants the standard library to be included, parse the // serialized version of the stdlib.ispc file to get its // definitions added. + extern char stdlib_mask1_code[], stdlib_mask8_code[]; + extern char stdlib_mask16_code[], stdlib_mask32_code[]; if (g->target->getISA() == Target::GENERIC && - g->target->getVectorWidth() != 1) { // 1 wide uses x86 stdlib - extern char stdlib_generic_code[]; - yy_scan_string(stdlib_generic_code); - yyparse(); + g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib + yy_scan_string(stdlib_mask32_code); } else { - extern char stdlib_x86_code[]; - yy_scan_string(stdlib_x86_code); - yyparse(); + switch (g->target->getMaskBitCount()) { + case 1: + yy_scan_string(stdlib_mask1_code); + break; + case 8: + yy_scan_string(stdlib_mask8_code); + break; + case 16: + yy_scan_string(stdlib_mask16_code); + break; + case 32: + yy_scan_string(stdlib_mask32_code); + break; + default: + FATAL("Unhandled mask bit size for stdlib.ispc"); + } } + yyparse(); } } diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index dcbe0a66..1d317713 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -277,3 +277,9 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1) ret double %ret } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index 8c6b7753..d9e0322b 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -271,6 +271,33 @@ reduce_equal(16) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal int32 ops +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline { + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <16 x i16> @__add_varying_i16(<16 x i16>, + <16 x i16>) nounwind readnone alwaysinline { + %r = add <16 x i16> %0, %1 + ret <16 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline { + reduce16(i16, @__add_varying_i16, @__add_uniform_i16) +} + define <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) nounwind readnone alwaysinline { %s = add <16 x i32> %0, %1 diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index e6ab3a4b..90e2f3ac 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -217,7 +217,6 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline { ret float %sum } - define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline { reduce8(float, @__min_varying_float, @__min_uniform_float) } @@ -229,6 +228,42 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline { reduce_equal(8) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int8 ops + +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int16 ops + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal int32 ops @@ -257,20 +292,14 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline { reduce8(i32, @__max_varying_int32, @__max_uniform_int32) } - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;; horizontal uint32 ops - define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline { reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32) } - define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline { reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32) } - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal double ops @@ -329,9 +358,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline { } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;; horizontal uint64 ops - define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline { reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) } diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 9b747e2e..31ebcdd5 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -471,6 +471,15 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { ret i64 %call } +define i8 @__reduce_add_int8(<1 x i8> %v) nounwind readonly alwaysinline { + %r = extractelement <1 x i8> %v, i32 0 + ret i8 %r +} + +define i16 @__reduce_add_int16(<1 x i16> %v) nounwind readonly alwaysinline { + %r = extractelement <1 x i16> %v, i32 0 + ret i16 %r +} define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline { %r = extractelement <1 x float> %v, i32 0 @@ -953,3 +962,9 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone declare @__half_to_float_varying( %v) nounwind readnone declare i16 @__float_to_half_uniform(float %v) nounwind readnone declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index bbf1b842..2896c6b1 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -226,14 +226,16 @@ declare i1 @__any() nounwind readnone declare i1 @__all() nounwind readnone declare i1 @__none() nounwind readnone +declare i16 @__reduce_add_int8() nounwind readnone +declare i32 @__reduce_add_int16() nounwind readnone + declare float @__reduce_add_float() nounwind readnone declare float @__reduce_min_float() nounwind readnone declare float @__reduce_max_float() nounwind readnone -declare i32 @__reduce_add_int32() nounwind readnone +declare i64 @__reduce_add_int32() nounwind readnone declare i32 @__reduce_min_int32() nounwind readnone declare i32 @__reduce_max_int32() nounwind readnone - declare i32 @__reduce_min_uint32() nounwind readnone declare i32 @__reduce_max_uint32() nounwind readnone @@ -244,7 +246,6 @@ declare double @__reduce_max_double() nounwind readnone declare i64 @__reduce_add_int64() nounwind readnone declare i64 @__reduce_min_int64() nounwind readnone declare i64 @__reduce_max_int64() nounwind readnone - declare i64 @__reduce_min_uint64() nounwind readnone declare i64 @__reduce_max_uint64() nounwind readnone @@ -379,3 +380,8 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll new file mode 100644 index 00000000..a0575927 --- /dev/null +++ b/builtins/target-neon-16.ll @@ -0,0 +1,517 @@ +;; +;; target-neon-16.ll +;; +;; Copyright(c) 2013 Google, Inc. +;; +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Matt Pharr nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`8') +define(`MASK',`i16') + +include(`util.m4') +include(`target-neon-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone { + unary4to8conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v) + ret <8 x float> %r +} + +define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone { + unary4to8conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v) + ret <8 x i16> %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +;; round/floor/ceil + +;; FIXME: grabbed these from the sse2 target, which does not have native +;; instructions for these. Is there a better approach for NEON? + +define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32> + %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i, + + %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i, + + %binop21.i = fadd <8 x float> %binop.i, + + %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32> + %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float> + ret <8 x float> %int_to_float_bitcast.i.i.i +} + +define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind + %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32> + %bitop.i = and <8 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <8 x float> %binop.i +} + +define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind + %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32> + %bitop.i = and <8 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <8 x float> %binop.i +} + +;; FIXME: rounding doubles and double vectors needs to be implemented +declare @__round_varying_double() nounwind readnone +declare @__floor_varying_double() nounwind readnone +declare @__ceil_varying_double() nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__max_varying_float(, + ) nounwind readnone { + binary4to8(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1) + ret %r +} + +define @__min_varying_float(, + ) nounwind readnone { + binary4to8(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1) + ret %r +} + +declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +define @__min_varying_int32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1) + ret %r +} + +define @__max_varying_int32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1) + ret %r +} + +define @__min_varying_uint32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1) + ret %r +} + +define @__max_varying_uint32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1) + ret %r +} + +;; sqrt/rsqrt/rcp + +declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rcp_varying_float( %d) nounwind readnone { + unary4to8(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d) + binary4to8(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0) + %x1 = fmul %x0, %x0_nr + binary4to8(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rsqrt_varying_float( %d) nounwind readnone { + unary4to8(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d) + %x0_2 = fmul %x0, %x0 + binary4to8(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2) + %x1 = fmul %x0, %x0_nr + %x1_2 = fmul %x1, %x1 + binary4to8(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +define float @__rsqrt_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + %vr = call <8 x float> @__rsqrt_varying_float(<8 x float> %vs) + %r = extractelement <8 x float> %vr, i32 0 + ret float %r +} + +define float @__rcp_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + %vr = call <8 x float> @__rcp_varying_float(<8 x float> %vs) + %r = extractelement <8 x float> %vr, i32 0 + ret float %r +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + +define @__sqrt_varying_float() nounwind readnone { + unary4to8(result, float, @llvm.sqrt.v4f32, %0) +;; this returns nan for v=0, which is undesirable.. +;; %rsqrt = call @__rsqrt_varying_float( %0) +;; %result = fmul <4 x float> %rsqrt, %0 + ret <8 x float> %result +} + +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) + +define @__sqrt_varying_double() nounwind readnone { + unary4to8(r, double, @llvm.sqrt.v4f64, %0) + ret %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reductions + +define i64 @__movmsk() nounwind readnone { + %and_mask = and %0, + + %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %and_mask) + %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4) + %va = extractelement <2 x i64> %v2, i32 0 + %vb = extractelement <2 x i64> %v2, i32 1 + %v = or i64 %va, %vb + ret i64 %v +} + +define i1 @__any() nounwind readnone alwaysinline { + v8tov4(MASK, %0, %v0123, %v4567) + %vor = or <4 x MASK> %v0123, %v4567 + %v0 = extractelement <4 x MASK> %vor, i32 0 + %v1 = extractelement <4 x MASK> %vor, i32 1 + %v2 = extractelement <4 x MASK> %vor, i32 2 + %v3 = extractelement <4 x MASK> %vor, i32 3 + %v01 = or MASK %v0, %v1 + %v23 = or MASK %v2, %v3 + %v = or MASK %v01, %v23 + %cmp = icmp ne MASK %v, 0 + ret i1 %cmp +} + +define i1 @__all() nounwind readnone alwaysinline { + v8tov4(MASK, %0, %v0123, %v4567) + %vand = and <4 x MASK> %v0123, %v4567 + %v0 = extractelement <4 x MASK> %vand, i32 0 + %v1 = extractelement <4 x MASK> %vand, i32 1 + %v2 = extractelement <4 x MASK> %vand, i32 2 + %v3 = extractelement <4 x MASK> %vand, i32 3 + %v01 = and MASK %v0, %v1 + %v23 = and MASK %v2, %v3 + %v = and MASK %v01, %v23 + %cmp = icmp ne MASK %v, 0 + ret i1 %cmp +} + +define i1 @__none() nounwind readnone alwaysinline { + %any = call i1 @__any( %0) + %none = icmp eq i1 %any, 0 + ret i1 %none +} + +;; $1: scalar type +;; $2: vector/vector reduce function (2 x -> ) +;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>) +;; $4: scalar reduce function + +define(`neon_reduce', ` + v8tov4($1, %0, %v0123, %v4567) + %v0123_8 = shufflevector <4 x $1> %v0123, <4 x $1> undef, + <8 x i32> + %v4567_8 = shufflevector <4 x $1> %v4567, <4 x $1> undef, + <8 x i32> + %vfirst = call <8 x $1> $2(<8 x $1> %v0123_8, <8 x $1> %v4567_8) + %vfirst_4 = shufflevector <8 x $1> %vfirst, <8 x $1> undef, + <4 x i32> + v4tov2($1, %vfirst_4, %v0, %v1) + %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1) + %vh0 = extractelement <2 x $1> %vh, i32 0 + %vh1 = extractelement <2 x $1> %vh, i32 1 + %r = call $1 $4($1 %vh0, $1 %vh1) + ret $1 %r +') + +declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @add_f32(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define internal @__add_varying_float(, ) { + %r = fadd %0, %1 + ret %r +} + +define float @__reduce_add_float() nounwind readnone { + neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @min_f32(float, float) { + %cmp = fcmp olt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_min_float() nounwind readnone { + neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @max_f32(float, float) { + %cmp = fcmp ugt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_max_float() nounwind readnone { + neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32) +} + +declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone + +define i16 @__reduce_add_int8() nounwind readnone { + %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %0) + %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16) + %a0 = extractelement <2 x i32> %a32, i32 0 + %a1 = extractelement <2 x i32> %a32, i32 1 + %r = add i32 %a0, %a1 + %r16 = trunc i32 %r to i16 + ret i16 %r16 +} + +declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16() + +define i64 @__reduce_add_int16() nounwind readnone { + %a1 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16( %0) + %a2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a1) + %aa = extractelement <2 x i64> %a2, i32 0 + %ab = extractelement <2 x i64> %a2, i32 1 + %r = add i64 %aa, %ab + ret i64 %r +} + +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone + +define i64 @__reduce_add_int32() nounwind readnone { + v8tov4(i32, %0, %va, %vb) + %pa = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va) + %pb = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb) + %psum = add <2 x i64> %pa, %pb + %a0 = extractelement <2 x i64> %psum, i32 0 + %a1 = extractelement <2 x i64> %psum, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_si32(i32, i32) { + %cmp = icmp slt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_int32() nounwind readnone { + neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_si32(i32, i32) { + %cmp = icmp sgt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_int32() nounwind readnone { + neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_ui32(i32, i32) { + %cmp = icmp ult i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_uint32() nounwind readnone { + neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_ui32(i32, i32) { + %cmp = icmp ugt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_uint32() nounwind readnone { + neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32) +} + +define double @__reduce_add_double() nounwind readnone { + v8tov2(double, %0, %v0, %v1, %v2, %v3) + %v01 = fadd <2 x double> %v0, %v1 + %v23 = fadd <2 x double> %v2, %v3 + %sum = fadd <2 x double> %v01, %v23 + %e0 = extractelement <2 x double> %sum, i32 0 + %e1 = extractelement <2 x double> %sum, i32 1 + %m = fadd double %e0, %e1 + ret double %m +} + +define double @__reduce_min_double() nounwind readnone { + reduce8(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double() nounwind readnone { + reduce8(double, @__max_varying_double, @__max_uniform_double) +} + +define i64 @__reduce_add_int64() nounwind readnone { + v8tov2(i64, %0, %v0, %v1, %v2, %v3) + %v01 = add <2 x i64> %v0, %v1 + %v23 = add <2 x i64> %v2, %v3 + %sum = add <2 x i64> %v01, %v23 + %e0 = extractelement <2 x i64> %sum, i32 0 + %e1 = extractelement <2 x i64> %sum, i32 1 + %m = add i64 %e0, %e1 + ret i64 %m +} + +define i64 @__reduce_min_int64() nounwind readnone { + reduce8(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64() nounwind readnone { + reduce8(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64() nounwind readnone { + reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64() nounwind readnone { + reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 + +declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} diff --git a/builtins/target-neon.ll b/builtins/target-neon-32.ll similarity index 60% rename from builtins/target-neon.ll rename to builtins/target-neon-32.ll index e70b774b..30b062c9 100644 --- a/builtins/target-neon.ll +++ b/builtins/target-neon-32.ll @@ -1,5 +1,5 @@ ;; -;; target-neon.ll +;; target-neon-32.ll ;; ;; Copyright(c) 2012-2013 Matt Pharr ;; Copyright(c) 2013 Google, Inc. @@ -34,52 +34,20 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32" - define(`WIDTH',`4') - define(`MASK',`i32') include(`util.m4') - -stdlib_core() -scans() -reduce_equal(WIDTH) -rdrand_decls() -define_shuffles() -aossoa() -ctlztz() +include(`target-neon-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines -declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone -declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone - -define float @__half_to_float_uniform(i16 %v) nounwind readnone { - %v1 = bitcast i16 %v to <1 x i16> - %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, - <4 x i32> - %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec) - %r = extractelement <4 x float> %h, i32 0 - ret float %r -} - define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone { %r = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %v) ret <4 x float> %r } -define i16 @__float_to_half_uniform(float %v) nounwind readnone { - %v1 = bitcast float %v to <1 x float> - %vec = shufflevector <1 x float> %v1, <1 x float> undef, - <4 x i32> - %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec) - %r = extractelement <4 x i16> %h, i32 0 - ret i16 %r -} - - define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone { %r = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %v) ret <4 x i16> %r @@ -88,48 +56,11 @@ define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; math -define void @__fastmath() nounwind { - ret void -} - ;; round/floor/ceil ;; FIXME: grabbed these from the sse2 target, which does not have native ;; instructions for these. Is there a better approach for NEON? -define float @__round_uniform_float(float) nounwind readonly alwaysinline { - %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32 - %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648 - %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i - %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06 - %binop21.i = fadd float %binop.i, -8.388608e+06 - %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32 - %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float - ret float %int_to_float_bitcast.i.i.i -} - -define float @__floor_uniform_float(float) nounwind readonly alwaysinline { - %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind - %bincmp.i = fcmp ogt float %calltmp.i, %0 - %selectexpr.i = sext i1 %bincmp.i to i32 - %bitop.i = and i32 %selectexpr.i, -1082130432 - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i - ret float %binop.i -} - -define float @__ceil_uniform_float(float) nounwind readonly alwaysinline { - %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind - %bincmp.i = fcmp olt float %calltmp.i, %0 - %selectexpr.i = sext i1 %bincmp.i to i32 - %bitop.i = and i32 %selectexpr.i, 1065353216 - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i - ret float %binop.i -} - define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline { %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32> %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, @@ -164,10 +95,6 @@ define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysin } ;; FIXME: rounding doubles and double vectors needs to be implemented -declare double @__round_uniform_double(double) nounwind readnone -declare double @__floor_uniform_double(double) nounwind readnone -declare double @__ceil_uniform_double(double) nounwind readnone - declare @__round_varying_double() nounwind readnone declare @__floor_varying_double() nounwind readnone declare @__ceil_varying_double() nounwind readnone @@ -175,78 +102,6 @@ declare @__ceil_varying_double() nounwind readn ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; min/max -define float @__max_uniform_float(float, float) nounwind readnone { - %cmp = fcmp ugt float %0, %1 - %r = select i1 %cmp, float %0, float %1 - ret float %r -} - -define float @__min_uniform_float(float, float) nounwind readnone { - %cmp = fcmp ult float %0, %1 - %r = select i1 %cmp, float %0, float %1 - ret float %r -} - -define i32 @__min_uniform_int32(i32, i32) nounwind readnone { - %cmp = icmp slt i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i32 @__max_uniform_int32(i32, i32) nounwind readnone { - %cmp = icmp sgt i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i32 @__min_uniform_uint32(i32, i32) nounwind readnone { - %cmp = icmp ult i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i32 @__max_uniform_uint32(i32, i32) nounwind readnone { - %cmp = icmp ugt i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i64 @__min_uniform_int64(i64, i64) nounwind readnone { - %cmp = icmp slt i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define i64 @__max_uniform_int64(i64, i64) nounwind readnone { - %cmp = icmp sgt i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define i64 @__min_uniform_uint64(i64, i64) nounwind readnone { - %cmp = icmp ult i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define i64 @__max_uniform_uint64(i64, i64) nounwind readnone { - %cmp = icmp ugt i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define double @__min_uniform_double(double, double) nounwind readnone { - %cmp = fcmp olt double %0, %1 - %r = select i1 %cmp, double %0, double %1 - ret double %r -} - -define double @__max_uniform_double(double, double) nounwind readnone { - %cmp = fcmp ogt double %0, %1 - %r = select i1 %cmp, double %0, double %1 - ret double %r -} - declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone @@ -287,44 +142,6 @@ define @__max_varying_uint32(, ) nounwin ret <4 x i32> %r } -define @__min_varying_int64(, ) nounwind readnone { - %m = icmp slt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__max_varying_int64(, ) nounwind readnone { - %m = icmp sgt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__min_varying_uint64(, ) nounwind readnone { - %m = icmp ult %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__max_varying_uint64(, ) nounwind readnone { - %m = icmp ugt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__min_varying_double(, - ) nounwind readnone { - %m = fcmp olt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__max_varying_double(, - ) nounwind readnone { - %m = fcmp ogt %0, %1 - %r = select %m, %0, %1 - ret %r -} - ;; sqrt/rsqrt/rcp declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone @@ -371,13 +188,6 @@ define float @__rcp_uniform_float(float) nounwind readnone { ret float %r } -declare float @llvm.sqrt.f32(float) - -define float @__sqrt_uniform_float(float) nounwind readnone { - %r = call float @llvm.sqrt.f32(float %0) - ret float %r -} - declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) define @__sqrt_varying_float() nounwind readnone { @@ -388,13 +198,6 @@ define @__sqrt_varying_float() nounwind readnone ret <4 x float> %result } -declare double @llvm.sqrt.f64(double) - -define double @__sqrt_uniform_double(double) nounwind readnone { - %r = call double @llvm.sqrt.f64(double %0) - ret double %r -} - declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) define @__sqrt_varying_double() nounwind readnone { @@ -402,21 +205,6 @@ define @__sqrt_varying_double() nounwind readno ret <4 x double> %r } -;; bit ops - -declare i32 @llvm.ctpop.i32(i32) nounwind readnone -declare i64 @llvm.ctpop.i64(i64) nounwind readnone - -define i32 @__popcnt_int32(i32) nounwind readnone { - %v = call i32 @llvm.ctpop.i32(i32 %0) - ret i32 %v -} - -define i64 @__popcnt_int64(i64) nounwind readnone { - %v = call i64 @llvm.ctpop.i64(i64 %0) - ret i64 %v -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions @@ -509,15 +297,38 @@ define float @__reduce_max_float(<4 x float>) nounwind readnone { neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32) } -define internal i32 @add_i32(i32, i32) { - %r = add i32 %0, %1 +declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone + +define i16 @__reduce_add_int8() nounwind readnone { + %v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <8 x i32> + %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8) + %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16) + %a0 = extractelement <2 x i32> %a32, i32 0 + %a1 = extractelement <2 x i32> %a32, i32 1 + %r = add i32 %a0, %a1 + %r16 = trunc i32 %r to i16 + ret i16 %r16 +} + +declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone + +define i32 @__reduce_add_int16() nounwind readnone { + %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0) + %a0 = extractelement <2 x i32> %a32, i32 0 + %a1 = extractelement <2 x i32> %a32, i32 1 + %r = add i32 %a0, %a1 ret i32 %r } -declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone -define i32 @__reduce_add_int32() nounwind readnone { - neon_reduce(i32, @llvm.arm.neon.vpadd.v2i32, @add_i32) +define i64 @__reduce_add_int32() nounwind readnone { + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0) + %a0 = extractelement <2 x i64> %a64, i32 0 + %a1 = extractelement <2 x i64> %a64, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r } declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone @@ -617,90 +428,60 @@ define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone { } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; unaligned loads/loads+broadcasts +;; int8/int16 -masked_load(i8, 1) -masked_load(i16, 2) -masked_load(i32, 4) -masked_load(float, 4) -masked_load(i64, 8) -masked_load(double, 8) +declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone -gen_masked_store(i8) -gen_masked_store(i16) -gen_masked_store(i32) -gen_masked_store(i64) -masked_store_float_double() - -define void @__masked_store_blend_i8(* nocapture %ptr, %new, - %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i8> %new, <4 x i8> %old - store %result, * %ptr - ret void +define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r } -define void @__masked_store_blend_i16(* nocapture %ptr, %new, - %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i16> %new, <4 x i16> %old - store %result, * %ptr - ret void +declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r } -define void @__masked_store_blend_i32(* nocapture %ptr, %new, - %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i32> %new, <4 x i32> %old - store %result, * %ptr - ret void +declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r } -define void @__masked_store_blend_i64(* nocapture %ptr, - %new, %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i64> %new, <4 x i64> %old - store %result, * %ptr - ret void +declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r } -;; yuck. We need declarations of these, even though we shouldnt ever -;; actually generate calls to them for the NEON target... +declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare @__svml_sin() -declare @__svml_cos() -declare void @__svml_sincos(, *, *) -declare @__svml_tan() -declare @__svml_atan() -declare @__svml_atan2(, ) -declare @__svml_exp() -declare @__svml_log() -declare @__svml_pow(, ) +define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; gather +declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -gen_gather_factored(i8) -gen_gather_factored(i16) -gen_gather_factored(i32) -gen_gather_factored(float) -gen_gather_factored(i64) -gen_gather_factored(double) +define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} -gen_scatter(i8) -gen_scatter(i16) -gen_scatter(i32) -gen_scatter(float) -gen_scatter(i64) -gen_scatter(double) +declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -packed_load_and_store(4) +define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; prefetch +declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -define_prefetches() +define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll new file mode 100644 index 00000000..2accfe53 --- /dev/null +++ b/builtins/target-neon-8.ll @@ -0,0 +1,583 @@ +;; +;; target-neon-8.ll +;; +;; Copyright(c) 2013 Google, Inc. +;; +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Matt Pharr nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`16') +define(`MASK',`i8') + +include(`util.m4') +include(`target-neon-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone { + unary4to16conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v) + ret <16 x float> %r +} + +define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone { + unary4to16conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v) + ret <16 x i16> %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +;; round/floor/ceil + +;; FIXME: grabbed these from the sse2 target, which does not have native +;; instructions for these. Is there a better approach for NEON? + +define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast <16 x float> %0 to <16 x i32> + %bitop.i.i = and <16 x i32> %float_to_int_bitcast.i.i.i.i, + + %bitop.i = xor <16 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <16 x i32> %bitop.i to <16 x float> + %binop.i = fadd <16 x float> %int_to_float_bitcast.i.i40.i, + + %binop21.i = fadd <16 x float> %binop.i, + + %float_to_int_bitcast.i.i.i = bitcast <16 x float> %binop21.i to <16 x i32> + %bitop31.i = xor <16 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop31.i to <16 x float> + ret <16 x float> %int_to_float_bitcast.i.i.i +} + +define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind + %bincmp.i = fcmp ogt <16 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32> + %bitop.i = and <16 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float> + %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <16 x float> %binop.i +} + +define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind + %bincmp.i = fcmp olt <16 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32> + %bitop.i = and <16 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float> + %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <16 x float> %binop.i +} + +;; FIXME: rounding doubles and double vectors needs to be implemented +declare @__round_varying_double() nounwind readnone +declare @__floor_varying_double() nounwind readnone +declare @__ceil_varying_double() nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__max_varying_float(, + ) nounwind readnone { + binary4to16(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1) + ret %r +} + +define @__min_varying_float(, + ) nounwind readnone { + binary4to16(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1) + ret %r +} + +declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +define @__min_varying_int32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1) + ret %r +} + +define @__max_varying_int32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1) + ret %r +} + +define @__min_varying_uint32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1) + ret %r +} + +define @__max_varying_uint32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1) + ret %r +} + +;; sqrt/rsqrt/rcp + +declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rcp_varying_float( %d) nounwind readnone { + unary4to16(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d) + binary4to16(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0) + %x1 = fmul %x0, %x0_nr + binary4to16(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rsqrt_varying_float( %d) nounwind readnone { + unary4to16(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d) + %x0_2 = fmul %x0, %x0 + binary4to16(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2) + %x1 = fmul %x0, %x0_nr + %x1_2 = fmul %x1, %x1 + binary4to16(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +define float @__rsqrt_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <16 x i32> + %vr = call <16 x float> @__rsqrt_varying_float(<16 x float> %vs) + %r = extractelement <16 x float> %vr, i32 0 + ret float %r +} + +define float @__rcp_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <16 x i32> + %vr = call <16 x float> @__rcp_varying_float(<16 x float> %vs) + %r = extractelement <16 x float> %vr, i32 0 + ret float %r +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + +define @__sqrt_varying_float() nounwind readnone { + unary4to16(result, float, @llvm.sqrt.v4f32, %0) +;; this returns nan for v=0, which is undesirable.. +;; %rsqrt = call @__rsqrt_varying_float( %0) +;; %result = fmul <4 x float> %rsqrt, %0 + ret <16 x float> %result +} + +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) + +define @__sqrt_varying_double() nounwind readnone { + unary4to16(r, double, @llvm.sqrt.v4f64, %0) + ret %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reductions + +define i64 @__movmsk() nounwind readnone { + %and_mask = and %0, + + %v8 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %and_mask) + %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %v8) + %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4) + %va = extractelement <2 x i64> %v2, i32 0 + %vb = extractelement <2 x i64> %v2, i32 1 + %vbshift = shl i64 %vb, 8 + %v = or i64 %va, %vbshift + ret i64 %v +} + +define i1 @__any() nounwind readnone alwaysinline { + v16tov8(MASK, %0, %v8a, %v8b) + %vor8 = or <8 x MASK> %v8a, %v8b + %v16 = sext <8 x i8> %vor8 to <8 x i16> + v8tov4(i16, %v16, %v16a, %v16b) + %vor16 = or <4 x i16> %v16a, %v16b + %v32 = sext <4 x i16> %vor16 to <4 x i32> + v4tov2(i32, %v32, %v32a, %v32b) + %vor32 = or <2 x i32> %v32a, %v32b + %v0 = extractelement <2 x i32> %vor32, i32 0 + %v1 = extractelement <2 x i32> %vor32, i32 1 + %v = or i32 %v0, %v1 + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all() nounwind readnone alwaysinline { + v16tov8(MASK, %0, %v8a, %v8b) + %vand8 = and <8 x MASK> %v8a, %v8b + %v16 = sext <8 x i8> %vand8 to <8 x i16> + v8tov4(i16, %v16, %v16a, %v16b) + %vand16 = and <4 x i16> %v16a, %v16b + %v32 = sext <4 x i16> %vand16 to <4 x i32> + v4tov2(i32, %v32, %v32a, %v32b) + %vand32 = and <2 x i32> %v32a, %v32b + %v0 = extractelement <2 x i32> %vand32, i32 0 + %v1 = extractelement <2 x i32> %vand32, i32 1 + %v = and i32 %v0, %v1 + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__none() nounwind readnone alwaysinline { + %any = call i1 @__any( %0) + %none = icmp eq i1 %any, 0 + ret i1 %none +} + +;; $1: scalar type +;; $2: vector/vector reduce function (2 x -> ) +;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>) +;; $4: scalar reduce function + +define(`neon_reduce', ` + v16tov8($1, %0, %va, %vb) + %va_16 = shufflevector <8 x $1> %va, <8 x $1> undef, + <16 x i32> + %vb_16 = shufflevector <8 x $1> %vb, <8 x $1> undef, + <16 x i32> + %v8 = call <16 x $1> $2(<16 x $1> %va_16, <16 x $1> %vb_16) + + %v8a = shufflevector <16 x $1> %v8, <16 x $1> undef, + <16 x i32> + %v8b = shufflevector <16 x $1> %v8, <16 x $1> undef, + <16 x i32> + + %v4 = call <16 x $1> $2(<16 x $1> %v8a, <16 x $1> %v8b) + + %vfirst_4 = shufflevector <16 x $1> %v4, <16 x $1> undef, + <4 x i32> + v4tov2($1, %vfirst_4, %v0, %v1) + %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1) + %vh0 = extractelement <2 x $1> %vh, i32 0 + %vh1 = extractelement <2 x $1> %vh, i32 1 + %r = call $1 $4($1 %vh0, $1 %vh1) + ret $1 %r +') + +declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @add_f32(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define internal @__add_varying_float(, ) { + %r = fadd %0, %1 + ret %r +} + +define float @__reduce_add_float() nounwind readnone { + neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @min_f32(float, float) { + %cmp = fcmp olt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_min_float() nounwind readnone { + neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @max_f32(float, float) { + %cmp = fcmp ugt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_max_float() nounwind readnone { + neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32) +} + +declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone + +define i64 @__reduce_add_int8() nounwind readnone { + %a16 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %0) + %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a16) + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32) + %a0 = extractelement <2 x i64> %a64, i32 0 + %a1 = extractelement <2 x i64> %a64, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +define i64 @__reduce_add_int16() nounwind readnone { + v16tov8(i16, %0, %va, %vb) + %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %va) + %b32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %vb) + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32) + %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %b32) + %sum = add <2 x i64> %a64, %b64 + %a0 = extractelement <2 x i64> %sum, i32 0 + %a1 = extractelement <2 x i64> %sum, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +define i64 @__reduce_add_int32() nounwind readnone { + v16tov4(i32, %0, %va, %vb, %vc, %vd) + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va) + %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb) + %c64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vc) + %d64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vd) + %ab = add <2 x i64> %a64, %b64 + %cd = add <2 x i64> %c64, %d64 + %sum = add <2 x i64> %ab, %cd + %a0 = extractelement <2 x i64> %sum, i32 0 + %a1 = extractelement <2 x i64> %sum, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_si32(i32, i32) { + %cmp = icmp slt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_int32() nounwind readnone { + neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_si32(i32, i32) { + %cmp = icmp sgt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_int32() nounwind readnone { + neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_ui32(i32, i32) { + %cmp = icmp ult i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_uint32() nounwind readnone { + neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_ui32(i32, i32) { + %cmp = icmp ugt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_uint32() nounwind readnone { + neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32) +} + +define internal double @__add_uniform_double(double, double) { + %r = fadd double %0, %1 + ret double %r +} + +define internal @__add_varying_double(, ) { + %r = fadd %0, %1 + ret %r +} + +define double @__reduce_add_double() nounwind readnone { + reduce16(double, @__add_varying_double, @__add_uniform_double) +} + +define double @__reduce_min_double() nounwind readnone { + reduce16(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double() nounwind readnone { + reduce16(double, @__max_varying_double, @__max_uniform_double) +} + +define internal i64 @__add_uniform_int64(i64, i64) { + %r = add i64 %0, %1 + ret i64 %r +} + +define internal @__add_varying_int64(, ) { + %r = add %0, %1 + ret %r +} + +define i64 @__reduce_add_int64() nounwind readnone { + reduce16(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define i64 @__reduce_min_int64() nounwind readnone { + reduce16(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64() nounwind readnone { + reduce16(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64() nounwind readnone { + reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64() nounwind readnone { + reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll new file mode 100644 index 00000000..696b0748 --- /dev/null +++ b/builtins/target-neon-common.ll @@ -0,0 +1,351 @@ +;; +;; target-neon-common.ll +;; +;; Copyright(c) 2013 Google, Inc. +;; +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Matt Pharr nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32" + +stdlib_core() +scans() +reduce_equal(WIDTH) +rdrand_decls() +define_shuffles() +aossoa() +ctlztz() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone + +define float @__half_to_float_uniform(i16 %v) nounwind readnone { + %v1 = bitcast i16 %v to <1 x i16> + %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, + <4 x i32> + %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec) + %r = extractelement <4 x float> %h, i32 0 + ret float %r +} + +define i16 @__float_to_half_uniform(float %v) nounwind readnone { + %v1 = bitcast float %v to <1 x float> + %vec = shufflevector <1 x float> %v1, <1 x float> undef, + <4 x i32> + %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec) + %r = extractelement <4 x i16> %h, i32 0 + ret i16 %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +define void @__fastmath() nounwind { + ret void +} + +;; round/floor/ceil + +;; FIXME: grabbed these from the sse2 target, which does not have native +;; instructions for these. Is there a better approach for NEON? + +define float @__round_uniform_float(float) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32 + %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648 + %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i + %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06 + %binop21.i = fadd float %binop.i, -8.388608e+06 + %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32 + %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float + ret float %int_to_float_bitcast.i.i.i +} + +define float @__floor_uniform_float(float) nounwind readonly alwaysinline { + %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind + %bincmp.i = fcmp ogt float %calltmp.i, %0 + %selectexpr.i = sext i1 %bincmp.i to i32 + %bitop.i = and i32 %selectexpr.i, -1082130432 + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i + ret float %binop.i +} + +define float @__ceil_uniform_float(float) nounwind readonly alwaysinline { + %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind + %bincmp.i = fcmp olt float %calltmp.i, %0 + %selectexpr.i = sext i1 %bincmp.i to i32 + %bitop.i = and i32 %selectexpr.i, 1065353216 + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i + ret float %binop.i +} + +;; FIXME: rounding doubles and double vectors needs to be implemented +declare double @__round_uniform_double(double) nounwind readnone +declare double @__floor_uniform_double(double) nounwind readnone +declare double @__ceil_uniform_double(double) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +define float @__max_uniform_float(float, float) nounwind readnone { + %cmp = fcmp ugt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__min_uniform_float(float, float) nounwind readnone { + %cmp = fcmp ult float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define i32 @__min_uniform_int32(i32, i32) nounwind readnone { + %cmp = icmp slt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__max_uniform_int32(i32, i32) nounwind readnone { + %cmp = icmp sgt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__min_uniform_uint32(i32, i32) nounwind readnone { + %cmp = icmp ult i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__max_uniform_uint32(i32, i32) nounwind readnone { + %cmp = icmp ugt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i64 @__min_uniform_int64(i64, i64) nounwind readnone { + %cmp = icmp slt i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__max_uniform_int64(i64, i64) nounwind readnone { + %cmp = icmp sgt i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__min_uniform_uint64(i64, i64) nounwind readnone { + %cmp = icmp ult i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__max_uniform_uint64(i64, i64) nounwind readnone { + %cmp = icmp ugt i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define double @__min_uniform_double(double, double) nounwind readnone { + %cmp = fcmp olt double %0, %1 + %r = select i1 %cmp, double %0, double %1 + ret double %r +} + +define double @__max_uniform_double(double, double) nounwind readnone { + %cmp = fcmp ogt double %0, %1 + %r = select i1 %cmp, double %0, double %1 + ret double %r +} + +define @__min_varying_int64(, ) nounwind readnone { + %m = icmp slt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__max_varying_int64(, ) nounwind readnone { + %m = icmp sgt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__min_varying_uint64(, ) nounwind readnone { + %m = icmp ult %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__max_varying_uint64(, ) nounwind readnone { + %m = icmp ugt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__min_varying_double(, + ) nounwind readnone { + %m = fcmp olt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__max_varying_double(, + ) nounwind readnone { + %m = fcmp ogt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +;; sqrt/rsqrt/rcp + +declare float @llvm.sqrt.f32(float) + +define float @__sqrt_uniform_float(float) nounwind readnone { + %r = call float @llvm.sqrt.f32(float %0) + ret float %r +} + +declare double @llvm.sqrt.f64(double) + +define double @__sqrt_uniform_double(double) nounwind readnone { + %r = call double @llvm.sqrt.f64(double %0) + ret double %r +} + +;; bit ops + +declare i32 @llvm.ctpop.i32(i32) nounwind readnone +declare i64 @llvm.ctpop.i64(i64) nounwind readnone + +define i32 @__popcnt_int32(i32) nounwind readnone { + %v = call i32 @llvm.ctpop.i32(i32 %0) + ret i32 %v +} + +define i64 @__popcnt_int64(i64) nounwind readnone { + %v = call i64 @llvm.ctpop.i64(i64 %0) + ret i64 %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) +masked_store_float_double() + +define void @__masked_store_blend_i8(* nocapture %ptr, %new, + %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +define void @__masked_store_blend_i16(* nocapture %ptr, %new, + %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +define void @__masked_store_blend_i32(* nocapture %ptr, %new, + %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +define void @__masked_store_blend_i64(* nocapture %ptr, + %new, %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +;; yuck. We need declarations of these, even though we shouldnt ever +;; actually generate calls to them for the NEON target... + +declare @__svml_sin() +declare @__svml_cos() +declare void @__svml_sincos(, *, *) +declare @__svml_tan() +declare @__svml_atan() +declare @__svml_atan2(, ) +declare @__svml_exp() +declare @__svml_log() +declare @__svml_pow(, ) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + +packed_load_and_store(4) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; prefetch + +define_prefetches() diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index c6a3afe2..ad1d88bc 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -269,4 +269,8 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline { ret i64 %val } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 73361720..da22a66c 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -367,6 +367,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + define <4 x float> @__vec4_add_float(<4 x float> %v0, <4 x float> %v1) nounwind readnone alwaysinline { %v = fadd <4 x float> %v0, %v1 diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 2bb06391..a6b206b6 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -267,6 +267,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <4 x i16> @__add_varying_i16(<4 x i16>, + <4 x i16>) nounwind readnone alwaysinline { + %r = add <4 x i16> %0, %1 + ret <4 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline { + reduce4(i16, @__add_varying_i16, @__add_uniform_i16) +} + define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline { %v1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll new file mode 100644 index 00000000..d7f3833d --- /dev/null +++ b/builtins/target-sse4-16.ll @@ -0,0 +1,497 @@ +;; Copyright (c) 2013, Google, Inc. +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Google, Inc. nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Define common 4-wide stuff +define(`WIDTH',`8') +define(`MASK',`i16') +include(`util.m4') + +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse4-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define @__rcp_varying_float() nounwind readonly alwaysinline { + unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0) + ; do one N-R iteration to improve precision + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + %v_iv = fmul <8 x float> %0, %call + %two_minus = fsub <8 x float> , %v_iv + %iv_mul = fmul <8 x float> %call, %two_minus + ret <8 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; rsqrt + +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define @__rsqrt_varying_float( %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <8 x float> %v, %is + %v_is_is = fmul <8 x float> %v_is, %is + %three_sub = fsub <8 x float> , %v_is_is + %is_mul = fmul <8 x float> %is, %three_sub + %half_scale = fmul <8 x float> , %is_mul + ret <8 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; sqrt + +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline { + unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0) + ret <8 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind +alwaysinline { + unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ret <8 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + round4to8(%0, 8) +} + +define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 + round4to8(%0, 9) +} + +define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 + round4to8(%0, 10) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone + +define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 8) +} + +define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 9) +} + +define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 10) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { + binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1) + ret <8 x float> %call +} + +define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { + binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1) + ret <8 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 min/max + +define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1) + ret <8 x i32> %call +} + +define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1) + ret <8 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unsigned int min/max + +define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1) + ret <8 x i32> %call +} + +define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1) + ret <8 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone { + binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ret <8 x double> %ret +} + +define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone { + binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ret <8 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME + +declare <8 x float> @__svml_sin(<8 x float>) +declare <8 x float> @__svml_cos(<8 x float>) +declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *) +declare <8 x float> @__svml_tan(<8 x float>) +declare <8 x float> @__svml_atan(<8 x float>) +declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>) +declare <8 x float> @__svml_exp(<8 x float>) +declare <8 x float> @__svml_log(<8 x float>) +declare <8 x float> @__svml_pow(<8 x float>, <8 x float>) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops / reductions + +declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone + +define i64 @__movmsk(<8 x MASK>) nounwind readnone alwaysinline { + %m8 = trunc <8 x MASK> %0 to <8 x i8> + %mask8 = shufflevector <8 x i8> %m8, <8 x i8> zeroinitializer, + <16 x i32> + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %mask8) + %m64 = zext i32 %m to i64 + ret i64 %m64 +} + +define i1 @__any(<8 x MASK>) nounwind readnone alwaysinline { + %m = call i64 @__movmsk(<8 x MASK> %0) + %mne = icmp ne i64 %m, 0 + ret i1 %mne +} + +define i1 @__all(<8 x MASK>) nounwind readnone alwaysinline { + %m = call i64 @__movmsk(<8 x MASK> %0) + %meq = icmp eq i64 %m, ALL_ON_MASK + ret i1 %meq +} + +define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline { + %m = call i64 @__movmsk(<8 x MASK> %0) + %meq = icmp eq i64 %m, 0 + ret i1 %meq +} + +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + +define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) { + %r = fadd <8 x float> %0, %1 + ret <8 x float> %r +} + +define internal float @__add_uniform_float(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline { + reduce8(float, @__add_varying_float, @__add_uniform_float) +} + +define float @__reduce_min_float(<8 x float>) nounwind readnone { + reduce8(float, @__min_varying_float, @__min_uniform_float) +} + +define float @__reduce_max_float(<8 x float>) nounwind readnone { + reduce8(float, @__max_varying_float, @__max_uniform_float) +} + +define internal <8 x i32> @__add_varying_int32(<8 x i32>, <8 x i32>) { + %r = add <8 x i32> %0, %1 + ret <8 x i32> %r +} + +define internal i32 @__add_uniform_int32(i32, i32) { + %r = add i32 %0, %1 + ret i32 %r +} + +define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone { + reduce8(i32, @__add_varying_int32, @__add_uniform_int32) +} + +define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone { + reduce8(i32, @__min_varying_int32, @__min_uniform_int32) +} + +define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone { + reduce8(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone { + reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone { + reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32) +} + +define internal <8 x double> @__add_varying_double(<8 x double>, <8 x double>) { + %r = fadd <8 x double> %0, %1 + ret <8 x double> %r +} + +define internal double @__add_uniform_double(double, double) { + %r = fadd double %0, %1 + ret double %r +} + +define double @__reduce_add_double(<8 x double>) nounwind readnone { + reduce8(double, @__add_varying_double, @__add_uniform_double) +} + +define double @__reduce_min_double(<8 x double>) nounwind readnone { + reduce8(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double(<8 x double>) nounwind readnone { + reduce8(double, @__max_varying_double, @__max_uniform_double) +} + +define internal <8 x i64> @__add_varying_int64(<8 x i64>, <8 x i64>) { + %r = add <8 x i64> %0, %1 + ret <8 x i64> %r +} + +define internal i64 @__add_uniform_int64(i64, i64) { + %r = add i64 %0, %1 + ret i64 %r +} + +define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone { + reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone { + reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +reduce_equal(8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +define void @__masked_store_blend_i64(<8 x i64>* nocapture, <8 x i64>, + <8 x MASK> %mask) nounwind + alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i64>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i64> %1, <8 x i64> %old + store <8 x i64> %blend, <8 x i64>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, + <8 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i32>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i32> %1, <8 x i32> %old + store <8 x i32> %blend, <8 x i32>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>, + <8 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i16>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i16> %1, <8 x i16> %old + store <8 x i16> %blend, <8 x i16>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>, + <8 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i8>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i8> %1, <8 x i8> %old + store <8 x i8> %blend, <8 x i8>* %0, align 4 + ret void +} + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) + +masked_store_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +; define these with the macros from stdlib.m4 + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret <8 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) { + %r = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +define_avg_up_int8() +define_avg_up_int16() +define_down_avgs() diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll new file mode 100644 index 00000000..fd4b74d7 --- /dev/null +++ b/builtins/target-sse4-8.ll @@ -0,0 +1,498 @@ +;; Copyright (c) 2013, Google, Inc. +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Google, Inc. nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Define common 4-wide stuff +define(`WIDTH',`16') +define(`MASK',`i8') +include(`util.m4') + +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse4-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define @__rcp_varying_float() nounwind readonly alwaysinline { + unary4to16(call, float, @llvm.x86.sse.rcp.ps, %0) + ; do one N-R iteration to improve precision + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + %v_iv = fmul <16 x float> %0, %call + %two_minus = fsub <16 x float> , %v_iv + %iv_mul = fmul <16 x float> %call, %two_minus + ret <16 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; rsqrt + +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + unary4to16(is, float, @llvm.x86.sse.rsqrt.ps, %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <16 x float> %v, %is + %v_is_is = fmul <16 x float> %v_is, %is + %three_sub = fsub <16 x float> , %v_is_is + %is_mul = fmul <16 x float> %is, %three_sub + %half_scale = fmul <16 x float> , %is_mul + ret <16 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; sqrt + +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline { + unary4to16(call, float, @llvm.x86.sse.sqrt.ps, %0) + ret <16 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind +alwaysinline { + unary2to16(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ret <16 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline { + ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + round4to16(%0, 8) +} + +define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline { + ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 + round4to16(%0, 9) +} + +define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline { + ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 + round4to16(%0, 10) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone + +define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline { +; XXXround2to4double(%0, 8) + ; FIXME: need round2to16double in util.m4... + ret <16 x double> undef +} + +define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline { + ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 +; XXXround2to4double(%0, 9) + ret <16 x double> undef +} + +define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline { + ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10 +; XXXround2to4double(%0, 10) + ret <16 x double> undef +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline { + binary4to16(call, float, @llvm.x86.sse.max.ps, %0, %1) + ret <16 x float> %call +} + +define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline { + binary4to16(call, float, @llvm.x86.sse.min.ps, %0, %1) + ret <16 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 min/max + +define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pminsd, %0, %1) + ret <16 x i32> %call +} + +define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1) + ret <16 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unsigned int min/max + +define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pminud, %0, %1) + ret <16 x i32> %call +} + +define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pmaxud, %0, %1) + ret <16 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone { + binary2to16(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ret <16 x double> %ret +} + +define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone { + binary2to16(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ret <16 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME + +declare <16 x float> @__svml_sin(<16 x float>) +declare <16 x float> @__svml_cos(<16 x float>) +declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *) +declare <16 x float> @__svml_tan(<16 x float>) +declare <16 x float> @__svml_atan(<16 x float>) +declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>) +declare <16 x float> @__svml_exp(<16 x float>) +declare <16 x float> @__svml_log(<16 x float>) +declare <16 x float> @__svml_pow(<16 x float>, <16 x float>) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops / reductions + +declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone + +define i64 @__movmsk(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %m64 = zext i32 %m to i64 + ret i64 %m64 +} + +define i1 @__any(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %mne = icmp ne i32 %m, 0 + ret i1 %mne +} + +define i1 @__all(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %meq = icmp eq i32 %m, ALL_ON_MASK + ret i1 %meq +} + +define i1 @__none(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %meq = icmp eq i32 %m, 0 + ret i1 %meq +} + +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline { + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <16 x i16> @__add_varying_i16(<16 x i16>, + <16 x i16>) nounwind readnone alwaysinline { + %r = add <16 x i16> %0, %1 + ret <16 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline { + reduce16(i16, @__add_varying_i16, @__add_uniform_i16) +} + +define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) { + %r = fadd <16 x float> %0, %1 + ret <16 x float> %r +} + +define internal float @__add_uniform_float(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline { + reduce16(float, @__add_varying_float, @__add_uniform_float) +} + +define float @__reduce_min_float(<16 x float>) nounwind readnone { + reduce16(float, @__min_varying_float, @__min_uniform_float) +} + +define float @__reduce_max_float(<16 x float>) nounwind readnone { + reduce16(float, @__max_varying_float, @__max_uniform_float) +} + +define internal <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) { + %r = add <16 x i32> %0, %1 + ret <16 x i32> %r +} + +define internal i32 @__add_uniform_int32(i32, i32) { + %r = add i32 %0, %1 + ret i32 %r +} + +define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone { + reduce16(i32, @__add_varying_int32, @__add_uniform_int32) +} + +define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone { + reduce16(i32, @__min_varying_int32, @__min_uniform_int32) +} + +define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone { + reduce16(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone { + reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone { + reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32) +} + +define internal <16 x double> @__add_varying_double(<16 x double>, <16 x double>) { + %r = fadd <16 x double> %0, %1 + ret <16 x double> %r +} + +define internal double @__add_uniform_double(double, double) { + %r = fadd double %0, %1 + ret double %r +} + +define double @__reduce_add_double(<16 x double>) nounwind readnone { + reduce16(double, @__add_varying_double, @__add_uniform_double) +} + +define double @__reduce_min_double(<16 x double>) nounwind readnone { + reduce16(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double(<16 x double>) nounwind readnone { + reduce16(double, @__max_varying_double, @__max_uniform_double) +} + +define internal <16 x i64> @__add_varying_int64(<16 x i64>, <16 x i64>) { + %r = add <16 x i64> %0, %1 + ret <16 x i64> %r +} + +define internal i64 @__add_uniform_int64(i64, i64) { + %r = add i64 %0, %1 + ret i64 %r +} + +define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone { + reduce16(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone { + reduce16(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone { + reduce16(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone { + reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone { + reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +reduce_equal(16) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +define void @__masked_store_blend_i64(<16 x i64>* nocapture, <16 x i64>, + <16 x i8> %mask) nounwind + alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i64>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i64> %1, <16 x i64> %old + store <16 x i64> %blend, <16 x i64>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, + <16 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i32>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i32> %1, <16 x i32> %old + store <16 x i32> %blend, <16 x i32>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>, + <16 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i16>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i16> %1, <16 x i16> %old + store <16 x i16> %blend, <16 x i16>* %0, align 4 + ret void +} + +declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone + +define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>, + <16 x MASK> %mask) nounwind alwaysinline { + %old = load <16 x i8>* %0, align 4 + %blend = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %old, <16 x i8> %1, + <16 x i8> %mask) + store <16 x i8> %blend, <16 x i8>* %0, align 4 + ret void +} + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) + +masked_store_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +; define these with the macros from stdlib.m4 + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +define_avg_up_int8() +define_avg_up_int16() +define_down_avgs() diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index ccae4d51..a7faddb3 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -309,6 +309,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline { reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float) } @@ -629,3 +659,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1) ret <8 x double> %ret } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index f622b839..e05b865f 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -299,6 +299,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <4 x i16> @__add_varying_i16(<4 x i16>, + <4 x i16>) nounwind readnone alwaysinline { + %r = add <4 x i16> %0, %1 + ret <4 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline { + reduce4(i16, @__add_varying_i16, @__add_uniform_i16) +} + declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline { @@ -503,3 +533,9 @@ gen_scatter(i32) gen_scatter(float) gen_scatter(i64) gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/util.m4 b/builtins/util.m4 index c19d4930..95e3844d 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -49,6 +49,63 @@ define(`MASK_HIGH_BIT_ON', ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; vector deconstruction utilities +;; split 8-wide vector into 2 4-wide vectors +;; +;; $1: vector element type +;; $2: 8-wide vector +;; $3: first 4-wide vector +;; $4: second 4-wide vector + +define(`v8tov4', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, + <4 x i32> + $4 = shufflevector <8 x $1> $2, <8 x $1> undef, + <4 x i32> +') + +define(`v16tov8', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, + <8 x i32> + $4 = shufflevector <16 x $1> $2, <16 x $1> undef, + <8 x i32> +') + +define(`v4tov2', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> + $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> +') + +define(`v8tov2', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> +') + +define(`v16tov4', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; vector assembly: wider vector from two narrower vectors +;; +;; $1: vector element type +;; $2: first n-wide vector +;; $3: second n-wide vector +;; $4: result 2*n-wide vector +define(`v8tov16', ` + $4 = shufflevector <8 x $1> $2, <8 x $1> $3, + <16 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Helper macro for calling various SSE instructions for scalar values ;; but where the instruction takes a vector parameter. ;; $1 : name of variable to put the final value in @@ -156,10 +213,7 @@ define(`reduce16', ` ;; the final reduction define(`reduce8by4', ` - %v1 = shufflevector <8 x $1> %0, <8 x $1> undef, - <4 x i32> - %v2 = shufflevector <8 x $1> %0, <8 x $1> undef, - <4 x i32> + v8tov4($1, %0, %v1, %v2) %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2) %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef, <4 x i32> @@ -266,30 +320,66 @@ define(`binary2to4', ` ;; $4: 8-wide operand value define(`unary4to8', ` - %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> - %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0) - %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> - %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1) - %$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, + %__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> + %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) + %__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> + %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) + %$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, + <8 x i32> +' +) + +;; $1: name of variable into which the final result should go +;; $2: scalar type of the input vector elements +;; $3: scalar type of the result vector elements +;; $4: 4-wide unary vector function to apply +;; $5: 8-wide operand value + +define(`unary4to8conv', ` + %$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> + %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) + %$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> + %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) + %$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, <8 x i32> ' ) define(`unary4to16', ` - %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0) - %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1) - %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2) - %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3) + %__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) + %__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) + %__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2) + %__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3) - %$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, + %__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, <8 x i32> - %$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3, + %__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3, <8 x i32> - %$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b, + %$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b, + <16 x i32> +' +) + +define(`unary4to16conv', ` + %$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) + %$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) + %$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2) + %$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3) + + %$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, + <8 x i32> + %$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3, + <8 x i32> + %$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b, <16 x i32> ' @@ -411,6 +501,42 @@ define(`unary2to8', ` ' ) +define(`unary2to16', ` + %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0) + %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1) + %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2) + %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3) + %$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4) + %$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5) + %$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6) + %$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> + %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, + <4 x i32> + %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + <8 x i32> + %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, + <4 x i32> + %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, + <4 x i32> + %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, + <8 x i32> + + %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, + <16 x i32> +' +) + ;; Maps an 2-wide binary function to two 8-wide vector operands ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements @@ -432,12 +558,58 @@ define(`binary2to8', ` %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> + %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, + <4 x i32> + %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + <8 x i32> +' +) + +define(`binary2to16', ` + %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b) + %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b) + %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b) + %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) + %$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b) + %$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b) + %$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b) + %$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, <4 x i32> - %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, <8 x i32> + + %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, + <4 x i32> + %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, + <4 x i32> + %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, + <8 x i32> + + %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, + <16 x i32> ' ) @@ -460,6 +632,26 @@ ret <8 x float> %ret ' ) +define(`round4to16', ` +%v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2) +%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2) +%r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2) +%r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2) +%ret01 = shufflevector <4 x float> %r0, <4 x float> %r1, + <8 x i32> +%ret23 = shufflevector <4 x float> %r2, <4 x float> %r3, + <8 x i32> +%ret = shufflevector <8 x float> %ret01, <8 x float> %ret23, + <16 x i32> +ret <16 x float> %ret +' +) + define(`round8to16', ` %v0 = shufflevector <16 x float> $1, <16 x float> undef, <8 x i32> @@ -690,6 +882,75 @@ shuffles(i64, 8) ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) ;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...) +define(`mask_converts', ` +define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) { + ret <$1 x i8> %0 +} +define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) { + %r = trunc <$1 x i16> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) { + ret <$1 x i16> %0 +} +define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) { + %r = sext <$1 x i16> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) { + %r = sext <$1 x i16> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) { + %r = trunc <$1 x i32> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) { + %r = trunc <$1 x i32> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) { + ret <$1 x i32> %0 +} +define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) { + %r = sext <$1 x i32> %0 to <$1 x i64> + ret <$1 x i64> %r +} +') + +mask_converts(WIDTH) + define(`global_atomic_associative', ` define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, @@ -697,17 +958,10 @@ define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, ; first, for any lanes where the mask is off, compute a vector where those lanes ; hold the identity value.. - ; for the bit tricks below, we need the mask to be sign extended to be - ; the size of the element type. - ifelse( - MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>', - $3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>', - $3,i32, ` - ; silly workaround to do %mask = %m, which is not possible directly.. - %maskmem = alloca <$1 x i32> - store <$1 x i32> %m, <$1 x i32> * %maskmem - %mask = load <$1 x i32> * %maskmem' - ) + ; for the bit tricks below, we need the mask to have the + ; the same element size as the element type. + %mask = call <$1 x $3> @convertmask_`'MASK`'_$3_$1(<$1 x MASK> %m) + ; zero out any lanes that are off %valoff = and <$1 x $3> %val, %mask @@ -1551,11 +1805,6 @@ declare i1 @__is_compile_time_constant_mask( %mask) declare i1 @__is_compile_time_constant_uniform_int32(i32) declare i1 @__is_compile_time_constant_varying_int32() -define void @__pause() nounwind readnone { - call void asm sideeffect "pause", "~{dirflag},~{fpsr},~{flags}"() nounwind - ret void -} - ; This function declares placeholder masked store functions for the ; front-end to use. ; @@ -2440,13 +2689,12 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline { } define @__sext_varying_bool() nounwind readnone alwaysinline { - ifelse(MASK,i1, ` - %se = sext %0 to - ret %se - ', ` - ret %0') + ifelse(MASK,i32, `ret %0', + `%se = sext %0 to + ret %se') } + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; memcpy/memmove/memset @@ -2830,17 +3078,11 @@ m4exit(`1') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; read hw clock +declare i64 @llvm.readcyclecounter() + define i64 @__clock() nounwind { -entry: - tail call void asm sideeffect "xorl %eax,%eax \0A cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind - %0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind - %asmresult = extractvalue { i32, i32 } %0, 0 - %asmresult1 = extractvalue { i32, i32 } %0, 1 - %conv = zext i32 %asmresult1 to i64 - %shl = shl nuw i64 %conv, 32 - %conv2 = zext i32 %asmresult to i64 - %or = or i64 %shl, %conv2 - ret i64 %or + %r = call i64 @llvm.readcyclecounter() + ret i64 %r } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3201,8 +3443,8 @@ return: ;; $1: llvm type of elements (and suffix for function name) define(`gen_masked_store', ` -define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { - per_lane(WIDTH, %2, ` +define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { + per_lane(WIDTH, %2, ` %ptr_LANE_ID = getelementptr * %0, i32 0, i32 LANE %storeval_LANE_ID = extractelement %1, i32 LANE store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID') @@ -3378,10 +3620,10 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>, define(`packed_load_and_store', ` define i32 @__packed_load_active(i32 * %startptr, * %val_ptr, - %full_mask) nounwind alwaysinline { + %full_mask) nounwind alwaysinline { entry: - %mask = call i64 @__movmsk( %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) + %mask = call i64 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: @@ -3432,10 +3674,10 @@ done: } define i32 @__packed_store_active(i32 * %startptr, %vals, - %full_mask) nounwind alwaysinline { + %full_mask) nounwind alwaysinline { entry: - %mask = call i64 @__movmsk( %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) + %mask = call i64 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: @@ -3544,10 +3786,10 @@ check_neighbors: %castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1) %vr = bitcast <$1 x $4> %castvr to <$1 x $2> %eq = $5 $7 <$1 x $2> %vec, %vr - ifelse(MASK,i32, ` - %eq32 = sext <$1 x i1> %eq to <$1 x i32> - %eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', ` - %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)') + ifelse(MASK,i1, ` + %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)', + `%eqm = sext <$1 x i1> %eq to <$1 x MASK> + %eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)') %alleq = icmp eq i64 %eqmm, ALL_ON_MASK br i1 %alleq, label %all_equal, label %not_all_equal ', ` @@ -3722,9 +3964,9 @@ pl_done: define(`gen_gather_general', ` ; fully general 32-bit gather, takes array of pointers encoded as vector of i32s define @__gather32_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` + per_lane(WIDTH, %vecmask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID @@ -3738,9 +3980,9 @@ define @__gather32_$1( %ptrs, ; fully general 64-bit gather, takes array of pointers encoded as vector of i32s define @__gather64_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` + per_lane(WIDTH, %vecmask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID @@ -3804,7 +4046,7 @@ define @__gather_elt64_$1(i8 * %ptr, %offsets, i32 %o define @__gather_factored_base_offsets32_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always ; legal to read from (and we do indeed require that, given the benefits!) @@ -3813,13 +4055,13 @@ define @__gather_factored_base_offsets32_$1(i8 * %ptr, store zeroinitializer, * %offsetsPtr call void @__masked_store_blend_i32( * %offsetsPtr, %offsets, - %vecmask) + %vecmask) %newOffsets = load * %offsetsPtr %deltaPtr = alloca store zeroinitializer, * %deltaPtr call void @__masked_store_blend_i32( * %deltaPtr, %offset_delta, - %vecmask) + %vecmask) %newDelta = load * %deltaPtr %ret0 = call @__gather_elt32_$1(i8 * %ptr, %newOffsets, @@ -3835,7 +4077,7 @@ define @__gather_factored_base_offsets32_$1(i8 * %ptr, @__gather_factored_base_offsets64_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always ; legal to read from (and we do indeed require that, given the benefits!) @@ -3844,13 +4086,13 @@ define @__gather_factored_base_offsets64_$1(i8 * %ptr, store zeroinitializer, * %offsetsPtr call void @__masked_store_blend_i64( * %offsetsPtr, %offsets, - %vecmask) + %vecmask) %newOffsets = load * %offsetsPtr %deltaPtr = alloca store zeroinitializer, * %deltaPtr call void @__masked_store_blend_i64( * %deltaPtr, %offset_delta, - %vecmask) + %vecmask) %newDelta = load * %deltaPtr %ret0 = call @__gather_elt64_$1(i8 * %ptr, %newOffsets, @@ -3876,27 +4118,27 @@ gen_gather_factored($1) define @__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale, %offsets, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %scale_vec = bitcast i32 %offset_scale to <1 x i32> %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef, < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > %scaled_offsets = mul %smear_scale, %offsets %v = call @__gather_factored_base_offsets32_$1(i8 * %ptr, %scaled_offsets, i32 1, - zeroinitializer, %vecmask) + zeroinitializer, %vecmask) ret %v } define @__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale, %offsets, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %scale64 = zext i32 %offset_scale to i64 %scale_vec = bitcast i64 %scale64 to <1 x i64> %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef, < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > %scaled_offsets = mul %smear_scale, %offsets %v = call @__gather_factored_base_offsets64_$1(i8 * %ptr, %scaled_offsets, - i32 1, zeroinitializer, %vecmask) + i32 1, zeroinitializer, %vecmask) ret %v } @@ -3955,9 +4197,9 @@ define void @__scatter_elt64_$1(i8 * %ptr, %offsets, i32 %offset_s define void @__scatter_factored_base_offsets32_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, - %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - per_lane(WIDTH, %mask, ` + per_lane(WIDTH, %mask, ` call void @__scatter_elt32_$1(i8 * %base, %offsets, i32 %offset_scale, %offset_delta, %values, i32 LANE)') ret void @@ -3965,9 +4207,9 @@ define void @__scatter_factored_base_offsets32_$1(i8* %base, %offs define void @__scatter_factored_base_offsets64_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, - %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - per_lane(WIDTH, %mask, ` + per_lane(WIDTH, %mask, ` call void @__scatter_elt64_$1(i8 * %base, %offsets, i32 %offset_scale, %offset_delta, %values, i32 LANE)') ret void @@ -3975,8 +4217,8 @@ define void @__scatter_factored_base_offsets64_$1(i8* %base, %offs ; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s define void @__scatter32_$1( %ptrs, %values, - %mask) nounwind alwaysinline { - per_lane(WIDTH, %mask, ` + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * %val_LANE_ID = extractelement %values, i32 LANE @@ -3987,8 +4229,8 @@ define void @__scatter32_$1( %ptrs, %values, ; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s define void @__scatter64_$1( %ptrs, %values, - %mask) nounwind alwaysinline { - per_lane(WIDTH, %mask, ` + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * %val_LANE_ID = extractelement %values, i32 LANE @@ -4044,3 +4286,109 @@ define i1 @__rdrand_i64(i64 * %ptr) { ret i1 %good } ') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define(`define_avg_up_uint8', ` +define @__avg_up_uint8(, ) { + %a16 = zext %0 to + %b16 = zext %1 to + %sum1 = add %a16, %b16 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_int8', ` +define @__avg_up_int8(, ) { + %a16 = sext %0 to + %b16 = sext %1 to + %sum1 = add %a16, %b16 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_uint16', ` +define @__avg_up_uint16(, ) { + %a32 = zext %0 to + %b32 = zext %1 to + %sum1 = add %a32, %b32 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_int16', ` +define @__avg_up_int16(, ) { + %a32 = sext %0 to + %b32 = sext %1 to + %sum1 = add %a32, %b32 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_uint8', ` +define @__avg_down_uint8(, ) { + %a16 = zext %0 to + %b16 = zext %1 to + %sum = add %a16, %b16 + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_int8', ` +define @__avg_down_int8(, ) { + %a16 = sext %0 to + %b16 = sext %1 to + %sum = add %a16, %b16 + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_uint16', ` +define @__avg_down_uint16(, ) { + %a32 = zext %0 to + %b32 = zext %1 to + %sum = add %a32, %b32 + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_int16', ` +define @__avg_down_int16(, ) { + %a32 = sext %0 to + %b32 = sext %1 to + %sum = add %a32, %b32 + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_up_avgs', ` +define_avg_up_uint8() +define_avg_up_int8() +define_avg_up_uint16() +define_avg_up_int16() +') + +define(`define_down_avgs', ` +define_avg_down_uint8() +define_avg_down_int8() +define_avg_down_uint16() +define_avg_down_int16() +') + +define(`define_avgs', ` +define_up_avgs() +define_down_avgs() +') diff --git a/cbackend.cpp b/cbackend.cpp index d23bcc20..7d4b4cfc 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -3704,6 +3704,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) { case llvm::Intrinsic::sadd_with_overflow: case llvm::Intrinsic::trap: case llvm::Intrinsic::objectsize: + case llvm::Intrinsic::readcyclecounter: // We directly implement these intrinsics break; default: @@ -4056,6 +4057,9 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID, return true; case llvm::Intrinsic::objectsize: return true; + case llvm::Intrinsic::readcyclecounter: + Out << "__clock()"; + return true; } } diff --git a/ctx.cpp b/ctx.cpp index 1e79c97b..c50d22f9 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1456,13 +1456,13 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) { for (unsigned int i = 0; i < at->getNumElements(); ++i) { llvm::Value *elt = ExtractInst(b, i); llvm::Value *sext = SExtInst(elt, LLVMTypes::BoolVectorType, - LLVMGetName(elt, "_to_boolvec32")); + LLVMGetName(elt, "_to_boolvec")); ret = InsertInst(ret, sext, i); } return ret; } else - return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_i32")); + return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_boolvec")); } @@ -2781,6 +2781,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, // Figure out if we need a 8, 16, 32 or 64-bit masked store. llvm::Function *maskedStoreFunc = NULL; + llvm::Type *llvmValueType = value->getType(); const PointerType *pt = CastType(valueType); if (pt != NULL) { @@ -2809,8 +2810,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, else maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64"); } - else if (Type::Equal(valueType, AtomicType::VaryingBool) && - g->target->getMaskBitCount() == 1) { + else if (llvmValueType == LLVMTypes::Int1VectorType) { llvm::Value *notMask = BinaryOperator(llvm::Instruction::Xor, mask, LLVMMaskAllOn, "~mask"); llvm::Value *old = LoadInst(ptr); @@ -2823,28 +2823,22 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, StoreInst(final, ptr); return; } - else if (Type::Equal(valueType, AtomicType::VaryingDouble)) { + else if (llvmValueType == LLVMTypes::DoubleVectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_double"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt64) || - Type::Equal(valueType, AtomicType::VaryingUInt64)) { + else if (llvmValueType == LLVMTypes::Int64VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64"); } - else if (Type::Equal(valueType, AtomicType::VaryingFloat)) { + else if (llvmValueType == LLVMTypes::FloatVectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_float"); } - else if (Type::Equal(valueType, AtomicType::VaryingBool) || - Type::Equal(valueType, AtomicType::VaryingInt32) || - Type::Equal(valueType, AtomicType::VaryingUInt32) || - CastType(valueType) != NULL) { + else if (llvmValueType == LLVMTypes::Int32VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt16) || - Type::Equal(valueType, AtomicType::VaryingUInt16)) { + else if (llvmValueType == LLVMTypes::Int16VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i16"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt8) || - Type::Equal(valueType, AtomicType::VaryingUInt8)) { + else if (llvmValueType == LLVMTypes::Int8VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i8"); } AssertPos(currentPos, maskedStoreFunc != NULL); diff --git a/decl.cpp b/decl.cpp index e7b3cdef..8a10543b 100644 --- a/decl.cpp +++ b/decl.cpp @@ -69,8 +69,15 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) { if (type == NULL) return NULL; - if ((typeQualifiers & TYPEQUAL_CONST) != 0) + if ((typeQualifiers & TYPEQUAL_CONST) != 0) { type = type->GetAsConstType(); + } + + if ( ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) + && ((typeQualifiers & TYPEQUAL_VARYING) != 0) ) { + Error(pos, "Type \"%s\" cannot be qualified with both uniform and varying.", + type->GetString().c_str()); + } if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) { if (Type::Equal(type, AtomicType::Void)) @@ -84,9 +91,10 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) { else type = type->GetAsVaryingType(); } - else + else { if (Type::Equal(type, AtomicType::Void) == false) type = type->GetAsUnboundVariabilityType(); + } if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) { if ((typeQualifiers & TYPEQUAL_SIGNED) != 0) @@ -124,6 +132,17 @@ DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) { typeQualifiers = tq; soaWidth = 0; vectorSize = 0; + if (t != NULL) { + if (m->symbolTable->ContainsType(t)) { + // Typedefs might have uniform/varying qualifiers inside. + if (t->IsVaryingType()) { + typeQualifiers |= TYPEQUAL_VARYING; + } + else if (t->IsUniformType()) { + typeQualifiers |= TYPEQUAL_UNIFORM; + } + } + } } @@ -229,6 +248,7 @@ Declarator::Declarator(DeclaratorKind dk, SourcePos p) void Declarator::InitFromDeclSpecs(DeclSpecs *ds) { const Type *baseType = ds->GetBaseType(pos); + InitFromType(baseType, ds); if (type == NULL) { @@ -591,6 +611,7 @@ Declaration::Declaration(DeclSpecs *ds, Declarator *d) { } + std::vector Declaration::GetVariableDeclarations() const { Assert(declSpecs->storageClass != SC_TYPEDEF); diff --git a/docs/build.sh b/docs/build.sh index a13f3231..4f4fbfe4 100755 --- a/docs/build.sh +++ b/docs/build.sh @@ -1,14 +1,16 @@ #!/bin/bash +rst2html=rst2html.py + for i in ispc perfguide faq; do - rst2html --template=template.txt --link-stylesheet \ + $rst2html --template=template.txt --link-stylesheet \ --stylesheet-path=css/style.css $i.rst > $i.html done -rst2html --template=template-news.txt --link-stylesheet \ +$rst2html --template=template-news.txt --link-stylesheet \ --stylesheet-path=css/style.css news.rst > news.html -rst2html --template=template-perf.txt --link-stylesheet \ +$rst2html --template=template-perf.txt --link-stylesheet \ --stylesheet-path=css/style.css perf.rst > perf.html #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex diff --git a/docs/ispc.rst b/docs/ispc.rst old mode 100755 new mode 100644 index c6c63172..ff07f6d8 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -467,45 +467,100 @@ There are three options that affect the compilation target: ``--arch``, which sets the target architecture, ``--cpu``, which sets the target CPU, and ``--target``, which sets the target instruction set. -By default, the ``ispc`` compiler generates code for the 64-bit x86-64 -architecture (i.e. ``--arch=x86-64``.) To compile to a 32-bit x86 target, -supply ``--arch=x86`` on the command line: +If none of these options is specified, ``ispc`` generates code for the +architecture of the system the compiler is running on (i.e. 64-bit x86-64 +(``--arch=x86-64``) on x86 systems and ARM NEON on ARM systems. + +To compile to a 32-bit x86 target, for example, supply ``--arch=x86`` on +the command line: :: ispc foo.ispc -o foo.obj --arch=x86 -No other architectures are currently supported. +Currently-supported architectures are ``x86-64``, ``x86``, and ``arm``. The target CPU determines both the default instruction set used as well as which CPU architecture the code is tuned for. ``ispc --help`` provides a -list of a number of the supported CPUs. By default, the CPU type of the -system on which you're running ``ispc`` is used to determine the target -CPU. +list of all of the supported CPUs. By default, the CPU type of the system +on which you're running ``ispc`` is used to determine the target CPU. :: ispc foo.ispc -o foo.obj --cpu=corei7-avx -Finally, ``--target`` selects between the SSE2, SSE4, and AVX, and AVX2 -instruction sets. (As general context, SSE2 was first introduced in -processors that shipped in 2001, SSE4 was introduced in 2007, and -processors with AVX were introduced in 2010. AVX2 will be supported on -future CPUs based on Intel's "Haswell" architecture. Consult your CPU's -manual for specifics on which vector instruction set it supports.) +Finally, ``--target`` selects the target instruction set. The target +string is of the form ``[ISA]-i[mask size]x[gang size]``. For example, +``--target=avx2-i32x16`` specifies a target with the AVX2 instruction set, +a mask size of 32 bits, and a gang size of 16. + +The following target ISAs are supported: + +============ ========================================== +Target Description +------------ ------------------------------------------ +avx, avx1 AVX (2010-2011 era Intel CPUs) +avx1.1 AVX 1.1 (2012 era "Ivybridge" Intel CPUs) +avx2 AVX 2 target (2013- Intel "Haswell" CPUs) +neon ARM NEON +sse2 SSE2 (early 2000s era x86 CPUs) +sse4 SSE4 (generally 2008-2010 Intel CPUs) +============ ========================================== + +Consult your CPU's manual for specifics on which vector instruction set it +supports. + +The mask size may be 8, 16, or 32 bits, though not all combinations of ISAs +and mask sizes are supported. For best performance, the best general +approach is to choose a mask size equal to the size of the most common +datatype in your programs. For example, if most of your computation is on +32-bit floating-point values, an ``i32`` target is appropriate. However, +if you're mostly doing computation on 8-bit images, ``i8`` is a better choice. + +See `Basic Concepts: Program Instances and Gangs of Program Instances`_ for +more discussion of the "gang size" and its implications for program +execution. + +Running ``ispc --help`` and looking at the output for the ``--target`` +option gives the most up-to-date documentation about which targets your +compiler binary supports. + +The naming scheme for compilation targets changed in August 2013; the +following table shows the relationship between names in the old scheme and +in the new scheme: + +============= =========== +Target Former Name +------------- ----------- +avx1-i32x8 avx, avx1 +avx1-i32x16 avx-x2 +avx1.1-i32x8 avx1.1 +avx1.1-i32x16 avx1.1-x2 +avx2-i32x8 avx2 +avx2-i32x16 avx2-x2 +neon-8 n/a +neon-16 n/a +neon-32 n/a +sse2-i32x4 sse2 +sse2-i32x8 sse2-x2 +sse4-i32x4 sse4 +sse4-i32x8 sse4-x2 +sse4-i8x16 n/a +sse4-i16x8 n/a +============= =========== By default, the target instruction set is chosen based on the most capable one supported by the system on which you're running ``ispc``. You can override this choice with the ``--target`` flag; for example, to select -Intel® SSE2, use ``--target=sse2``. (As with the other options in this -section, see the output of ``ispc --help`` for a full list of supported -targets.) +Intel® SSE2 with a 32-bit mask and 4 program instances in a gang, use +``--target=sse2-i32x4``. (As with the other options in this section, see +the output of ``ispc --help`` for a full list of supported targets.) Generating Generic C++ Output ----------------------------- In addition to generating object files or assembly output for specific -targets like SSE2, SSE4, and AVX, ``ispc`` provides an option to generate +targets like NEON, SSE2, SSE4, and AVX, ``ispc`` provides an option to generate "generic" C++ output. This As an example, consider the following simple ``ispc`` program: @@ -659,7 +714,7 @@ preprocessor runs: * - ISPC - 1 - Detecting that the ``ispc`` compiler is processing the file - * - ISPC_TARGET_{SSE2,SSE4,AVX,AVX2} + * - ISPC_TARGET_{NEON_8,NEON_16,NEON_32,SSE2,SSE4,AVX,AVX11,AVX2,GENERIC} - 1 - One of these will be set, depending on the compilation target. * - ISPC_POINTER_SIZE @@ -3365,6 +3420,31 @@ The ``isnan()`` functions test whether the given value is a floating-point uniform bool isnan(uniform double v) +A number of functions are also available for performing operations on 8- and +16-bit quantities; these map to specialized instructions that perform these +operations on targets that support them. ``avg_up()`` computes the average +of the two values, rounding up if their average is halfway between two +integers (i.e., it computes ``(a+b+1)/2``). + +:: + + int8 avg_up(int8 a, int8 b) + unsigned int8 avg_up(unsigned int8 a, unsigned int8 b) + int16 avg_up(int16 a, int16 b) + unsigned int16 avg_up(unsigned int16 a, unsigned int16 b) + + +``avg_down()`` computes the average of the two values, rounding down (i.e., +it computes ``(a+b)/2``). + +:: + + int8 avg_down(int8 a, int8 b) + unsigned int8 avg_down(unsigned int8 a, unsigned int8 b) + int16 avg_down(int16 a, int16 b) + unsigned int16 avg_down(unsigned int16 a, unsigned int16 b) + + Transcendental Functions ------------------------ @@ -3582,7 +3662,7 @@ command-line argument. Cross-Program Instance Operations --------------------------------- -``ispc`` programs are often used to expresses independently-executing +``ispc`` programs are often used to express independently-executing programs performing computation on separate data elements. (i.e. pure data-parallelism). However, it's often the case where it's useful for the program instances to be able to cooperate in computing results. The @@ -3613,7 +3693,7 @@ the running program instances. The ``rotate()`` function allows each program instance to find the value of the given value that their neighbor ``offset`` steps away has. For -example, on an 8-wide target, if ``offset`` has the value (1, 2, 3, 4, 5, +example, on an 8-wide target, if ``value`` has the value (1, 2, 3, 4, 5, 6, 7, 8) across the gang of running program instances, then ``rotate(value, -1)`` causes the first program instance to get the value 8, the second program instance to get the value 1, the third 2, and so forth. The @@ -3692,7 +3772,7 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v`` Reductions ---------- -A number routines are available to evaluate conditions across the +A number of routines are available to evaluate conditions across the running program instances. For example, ``any()`` returns ``true`` if the given value ``v`` is ``true`` for any of the SPMD program instances currently running, ``all()`` returns ``true`` if it true @@ -3711,29 +3791,44 @@ instances are added together by the ``reduce_add()`` function. :: - uniform float reduce_add(float x) - uniform int reduce_add(int x) - uniform unsigned int reduce_add(unsigned int x) + uniform int16 reduce_add(int8 x) + uniform unsigned int16 reduce_add(unsigned int8 x) + uniform int32 reduce_add(int16 x) + uniform unsigned int32 reduce_add(unsigned 16int x) + uniform int64 reduce_add(int32 x) + uniform unsigned int64 reduce_add(unsigned int32 x) + uniform int64 reduce_add(int64 x) + uniform unsigned int64 reduce_add(unsigned int64 x) -You can also use functions to compute the minimum and maximum value of the -given value across all of the currently-executing program instances. + uniform float reduce_add(float x) + uniform double reduce_add(double x) + +You can also use functions to compute the minimum value of the given value +across all of the currently-executing program instances. :: - uniform float reduce_min(float a) uniform int32 reduce_min(int32 a) uniform unsigned int32 reduce_min(unsigned int32 a) - uniform double reduce_min(double a) uniform int64 reduce_min(int64 a) uniform unsigned int64 reduce_min(unsigned int64 a) - uniform float reduce_max(float a) + uniform float reduce_min(float a) + uniform double reduce_min(double a) + +Equivalent functions are available to comptue the maximum of the given +varying variable over the active program instances. + +:: + uniform int32 reduce_max(int32 a) uniform unsigned int32 reduce_max(unsigned int32 a) - uniform double reduce_max(double a) uniform int64 reduce_max(int64 a) uniform unsigned int64 reduce_max(unsigned int64 a) + uniform float reduce_max(float a) + uniform double reduce_max(double a) + Finally, you can check to see if a particular value has the same value in all of the currently-running program instances: @@ -3741,9 +3836,10 @@ all of the currently-running program instances: uniform bool reduce_equal(int32 v) uniform bool reduce_equal(unsigned int32 v) - uniform bool reduce_equal(float v) uniform bool reduce_equal(int64 v) uniform bool reduce_equal(unsigned int64 v) + + uniform bool reduce_equal(float v) uniform bool reduce_equal(double) There are also variants of these functions that return the value as a @@ -3758,10 +3854,11 @@ performance in the `Performance Guide`_. uniform bool reduce_equal(int32 v, uniform int32 * uniform sameval) uniform bool reduce_equal(unsigned int32 v, uniform unsigned int32 * uniform sameval) - uniform bool reduce_equal(float v, uniform float * uniform sameval) uniform bool reduce_equal(int64 v, uniform int64 * uniform sameval) uniform bool reduce_equal(unsigned int64 v, uniform unsigned int64 * uniform sameval) + + uniform bool reduce_equal(float v, uniform float * uniform sameval) uniform bool reduce_equal(double, uniform double * uniform sameval) If called when none of the program instances are running, diff --git a/examples/aobench/ao.cpp b/examples/aobench/ao.cpp index cbe75a0b..2286316d 100644 --- a/examples/aobench/ao.cpp +++ b/examples/aobench/ao.cpp @@ -138,7 +138,7 @@ int main(int argc, char **argv) } // Report results and save image - printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", + printf("[aobench ispc]:\t\t\t[%.3f] million cycles (%d x %d image)\n", minTimeISPC, width, height); savePPM("ao-ispc.ppm", width, height); @@ -158,7 +158,7 @@ int main(int argc, char **argv) } // Report results and save image - printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n", + printf("[aobench ispc + tasks]:\t\t[%.3f] million cycles (%d x %d image)\n", minTimeISPCTasks, width, height); savePPM("ao-ispc-tasks.ppm", width, height); @@ -176,7 +176,7 @@ int main(int argc, char **argv) } // Report more results, save another image... - printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, + printf("[aobench serial]:\t\t[%.3f] million cycles (%d x %d image)\n", minTimeSerial, width, height); printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); diff --git a/examples/deferred/main.cpp b/examples/deferred/main.cpp index 17bd3f42..4f2be879 100644 --- a/examples/deferred/main.cpp +++ b/examples/deferred/main.cpp @@ -130,7 +130,7 @@ int main(int argc, char** argv) { printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", serialCycles/ispcCycles, serialCycles/dynamicCilkCycles); #else - printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles); + printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles); #endif // __cilk DeleteInputData(input); diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 828c1ab4..d81101f7 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1162,19 +1162,20 @@ REDUCE_ADD(double, __vec16_d, __reduce_add_double) REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >) -REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_int32) +REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) + +REDUCE_ADD(int64_t, __vec16_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <) REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >) -REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_uint32) REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <) REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >) -REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_int64) +REDUCE_ADD(int64_t, __vec16_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_min_int64, <) REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_max_int64, >) -REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_uint64) REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <) REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >) @@ -1758,3 +1759,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, return __sync_val_compare_and_swap(p, cmpval, newval); #endif } + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} + +#endif // !WIN32 + diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 64b82cb1..531ed215 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -408,15 +408,15 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) { return (uint64_t)mask.v; } -static FORCEINLINE __vec32_i1 __any(__vec32_i1 mask) { +static FORCEINLINE bool __any(__vec32_i1 mask) { return (mask.v!=0); } -static FORCEINLINE __vec32_i1 __all(__vec32_i1 mask) { - return (mask.v==0xFFFFFFFF); +static FORCEINLINE bool __all(__vec32_i1 mask) { + return (mask.v==0xFFFFFFFFul); } -static FORCEINLINE __vec32_i1 __none(__vec32_i1 mask) { +static FORCEINLINE bool __none(__vec32_i1 mask) { return (mask.v==0); } @@ -1231,19 +1231,20 @@ REDUCE_ADD(double, __vec32_d, __reduce_add_double) REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >) -REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_int32) +//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) + +REDUCE_ADD(int64_t, __vec32_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <) REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_max_int32, >) -REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_uint32) REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_min_uint32, <) REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_max_uint32, >) -REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_int64) +REDUCE_ADD(int64_t, __vec32_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_min_int64, <) REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_max_int64, >) -REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_uint64) REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_min_uint64, <) REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_max_uint64, >) @@ -1826,3 +1827,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, return __sync_val_compare_and_swap(p, cmpval, newval); #endif } + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // WIN32 + +#undef FORCEINLINE diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index 7869faa5..bbeb007a 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -533,15 +533,15 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) { return (uint64_t)mask.v; } -static FORCEINLINE __vec64_i1 __any(__vec64_i1 mask) { +static FORCEINLINE bool __any(__vec64_i1 mask) { return (mask.v!=0); } -static FORCEINLINE __vec64_i1 __all(__vec64_i1 mask) { - return (mask.v==0xFFFFFFFFFFFFFFFF); +static FORCEINLINE bool __all(__vec64_i1 mask) { + return (mask.v==0xFFFFFFFFFFFFFFFFull); } -static FORCEINLINE __vec64_i1 __none(__vec64_i1 mask) { +static FORCEINLINE bool __none(__vec64_i1 mask) { return (mask.v==0); } @@ -1364,19 +1364,20 @@ REDUCE_ADD(double, __vec64_d, __reduce_add_double) REDUCE_MINMAX(double, __vec64_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec64_d, __reduce_max_double, >) -REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_int32) +//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) + +REDUCE_ADD(int64_t, __vec64_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_min_int32, <) REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_max_int32, >) -REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_uint32) REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_min_uint32, <) REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_max_uint32, >) -REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_int64) +REDUCE_ADD(int64_t, __vec64_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_min_int64, <) REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_max_int64, >) -REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_uint64) REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_min_uint64, <) REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_max_uint64, >) @@ -1959,3 +1960,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, return __sync_val_compare_and_swap(p, cmpval, newval); #endif } + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif + +#undef FORCEINLINE diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index bf383c88..8baef8cb 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1511,6 +1511,22 @@ static FORCEINLINE int64_t __count_trailing_zeros_i64(const __vec1_i64 mask) { // reductions /////////////////////////////////////////////////////////////////////////// +static FORCEINLINE int16_t __reduce_add_i8(__vec16_i8 v) { + // TODO: improve this! + int16_t ret = 0; + for (int i = 0; i < 16; ++i) + ret += v.v[i]; + return ret; +} + +static FORCEINLINE int32_t __reduce_add_i16(__vec16_i16 v) { + // TODO: improve this! + int32_t ret = 0; + for (int i = 0; i < 16; ++i) + ret += v.v[i]; + return ret; +} + static FORCEINLINE uint32_t __reduce_add_i32(__vec16_i32 v) { return _mm512_reduce_add_epi32(v); } @@ -2105,9 +2121,24 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, #endif } +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 + #undef FORCEINLINE #undef PRE_ALIGN #undef POST_ALIGN - - - diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h index 0041a6c9..a1b1fc9d 100644 --- a/examples/intrinsics/knc2x.h +++ b/examples/intrinsics/knc2x.h @@ -1607,6 +1607,9 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { /////////////////////////////////////////////////////////////////////////// // reductions +REDUCE_ADD(int16_t, __vec32_i8, __reduce_add_int8) +REDUCE_ADD(int32_t, __vec32_i16, __reduce_add_int16) + static FORCEINLINE float __reduce_add_float(__vec32_f v) { return _mm512_reduce_add_ps(v.v1) + _mm512_reduce_add_ps(v.v2); } @@ -2052,7 +2055,24 @@ static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec32_f *out0, __vec32 } */ +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 + #undef FORCEINLINE #undef PRE_ALIGN #undef POST_ALIGN - diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index d4739d61..ff00d920 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -2528,6 +2528,22 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { /////////////////////////////////////////////////////////////////////////// // reductions +static FORCEINLINE int16_t __reduce_add_int8(__vec4_i8 v) { + // TODO: improve + int16_t ret = 0; + for (int i = 0; i < 4; ++i) + ret += __extract_element(v, i); + return ret; +} + +static FORCEINLINE int32_t __reduce_add_int16(__vec4_i16 v) { + // TODO: improve + int32_t ret = 0; + for (int i = 0; i < 4; ++i) + ret += __extract_element(v, i); + return ret; +} + static FORCEINLINE float __reduce_add_float(__vec4_f v) { float r = bits_as_float(_mm_extract_ps(v.v, 0)); r += bits_as_float(_mm_extract_ps(v.v, 1)); @@ -3984,6 +4000,22 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, #endif } +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 + #undef FORCEINLINE - - diff --git a/examples/mandelbrot/mandelbrot.cpp b/examples/mandelbrot/mandelbrot.cpp index 7e73768f..d2bebb96 100644 --- a/examples/mandelbrot/mandelbrot.cpp +++ b/examples/mandelbrot/mandelbrot.cpp @@ -109,7 +109,7 @@ int main() { minSerial = std::min(minSerial, dt); } - printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial); + printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); writePPM(buf, width, height, "mandelbrot-serial.ppm"); printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC); diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp index dae22736..698daf0f 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp @@ -137,7 +137,7 @@ int main(int argc, char *argv[]) { minSerial = std::min(minSerial, dt); } - printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial); + printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); writePPM(buf, width, height, "mandelbrot-serial.ppm"); printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); diff --git a/examples/noise/noise.cpp b/examples/noise/noise.cpp index 58552ce3..123f98c7 100644 --- a/examples/noise/noise.cpp +++ b/examples/noise/noise.cpp @@ -106,7 +106,7 @@ int main() { minSerial = std::min(minSerial, dt); } - printf("[noise serial]:\t\t\t[%.3f] millon cycles\n", minSerial); + printf("[noise serial]:\t\t\t[%.3f] million cycles\n", minSerial); writePPM(buf, width, height, "noise-serial.ppm"); printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC); diff --git a/examples/perf.py b/examples/perf.py index f96ef9ec..4b661b39 100755 --- a/examples/perf.py +++ b/examples/perf.py @@ -10,12 +10,22 @@ import glob import string import platform +def print_debug(line): + if options.silent == False: + sys.stdout.write(line) + +def print_file(line): + if options.output != "": + output = open(options.output, 'w') + output.writelines(line) + output.close() + def build_test(): global build_log global is_windows if is_windows == False: os.system("make clean >> "+build_log) - return os.system("make >> "+build_log+" 2>> "+build_log) + return os.system("make CXX="+ref_compiler+" CC="+refc_compiler+" >> "+build_log+" 2>> "+build_log) else: os.system("msbuild /t:clean >> " + build_log) return os.system("msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /t:rebuild >> " + build_log) @@ -30,7 +40,7 @@ def execute_test(command): return r #gathers all tests results and made an item test from answer structure -def run_test(command, c1, c2, test): +def run_test(command, c1, c2, test, b_serial): global perf_temp if build_test() != 0: sys.stdout.write("ERROR: Compilation fails\n") @@ -40,11 +50,13 @@ def run_test(command, c1, c2, test): return tasks = [] #list of results with tasks, it will be test[2] ispc = [] #list of results without tasks, it will be test[1] + absolute_tasks = [] #list of absolute results with tasks, it will be test[4] + absolute_ispc = [] #list of absolute results without tasks, ut will be test[3] + serial = [] #list serial times, it will be test[5] j = 1 for line in open(perf_temp): # we take test output if "speedup" in line: # we are interested only in lines with speedup if j == c1: # we are interested only in lines with c1 numbers - sys.stdout.write(line) line = line.expandtabs(0) line = line.replace("("," ") line = line.split(",") @@ -57,9 +69,42 @@ def run_test(command, c1, c2, test): ispc.append(number) c1 = c1 + c2 j+=1 + if "million cycles" in line: + if j == c1: + line = line.replace("]","[") + line = line.split("[") + number = float(line[3]) + if "tasks" in line[1]: + absolute_tasks.append(number) + else: + if "ispc" in line[1]: + absolute_ispc.append(number) + if "serial" in line[1]: + serial.append(number) + + if len(ispc) != 0: + if len(tasks) != 0: + print_debug("ISPC speedup / ISPC + tasks speedup / ISPC time / ISPC + tasks time / serial time\n") + for i in range(0,len(serial)): + print_debug("%10s /\t%10s\t /%9s / %10s\t /%10s\n" % + (ispc[i], tasks[i], absolute_ispc[i], absolute_tasks[i], serial[i])) + else: + print_debug("ISPC speedup / ISPC time / serial time\n") + for i in range(0,len(serial)): + print_debug("%10s /%9s /%10s\n" % (ispc[i], absolute_ispc[i], serial[i])) + else: + if len(tasks) != 0: + print_debug("ISPC + tasks speedup / ISPC + tasks time / serial time\n") + for i in range(0,len(serial)): + print_debug("%10s\t / %10s\t /%10s\n" % (tasks[i], absolute_tasks[i], serial[i])) + test[1] = test[1] + ispc test[2] = test[2] + tasks - + test[3] = test[3] + absolute_ispc + test[4] = test[4] + absolute_tasks + if b_serial == True: + #if we concatenate outputs we should use only the first serial answer. + test[5] = test[5] + serial def cpu_get(): p = open("/proc/stat", 'r') @@ -113,30 +158,57 @@ def geomean(par): #test[0] - name of test #test[1] - list of results without tasks #test[2] - list of results with tasks -#test[1] or test[2] may be empty +#test[3] - list of absolute results without tasks +#test[4] - list of absolute results with tasks +#test[5] - list of absolute time without ISPC (serial) +#test[1..4] may be empty def print_answer(answer): - sys.stdout.write("Name of test:\t\tISPC:\tISPC + tasks:\n") - max_t = [0,0] - diff_t = [0,0] - geomean_t = [0,0] - list_of_max = [[],[]] + filelist = [] + print_debug("--------------------------------------------------------------------------\n") + print_debug("test name:\t ISPC speedup: ISPC + tasks speedup: | " + + "ISPC time: ISPC + tasks time: serial:\n") + filelist.append("test name,ISPC speedup,diff," + + "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n") + max_t = [0,0,0,0,0] + diff_t = [0,0,0,0,0] + geomean_t = [0,0,0,0,0] + list_of_max = [[],[],[],[],[]] for i in range(len(answer)): - for t in range(1,3): + for t in range(1,6): if len(answer[i][t]) == 0: max_t[t-1] = "n/a" diff_t[t-1] = "n/a" else: - list_of_max[t-1].append(max(answer[i][t])) - max_t[t-1] = str(max(answer[i][t])) - diff_t[t-1] = str(max(answer[i][t]) - min(answer[i][t])) - sys.stdout.write("%s:\n" % answer[i][0]) - sys.stdout.write("\t\tmax:\t%s\t%s\n" % (max_t[0], max_t[1])) - sys.stdout.write("\t\tdiff:\t%s\t%s\n" % (diff_t[0], diff_t[1])) + if t < 3: + mm = max(answer[i][t]) + else: + mm = min(answer[i][t]) + max_t[t-1] = '%.2f' % mm + list_of_max[t-1].append(mm) + diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t])) + print_debug("%s:\n" % answer[i][0]) + print_debug("\t\tmax:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" % + (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4])) + print_debug("\t\tdiff:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" % + (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4])) + for t in range(0,5): + if max_t[t] == "n/a": + max_t[t] = "" + if diff_t[t] == "n/a": + diff_t[t] = "" + filelist.append(answer[i][0] + "," + + max_t[0] + "," + diff_t[0] + "," + max_t[1] + "," + diff_t[1] + "," + + max_t[2] + "," + diff_t[2] + "," + max_t[3] + "," + diff_t[3] + "," + + max_t[4] + "," + diff_t[4] + "\n") + for i in range(0,5): + geomean_t[i] = geomean(list_of_max[i]) + print_debug("---------------------------------------------------------------------------------\n") + print_debug("Geomean:\t\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" % + (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4])) + filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1]) + + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n") + print_file(filelist) - geomean_t[0] = geomean(list_of_max[0]) - geomean_t[1] = geomean(list_of_max[1]) - sys.stdout.write("---------------------------------------------\n") - sys.stdout.write("Geomean:\t\t%s\t%s\n" % (geomean_t[0], geomean_t[1])) ###Main### # parsing options @@ -147,6 +219,12 @@ parser.add_option('-c', '--config', dest='config', help='config file of tests', default="./perf.ini") parser.add_option('-p', '--path', dest='path', help='path to examples directory', default="./") +parser.add_option('-s', '--silent', dest='silent', + help='silent mode, only table output', default=False, action="store_true") +parser.add_option('-o', '--output', dest='output', + help='output file for script reading', default="") +parser.add_option('--compiler', dest='compiler', + help='reference compiler', default="") (options, args) = parser.parse_args() global is_windows @@ -174,6 +252,14 @@ ref_compiler_exists = False if is_windows == False: compiler = "ispc" ref_compiler = "g++" + refc_compiler = "gcc" + if options.compiler != "": + if options.compiler == "clang" or options.compiler == "clang++": + ref_compiler = "clang++" + refc_compiler = "clang" + if options.compiler == "icc" or options.compiler == "icpc": + ref_compiler = "icpc" + refc_compiler = "icc" else: compiler = "ispc.exe" ref_compiler = "cl.exe" @@ -222,12 +308,27 @@ perf_temp = pwd + "perf_temp" i = 0 answer = [] -sys.stdout.write("Okey go go go!\n\n") +print_debug("Okey go go go!\n\n") +os.system(compiler + " --version >" + build_log) +version = open(build_log) +print_debug("Using test compiler: " + version.readline()) +version.close() + +if is_windows == False: + os.system(ref_compiler + " --version >" + build_log) +else: + os.system(ref_compiler + " 2>" + build_log + " 1>&2") + +version = open(build_log) +print_debug("Using reference compiler: " + version.readline()) +version.close() + + # loop for all tests while i < length-2: # we read name of test - sys.stdout.write("%s" % lines[i]) - test = [lines[i][:-1],[],[]] + print_debug("%s" % lines[i]) + test = [lines[i][:-1],[],[],[],[],[]] # read location of test folder = lines[i+1] folder = folder[:-1] @@ -257,10 +358,10 @@ while i < length-2: c2 = 1 next_line = lines[i+3] if next_line[0] == "^": #we should concatenate result of this test with previous one - run_test(command, c1, c2, answer[len(answer)-1]) + run_test(command, c1, c2, answer[len(answer)-1], False) i = i+1 else: #we run this test and append it's result to answer structure - run_test(command, c1, c2, test) + run_test(command, c1, c2, test, True) answer.append(test) # preparing next loop iteration os.chdir(pwd) diff --git a/examples/stencil/stencil.cpp b/examples/stencil/stencil.cpp index 9d5b3ee6..593d901f 100644 --- a/examples/stencil/stencil.cpp +++ b/examples/stencil/stencil.cpp @@ -130,7 +130,7 @@ int main() { minTimeSerial = std::min(minTimeSerial, dt); } - printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minTimeSerial); + printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); diff --git a/examples/volume_rendering/volume.cpp b/examples/volume_rendering/volume.cpp index 7d8b8e99..458cd407 100644 --- a/examples/volume_rendering/volume.cpp +++ b/examples/volume_rendering/volume.cpp @@ -204,7 +204,7 @@ int main(int argc, char *argv[]) { minSerial = std::min(minSerial, dt); } - printf("[volume serial]:\t\t[%.3f] millon cycles\n", minSerial); + printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial); writePPM(image, width, height, "volume-serial.ppm"); printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", diff --git a/expr.cpp b/expr.cpp index fc3d295a..614cb5e5 100644 --- a/expr.cpp +++ b/expr.cpp @@ -1911,6 +1911,40 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1, } +/* Returns true if shifting right by the given amount will lead to + inefficient code. (Assumes x86 target. May also warn inaccurately if + later optimization simplify the shift amount more than we are able to + see at this point.) */ +static bool +lIsDifficultShiftAmount(Expr *expr) { + // Uniform shifts (of uniform values) are no problem. + if (expr->GetType()->IsVaryingType() == false) + return false; + + ConstExpr *ce = dynamic_cast(expr); + if (ce) { + // If the shift is by a constant amount, *and* it's the same amount + // in all vector lanes, we're in good shape. + uint32_t amount[ISPC_MAX_NVEC]; + int count = ce->GetValues(amount); + for (int i = 1; i < count; ++i) + if (amount[i] != amount[0]) + return true; + return false; + } + + TypeCastExpr *tce = dynamic_cast(expr); + if (tce && tce->expr) { + // Finally, if the shift amount is given by a uniform value that's + // been smeared out into a varying, we have the same shift for all + // lanes and are also in good shape. + return (tce->expr->GetType()->IsUniformType() == false); + } + + return true; +} + + llvm::Value * BinaryExpr::GetValue(FunctionEmitContext *ctx) const { if (!arg0 || !arg1) { @@ -1951,9 +1985,8 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const { case BitAnd: case BitXor: case BitOr: { - if (op == Shr && arg1->GetType()->IsVaryingType() && - dynamic_cast(arg1) == NULL) - PerformanceWarning(pos, "Shift right is extremely inefficient for " + if (op == Shr && lIsDifficultShiftAmount(arg1)) + PerformanceWarning(pos, "Shift right is inefficient for " "varying shift amounts."); return lEmitBinaryBitOp(op, value0, value1, arg0->GetType()->IsUnsignedType(), ctx); @@ -2207,6 +2240,49 @@ lConstFoldBinaryIntOp(ConstExpr *constArg0, ConstExpr *constArg1, } +/* Returns true if the given arguments (which are assumed to be the + operands of a divide) represent a divide that can be performed by one of + the __fast_idiv functions. + */ +static bool +lCanImproveVectorDivide(Expr *arg0, Expr *arg1, int *divisor) { + const Type *type = arg0->GetType(); + if (!type) + return false; + + // The value being divided must be an int8/16/32. + if (!(Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8) || + Type::EqualIgnoringConst(type, AtomicType::VaryingInt16) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt16) || + Type::EqualIgnoringConst(type, AtomicType::VaryingInt32) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt32))) + return false; + + // The divisor must be the same compile-time constant value for all of + // the vector lanes. + ConstExpr *ce = dynamic_cast(arg1); + if (!ce) + return false; + int64_t div[ISPC_MAX_NVEC]; + int count = ce->GetValues(div); + for (int i = 1; i < count; ++i) + if (div[i] != div[0]) + return false; + *divisor = div[0]; + + // And finally, the divisor must be >= 2 and <128 (for 8-bit divides), + // and <256 otherwise. + if (*divisor < 2) + return false; + if (Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8)) + return *divisor < 128; + else + return *divisor < 256; +} + + Expr * BinaryExpr::Optimize() { if (arg0 == NULL || arg1 == NULL) @@ -2269,6 +2345,32 @@ BinaryExpr::Optimize() { } } + int divisor; + if (op == Div && lCanImproveVectorDivide(arg0, arg1, &divisor)) { + Debug(pos, "Improving vector divide by constant %d", divisor); + + std::vector idivFuns; + m->symbolTable->LookupFunction("__fast_idiv", &idivFuns); + if (idivFuns.size() == 0) { + Warning(pos, "Couldn't find __fast_idiv to optimize integer divide. " + "Are you compiling with --nostdlib?"); + return this; + } + + Expr *idivSymExpr = new FunctionSymbolExpr("__fast_idiv", idivFuns, pos); + ExprList *args = new ExprList(arg0, pos); + args->exprs.push_back(new ConstExpr(AtomicType::UniformInt32, divisor, arg1->pos)); + Expr *idivCall = new FunctionCallExpr(idivSymExpr, args, pos); + + idivCall = ::TypeCheck(idivCall); + if (idivCall == NULL) + return NULL; + + Assert(Type::EqualIgnoringConst(GetType(), idivCall->GetType())); + idivCall = new TypeCastExpr(GetType(), idivCall, pos); + return ::Optimize(idivCall); + } + // From here on out, we're just doing constant folding, so if both args // aren't constants then we're done... if (constArg0 == NULL || constArg1 == NULL) @@ -3021,6 +3123,14 @@ static llvm::Value * lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, llvm::Value *expr1, llvm::Value *expr2, const Type *type) { +#if 0 // !defined(LLVM_3_1) + // Though it should be equivalent, this seems to cause non-trivial + // performance regressions versus the below. This may be related to + // http://llvm.org/bugs/show_bug.cgi?id=16941. + if (test->getType() != LLVMTypes::Int1VectorType) + test = ctx->TruncInst(test, LLVMTypes::Int1VectorType); + return ctx->SelectInst(test, expr1, expr2, "select"); +#else llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp"); // Don't need to worry about masking here ctx->StoreInst(expr2, resultPtr); @@ -3029,6 +3139,7 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, PointerType::GetUniform(type)->LLVMType(g->ctx)); ctx->StoreInst(expr1, resultPtr, test, type, PointerType::GetUniform(type)); return ctx->LoadInst(resultPtr, "selectexpr_final"); +#endif // !LLVM_3_1 } @@ -6059,9 +6170,9 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) - // If we have a bool vector of i32 elements, first truncate - // down to a single bit + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) + // If we have a bool vector of non-i1 elements, first + // truncate down to a single bit. exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); // And then do an unisgned int->float cast cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int @@ -6103,8 +6214,8 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) - // truncate i32 bool vector values to i1s + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) + // truncate bool vector values to i1s exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double exprVal, targetType, cOpName); @@ -6141,7 +6252,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6177,7 +6288,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6219,7 +6330,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6259,7 +6370,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6305,7 +6416,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6345,7 +6456,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6391,7 +6502,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6429,7 +6540,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6523,12 +6634,12 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, if (fromType->IsUniformType()) { if (toType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) { - // extend out to i32 bool values from i1 here. then we'll - // turn into a vector below, the way it does for everyone - // else... + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) { + // extend out to an bool as an i8/i16/i32 from the i1 here. + // Then we'll turn that into a vector below, the way it + // does for everyone else... cast = ctx->SExtInst(cast, LLVMTypes::BoolVectorType->getElementType(), - LLVMGetName(cast, "to_i32bool")); + LLVMGetName(cast, "to_i_bool")); } } else diff --git a/ispc.cpp b/ispc.cpp index 7743d6b2..6d4b063d 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -106,7 +106,7 @@ static void __cpuidex(int info[4], int level, int count) { static const char * lGetSystemISA() { #ifdef __arm__ - return "neon"; + return "neon-i32x4"; #else int info[4]; __cpuid(info, 1); @@ -121,19 +121,19 @@ lGetSystemISA() { int info2[4]; __cpuidex(info2, 7, 0); if ((info2[1] & (1 << 5)) != 0) - return "avx2"; + return "avx2-i32x8"; else - return "avx1.1"; + return "avx1.1-i32x8"; } // Regular AVX - return "avx"; + return "avx-i32x8"; } else if ((info[2] & (1 << 19)) != 0) - return "sse4"; + return "sse4-i32x4"; else if ((info[3] & (1 << 26)) != 0) - return "sse2"; + return "sse2-i32x4"; else { - fprintf(stderr, "Unable to detect supported SSE/AVX ISA. Exiting.\n"); + Error(SourcePos(), "Unable to detect supported SSE/AVX ISA. Exiting."); exit(1); } #endif @@ -186,22 +186,22 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // If a CPU was specified explicitly, try to pick the best // possible ISA based on that. if (!strcmp(cpu, "core-avx2")) - isa = "avx2"; + isa = "avx2-i32x8"; #ifdef ISPC_ARM_ENABLED else if (!strcmp(cpu, "cortex-a9") || !strcmp(cpu, "cortex-a15")) - isa = "neon"; + isa = "neon-i32x4"; #endif else if (!strcmp(cpu, "core-avx-i")) - isa = "avx1.1"; + isa = "avx1.1-i32x8"; else if (!strcmp(cpu, "sandybridge") || !strcmp(cpu, "corei7-avx")) - isa = "avx"; + isa = "avx-i32x8"; else if (!strcmp(cpu, "corei7") || !strcmp(cpu, "penryn")) - isa = "sse4"; + isa = "sse4-i32x4"; else - isa = "sse2"; + isa = "sse2-i32x4"; Warning(SourcePos(), "No --target specified on command-line. " "Using ISA \"%s\" based on specified CPU \"%s\".", isa, cpu); @@ -211,12 +211,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // supports. isa = lGetSystemISA(); Warning(SourcePos(), "No --target specified on command-line. " - "Using system ISA \"%s\".", isa); + "Using default system target \"%s\".", isa); } } #if defined(ISPC_ARM_ENABLED) && !defined(__arm__) - if (cpu == NULL && !strcmp(isa, "neon")) + if (cpu == NULL && !strncmp(isa, "neon", 4)) // If we're compiling NEON on an x86 host and the CPU wasn't // supplied, don't go and set the CPU based on the host... cpu = "cortex-a9"; @@ -241,8 +241,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } } if (foundCPU == false) { - fprintf(stderr, "Error: CPU type \"%s\" unknown. Supported CPUs: " - "%s.\n", cpu, SupportedTargetCPUs().c_str()); + Error(SourcePos(), "Error: CPU type \"%s\" unknown. Supported CPUs: " + "%s.", cpu, SupportedCPUs().c_str()); return; } } @@ -251,7 +251,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : if (arch == NULL) { #ifdef ISPC_ARM_ENABLED - if (!strcmp(isa, "neon")) + if (!strncmp(isa, "neon", 4)) arch = "arm"; else #endif @@ -283,40 +283,98 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } // Check default LLVM generated targets - if (!strcasecmp(isa, "sse2")) { + if (!strcasecmp(isa, "sse2") || + !strcasecmp(isa, "sse2-i32x4")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; - this->m_attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt"; + this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" +#if defined(LLVM_3_4) + ",-sse4.1,-sse4.2" +#else + ",-sse41,-sse42" +#endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "sse2-x2")) { + else if (!strcasecmp(isa, "sse2-x2") || + !strcasecmp(isa, "sse2-i32x8")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 8; - this->m_attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt"; + this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" +#if defined(LLVM_3_4) + ",-sse4.1,-sse4.2" +#else + ",-sse41,-sse42" +#endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "sse4")) { + else if (!strcasecmp(isa, "sse4") || + !strcasecmp(isa, "sse4-i32x4")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; // TODO: why not sse42 and popcnt? - this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" +#if defined(LLVM_3_4) + ",+sse4.1,-sse4.2" +#else + ",+sse41,-sse42" +#endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) { + else if (!strcasecmp(isa, "sse4x2") || + !strcasecmp(isa, "sse4-x2") || + !strcasecmp(isa, "sse4-i32x8")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 8; - this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" +#if defined(LLVM_3_4) + ",+sse4.1,-sse4.2" +#else + ",+sse41,-sse42" +#endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "generic-4")) { + else if (!strcasecmp(isa, "sse4-i8x16")) { + this->m_isa = Target::SSE4; + this->m_nativeVectorWidth = 16; + this->m_vectorWidth = 16; + this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" +#if defined(LLVM_3_4) + ",+sse4.1,-sse4.2" +#else + ",+sse41,-sse42" +#endif + ; + this->m_maskingIsFree = false; + this->m_maskBitCount = 8; + } + else if (!strcasecmp(isa, "sse4-i16x8")) { + this->m_isa = Target::SSE4; + this->m_nativeVectorWidth = 8; + this->m_vectorWidth = 8; + this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" +#if defined(LLVM_3_4) + ",+sse4.1,-sse4.2" +#else + ",+sse41,-sse42" +#endif + ; + this->m_maskingIsFree = false; + this->m_maskBitCount = 16; + } + else if (!strcasecmp(isa, "generic-4") || + !strcasecmp(isa, "generic-x4")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; @@ -326,7 +384,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-8")) { + else if (!strcasecmp(isa, "generic-8") || + !strcasecmp(isa, "generic-x8")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; @@ -336,7 +395,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-16")) { + else if (!strcasecmp(isa, "generic-16") || + !strcasecmp(isa, "generic-x16")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; @@ -346,7 +406,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-32")) { + else if (!strcasecmp(isa, "generic-32") || + !strcasecmp(isa, "generic-x32")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 32; this->m_vectorWidth = 32; @@ -356,7 +417,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-64")) { + else if (!strcasecmp(isa, "generic-64") || + !strcasecmp(isa, "generic-x64")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 64; this->m_vectorWidth = 64; @@ -366,14 +428,17 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-1")) { + else if (!strcasecmp(isa, "generic-1") || + !strcasecmp(isa, "generic-x1")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 1; this->m_vectorWidth = 1; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "avx") || !strcasecmp(isa, "avx1")) { + else if (!strcasecmp(isa, "avx") || + !strcasecmp(isa, "avx1") || + !strcasecmp(isa, "avx1-i32x8")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; @@ -381,7 +446,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "avx-x2") || !strcasecmp(isa, "avx1-x2")) { + else if (!strcasecmp(isa, "avx-x2") || + !strcasecmp(isa, "avx1-x2") || + !strcasecmp(isa, "avx1-i32x16")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 16; @@ -389,11 +456,18 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "avx1.1")) { + else if (!strcasecmp(isa, "avx1.1") || + !strcasecmp(isa, "avx1.1-i32x8")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; - this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand"; + this->m_attributes = "+avx,+popcnt,+cmov,+f16c" +#if defined(LLVM_3_4) + ",+rdrnd" +#else + ",+rdrand" +#endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; @@ -402,11 +476,18 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasRand = true; #endif } - else if (!strcasecmp(isa, "avx1.1-x2")) { + else if (!strcasecmp(isa, "avx1.1-x2") || + !strcasecmp(isa, "avx1.1-i32x16")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 16; - this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand"; + this->m_attributes = "+avx,+popcnt,+cmov,+f16c" +#if defined(LLVM_3_4) + ",+rdrnd" +#else + ",+rdrand" +#endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; @@ -415,11 +496,17 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasRand = true; #endif } - else if (!strcasecmp(isa, "avx2")) { + else if (!strcasecmp(isa, "avx2") || + !strcasecmp(isa, "avx2-i32x8")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; - this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand" + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" +#if defined(LLVM_3_4) + ",+rdrnd" +#else + ",+rdrand" +#endif #ifndef LLVM_3_1 ",+fma" #endif // !LLVM_3_1 @@ -433,11 +520,17 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasGather = true; #endif } - else if (!strcasecmp(isa, "avx2-x2")) { + else if (!strcasecmp(isa, "avx2-x2") || + !strcasecmp(isa, "avx2-i32x16")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; - this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand" + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" +#if defined(LLVM_3_4) + ",+rdrnd" +#else + ",+rdrand" +#endif #ifndef LLVM_3_1 ",+fma" #endif // !LLVM_3_1 @@ -452,8 +545,27 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : #endif } #ifdef ISPC_ARM_ENABLED - else if (!strcasecmp(isa, "neon")) { - this->m_isa = Target::NEON; + else if (!strcasecmp(isa, "neon-i8x16")) { + this->m_isa = Target::NEON8; + this->m_nativeVectorWidth = 16; + this->m_vectorWidth = 16; + this->m_attributes = "+neon,+fp16"; + this->m_hasHalf = true; // ?? + this->m_maskingIsFree = false; + this->m_maskBitCount = 8; + } + else if (!strcasecmp(isa, "neon-i16x8")) { + this->m_isa = Target::NEON16; + this->m_nativeVectorWidth = 8; + this->m_vectorWidth = 8; + this->m_attributes = "+neon,+fp16"; + this->m_hasHalf = true; // ?? + this->m_maskingIsFree = false; + this->m_maskBitCount = 16; + } + else if (!strcasecmp(isa, "neon") || + !strcasecmp(isa, "neon-i32x4")) { + this->m_isa = Target::NEON32; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; this->m_attributes = "+neon,+fp16"; @@ -463,8 +575,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } #endif else { - fprintf(stderr, "Target ISA \"%s\" is unknown. Choices are: %s\n", - isa, SupportedTargetISAs()); + Error(SourcePos(), "Target \"%s\" is unknown. Choices are: %s.", + isa, SupportedTargets()); error = true; } @@ -477,7 +589,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : std::string featuresString = m_attributes; llvm::TargetOptions options; #ifdef ISPC_ARM_ENABLED - if (m_isa == Target::NEON) + if (m_isa == Target::NEON8 || m_isa == Target::NEON16 || + m_isa == Target::NEON32) options.FloatABIType = llvm::FloatABI::Hard; #endif #if !defined(LLVM_3_1) @@ -557,7 +670,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : std::string -Target::SupportedTargetCPUs() { +Target::SupportedCPUs() { std::string ret; int count = sizeof(supportedCPUs) / sizeof(supportedCPUs[0]); for (int i = 0; i < count; ++i) { @@ -570,7 +683,7 @@ Target::SupportedTargetCPUs() { const char * -Target::SupportedTargetArchs() { +Target::SupportedArchs() { return #ifdef ISPC_ARM_ENABLED "arm, " @@ -580,14 +693,18 @@ Target::SupportedTargetArchs() { const char * -Target::SupportedTargetISAs() { +Target::SupportedTargets() { return #ifdef ISPC_ARM_ENABLED - "neon, " + "neon-i8x16, neon-16x8, neon-32x4, " #endif - "sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2" - ", avx1.1, avx1.1-x2, avx2, avx2-x2" - ", generic-1, generic-4, generic-8, generic-16, generic-32"; + "sse2-i32x4, sse2-i32x8, " + "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, " + "avx1-i32x8, avx1-i32x16, " + "avx1.1-i32x8, avx1.1-i32x16, " + "avx2-i32x8, avx2-i32x16, " + "generic-x1, generic-x4, generic-x8, generic-x16, " + "generic-x32, generic-x64"; } @@ -624,9 +741,13 @@ const char * Target::ISAToString(ISA isa) { switch (isa) { #ifdef ISPC_ARM_ENABLED - case Target::NEON: + case Target::NEON8: + return "neon-8"; + case Target::NEON16: + return "neon-16"; + case Target::NEON32: + return "neon-32"; #endif - return "neon"; case Target::SSE2: return "sse2"; case Target::SSE4: diff --git a/ispc.h b/ispc.h index bb9e2b31..4804832f 100644 --- a/ispc.h +++ b/ispc.h @@ -181,9 +181,10 @@ public: added or the enumerant values are reordered. */ enum ISA { #ifdef ISPC_ARM_ENABLED - NEON, + NEON32, NEON16, NEON8, #endif - SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS }; + SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, + NUM_ISAS }; /** Initializes the given Target pointer for a target of the given name, if the name is a known target. Returns true if the @@ -191,16 +192,16 @@ public: Target(const char *arch, const char *cpu, const char *isa, bool pic); /** Returns a comma-delimited string giving the names of the currently - supported target ISAs. */ - static const char *SupportedTargetISAs(); + supported compilation targets. */ + static const char *SupportedTargets(); /** Returns a comma-delimited string giving the names of the currently - supported target CPUs. */ - static std::string SupportedTargetCPUs(); + supported CPUs. */ + static std::string SupportedCPUs(); /** Returns a comma-delimited string giving the names of the currently - supported target architectures. */ - static const char *SupportedTargetArchs(); + supported architectures. */ + static const char *SupportedArchs(); /** Returns a triple string specifying the target architecture, vendor, and environment. */ diff --git a/ispc.vcxproj b/ispc.vcxproj index 36fbad5d..b4a8b764 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -51,10 +51,16 @@ + + + + - - + + + + 4146;4800;4996;4355;4624;4005;4003;4018 @@ -97,11 +103,13 @@ Document - %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 > $(Configuration)/gen-stdlib-x86.cpp; -%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic > $(Configuration)/gen-stdlib-generic.cpp; + %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask1 > $(Configuration)/gen-stdlib-mask1.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask8 > $(Configuration)/gen-stdlib-mask8.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask16 > $(Configuration)/gen-stdlib-mask16.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask32 > $(Configuration)/gen-stdlib-mask32.cpp; - $(Configuration)/gen-stdlib-generic.cpp;$(Configuration)/gen-stdlib-x86.cpp - Building gen-stdlib-{generic,x86}.cpp + $(Configuration)/gen-stdlib-mask1.cpp;$(Configuration)/gen-stdlib-mask8.cpp;$(Configuration)/gen-stdlib-mask16.cpp;$(Configuration)/gen-stdlib-mask32.cpp + Building gen-stdlib-{mask1,8,16,32}.cpp @@ -131,6 +139,42 @@ Building gen-bitcode-sse4-64bit.cpp + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit > $(Configuration)/gen-bitcode-sse4-8-32bit.cpp + $(Configuration)/gen-bitcode-sse4-8-32bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-8-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit > $(Configuration)/gen-bitcode-sse4-8-64bit.cpp + $(Configuration)/gen-bitcode-sse4-8-64bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-8-64bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit > $(Configuration)/gen-bitcode-sse4-16-32bit.cpp + $(Configuration)/gen-bitcode-sse4-16-32bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-16-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit > $(Configuration)/gen-bitcode-sse4-16-64bit.cpp + $(Configuration)/gen-bitcode-sse4-16-64bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-16-64bit.cpp + + Document diff --git a/lex.ll b/lex.ll index f6633fce..8baa627a 100644 --- a/lex.ll +++ b/lex.ll @@ -77,6 +77,8 @@ static int allTokens[] = { TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE, TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT, TOKEN_FLOAT_CONSTANT, + TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT, + TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT, TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT, TOKEN_INT64_CONSTANT, TOKEN_UINT64_CONSTANT, TOKEN_INC_OP, TOKEN_DEC_OP, TOKEN_LEFT_OP, TOKEN_RIGHT_OP, TOKEN_LE_OP, @@ -150,6 +152,10 @@ void ParserInit() { tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\""; tokenToName[TOKEN_DOTDOTDOT] = "..."; tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT"; + tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT"; + tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT"; + tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT"; + tokenToName[TOKEN_UINT16_CONSTANT] = "TOKEN_UINT16_CONSTANT"; tokenToName[TOKEN_INT32_CONSTANT] = "TOKEN_INT32_CONSTANT"; tokenToName[TOKEN_UINT32_CONSTANT] = "TOKEN_UINT32_CONSTANT"; tokenToName[TOKEN_INT64_CONSTANT] = "TOKEN_INT64_CONSTANT"; @@ -260,6 +266,10 @@ void ParserInit() { tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\""; tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'"; tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant"; + tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant"; + tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant"; + tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant"; + tokenNameRemap["TOKEN_UINT16_CONSTANT"] = "unsigned int16 constant"; tokenNameRemap["TOKEN_INT32_CONSTANT"] = "int32 constant"; tokenNameRemap["TOKEN_UINT32_CONSTANT"] = "unsigned int32 constant"; tokenNameRemap["TOKEN_INT64_CONSTANT"] = "int64 constant"; @@ -599,7 +609,22 @@ lParseInteger(bool dotdotdot) { } else { // No u or l suffix - // First, see if we can fit this into a 32-bit integer... + // If we're compiling to an 8-bit mask target and the constant + // fits into 8 bits, return an 8-bit int. + if (g->target->getMaskBitCount() == 8) { + if (yylval.intVal <= 0x7fULL) + return TOKEN_INT8_CONSTANT; + else if (yylval.intVal <= 0xffULL) + return TOKEN_UINT8_CONSTANT; + } + // And similarly for 16-bit masks and constants + if (g->target->getMaskBitCount() == 16) { + if (yylval.intVal <= 0x7fffULL) + return TOKEN_INT16_CONSTANT; + else if (yylval.intVal <= 0xffffULL) + return TOKEN_UINT16_CONSTANT; + } + // Otherwise, see if we can fit this into a 32-bit integer... if (yylval.intVal <= 0x7fffffffULL) return TOKEN_INT32_CONSTANT; else if (yylval.intVal <= 0xffffffffULL) diff --git a/llvm_patches/r183327-AVX2-GATHER.patch b/llvm_patches/r183327-AVX2-GATHER.patch old mode 100755 new mode 100644 diff --git a/llvmutil.cpp b/llvmutil.cpp index 26c18bf5..180c8676 100644 --- a/llvmutil.cpp +++ b/llvmutil.cpp @@ -115,13 +115,25 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0); LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0); - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: LLVMTypes::MaskType = LLVMTypes::BoolVectorType = llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.getVectorWidth()); - else { - Assert(target.getMaskBitCount() == 32); + break; + case 8: + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt8Ty(*ctx), target.getVectorWidth()); + break; + case 16: + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt16Ty(*ctx), target.getVectorWidth()); + break; + case 32: LLVMTypes::MaskType = LLVMTypes::BoolVectorType = llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth()); + break; + default: + FATAL("Unhandled mask width for initializing MaskType"); } LLVMTypes::Int1VectorType = @@ -154,12 +166,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { std::vector maskOnes; llvm::Constant *onMask = NULL; - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1, false /*unsigned*/); // 0x1 - else + break; + case 8: + onMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), -1, + true /*signed*/); // 0xff + break; + case 16: + onMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), -1, + true /*signed*/); // 0xffff + break; + case 32: onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1, true /*signed*/); // 0xffffffff + break; + default: + FATAL("Unhandled mask width for onMask"); + } for (int i = 0; i < target.getVectorWidth(); ++i) maskOnes.push_back(onMask); @@ -167,13 +193,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { std::vector maskZeros; llvm::Constant *offMask = NULL; - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0, true /*signed*/); - else + break; + case 8: + offMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), 0, + true /*signed*/); + break; + case 16: + offMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), 0, + true /*signed*/); + break; + case 32: offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0, true /*signed*/); - + break; + default: + FATAL("Unhandled mask width for offMask"); + } for (int i = 0; i < target.getVectorWidth(); ++i) maskZeros.push_back(offMask); LLVMMaskAllOff = llvm::ConstantVector::get(maskZeros); @@ -444,9 +483,14 @@ LLVMBoolVector(bool b) { if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0, false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int16Type, b ? 0xffff : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int8Type, b ? 0xff : 0, + false /*unsigned*/); else { - Assert(LLVMTypes::BoolVectorType->getElementType() == - llvm::Type::getInt1Ty(*g->ctx)); + Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType); v = b ? LLVMTrue : LLVMFalse; } @@ -465,9 +509,14 @@ LLVMBoolVector(const bool *bvec) { if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0, false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int16Type, bvec[i] ? 0xffff : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int8Type, bvec[i] ? 0xff : 0, + false /*unsigned*/); else { - Assert(LLVMTypes::BoolVectorType->getElementType() == - llvm::Type::getInt1Ty(*g->ctx)); + Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType); v = bvec[i] ? LLVMTrue : LLVMFalse; } diff --git a/main.cpp b/main.cpp index c6786c39..21a47de8 100644 --- a/main.cpp +++ b/main.cpp @@ -85,13 +85,16 @@ usage(int ret) { printf(" \t\taddressing calculations are done by default, even\n"); printf(" \t\ton 64-bit target architectures.)\n"); printf(" [--arch={%s}]\t\tSelect target architecture\n", - Target::SupportedTargetArchs()); + Target::SupportedArchs()); printf(" [--c++-include-file=]\t\tSpecify name of file to emit in #include statement in generated C++ code.\n"); #ifndef ISPC_IS_WINDOWS printf(" [--colored-output]\t\tAlways use terminal colors in error/warning messages.\n"); #endif - printf(" [--cpu=]\t\t\tSelect target CPU type\n"); - printf(" ={%s}\n", Target::SupportedTargetCPUs().c_str()); + printf(" "); + char cpuHelp[2048]; + sprintf(cpuHelp, "[--cpu=]\t\t\tSelect target CPU type\n={%s}\n", + Target::SupportedCPUs().c_str()); + PrintWithWordBreaks(cpuHelp, 16, TerminalWidth(), stdout); printf(" [-D]\t\t\t\t#define given value when running preprocessor\n"); printf(" [--dev-stub ]\t\tEmit device-side offload stub functions to file\n"); printf(" [--emit-asm]\t\t\tGenerate assembly language file as output\n"); @@ -127,7 +130,11 @@ usage(int ret) { printf(" [--pic]\t\t\t\tGenerate position-independent code\n"); #endif // !ISPC_IS_WINDOWS printf(" [--quiet]\t\t\t\tSuppress all output\n"); - printf(" [--target=]\t\t\tSelect target ISA. ={%s}\n", Target::SupportedTargetISAs()); + printf(" "); + char targetHelp[2048]; + sprintf(targetHelp, "[--target=]\t\t\tSelect target ISA and width.\n" + "={%s}", Target::SupportedTargets()); + PrintWithWordBreaks(targetHelp, 24, TerminalWidth(), stdout); printf(" [--version]\t\t\t\tPrint ispc version\n"); printf(" [--werror]\t\t\t\tTreat warnings as errors\n"); printf(" [--woff]\t\t\t\tDisable warnings\n"); @@ -322,7 +329,6 @@ int main(int Argc, char *Argv[]) { // as we're parsing below g = new Globals; - bool debugSet = false, optSet = false; Module::OutputType ot = Module::Object; bool generatePIC = false; const char *arch = NULL, *cpu = NULL, *target = NULL; @@ -365,7 +371,6 @@ int main(int Argc, char *Argv[]) { g->emitInstrumentation = true; else if (!strcmp(argv[i], "-g")) { g->generateDebuggingSymbols = true; - debugSet = true; } else if (!strcmp(argv[i], "--emit-asm")) ot = Module::Asm; @@ -492,12 +497,10 @@ int main(int Argc, char *Argv[]) { } else if (!strcmp(argv[i], "-O0")) { g->opt.level = 0; - optSet = true; } else if (!strcmp(argv[i], "-O") || !strcmp(argv[i], "-O1") || !strcmp(argv[i], "-O2") || !strcmp(argv[i], "-O3")) { g->opt.level = 1; - optSet = true; } else if (!strcmp(argv[i], "-")) ; @@ -571,12 +574,6 @@ int main(int Argc, char *Argv[]) { } } - // If the user specified -g, then the default optimization level is 0. - // If -g wasn't specified, the default optimization level is 1 (full - // optimization). - if (debugSet && !optSet) - g->opt.level = 0; - if (g->enableFuzzTest) { if (g->fuzzTestSeed == -1) { #ifdef ISPC_IS_WINDOWS diff --git a/module.cpp b/module.cpp index 562a7f5c..3f197c1b 100644 --- a/module.cpp +++ b/module.cpp @@ -1869,6 +1869,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre char *p = targetMacro; while (*p) { *p = toupper(*p); + if (*p == '-') *p = '_'; ++p; } opts.addMacroDef(targetMacro); diff --git a/opt.cpp b/opt.cpp index 4602da43..75eae20c 100644 --- a/opt.cpp +++ b/opt.cpp @@ -88,6 +88,7 @@ #include #include #include +#include #if defined(LLVM_3_1) #include #else @@ -111,7 +112,8 @@ #endif static llvm::Pass *CreateIntrinsicsOptPass(); -static llvm::Pass *CreateVSelMovmskOptPass(); +static llvm::Pass *CreateInstructionSimplifyPass(); +static llvm::Pass *CreatePeepholePass(); static llvm::Pass *CreateImproveMemoryOpsPass(); static llvm::Pass *CreateGatherCoalescePass(); @@ -516,6 +518,9 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createDeadInstEliminationPass()); optPM.add(llvm::createCFGSimplificationPass()); + optPM.add(llvm::createPromoteMemoryToRegisterPass()); + optPM.add(llvm::createAggressiveDCEPass()); + if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) { optPM.add(llvm::createInstructionCombiningPass(), 210); @@ -523,7 +528,7 @@ Optimize(llvm::Module *module, int optLevel) { } if (!g->opt.disableMaskAllOnOptimizations) { optPM.add(CreateIntrinsicsOptPass(), 215); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); } optPM.add(llvm::createDeadInstEliminationPass(), 220); @@ -557,6 +562,7 @@ Optimize(llvm::Module *module, int optLevel) { // InstructionCombiningPass. See r184459 for details. optPM.add(llvm::createSimplifyLibCallsPass(), 240); #endif + optPM.add(llvm::createAggressiveDCEPass()); optPM.add(llvm::createInstructionCombiningPass(), 241); optPM.add(llvm::createJumpThreadingPass()); optPM.add(llvm::createCFGSimplificationPass()); @@ -566,7 +572,7 @@ Optimize(llvm::Module *module, int optLevel) { if (!g->opt.disableMaskAllOnOptimizations) { optPM.add(CreateIntrinsicsOptPass(), 250); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); } if (g->opt.disableGatherScatterOptimizations == false && @@ -586,7 +592,7 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createFunctionInliningPass(), 265); optPM.add(llvm::createConstantPropagationPass()); optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) { @@ -596,26 +602,28 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createIPSCCPPass(), 275); optPM.add(llvm::createDeadArgEliminationPass()); + optPM.add(llvm::createAggressiveDCEPass()); optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createCFGSimplificationPass()); if (g->opt.disableHandlePseudoMemoryOps == false) { optPM.add(CreateReplacePseudoMemoryOpsPass(),280); } - optPM.add(CreateIntrinsicsOptPass(),281); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createFunctionInliningPass()); optPM.add(llvm::createArgumentPromotionPass()); optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false)); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createReassociatePass()); optPM.add(llvm::createLoopRotatePass()); optPM.add(llvm::createLICMPass()); optPM.add(llvm::createLoopUnswitchPass(false)); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createIndVarSimplifyPass()); optPM.add(llvm::createLoopIdiomPass()); optPM.add(llvm::createLoopDeletionPass()); @@ -626,17 +634,22 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(CreateIsCompileTimeConstantPass(true)); optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createMemCpyOptPass()); optPM.add(llvm::createSCCPPass()); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createJumpThreadingPass()); optPM.add(llvm::createCorrelatedValuePropagationPass()); optPM.add(llvm::createDeadStoreEliminationPass()); optPM.add(llvm::createAggressiveDCEPass()); optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); + optPM.add(CreatePeepholePass()); + optPM.add(llvm::createFunctionInliningPass()); + optPM.add(llvm::createAggressiveDCEPass()); optPM.add(llvm::createStripDeadPrototypesPass()); optPM.add(CreateMakeInternalFuncsStaticPass()); optPM.add(llvm::createGlobalDCEPass()); @@ -720,14 +733,17 @@ IntrinsicsOpt::IntrinsicsOpt() // All of the mask instructions we may encounter. Note that even if // compiling for AVX, we may still encounter the regular 4-wide SSE // MOVMSK instruction. - llvm::Function *sseMovmsk = + llvm::Function *ssei8Movmsk = + llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse2_pmovmskb_128); + maskInstructions.push_back(ssei8Movmsk); + llvm::Function *sseFloatMovmsk = llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps); - maskInstructions.push_back(sseMovmsk); + maskInstructions.push_back(sseFloatMovmsk); maskInstructions.push_back(m->module->getFunction("__movmsk")); - llvm::Function *avxMovmsk = + llvm::Function *avxFloatMovmsk = llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_movmsk_ps_256); - Assert(avxMovmsk != NULL); - maskInstructions.push_back(avxMovmsk); + Assert(avxFloatMovmsk != NULL); + maskInstructions.push_back(avxFloatMovmsk); // And all of the blend instructions blendInstructions.push_back(BlendInstruction( @@ -974,80 +990,153 @@ CreateIntrinsicsOptPass() { @todo The better thing to do would be to submit a patch to LLVM to get these; they're presumably pretty simple patterns to match. */ -class VSelMovmskOpt : public llvm::BasicBlockPass { +class InstructionSimplifyPass : public llvm::BasicBlockPass { public: - VSelMovmskOpt() + InstructionSimplifyPass() : BasicBlockPass(ID) { } const char *getPassName() const { return "Vector Select Optimization"; } bool runOnBasicBlock(llvm::BasicBlock &BB); static char ID; + +private: + static bool simplifySelect(llvm::SelectInst *selectInst, + llvm::BasicBlock::iterator iter); + static llvm::Value *simplifyBoolVec(llvm::Value *value); + static bool simplifyCall(llvm::CallInst *callInst, + llvm::BasicBlock::iterator iter); }; -char VSelMovmskOpt::ID = 0; +char InstructionSimplifyPass::ID = 0; + + +llvm::Value * +InstructionSimplifyPass::simplifyBoolVec(llvm::Value *value) { + llvm::TruncInst *trunc = llvm::dyn_cast(value); + if (trunc != NULL) { + // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector) + llvm::SExtInst *sext = llvm::dyn_cast(value); + if (sext && + sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType) + return sext->getOperand(0); + + llvm::ZExtInst *zext = llvm::dyn_cast(value); + if (zext && + zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType) + return zext->getOperand(0); + } + + llvm::ICmpInst *icmp = llvm::dyn_cast(value); + if (icmp != NULL) { + // icmp(ne, {sext,zext}(foo), zeroinitializer) -> foo + if (icmp->getSignedPredicate() == llvm::CmpInst::ICMP_NE) { + llvm::Value *op1 = icmp->getOperand(1); + if (llvm::isa(op1)) { + llvm::Value *op0 = icmp->getOperand(0); + llvm::SExtInst *sext = llvm::dyn_cast(op0); + if (sext) + return sext->getOperand(0); + llvm::ZExtInst *zext = llvm::dyn_cast(op0); + if (zext) + return zext->getOperand(0); + } + } + } + return NULL; +} bool -VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) { - DEBUG_START_PASS("VSelMovmaskOpt"); +InstructionSimplifyPass::simplifySelect(llvm::SelectInst *selectInst, + llvm::BasicBlock::iterator iter) { + if (selectInst->getType()->isVectorTy() == false) + return false; + + llvm::Value *factor = selectInst->getOperand(0); + + // Simplify all-on or all-off mask values + MaskStatus maskStatus = lGetMaskStatus(factor); + llvm::Value *value = NULL; + if (maskStatus == ALL_ON) + // Mask all on -> replace with the first select value + value = selectInst->getOperand(1); + else if (maskStatus == ALL_OFF) + // Mask all off -> replace with the second select value + value = selectInst->getOperand(2); + if (value != NULL) { + llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), + iter, value); + return true; + } + + // Sometimes earlier LLVM optimization passes generate unnecessarily + // complex expressions for the selection vector, which in turn confuses + // the code generators and leads to sub-optimal code (particularly for + // 8 and 16-bit masks). We'll try to simplify them out here so that + // the code generator patterns match.. + if ((factor = simplifyBoolVec(factor)) != NULL) { + llvm::Instruction *newSelect = + llvm::SelectInst::Create(factor, selectInst->getOperand(1), + selectInst->getOperand(2), + selectInst->getName()); + llvm::ReplaceInstWithInst(selectInst, newSelect); + return true; + } + + return false; +} + + +bool +InstructionSimplifyPass::simplifyCall(llvm::CallInst *callInst, + llvm::BasicBlock::iterator iter) { + llvm::Function *calledFunc = callInst->getCalledFunction(); + + // Turn a __movmsk call with a compile-time constant vector into the + // equivalent scalar value. + if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk")) + return false; + + uint64_t mask; + if (lGetMask(callInst->getArgOperand(0), &mask) == true) { + llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), + iter, LLVMInt64(mask)); + return true; + } + return false; +} + + +bool +InstructionSimplifyPass::runOnBasicBlock(llvm::BasicBlock &bb) { + DEBUG_START_PASS("InstructionSimplify"); bool modifiedAny = false; restart: for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) { llvm::SelectInst *selectInst = llvm::dyn_cast(&*iter); - if (selectInst != NULL && selectInst->getType()->isVectorTy()) { - llvm::Value *factor = selectInst->getOperand(0); - - MaskStatus maskStatus = lGetMaskStatus(factor); - llvm::Value *value = NULL; - if (maskStatus == ALL_ON) - // Mask all on -> replace with the first select value - value = selectInst->getOperand(1); - else if (maskStatus == ALL_OFF) - // Mask all off -> replace with the second select value - value = selectInst->getOperand(2); - - if (value != NULL) { - llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), - iter, value); - modifiedAny = true; - goto restart; - } + if (selectInst && simplifySelect(selectInst, iter)) { + modifiedAny = true; + goto restart; } - llvm::CallInst *callInst = llvm::dyn_cast(&*iter); - if (callInst == NULL) - continue; - - llvm::Function *calledFunc = callInst->getCalledFunction(); - if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk")) - continue; - - uint64_t mask; - if (lGetMask(callInst->getArgOperand(0), &mask) == true) { -#if 0 - fprintf(stderr, "mask %d\n", mask); - callInst->getArgOperand(0)->dump(); - fprintf(stderr, "-----------\n"); -#endif - llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), - iter, LLVMInt64(mask)); + if (callInst && simplifyCall(callInst, iter)) { modifiedAny = true; goto restart; } } - DEBUG_END_PASS("VSelMovMskOpt"); + DEBUG_END_PASS("InstructionSimplify"); return modifiedAny; } static llvm::Pass * -CreateVSelMovmskOptPass() { - return new VSelMovmskOpt; +CreateInstructionSimplifyPass() { + return new InstructionSimplifyPass; } @@ -4359,6 +4448,14 @@ char MakeInternalFuncsStaticPass::ID = 0; bool MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { const char *names[] = { + "__avg_up_uint8", + "__avg_up_int8", + "__avg_up_uint16", + "__avg_up_int16", + "__avg_down_uint8", + "__avg_down_int8", + "__avg_down_uint16", + "__avg_down_int16", "__fast_masked_vload", "__gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i16", "__gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i64", @@ -4438,3 +4535,391 @@ static llvm::Pass * CreateMakeInternalFuncsStaticPass() { return new MakeInternalFuncsStaticPass; } + + +/////////////////////////////////////////////////////////////////////////// +// PeepholePass + +class PeepholePass : public llvm::BasicBlockPass { +public: + PeepholePass(); + + const char *getPassName() const { return "Peephole Optimizations"; } + bool runOnBasicBlock(llvm::BasicBlock &BB); + + static char ID; +}; + +char PeepholePass::ID = 0; + +PeepholePass::PeepholePass() + : BasicBlockPass(ID) { +} + +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) + +using namespace llvm::PatternMatch; + +template +struct CastClassTypes_match { + Op_t Op; + const llvm::Type *fromType, *toType; + + CastClassTypes_match(const Op_t &OpMatch, const llvm::Type *f, + const llvm::Type *t) + : Op(OpMatch), fromType(f), toType(t) {} + + template + bool match(OpTy *V) { + if (llvm::Operator *O = llvm::dyn_cast(V)) + return (O->getOpcode() == Opcode && Op.match(O->getOperand(0)) && + O->getType() == toType && + O->getOperand(0)->getType() == fromType); + return false; + } +}; + +template +inline CastClassTypes_match +m_SExt8To16(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int8VectorType, + LLVMTypes::Int16VectorType); +} + +template +inline CastClassTypes_match +m_ZExt8To16(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int8VectorType, + LLVMTypes::Int16VectorType); +} + + +template +inline CastClassTypes_match +m_Trunc16To8(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int16VectorType, + LLVMTypes::Int8VectorType); +} + +template +inline CastClassTypes_match +m_SExt16To32(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int16VectorType, + LLVMTypes::Int32VectorType); +} + +template +inline CastClassTypes_match +m_ZExt16To32(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int16VectorType, + LLVMTypes::Int32VectorType); +} + + +template +inline CastClassTypes_match +m_Trunc32To16(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int32VectorType, + LLVMTypes::Int16VectorType); +} + +template +struct UDiv2_match { + Op_t Op; + + UDiv2_match(const Op_t &OpMatch) + : Op(OpMatch) {} + + template + bool match(OpTy *V) { + llvm::BinaryOperator *bop; + llvm::ConstantDataVector *cdv; + if ((bop = llvm::dyn_cast(V)) && + (cdv = llvm::dyn_cast(bop->getOperand(1))) && + cdv->getSplatValue() != NULL) { + const llvm::APInt &apInt = cdv->getUniqueInteger(); + + switch (bop->getOpcode()) { + case llvm::Instruction::UDiv: + // divide by 2 + return (apInt.isIntN(2) && Op.match(bop->getOperand(0))); + case llvm::Instruction::LShr: + // shift left by 1 + return (apInt.isIntN(1) && Op.match(bop->getOperand(0))); + default: + return false; + } + } + return false; + } +}; + +template +inline UDiv2_match +m_UDiv2(const V &v) { + return UDiv2_match(v); +} + +template +struct SDiv2_match { + Op_t Op; + + SDiv2_match(const Op_t &OpMatch) + : Op(OpMatch) {} + + template + bool match(OpTy *V) { + llvm::BinaryOperator *bop; + llvm::ConstantDataVector *cdv; + if ((bop = llvm::dyn_cast(V)) && + (cdv = llvm::dyn_cast(bop->getOperand(1))) && + cdv->getSplatValue() != NULL) { + const llvm::APInt &apInt = cdv->getUniqueInteger(); + + switch (bop->getOpcode()) { + case llvm::Instruction::SDiv: + // divide by 2 + return (apInt.isIntN(2) && Op.match(bop->getOperand(0))); + case llvm::Instruction::AShr: + // shift left by 1 + return (apInt.isIntN(1) && Op.match(bop->getOperand(0))); + default: + return false; + } + } + return false; + } +}; + +template +inline SDiv2_match +m_SDiv2(const V &v) { + return SDiv2_match(v); +} + +// Returns true if the given function has a call to an intrinsic function +// in its definition. +static bool +lHasIntrinsicInDefinition(llvm::Function *func) { + llvm::Function::iterator bbiter = func->begin(); + for (; bbiter != func->end(); ++bbiter) { + for (llvm::BasicBlock::iterator institer = bbiter->begin(); + institer != bbiter->end(); ++institer) { + if (llvm::isa(institer)) + return true; + } + } + return false; +} + +static llvm::Instruction * +lGetBinaryIntrinsic(const char *name, llvm::Value *opa, llvm::Value *opb) { + llvm::Function *func = m->module->getFunction(name); + Assert(func != NULL); + + // Make sure that the definition of the llvm::Function has a call to an + // intrinsic function in its instructions; otherwise we will generate + // infinite loops where we "helpfully" turn the default implementations + // of target builtins like __avg_up_uint8 that are implemented with plain + // arithmetic ops into recursive calls to themselves. + if (lHasIntrinsicInDefinition(func)) + return lCallInst(func, opa, opb, name); + else + return NULL; +} + +////////////////////////////////////////////////// + +static llvm::Instruction * +lMatchAvgUpUInt8(llvm::Value *inst) { + // (unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2) + llvm::Value *opa, *opb; + const llvm::APInt *delta; + if (match(inst, m_Trunc16To8(m_UDiv2(m_CombineOr( + m_CombineOr( + m_Add(m_ZExt8To16(m_Value(opa)), + m_Add(m_ZExt8To16(m_Value(opb)), m_APInt(delta))), + m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_APInt(delta)), + m_ZExt8To16(m_Value(opb)))), + m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))), + m_APInt(delta))))))) { + if (delta->isIntN(1) == false) + return NULL; + + return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb); + } + return NULL; +} + + +static llvm::Instruction * +lMatchAvgDownUInt8(llvm::Value *inst) { + // (unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2) + llvm::Value *opa, *opb; + if (match(inst, m_Trunc16To8(m_UDiv2( + m_Add(m_ZExt8To16(m_Value(opa)), + m_ZExt8To16(m_Value(opb))))))) { + return lGetBinaryIntrinsic("__avg_down_uint8", opa, opb); + } + return NULL; +} + +static llvm::Instruction * +lMatchAvgUpUInt16(llvm::Value *inst) { + // (unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2) + llvm::Value *opa, *opb; + const llvm::APInt *delta; + if (match(inst, m_Trunc32To16(m_UDiv2(m_CombineOr( + m_CombineOr( + m_Add(m_ZExt16To32(m_Value(opa)), + m_Add(m_ZExt16To32(m_Value(opb)), m_APInt(delta))), + m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_APInt(delta)), + m_ZExt16To32(m_Value(opb)))), + m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))), + m_APInt(delta))))))) { + if (delta->isIntN(1) == false) + return NULL; + + return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb); + } + return NULL; +} + + +static llvm::Instruction * +lMatchAvgDownUInt16(llvm::Value *inst) { + // (unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2) + llvm::Value *opa, *opb; + if (match(inst, m_Trunc32To16(m_UDiv2( + m_Add(m_ZExt16To32(m_Value(opa)), + m_ZExt16To32(m_Value(opb))))))) { + return lGetBinaryIntrinsic("__avg_down_uint16", opa, opb); + } + return NULL; +} + + +static llvm::Instruction * +lMatchAvgUpInt8(llvm::Value *inst) { + // (int8)(((int16)a + (int16)b + 1)/2) + llvm::Value *opa, *opb; + const llvm::APInt *delta; + if (match(inst, m_Trunc16To8(m_SDiv2(m_CombineOr( + m_CombineOr( + m_Add(m_SExt8To16(m_Value(opa)), + m_Add(m_SExt8To16(m_Value(opb)), m_APInt(delta))), + m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_APInt(delta)), + m_SExt8To16(m_Value(opb)))), + m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))), + m_APInt(delta))))))) { + if (delta->isIntN(1) == false) + return NULL; + + return lGetBinaryIntrinsic("__avg_up_int8", opa, opb); + } + return NULL; +} + + +static llvm::Instruction * +lMatchAvgDownInt8(llvm::Value *inst) { + // (int8)(((int16)a + (int16)b)/2) + llvm::Value *opa, *opb; + if (match(inst, m_Trunc16To8(m_SDiv2( + m_Add(m_SExt8To16(m_Value(opa)), + m_SExt8To16(m_Value(opb))))))) { + return lGetBinaryIntrinsic("__avg_down_int8", opa, opb); + } + return NULL; +} + +static llvm::Instruction * +lMatchAvgUpInt16(llvm::Value *inst) { + // (int16)(((int32)a + (int32)b + 1)/2) + llvm::Value *opa, *opb; + const llvm::APInt *delta; + if (match(inst, m_Trunc32To16(m_SDiv2(m_CombineOr( + m_CombineOr( + m_Add(m_SExt16To32(m_Value(opa)), + m_Add(m_SExt16To32(m_Value(opb)), m_APInt(delta))), + m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_APInt(delta)), + m_SExt16To32(m_Value(opb)))), + m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))), + m_APInt(delta))))))) { + if (delta->isIntN(1) == false) + return NULL; + + return lGetBinaryIntrinsic("__avg_up_int16", opa, opb); + } + return NULL; +} + +static llvm::Instruction * +lMatchAvgDownInt16(llvm::Value *inst) { + // (int16)(((int32)a + (int32)b)/2) + llvm::Value *opa, *opb; + if (match(inst, m_Trunc32To16(m_SDiv2( + m_Add(m_SExt16To32(m_Value(opa)), + m_SExt16To32(m_Value(opb))))))) { + return lGetBinaryIntrinsic("__avg_down_int16", opa, opb); + } + return NULL; +} +#endif // !LLVM_3_1 && !LLVM_3_2 + +bool +PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) { + DEBUG_START_PASS("PeepholePass"); + + bool modifiedAny = false; + restart: + for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) { + llvm::Instruction *inst = &*iter; + + llvm::Instruction *builtinCall = NULL; +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) + if (!builtinCall) + builtinCall = lMatchAvgUpUInt8(inst); + if (!builtinCall) + builtinCall = lMatchAvgUpUInt16(inst); + if (!builtinCall) + builtinCall = lMatchAvgDownUInt8(inst); + if (!builtinCall) + builtinCall = lMatchAvgDownUInt16(inst); + if (!builtinCall) + builtinCall = lMatchAvgUpInt8(inst); + if (!builtinCall) + builtinCall = lMatchAvgUpInt16(inst); + if (!builtinCall) + builtinCall = lMatchAvgDownInt8(inst); + if (!builtinCall) + builtinCall = lMatchAvgDownInt16(inst); +#endif // !LLVM_3_1 && !LLVM_3_2 + if (builtinCall != NULL) { + llvm::ReplaceInstWithInst(inst, builtinCall); + modifiedAny = true; + goto restart; + } + } + + DEBUG_END_PASS("PeepholePass"); + + return modifiedAny; +} + +static llvm::Pass * +CreatePeepholePass() { + return new PeepholePass; +} diff --git a/parse.yy b/parse.yy index 3ad815cf..5fc01cb0 100644 --- a/parse.yy +++ b/parse.yy @@ -179,6 +179,8 @@ struct ForeachDimension { } +%token TOKEN_INT8_CONSTANT TOKEN_UINT8_CONSTANT +%token TOKEN_INT16_CONSTANT TOKEN_UINT16_CONSTANT %token TOKEN_INT32_CONSTANT TOKEN_UINT32_CONSTANT %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT @@ -291,6 +293,22 @@ primary_expression Error(@1, "Undeclared symbol \"%s\".%s", name, alts.c_str()); } } + | TOKEN_INT8_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformInt8->GetAsConstType(), + (int8_t)yylval.intVal, @1); + } + | TOKEN_UINT8_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformUInt8->GetAsConstType(), + (uint8_t)yylval.intVal, @1); + } + | TOKEN_INT16_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformInt16->GetAsConstType(), + (int16_t)yylval.intVal, @1); + } + | TOKEN_UINT16_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformUInt16->GetAsConstType(), + (uint16_t)yylval.intVal, @1); + } | TOKEN_INT32_CONSTANT { $$ = new ConstExpr(AtomicType::UniformInt32->GetAsConstType(), (int32_t)yylval.intVal, @1); @@ -1233,7 +1251,10 @@ declarator ; int_constant - : TOKEN_INT32_CONSTANT { $$ = yylval.intVal; } + : TOKEN_INT8_CONSTANT { $$ = yylval.intVal; } + | TOKEN_INT16_CONSTANT { $$ = yylval.intVal; } + | TOKEN_INT32_CONSTANT { $$ = yylval.intVal; } + | TOKEN_INT64_CONSTANT { $$ = yylval.intVal; } ; direct_declarator @@ -2148,8 +2169,24 @@ lAddFunctionParams(Declarator *decl) { /** Add a symbol for the built-in mask variable to the symbol table */ static void lAddMaskToSymbolTable(SourcePos pos) { - const Type *t = g->target->getMaskBitCount() == 1 ? - AtomicType::VaryingBool : AtomicType::VaryingUInt32; + const Type *t = NULL; + switch (g->target->getMaskBitCount()) { + case 1: + t = AtomicType::VaryingBool; + break; + case 8: + t = AtomicType::VaryingUInt8; + break; + case 16: + t = AtomicType::VaryingUInt16; + break; + case 32: + t = AtomicType::VaryingUInt32; + break; + default: + FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable"); + } + t = t->GetAsConstType(); Symbol *maskSymbol = new Symbol("__mask", pos, t); m->symbolTable->AddVariable(maskSymbol); @@ -2241,7 +2278,11 @@ lGetConstantInt(Expr *expr, int *value, SourcePos pos, const char *usage) { Error(pos, "%s must be representable with a 32-bit integer.", usage); return false; } - *value = (int)ci->getZExtValue(); + const Type *type = expr->GetType(); + if (type->IsUnsignedType()) + *value = (int)ci->getZExtValue(); + else + *value = (int)ci->getSExtValue(); return true; } } diff --git a/run_tests.py b/run_tests.py index 7c6b1eb8..9729930f 100755 --- a/run_tests.py +++ b/run_tests.py @@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)", default="") parser.add_option('-t', '--target', dest='target', - help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)', + help='Set compilation target (sse2-i32x4, sse2-i32x8, sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, avx1-i32x8, avx1-i32x16, avx1.1-i32x8, avx1.1-i32x16, avx2-i32x8, avx2-i32x16, generic-x1, generic-x4, generic-x8, generic-x16, generic-x32, generic-x64)', default="sse4") parser.add_option('-a', '--arch', dest='arch', help='Set architecture (arm, x86, x86-64)', @@ -55,6 +55,8 @@ parser.add_option('--wrap-exe', dest='wrapexe', default="") parser.add_option('--time', dest='time', help='Enable time output', default=False, action="store_true") +parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates', + default=False, action="store_true") (options, args) = parser.parse_args() @@ -162,14 +164,15 @@ total_tests = 0 # finished. Should be called with the lock held.. def update_progress(fn, total_tests_arg, counter, max_test_length_arg): counter.value += 1 - progress_str = " Done %d / %d [%s]" % (counter.value, total_tests_arg, fn) - # spaces to clear out detrius from previous printing... - spaces_needed = max_test_length_arg - len(fn) - for x in range(spaces_needed): - progress_str += ' ' - progress_str += '\r' - sys.stdout.write(progress_str) - sys.stdout.flush() + if options.non_interactive == False: + progress_str = " Done %d / %d [%s]" % (counter.value, total_tests_arg, fn) + # spaces to clear out detrius from previous printing... + spaces_needed = max_test_length_arg - len(fn) + for x in range(spaces_needed): + progress_str += ' ' + progress_str += '\r' + sys.stdout.write(progress_str) + sys.stdout.flush() def run_command(cmd): if options.verbose: @@ -231,7 +234,7 @@ def add_prefix(path): else: input_prefix = "" path = input_prefix + path - path = os.path.normpath(path) + path = os.path.abspath(path) return path @@ -294,7 +297,7 @@ def run_test(testname): firstline = firstline.rstrip() file.close() - if (output.find(firstline) == -1): + if re.search(firstline, output) == None: sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \ (firstline, testname, output)) return (1, 0) @@ -489,11 +492,10 @@ if __name__ == '__main__': # (i.e. return 0 if all is ok) for t in task_threads: t.join() - sys.stdout.write("\n") + if options.non_interactive == False: + sys.stdout.write("\n") elapsed_time = time.time() - start_time - if options.time: - sys.stdout.write("Elapsed time: %d s\n" % elapsed_time) while not qret.empty(): (c, r, s) = qret.get() @@ -501,6 +503,8 @@ if __name__ == '__main__': run_error_files += r skip_files += s + if options.non_interactive: + sys.stdout.write(" Done %d / %d\n" % (finished_tests_counter.value, total_tests)) if len(skip_files) > 0: skip_files.sort() sys.stdout.write("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests)) @@ -517,4 +521,7 @@ if __name__ == '__main__': for f in run_error_files: sys.stdout.write("\t%s\n" % f) + if options.time: + sys.stdout.write("Elapsed time: %d s\n" % elapsed_time) + sys.exit(len(compile_error_files) + len(run_error_files)) diff --git a/stdlib.ispc b/stdlib.ispc index 4e06f5da..e4f8844f 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -38,12 +38,20 @@ ispc code */ -#ifdef ISPC_TARGET_GENERIC -#define IntMaskType bool -#define UIntMaskType bool +#if (ISPC_MASK_BITS == 1) + #define IntMaskType bool + #define UIntMaskType bool +#elif (ISPC_MASK_BITS == 8) + #define IntMaskType int8 + #define UIntMaskType unsigned int8 +#elif (ISPC_MASK_BITS == 16) + #define IntMaskType int16 + #define UIntMaskType unsigned int16 +#elif (ISPC_MASK_BITS == 32) + #define IntMaskType int32 + #define UIntMaskType unsigned int32 #else -#define IntMaskType int32 -#define UIntMaskType unsigned int32 + #error Unknown value of ISPC_MASK_BITS #endif /////////////////////////////////////////////////////////////////////////// @@ -335,14 +343,15 @@ static inline int32 sign_extend(bool v) { return __sext_varying_bool(v); } + __declspec(safe) static inline uniform bool any(bool v) { // We only care about whether "any" is true for the active program instances, // so we have to make v with the current program mask. -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __any(v & __mask); #else - return __any(__sext_varying_bool(v) & __mask); + return __any((UIntMaskType)__sext_varying_bool(v) & __mask); #endif } @@ -350,11 +359,10 @@ __declspec(safe) static inline uniform bool all(bool v) { // As with any(), we need to explicitly mask v with the current program mask // so we're only looking at the current lanes - -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __all(v | !__mask); #else - return __all(__sext_varying_bool(v) | !__mask); + return __all((UIntMaskType)__sext_varying_bool(v) | !__mask); #endif } @@ -362,11 +370,10 @@ __declspec(safe) static inline uniform bool none(bool v) { // As with any(), we need to explicitly mask v with the current program mask // so we're only looking at the current lanes - -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __none(v & __mask); #else - return __none(__sext_varying_bool(v) & __mask); + return __none((UIntMaskType)__sext_varying_bool(v) & __mask); #endif } @@ -399,10 +406,10 @@ static inline int popcnt(int64 v) { __declspec(safe) static inline uniform int popcnt(bool v) { // As with any() and all(), only count across the active lanes -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __popcnt_int64(__movmsk(v & __mask)); #else - return __popcnt_int64(__movmsk(__sext_varying_bool(v) & __mask)); + return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask)); #endif } @@ -880,21 +887,45 @@ static inline uniform double select(uniform bool c, uniform double a, /////////////////////////////////////////////////////////////////////////// // Horizontal ops / reductions +__declspec(safe) +static inline uniform int16 reduce_add(int8 x) { + return __reduce_add_int8(__mask ? x : (int8)0); +} + +__declspec(safe) +static inline uniform unsigned int16 reduce_add(unsigned int8 x) { + return __reduce_add_int8(__mask ? x : (int8)0); +} + +__declspec(safe) +static inline uniform int32 reduce_add(int16 x) { + return __reduce_add_int16(__mask ? x : (int16)0); +} + +__declspec(safe) +static inline uniform unsigned int32 reduce_add(unsigned int16 x) { + return __reduce_add_int16(__mask ? x : (int16)0); +} + __declspec(safe) static inline uniform float reduce_add(float x) { // zero the lanes where the mask is off return __reduce_add_float(__mask ? x : 0.); } - __declspec(safe) static inline uniform float reduce_min(float v) { // For the lanes where the mask is off, replace the given value with // infinity, so that it doesn't affect the result. int iflt_max = 0x7f800000; // infinity - // Must use __floatbits_varying_int32, not floatbits(), since with the - // latter the current mask enters into the returned result... - return __reduce_min_float(__mask ? v : __floatbits_varying_int32(iflt_max)); + // unmasked block is needed to make sure that argument for unmasked + // function __reduce_min_float() are calculated without a mask. + bool test = __mask; + uniform float result; + unmasked { + result = __reduce_min_float(test ? v : floatbits(iflt_max)); + } + return result; } __declspec(safe) @@ -902,13 +933,18 @@ static inline uniform float reduce_max(float v) { // For the lanes where the mask is off, replace the given value with // negative infinity, so that it doesn't affect the result. const int iflt_neg_max = 0xff800000; // -infinity - // Must use __floatbits_varying_int32, not floatbits(), since with the - // latter the current mask enters into the returned result... - return __reduce_max_float(__mask ? v : __floatbits_varying_int32(iflt_neg_max)); + // unmasked block is needed to make sure that argument for unmasked + // function __reduce_max_float() are calculated without a mask. + bool test = __mask; + uniform float result; + unmasked { + result = __reduce_max_float(test ? v : floatbits(iflt_neg_max)); + } + return result; } __declspec(safe) -static inline uniform int reduce_add(int x) { +static inline uniform int64 reduce_add(int32 x) { // Zero out the values for lanes that aren't running return __reduce_add_int32(__mask ? x : 0); } @@ -930,7 +966,7 @@ static inline uniform int reduce_max(int v) { } __declspec(safe) -static inline uniform unsigned int reduce_add(unsigned int x) { +static inline uniform unsigned int64 reduce_add(unsigned int32 x) { // Set values for non-running lanes to zero so they don't affect the // result. return __reduce_add_int32(__mask ? x : 0); @@ -960,17 +996,27 @@ static inline uniform double reduce_add(double x) { __declspec(safe) static inline uniform double reduce_min(double v) { int64 iflt_max = 0x7ff0000000000000; // infinity - // Must use __doublebits_varying_int64, not doublebits(), since with the - // latter the current mask enters into the returned result... - return __reduce_min_double(__mask ? v : __doublebits_varying_int64(iflt_max)); + // unmasked block is needed to make sure that argument for unmasked + // function __reduce_min_double() are calculated without a mask. + bool test = __mask; + uniform double result; + unmasked { + result = __reduce_min_double(test ? v : doublebits(iflt_max)); + } + return result; } __declspec(safe) static inline uniform double reduce_max(double v) { const int64 iflt_neg_max = 0xfff0000000000000; // -infinity - // Must use __doublebits_varying_int64, not doublebits(), since with the - // latter the current mask enters into the returned result... - return __reduce_max_double(__mask ? v : __doublebits_varying_int64(iflt_neg_max)); + // unmasked block is needed to make sure that argument for unmasked + // function __reduce_max_double() are calculated without a mask. + bool test = __mask; + uniform double result; + unmasked { + result = __reduce_max_double(test ? v : doublebits(iflt_neg_max)); + } + return result; } __declspec(safe) @@ -1325,88 +1371,88 @@ static inline uniform double max(uniform double a, uniform double b) { // int8 -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int8 min(uniform unsigned int8 a, uniform unsigned int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int8 max(uniform unsigned int8 a, uniform unsigned int8 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int8 min(uniform int8 a, uniform int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int8 max(uniform int8 a, uniform int8 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int8 min(unsigned int8 a, unsigned int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int8 max(unsigned int8 a, unsigned int8 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int8 min(int8 a, int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int8 max(int8 a, int8 b) { return (a > b) ? a : b; } // int16 -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int16 min(uniform unsigned int16 a, uniform unsigned int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int16 max(uniform unsigned int16 a, uniform unsigned int16 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int16 min(uniform int16 a, uniform int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int16 max(uniform int16 a, uniform int16 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int16 min(unsigned int16 a, unsigned int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int16 max(unsigned int16 a, unsigned int16 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int16 min(int16 a, int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int16 max(int16 a, int16 b) { return (a > b) ? a : b; } @@ -3119,7 +3165,7 @@ static inline void __range_reduce_log(float input, varying float * uniform reduc static const int nonexponent_mask = 0x807FFFFF; // We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126 - static const int exponent_neg1 = (126 << 23); + static const int exponent_neg1 = (126l << 23); // NOTE(boulos): We don't need to mask anything out since we know // the sign bit has to be 0. If it's 1, we need to return infinity/nan // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN). @@ -3142,7 +3188,7 @@ static inline void __range_reduce_log(uniform float input, uniform float * unifo uniform int int_version = intbits(input); static const uniform int nonexponent_mask = 0x807FFFFF; - static const uniform int exponent_neg1 = (126 << 23); + static const uniform int exponent_neg1 = (126ul << 23); uniform int biased_exponent = int_version >> 23; uniform int offset_exponent = biased_exponent + 1; *exponent = offset_exponent - 127; // get the real value @@ -3640,18 +3686,18 @@ static inline uniform float half_to_float(uniform unsigned int16 h) { else { // https://gist.github.com/2144712 // Fabian "ryg" Giesen. - static const uniform unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift + static const uniform unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift uniform int32 o = ((int32)(h & 0x7fff)) << 13; // exponent/mantissa bits uniform unsigned int32 exp = shifted_exp & o; // just the exponent - o += (127 - 15) << 23; // exponent adjust + o += (uniform int32)(127 - 15) << 23; // exponent adjust // handle exponent special cases if (exp == shifted_exp) // Inf/NaN? - o += (128 - 16) << 23; // extra exp adjust + o += (uniform unsigned int32)(128 - 16) << 23; // extra exp adjust else if (exp == 0) { // Zero/Denormal? - o += 1 << 23; // extra exp adjust - o = intbits(floatbits(o) - floatbits(113 << 23)); // renormalize + o += 1ul << 23; // extra exp adjust + o = intbits(floatbits(o) - floatbits(113ul << 23)); // renormalize } o |= ((int32)(h & 0x8000)) << 16; // sign bit @@ -3668,17 +3714,17 @@ static inline float half_to_float(unsigned int16 h) { // https://gist.github.com/2144712 // Fabian "ryg" Giesen. - const unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift + const unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift - int32 o = ((int32)(h & 0x7fff)) << 13; // exponent/mantissa bits + int32 o = ((int32)(h & 0x7ffful)) << 13; // exponent/mantissa bits unsigned int32 exp = shifted_exp & o; // just the exponent - o += (127 - 15) << 23; // exponent adjust + o += (int32)(127 - 15) << 23; // exponent adjust - int32 infnan_val = o + ((128 - 16) << 23); - int32 zerodenorm_val = intbits(floatbits(o + (1<<23)) - floatbits(113 << 23)); + int32 infnan_val = o + ((int32)(128 - 16) << 23); + int32 zerodenorm_val = intbits(floatbits(o + (1ul<<23)) - floatbits(113ul << 23)); int32 reg_val = (exp == 0) ? zerodenorm_val : o; - int32 sign_bit = ((int32)(h & 0x8000)) << 16; + int32 sign_bit = ((int32)(h & 0x8000ul)) << 16; return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit); } } @@ -3708,16 +3754,16 @@ static inline uniform int16 float_to_half(uniform float f) { // NaN->qNaN and Inf->Inf // unconditional assignment here, will override with right value for // the regular case below. - uniform int32 f32infty = 255 << 23; - o = (fint > f32infty) ? 0x7e00 : 0x7c00; + uniform int32 f32infty = 255ul << 23; + o = (fint > f32infty) ? 0x7e00u : 0x7c00u; // (De)normalized number or zero // update fint unconditionally to save the blending; we don't need it // anymore for the Inf/NaN case anyway. - const uniform unsigned int32 round_mask = ~0xfffu; - const uniform int32 magic = 15 << 23; - const uniform int32 f16infty = 31 << 23; + const uniform unsigned int32 round_mask = ~0xffful; + const uniform int32 magic = 15ul << 23; + const uniform int32 f16infty = 31ul << 23; uniform int32 fint2 = intbits(floatbits(fint & round_mask) * floatbits(magic)) - round_mask; fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed @@ -3754,16 +3800,16 @@ static inline int16 float_to_half(float f) { // NaN->qNaN and Inf->Inf // unconditional assignment here, will override with right value for // the regular case below. - int32 f32infty = 255 << 23; - o = (fint > f32infty) ? 0x7e00 : 0x7c00; + int32 f32infty = 255ul << 23; + o = (fint > f32infty) ? 0x7e00u : 0x7c00u; // (De)normalized number or zero // update fint unconditionally to save the blending; we don't need it // anymore for the Inf/NaN case anyway. - const unsigned int32 round_mask = ~0xfffu; - const int32 magic = 15 << 23; - const int32 f16infty = 31 << 23; + const unsigned int32 round_mask = ~0xffful; + const int32 magic = 15ul << 23; + const int32 f16infty = 31ul << 23; // Shift exponent down, denormalize if necessary. // NOTE This represents half-float denormals using single precision denormals. @@ -3782,7 +3828,7 @@ static inline int16 float_to_half(float f) { // FP16 denormals are rare in practice, I don't know. Whatever slow path your HW // may or may not have for denormals, this may well hit it. float fscale = floatbits(fint & round_mask) * floatbits(magic); - fscale = min(fscale, floatbits((31 << 23) - 0x1000)); + fscale = min(fscale, floatbits((31ul << 23) - 0x1000ul)); int32 fint2 = intbits(fscale) - round_mask; if (fint < f32infty) @@ -3949,7 +3995,7 @@ float_to_srgb8(float inval) // Do the table lookup and unpack bias, scale unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20]; unsigned int bias = (tab >> 16) << 9; - unsigned int scale = tab & 0xffff; + unsigned int scale = tab & 0xfffful; // Grab next-highest mantissa bits and perform linear interpolation unsigned int t = (intbits(inval) >> 12) & 0xff; @@ -3999,7 +4045,7 @@ float_to_srgb8(uniform float inval) // Do the table lookup and unpack bias, scale uniform unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20]; uniform unsigned int bias = (tab >> 16) << 9; - uniform unsigned int scale = tab & 0xffff; + uniform unsigned int scale = tab & 0xfffful; // Grab next-highest mantissa bits and perform linear interpolation uniform unsigned int t = (intbits(inval) >> 12) & 0xff; @@ -4046,14 +4092,14 @@ static inline uniform unsigned int random(uniform RNGState * uniform state) static inline float frandom(varying RNGState * uniform state) { unsigned int irand = random(state); - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; return floatbits(0x3F800000 | irand)-1.0f; } static inline uniform float frandom(uniform RNGState * uniform state) { uniform unsigned int irand = random(state); - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; return floatbits(0x3F800000 | irand)-1.0f; } @@ -4061,18 +4107,18 @@ static inline void seed_rng(varying RNGState * uniform state, unsigned int seed) { state->z1 = seed; state->z2 = seed ^ 0xbeeff00d; - state->z3 = ((seed & 0xffff) << 16) | (seed >> 16); - state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00) << 8) | - ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24); + state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16); + state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) | + ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24); } static inline void seed_rng(uniform RNGState * uniform state, uniform unsigned int seed) { state->z1 = seed; state->z2 = seed ^ 0xbeeff00d; - state->z3 = ((seed & 0xffff) << 16) | (seed >> 16); - state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00) << 8) | - ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24); + state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16); + state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) | + ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24); } @@ -4090,7 +4136,7 @@ static inline uniform bool rdrand(float * uniform ptr) { uniform int32 irand; uniform bool success = __rdrand_i32(&irand); if (success) { - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; *ptr = floatbits(0x3F800000 | irand)-1.0f; } return success; @@ -4110,7 +4156,7 @@ static inline bool rdrand(varying float * uniform ptr) { // in vector form. However, we need to be careful to not // clobber any existing already-set values in *ptr with // inactive lanes here... - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; *ptr = floatbits(0x3F800000 | irand)-1.0f; success = true; } @@ -4130,7 +4176,7 @@ static inline bool rdrand(float * ptr) { foreach_active (index) { uniform int32 irand; if (__rdrand_i32(&irand)) { - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; *ptrs[index] = floatbits(0x3F800000 | irand)-1.0f; success = true; } @@ -4264,3 +4310,720 @@ static inline bool rdrand(int64 * ptr) { return success; } } + +/////////////////////////////////////////////////////////////////////////// +// Fast vector integer division + +/* These tables and the algorithms in the __fast_idiv() functions below are + from Halide; the idea is based on the paper "Division by Invariant + Integers using Multiplication" by Granlund and Montgomery. + + Copyright (c) 2012 MIT CSAIL + + Developed by: + + The Halide team + MIT CSAIL + http://halide-lang.org + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +static const uniform int64 __idiv_table_u8[][3] = { + {0, 0LL, 1}, {1, 171LL, 1}, {0, 0LL, 2}, + {1, 205LL, 2}, {1, 171LL, 2}, {2, 37LL, 2}, + {0, 0LL, 3}, {1, 57LL, 1}, {1, 205LL, 3}, + {2, 117LL, 3}, {1, 171LL, 3}, {1, 79LL, 2}, + {2, 37LL, 3}, {1, 137LL, 3}, {0, 0LL, 4}, + {1, 241LL, 4}, {1, 57LL, 2}, {1, 27LL, 1}, + {1, 205LL, 4}, {2, 135LL, 4}, {2, 117LL, 4}, + {2, 101LL, 4}, {1, 171LL, 4}, {1, 41LL, 2}, + {1, 79LL, 3}, {1, 19LL, 1}, {2, 37LL, 4}, + {2, 27LL, 4}, {1, 137LL, 4}, {2, 9LL, 4}, + {0, 0LL, 5}, {1, 249LL, 5}, {1, 241LL, 5}, + {1, 235LL, 5}, {1, 57LL, 3}, {1, 111LL, 4}, + {1, 27LL, 2}, {2, 165LL, 5}, {1, 205LL, 5}, + {1, 25LL, 2}, {2, 135LL, 5}, {1, 191LL, 5}, + {1, 187LL, 5}, {2, 109LL, 5}, {2, 101LL, 5}, + {1, 175LL, 5}, {1, 171LL, 5}, {2, 79LL, 5}, + {1, 41LL, 3}, {1, 161LL, 5}, {1, 79LL, 4}, + {1, 155LL, 5}, {1, 19LL, 2}, {1, 149LL, 5}, + {2, 37LL, 5}, {1, 9LL, 1}, {2, 27LL, 5}, + {1, 139LL, 5}, {1, 137LL, 5}, {2, 13LL, 5}, + {2, 9LL, 5}, {2, 5LL, 5}, {0, 0LL, 6}, + {1, 253LL, 6}, {1, 249LL, 6}, {1, 245LL, 6}, + {1, 121LL, 5}, {1, 119LL, 5}, {1, 235LL, 6}, + {1, 231LL, 6}, {1, 57LL, 4}, {1, 225LL, 6}, + {1, 111LL, 5}, {1, 219LL, 6}, {1, 27LL, 3}, + {1, 213LL, 6}, {2, 165LL, 6}, {1, 13LL, 2}, + {1, 205LL, 6}, {1, 203LL, 6}, {1, 25LL, 3}, + {1, 99LL, 5}, {2, 135LL, 6}, {1, 193LL, 6}, + {1, 191LL, 6}, {1, 189LL, 6}, {1, 187LL, 6}, + {1, 185LL, 6}, {1, 183LL, 6}, {1, 181LL, 6}, + {1, 179LL, 6}, {1, 177LL, 6}, {1, 175LL, 6}, + {1, 173LL, 6}, {1, 171LL, 6}, {1, 169LL, 6}, + {1, 21LL, 3}, {1, 83LL, 5}, {1, 41LL, 4}, + {1, 163LL, 6}, {1, 161LL, 6}, {2, 63LL, 6}, + {1, 79LL, 5}, {2, 57LL, 6}, {1, 155LL, 6}, + {2, 51LL, 6}, {1, 19LL, 3}, {1, 151LL, 6}, + {1, 149LL, 6}, {1, 37LL, 4}, {2, 37LL, 6}, + {1, 145LL, 6}, {1, 9LL, 2}, {1, 143LL, 6}, + {2, 27LL, 6}, {2, 25LL, 6}, {1, 139LL, 6}, + {1, 69LL, 5}, {1, 137LL, 6}, {2, 15LL, 6}, + {2, 13LL, 6}, {2, 11LL, 6}, {2, 9LL, 6}, + {2, 7LL, 6}, {2, 5LL, 6}, {2, 3LL, 6}, + {0, 0LL, 7}, {1, 255LL, 7}, {1, 127LL, 6}, + {1, 63LL, 5}, {1, 125LL, 6}, {1, 31LL, 4}, + {1, 123LL, 6}, {1, 61LL, 5}, {1, 121LL, 6}, + {1, 15LL, 3}, {1, 119LL, 6}, {1, 59LL, 5}, + {1, 235LL, 7}, {1, 117LL, 6}, {1, 29LL, 4}, + {1, 115LL, 6}, {1, 57LL, 5}, {1, 113LL, 6}, + {1, 225LL, 7}, {1, 7LL, 2}, {1, 111LL, 6}, + {1, 55LL, 5}, {1, 219LL, 7}, {1, 109LL, 6}, + {1, 27LL, 4}, {1, 215LL, 7}, {1, 107LL, 6}, + {1, 53LL, 5}, {1, 211LL, 7}, {1, 105LL, 6}, + {1, 13LL, 3}, {1, 207LL, 7}, {1, 103LL, 6}, + {1, 51LL, 5}, {1, 203LL, 7}, {1, 101LL, 6}, + {1, 25LL, 4}, {1, 199LL, 7}, {1, 99LL, 6}, + {1, 197LL, 7}, {1, 49LL, 5}, {1, 97LL, 6}, + {1, 193LL, 7}, {1, 3LL, 1}, {1, 191LL, 7}, + {1, 95LL, 6}, {1, 189LL, 7}, {1, 47LL, 5}, + {1, 187LL, 7}, {1, 93LL, 6}, {1, 185LL, 7}, + {1, 23LL, 4}, {1, 183LL, 7}, {1, 91LL, 6}, + {1, 181LL, 7}, {1, 45LL, 5}, {1, 179LL, 7}, + {1, 89LL, 6}, {1, 177LL, 7}, {1, 11LL, 3}, + {1, 175LL, 7}, {1, 87LL, 6}, {1, 173LL, 7}, + {1, 43LL, 5}, {1, 171LL, 7}, {1, 85LL, 6}, + {1, 169LL, 7}, {2, 81LL, 7}, {1, 21LL, 4}, + {1, 167LL, 7}, {1, 83LL, 6}, {1, 165LL, 7}, + {1, 41LL, 5}, {2, 71LL, 7}, {1, 163LL, 7}, + {1, 81LL, 6}, {1, 161LL, 7}, {1, 5LL, 2}, + {2, 63LL, 7}, {1, 159LL, 7}, {1, 79LL, 6}, + {1, 157LL, 7}, {2, 57LL, 7}, {1, 39LL, 5}, + {1, 155LL, 7}, {1, 77LL, 6}, {2, 51LL, 7}, + {1, 153LL, 7}, {1, 19LL, 4}, {2, 47LL, 7}, + {1, 151LL, 7}, {1, 75LL, 6}, {1, 149LL, 7}, + {2, 41LL, 7}, {1, 37LL, 5}, {1, 147LL, 7}, + {2, 37LL, 7}, {1, 73LL, 6}, {1, 145LL, 7}, + {2, 33LL, 7}, {1, 9LL, 3}, {2, 31LL, 7}, + {1, 143LL, 7}, {1, 71LL, 6}, {2, 27LL, 7}, + {1, 141LL, 7}, {2, 25LL, 7}, {1, 35LL, 5}, + {1, 139LL, 7}, {2, 21LL, 7}, {1, 69LL, 6}, + {2, 19LL, 7}, {1, 137LL, 7}, {1, 17LL, 4}, + {2, 15LL, 7}, {1, 135LL, 7}, {2, 13LL, 7}, + {1, 67LL, 6}, {2, 11LL, 7}, {1, 133LL, 7}, + {2, 9LL, 7}, {1, 33LL, 5}, {2, 7LL, 7}, + {1, 131LL, 7}, {2, 5LL, 7}, {1, 65LL, 6}, + {2, 3LL, 7}, {1, 129LL, 7}, {0, 0LL, 8}, +}; +static const uniform int64 __idiv_table_s8[][3] = { + {0, 0LL, 1}, {1, 86LL, 0}, {0, 0LL, 2}, + {1, 103LL, 1}, {1, 43LL, 0}, {1, 147LL, 2}, + {0, 0LL, 3}, {1, 57LL, 1}, {1, 103LL, 2}, + {1, 187LL, 3}, {1, 43LL, 1}, {1, 79LL, 2}, + {1, 147LL, 3}, {1, 137LL, 3}, {0, 0LL, 4}, + {1, 121LL, 3}, {1, 57LL, 2}, {1, 27LL, 1}, + {1, 103LL, 3}, {1, 49LL, 2}, {1, 187LL, 4}, + {1, 179LL, 4}, {1, 43LL, 2}, {1, 41LL, 2}, + {1, 79LL, 3}, {1, 19LL, 1}, {1, 147LL, 4}, + {1, 71LL, 3}, {1, 137LL, 4}, {1, 133LL, 4}, + {0, 0LL, 5}, {1, 125LL, 4}, {1, 121LL, 4}, + {1, 59LL, 3}, {1, 57LL, 3}, {1, 111LL, 4}, + {1, 27LL, 2}, {1, 211LL, 5}, {1, 103LL, 4}, + {1, 25LL, 2}, {1, 49LL, 3}, {1, 6LL, 0}, + {1, 47LL, 3}, {1, 23LL, 2}, {1, 45LL, 3}, + {1, 11LL, 1}, {1, 43LL, 3}, {1, 21LL, 2}, + {1, 41LL, 3}, {1, 81LL, 4}, {1, 79LL, 4}, + {1, 39LL, 3}, {1, 19LL, 2}, {1, 75LL, 4}, + {1, 147LL, 5}, {1, 9LL, 1}, {1, 71LL, 4}, + {1, 35LL, 3}, {1, 137LL, 5}, {1, 135LL, 5}, + {1, 133LL, 5}, {1, 131LL, 5}, {0, 0LL, 6}, + {1, 127LL, 5}, {1, 63LL, 4}, {1, 31LL, 3}, + {1, 61LL, 4}, {1, 15LL, 2}, {1, 59LL, 4}, + {1, 29LL, 3}, {1, 57LL, 4}, {1, 113LL, 5}, + {1, 7LL, 1}, {1, 55LL, 4}, {1, 27LL, 3}, + {1, 107LL, 5}, {1, 53LL, 4}, {1, 13LL, 2}, + {1, 103LL, 5}, {1, 51LL, 4}, {1, 25LL, 3}, + {1, 99LL, 5}, {1, 49LL, 4}, {1, 97LL, 5}, + {1, 3LL, 0}, {1, 95LL, 5}, {1, 47LL, 4}, + {1, 93LL, 5}, {1, 23LL, 3}, {1, 91LL, 5}, + {1, 45LL, 4}, {1, 89LL, 5}, {1, 11LL, 2}, + {1, 87LL, 5}, {1, 43LL, 4}, {1, 85LL, 5}, + {1, 21LL, 3}, {1, 83LL, 5}, {1, 41LL, 4}, + {1, 163LL, 6}, {1, 81LL, 5}, {1, 5LL, 1}, + {1, 79LL, 5}, {1, 157LL, 6}, {1, 39LL, 4}, + {1, 77LL, 5}, {1, 19LL, 3}, {1, 151LL, 6}, + {1, 75LL, 5}, {1, 37LL, 4}, {1, 147LL, 6}, + {1, 73LL, 5}, {1, 9LL, 2}, {1, 143LL, 6}, + {1, 71LL, 5}, {1, 141LL, 6}, {1, 35LL, 4}, + {1, 69LL, 5}, {1, 137LL, 6}, {1, 17LL, 3}, + {1, 135LL, 6}, {1, 67LL, 5}, {1, 133LL, 6}, + {1, 33LL, 4}, {1, 131LL, 6}, {1, 65LL, 5}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, +}; +static const uniform int64 __idiv_table_u16[][3] = { + {0, 0LL, 1}, {1, 43691LL, 1}, {0, 0LL, 2}, + {1, 52429LL, 2}, {1, 43691LL, 2}, {2, 9363LL, 2}, + {0, 0LL, 3}, {1, 58255LL, 3}, {1, 52429LL, 3}, + {1, 47663LL, 3}, {1, 43691LL, 3}, {1, 20165LL, 2}, + {2, 9363LL, 3}, {1, 34953LL, 3}, {0, 0LL, 4}, + {1, 61681LL, 4}, {1, 58255LL, 4}, {1, 55189LL, 4}, + {1, 52429LL, 4}, {2, 34329LL, 4}, {1, 47663LL, 4}, + {2, 25645LL, 4}, {1, 43691LL, 4}, {2, 18351LL, 4}, + {1, 20165LL, 3}, {2, 12137LL, 4}, {2, 9363LL, 4}, + {1, 18079LL, 3}, {1, 34953LL, 4}, {2, 2115LL, 4}, + {0, 0LL, 5}, {1, 63551LL, 5}, {1, 61681LL, 5}, + {1, 59919LL, 5}, {1, 58255LL, 5}, {1, 7085LL, 2}, + {1, 55189LL, 5}, {2, 42011LL, 5}, {1, 52429LL, 5}, + {2, 36765LL, 5}, {2, 34329LL, 5}, {1, 48771LL, 5}, + {1, 47663LL, 5}, {1, 11651LL, 3}, {2, 25645LL, 5}, + {2, 23705LL, 5}, {1, 43691LL, 5}, {2, 20063LL, 5}, + {2, 18351LL, 5}, {1, 41121LL, 5}, {1, 20165LL, 4}, + {1, 39569LL, 5}, {2, 12137LL, 5}, {2, 10725LL, 5}, + {2, 9363LL, 5}, {2, 8049LL, 5}, {1, 18079LL, 4}, + {1, 35545LL, 5}, {1, 34953LL, 5}, {1, 8595LL, 3}, + {2, 2115LL, 5}, {2, 1041LL, 5}, {0, 0LL, 6}, + {1, 4033LL, 2}, {1, 63551LL, 6}, {1, 31301LL, 5}, + {1, 61681LL, 6}, {2, 56039LL, 6}, {1, 59919LL, 6}, + {1, 59075LL, 6}, {1, 58255LL, 6}, {1, 57457LL, 6}, + {1, 7085LL, 3}, {2, 46313LL, 6}, {1, 55189LL, 6}, + {1, 6809LL, 3}, {2, 42011LL, 6}, {1, 53093LL, 6}, + {1, 52429LL, 6}, {1, 25891LL, 5}, {2, 36765LL, 6}, + {1, 25267LL, 5}, {2, 34329LL, 6}, {1, 49345LL, 6}, + {1, 48771LL, 6}, {1, 48211LL, 6}, {1, 47663LL, 6}, + {2, 28719LL, 6}, {1, 11651LL, 4}, {2, 26647LL, 6}, + {2, 25645LL, 6}, {2, 24665LL, 6}, {2, 23705LL, 6}, + {1, 44151LL, 6}, {1, 43691LL, 6}, {2, 20945LL, 6}, + {2, 20063LL, 6}, {1, 42367LL, 6}, {2, 18351LL, 6}, + {1, 5191LL, 3}, {1, 41121LL, 6}, {1, 20361LL, 5}, + {1, 20165LL, 5}, {1, 19973LL, 5}, {1, 39569LL, 6}, + {2, 12863LL, 6}, {2, 12137LL, 6}, {1, 2405LL, 2}, + {2, 10725LL, 6}, {1, 37787LL, 6}, {2, 9363LL, 6}, + {1, 18559LL, 5}, {2, 8049LL, 6}, {2, 7409LL, 6}, + {1, 18079LL, 5}, {1, 35849LL, 6}, {1, 35545LL, 6}, + {2, 4957LL, 6}, {1, 34953LL, 6}, {1, 4333LL, 3}, + {1, 8595LL, 4}, {2, 2665LL, 6}, {2, 2115LL, 6}, + {2, 1573LL, 6}, {2, 1041LL, 6}, {2, 517LL, 6}, + {0, 0LL, 7}, {1, 16257LL, 5}, {1, 4033LL, 3}, + {1, 16009LL, 5}, {1, 63551LL, 7}, {1, 63073LL, 7}, + {1, 31301LL, 6}, {1, 31069LL, 6}, {1, 61681LL, 7}, + {1, 61231LL, 7}, {2, 56039LL, 7}, {1, 30175LL, 6}, + {1, 59919LL, 7}, {1, 29747LL, 6}, {1, 59075LL, 7}, + {1, 29331LL, 6}, {1, 58255LL, 7}, {1, 57853LL, 7}, + {1, 57457LL, 7}, {1, 28533LL, 6}, {1, 7085LL, 4}, + {1, 14075LL, 5}, {2, 46313LL, 7}, {1, 27777LL, 6}, + {1, 55189LL, 7}, {1, 13707LL, 5}, {1, 6809LL, 4}, + {2, 42705LL, 7}, {2, 42011LL, 7}, {1, 53431LL, 7}, + {1, 53093LL, 7}, {1, 52759LL, 7}, {1, 52429LL, 7}, + {2, 38671LL, 7}, {1, 25891LL, 6}, {1, 6433LL, 4}, + {2, 36765LL, 7}, {2, 36145LL, 7}, {1, 25267LL, 6}, + {2, 34927LL, 7}, {2, 34329LL, 7}, {1, 49637LL, 7}, + {1, 49345LL, 7}, {2, 32577LL, 7}, {1, 48771LL, 7}, + {2, 31443LL, 7}, {1, 48211LL, 7}, {1, 47935LL, 7}, + {1, 47663LL, 7}, {2, 29251LL, 7}, {2, 28719LL, 7}, + {1, 2929LL, 3}, {1, 11651LL, 5}, {1, 23173LL, 6}, + {2, 26647LL, 7}, {1, 2865LL, 3}, {2, 25645LL, 7}, + {1, 1417LL, 2}, {2, 24665LL, 7}, {1, 44859LL, 7}, + {2, 23705LL, 7}, {2, 23233LL, 7}, {1, 44151LL, 7}, + {1, 2745LL, 3}, {1, 43691LL, 7}, {2, 21393LL, 7}, + {2, 20945LL, 7}, {1, 43019LL, 7}, {2, 20063LL, 7}, + {1, 21291LL, 6}, {1, 42367LL, 7}, {1, 21077LL, 6}, + {2, 18351LL, 7}, {1, 41735LL, 7}, {1, 5191LL, 4}, + {2, 17111LL, 7}, {1, 41121LL, 7}, {2, 16305LL, 7}, + {1, 20361LL, 6}, {1, 40525LL, 7}, {1, 20165LL, 6}, + {1, 40137LL, 7}, {1, 19973LL, 6}, {1, 39757LL, 7}, + {1, 39569LL, 7}, {2, 13231LL, 7}, {2, 12863LL, 7}, + {1, 39017LL, 7}, {2, 12137LL, 7}, {2, 11779LL, 7}, + {1, 2405LL, 3}, {2, 11073LL, 7}, {2, 10725LL, 7}, + {1, 18979LL, 6}, {1, 37787LL, 7}, {2, 9699LL, 7}, + {2, 9363LL, 7}, {1, 37283LL, 7}, {1, 18559LL, 6}, + {2, 8373LL, 7}, {2, 8049LL, 7}, {1, 4579LL, 4}, + {2, 7409LL, 7}, {2, 7093LL, 7}, {1, 18079LL, 6}, + {1, 36003LL, 7}, {1, 35849LL, 7}, {2, 5857LL, 7}, + {1, 35545LL, 7}, {1, 35395LL, 7}, {2, 4957LL, 7}, + {1, 35099LL, 7}, {1, 34953LL, 7}, {1, 4351LL, 4}, + {1, 4333LL, 4}, {2, 3507LL, 7}, {1, 8595LL, 5}, + {2, 2943LL, 7}, {2, 2665LL, 7}, {1, 16981LL, 6}, + {2, 2115LL, 7}, {2, 1843LL, 7}, {2, 1573LL, 7}, + {1, 33421LL, 7}, {2, 1041LL, 7}, {1, 33157LL, 7}, + {2, 517LL, 7}, {1, 32897LL, 7}, {0, 0LL, 8}, +}; +static const uniform int64 __idiv_table_s16[][3] = { + {0, 0LL, 1}, {1, 21846LL, 0}, {0, 0LL, 2}, + {1, 26215LL, 1}, {1, 10923LL, 0}, {1, 18725LL, 1}, + {0, 0LL, 3}, {1, 7282LL, 0}, {1, 26215LL, 2}, + {1, 5958LL, 0}, {1, 10923LL, 1}, {1, 20165LL, 2}, + {1, 18725LL, 2}, {1, 34953LL, 3}, {0, 0LL, 4}, + {1, 30841LL, 3}, {1, 3641LL, 0}, {1, 55189LL, 4}, + {1, 26215LL, 3}, {1, 49933LL, 4}, {1, 2979LL, 0}, + {1, 45591LL, 4}, {1, 10923LL, 2}, {1, 5243LL, 1}, + {1, 20165LL, 3}, {1, 38837LL, 4}, {1, 18725LL, 3}, + {1, 18079LL, 3}, {1, 34953LL, 4}, {1, 16913LL, 3}, + {0, 0LL, 5}, {1, 1986LL, 0}, {1, 30841LL, 4}, + {1, 3745LL, 1}, {1, 3641LL, 1}, {1, 7085LL, 2}, + {1, 55189LL, 5}, {1, 26887LL, 4}, {1, 26215LL, 4}, + {1, 51151LL, 5}, {1, 49933LL, 5}, {1, 12193LL, 3}, + {1, 2979LL, 1}, {1, 11651LL, 3}, {1, 45591LL, 5}, + {1, 44621LL, 5}, {1, 10923LL, 3}, {1, 2675LL, 1}, + {1, 5243LL, 2}, {1, 41121LL, 5}, {1, 20165LL, 4}, + {1, 19785LL, 4}, {1, 38837LL, 5}, {1, 38131LL, 5}, + {1, 18725LL, 4}, {1, 36793LL, 5}, {1, 18079LL, 4}, + {1, 17773LL, 4}, {1, 34953LL, 5}, {1, 8595LL, 3}, + {1, 16913LL, 4}, {1, 33289LL, 5}, {0, 0LL, 6}, + {1, 4033LL, 2}, {1, 993LL, 0}, {1, 31301LL, 5}, + {1, 30841LL, 5}, {1, 15197LL, 4}, {1, 3745LL, 2}, + {1, 14769LL, 4}, {1, 3641LL, 2}, {1, 57457LL, 6}, + {1, 7085LL, 3}, {1, 55925LL, 6}, {1, 55189LL, 6}, + {1, 6809LL, 3}, {1, 26887LL, 5}, {1, 26547LL, 5}, + {1, 26215LL, 5}, {1, 25891LL, 5}, {1, 51151LL, 6}, + {1, 25267LL, 5}, {1, 49933LL, 6}, {1, 24673LL, 5}, + {1, 12193LL, 4}, {1, 48211LL, 6}, {1, 2979LL, 2}, + {1, 5891LL, 3}, {1, 11651LL, 4}, {1, 11523LL, 4}, + {1, 45591LL, 6}, {1, 45101LL, 6}, {1, 44621LL, 6}, + {1, 44151LL, 6}, {1, 10923LL, 4}, {1, 43241LL, 6}, + {1, 2675LL, 2}, {1, 662LL, 0}, {1, 5243LL, 3}, + {1, 5191LL, 3}, {1, 41121LL, 6}, {1, 20361LL, 5}, + {1, 20165LL, 5}, {1, 19973LL, 5}, {1, 19785LL, 5}, + {1, 1225LL, 1}, {1, 38837LL, 6}, {1, 2405LL, 2}, + {1, 38131LL, 6}, {1, 37787LL, 6}, {1, 18725LL, 5}, + {1, 18559LL, 5}, {1, 36793LL, 6}, {1, 36473LL, 6}, + {1, 18079LL, 5}, {1, 35849LL, 6}, {1, 17773LL, 5}, + {1, 35247LL, 6}, {1, 34953LL, 6}, {1, 4333LL, 3}, + {1, 8595LL, 4}, {1, 34101LL, 6}, {1, 16913LL, 5}, + {1, 33555LL, 6}, {1, 33289LL, 6}, {1, 33027LL, 6}, + {0, 0LL, 7}, {1, 16257LL, 5}, {1, 4033LL, 3}, + {1, 16009LL, 5}, {1, 993LL, 1}, {1, 31537LL, 6}, + {1, 31301LL, 6}, {1, 31069LL, 6}, {1, 30841LL, 6}, + {1, 3827LL, 3}, {1, 15197LL, 5}, {1, 30175LL, 6}, + {1, 3745LL, 3}, {1, 29747LL, 6}, {1, 14769LL, 5}, + {1, 29331LL, 6}, {1, 3641LL, 3}, {1, 28927LL, 6}, + {1, 57457LL, 7}, {1, 28533LL, 6}, {1, 7085LL, 4}, + {1, 14075LL, 5}, {1, 55925LL, 7}, {1, 27777LL, 6}, + {1, 55189LL, 7}, {1, 13707LL, 5}, {1, 6809LL, 4}, + {1, 54121LL, 7}, {1, 26887LL, 6}, {1, 6679LL, 4}, + {1, 26547LL, 6}, {1, 6595LL, 4}, {1, 26215LL, 6}, + {1, 6513LL, 4}, {1, 25891LL, 6}, {1, 6433LL, 4}, + {1, 51151LL, 7}, {1, 50841LL, 7}, {1, 25267LL, 6}, + {1, 6279LL, 4}, {1, 49933LL, 7}, {1, 24819LL, 6}, + {1, 24673LL, 6}, {1, 49057LL, 7}, {1, 12193LL, 5}, + {1, 24245LL, 6}, {1, 48211LL, 7}, {1, 749LL, 1}, + {1, 2979LL, 3}, {1, 23697LL, 6}, {1, 5891LL, 4}, + {1, 2929LL, 3}, {1, 11651LL, 5}, {1, 23173LL, 6}, + {1, 11523LL, 5}, {1, 2865LL, 3}, {1, 45591LL, 7}, + {1, 1417LL, 2}, {1, 45101LL, 7}, {1, 11215LL, 5}, + {1, 44621LL, 7}, {1, 44385LL, 7}, {1, 44151LL, 7}, + {1, 2745LL, 3}, {1, 10923LL, 5}, {1, 43465LL, 7}, + {1, 43241LL, 7}, {1, 43019LL, 7}, {1, 2675LL, 3}, + {1, 21291LL, 6}, {1, 331LL, 0}, {1, 21077LL, 6}, + {1, 5243LL, 4}, {1, 41735LL, 7}, {1, 5191LL, 4}, + {1, 10331LL, 5}, {1, 41121LL, 7}, {1, 40921LL, 7}, + {1, 20361LL, 6}, {1, 40525LL, 7}, {1, 20165LL, 6}, + {1, 20069LL, 6}, {1, 19973LL, 6}, {1, 39757LL, 7}, + {1, 19785LL, 6}, {1, 4923LL, 4}, {1, 1225LL, 2}, + {1, 39017LL, 7}, {1, 38837LL, 7}, {1, 19329LL, 6}, + {1, 2405LL, 3}, {1, 38305LL, 7}, {1, 38131LL, 7}, + {1, 18979LL, 6}, {1, 37787LL, 7}, {1, 18809LL, 6}, + {1, 18725LL, 6}, {1, 37283LL, 7}, {1, 18559LL, 6}, + {1, 36955LL, 7}, {1, 36793LL, 7}, {1, 4579LL, 4}, + {1, 36473LL, 7}, {1, 36315LL, 7}, {1, 18079LL, 6}, + {1, 36003LL, 7}, {1, 35849LL, 7}, {1, 35697LL, 7}, + {1, 17773LL, 6}, {1, 8849LL, 5}, {1, 35247LL, 7}, + {1, 35099LL, 7}, {1, 34953LL, 7}, {1, 4351LL, 4}, + {1, 4333LL, 4}, {1, 17261LL, 6}, {1, 8595LL, 5}, + {1, 535LL, 1}, {1, 34101LL, 7}, {1, 16981LL, 6}, + {1, 16913LL, 6}, {1, 16845LL, 6}, {1, 33555LL, 7}, + {1, 33421LL, 7}, {1, 33289LL, 7}, {1, 33157LL, 7}, + {1, 33027LL, 7}, {1, 32897LL, 7}, {1, 32769LL, 7}, +}; +static const uniform int64 __idiv_table_u32[][3] = { + {0, 0LL, 1}, {1, 2863311531LL, 1}, {0, 0LL, 2}, + {1, 3435973837LL, 2}, {1, 2863311531LL, 2}, {2, 613566757LL, 2}, + {0, 0LL, 3}, {1, 954437177LL, 1}, {1, 3435973837LL, 3}, + {1, 3123612579LL, 3}, {1, 2863311531LL, 3}, {1, 1321528399LL, 2}, + {2, 613566757LL, 3}, {1, 2290649225LL, 3}, {0, 0LL, 4}, + {1, 4042322161LL, 4}, {1, 954437177LL, 2}, {2, 2938661835LL, 4}, + {1, 3435973837LL, 4}, {2, 2249744775LL, 4}, {1, 3123612579LL, 4}, + {1, 2987803337LL, 4}, {1, 2863311531LL, 4}, {1, 1374389535LL, 3}, + {1, 1321528399LL, 3}, {2, 795364315LL, 4}, {2, 613566757LL, 4}, + {1, 2369637129LL, 4}, {1, 2290649225LL, 4}, {2, 138547333LL, 4}, + {0, 0LL, 5}, {1, 1041204193LL, 3}, {1, 4042322161LL, 5}, + {2, 3558687189LL, 5}, {1, 954437177LL, 3}, {2, 3134165325LL, 5}, + {2, 2938661835LL, 5}, {2, 2753184165LL, 5}, {1, 3435973837LL, 5}, + {1, 3352169597LL, 5}, {2, 2249744775LL, 5}, {1, 799063683LL, 3}, + {1, 3123612579LL, 5}, {2, 1813430637LL, 5}, {1, 2987803337LL, 5}, + {1, 2924233053LL, 5}, {1, 2863311531LL, 5}, {1, 1402438301LL, 4}, + {1, 1374389535LL, 4}, {1, 2694881441LL, 5}, {1, 1321528399LL, 4}, + {2, 891408307LL, 5}, {2, 795364315LL, 5}, {2, 702812831LL, 5}, + {2, 613566757LL, 5}, {2, 527452125LL, 5}, {1, 2369637129LL, 5}, + {1, 582368447LL, 3}, {1, 2290649225LL, 5}, {1, 1126548799LL, 4}, + {2, 138547333LL, 5}, {2, 68174085LL, 5}, {0, 0LL, 6}, + {1, 4228890877LL, 6}, {1, 1041204193LL, 4}, {1, 128207979LL, 1}, + {1, 4042322161LL, 6}, {1, 1991868891LL, 5}, {2, 3558687189LL, 6}, + {1, 3871519817LL, 6}, {1, 954437177LL, 4}, {2, 3235934265LL, 6}, + {2, 3134165325LL, 6}, {1, 458129845LL, 3}, {2, 2938661835LL, 6}, + {1, 892460737LL, 4}, {2, 2753184165LL, 6}, {1, 3479467177LL, 6}, + {1, 3435973837LL, 6}, {1, 3393554407LL, 6}, {1, 3352169597LL, 6}, + {1, 827945503LL, 4}, {2, 2249744775LL, 6}, {1, 3233857729LL, 6}, + {1, 799063683LL, 4}, {1, 789879043LL, 4}, {1, 3123612579LL, 6}, + {1, 3088515809LL, 6}, {2, 1813430637LL, 6}, {2, 1746305385LL, 6}, + {1, 2987803337LL, 6}, {1, 2955676419LL, 6}, {1, 2924233053LL, 6}, + {2, 1491936009LL, 6}, {1, 2863311531LL, 6}, {2, 1372618415LL, 6}, + {1, 1402438301LL, 5}, {1, 2776544515LL, 6}, {1, 1374389535LL, 5}, + {2, 1148159575LL, 6}, {1, 2694881441LL, 6}, {2, 1042467791LL, 6}, + {1, 1321528399LL, 5}, {2, 940802361LL, 6}, {2, 891408307LL, 6}, + {2, 842937507LL, 6}, {2, 795364315LL, 6}, {2, 748664025LL, 6}, + {2, 702812831LL, 6}, {2, 657787785LL, 6}, {2, 613566757LL, 6}, + {2, 570128403LL, 6}, {2, 527452125LL, 6}, {2, 485518043LL, 6}, + {1, 2369637129LL, 6}, {2, 403800345LL, 6}, {1, 582368447LL, 4}, + {1, 1154949189LL, 5}, {1, 2290649225LL, 6}, {2, 248469183LL, 6}, + {1, 1126548799LL, 5}, {2, 174592167LL, 6}, {2, 138547333LL, 6}, + {1, 274877907LL, 3}, {2, 68174085LL, 6}, {2, 33818641LL, 6}, + {0, 0LL, 7}, {1, 266354561LL, 3}, {1, 4228890877LL, 7}, + {1, 4196609267LL, 7}, {1, 1041204193LL, 5}, {1, 4133502361LL, 7}, + {1, 128207979LL, 2}, {1, 4072265289LL, 7}, {1, 4042322161LL, 7}, + {1, 125400505LL, 2}, {1, 1991868891LL, 6}, {1, 1977538899LL, 6}, + {2, 3558687189LL, 7}, {1, 974744351LL, 5}, {1, 3871519817LL, 7}, + {1, 3844446251LL, 7}, {1, 954437177LL, 5}, {1, 3791419407LL, 7}, + {2, 3235934265LL, 7}, {1, 3739835469LL, 7}, {2, 3134165325LL, 7}, + {1, 3689636335LL, 7}, {1, 458129845LL, 4}, {1, 910191745LL, 5}, + {2, 2938661835LL, 7}, {1, 3593175255LL, 7}, {1, 892460737LL, 5}, + {1, 3546811703LL, 7}, {2, 2753184165LL, 7}, {1, 875407347LL, 5}, + {1, 3479467177LL, 7}, {2, 2620200175LL, 7}, {1, 3435973837LL, 7}, + {1, 3414632385LL, 7}, {1, 3393554407LL, 7}, {1, 3372735055LL, 7}, + {1, 3352169597LL, 7}, {1, 1665926709LL, 6}, {1, 827945503LL, 5}, + {1, 1645975491LL, 6}, {2, 2249744775LL, 7}, {1, 1626496491LL, 6}, + {1, 3233857729LL, 7}, {2, 2134925265LL, 7}, {1, 799063683LL, 5}, + {2, 2060591247LL, 7}, {1, 789879043LL, 5}, {1, 1570730897LL, 6}, + {1, 3123612579LL, 7}, {2, 1916962805LL, 7}, {1, 3088515809LL, 7}, + {2, 1847555765LL, 7}, {2, 1813430637LL, 7}, {1, 3037324939LL, 7}, + {2, 1746305385LL, 7}, {1, 3004130131LL, 7}, {1, 2987803337LL, 7}, + {2, 1648338801LL, 7}, {1, 2955676419LL, 7}, {1, 2939870663LL, 7}, + {1, 2924233053LL, 7}, {2, 1522554545LL, 7}, {2, 1491936009LL, 7}, + {1, 2878302691LL, 7}, {1, 2863311531LL, 7}, {1, 356059465LL, 4}, + {2, 1372618415LL, 7}, {2, 1343553873LL, 7}, {1, 1402438301LL, 6}, + {2, 1286310003LL, 7}, {1, 2776544515LL, 7}, {1, 1381296015LL, 6}, + {1, 1374389535LL, 6}, {1, 42735993LL, 1}, {2, 1148159575LL, 7}, + {1, 2708156719LL, 7}, {1, 2694881441LL, 7}, {1, 1340867839LL, 6}, + {2, 1042467791LL, 7}, {1, 663956297LL, 5}, {1, 1321528399LL, 6}, + {1, 2630410593LL, 7}, {2, 940802361LL, 7}, {1, 2605477791LL, 7}, + {2, 891408307LL, 7}, {1, 2581013211LL, 7}, {2, 842937507LL, 7}, + {1, 1278501893LL, 6}, {2, 795364315LL, 7}, {2, 771906565LL, 7}, + {2, 748664025LL, 7}, {2, 725633745LL, 7}, {2, 702812831LL, 7}, + {2, 680198441LL, 7}, {2, 657787785LL, 7}, {2, 635578121LL, 7}, + {2, 613566757LL, 7}, {1, 2443359173LL, 7}, {2, 570128403LL, 7}, + {2, 548696263LL, 7}, {2, 527452125LL, 7}, {1, 1200340205LL, 6}, + {2, 485518043LL, 7}, {2, 464823301LL, 7}, {1, 2369637129LL, 7}, + {2, 423966729LL, 7}, {2, 403800345LL, 7}, {2, 383805589LL, 7}, + {1, 582368447LL, 5}, {2, 344322273LL, 7}, {1, 1154949189LL, 6}, + {1, 2300233531LL, 7}, {1, 2290649225LL, 7}, {1, 285143057LL, 4}, + {2, 248469183LL, 7}, {1, 2262369605LL, 7}, {1, 1126548799LL, 6}, + {2, 192835267LL, 7}, {2, 174592167LL, 7}, {2, 156496785LL, 7}, + {2, 138547333LL, 7}, {2, 120742053LL, 7}, {1, 274877907LL, 4}, + {1, 2190262207LL, 7}, {2, 68174085LL, 7}, {1, 2172947881LL, 7}, + {2, 33818641LL, 7}, {1, 2155905153LL, 7}, {0, 0LL, 8}, +}; +static const uniform int64 __idiv_table_s32[][3] = { + {0, 0LL, 1}, {1, 1431655766LL, 0}, {0, 0LL, 2}, + {1, 1717986919LL, 1}, {1, 715827883LL, 0}, {1, 2454267027LL, 2}, + {0, 0LL, 3}, {1, 954437177LL, 1}, {1, 1717986919LL, 2}, + {1, 780903145LL, 1}, {1, 715827883LL, 1}, {1, 1321528399LL, 2}, + {1, 2454267027LL, 3}, {1, 2290649225LL, 3}, {0, 0LL, 4}, + {1, 2021161081LL, 3}, {1, 954437177LL, 2}, {1, 1808407283LL, 3}, + {1, 1717986919LL, 3}, {1, 818089009LL, 2}, {1, 780903145LL, 2}, + {1, 2987803337LL, 4}, {1, 715827883LL, 2}, {1, 1374389535LL, 3}, + {1, 1321528399LL, 3}, {1, 1272582903LL, 3}, {1, 2454267027LL, 4}, + {1, 2369637129LL, 4}, {1, 2290649225LL, 4}, {1, 2216757315LL, 4}, + {0, 0LL, 5}, {1, 1041204193LL, 3}, {1, 2021161081LL, 4}, + {1, 3926827243LL, 5}, {1, 954437177LL, 3}, {1, 3714566311LL, 5}, + {1, 1808407283LL, 4}, {1, 3524075731LL, 5}, {1, 1717986919LL, 4}, + {1, 1676084799LL, 4}, {1, 818089009LL, 3}, {1, 799063683LL, 3}, + {1, 780903145LL, 3}, {1, 3054198967LL, 5}, {1, 2987803337LL, 5}, + {1, 2924233053LL, 5}, {1, 715827883LL, 3}, {1, 1402438301LL, 4}, + {1, 1374389535LL, 4}, {1, 2694881441LL, 5}, {1, 1321528399LL, 4}, + {1, 1296593901LL, 4}, {1, 1272582903LL, 4}, {1, 156180629LL, 1}, + {1, 2454267027LL, 5}, {1, 2411209711LL, 5}, {1, 2369637129LL, 5}, + {1, 582368447LL, 3}, {1, 2290649225LL, 5}, {1, 1126548799LL, 4}, + {1, 2216757315LL, 5}, {1, 2181570691LL, 5}, {0, 0LL, 6}, + {1, 2114445439LL, 5}, {1, 1041204193LL, 4}, {1, 128207979LL, 1}, + {1, 2021161081LL, 5}, {1, 1991868891LL, 5}, {1, 3926827243LL, 6}, + {1, 3871519817LL, 6}, {1, 954437177LL, 4}, {1, 3765450781LL, 6}, + {1, 3714566311LL, 6}, {1, 458129845LL, 3}, {1, 1808407283LL, 5}, + {1, 892460737LL, 4}, {1, 3524075731LL, 6}, {1, 1739733589LL, 5}, + {1, 1717986919LL, 5}, {1, 424194301LL, 3}, {1, 1676084799LL, 5}, + {1, 827945503LL, 4}, {1, 818089009LL, 4}, {1, 1616928865LL, 5}, + {1, 799063683LL, 4}, {1, 789879043LL, 4}, {1, 780903145LL, 4}, + {1, 3088515809LL, 6}, {1, 3054198967LL, 6}, {1, 3020636341LL, 6}, + {1, 2987803337LL, 6}, {1, 738919105LL, 4}, {1, 2924233053LL, 6}, + {1, 2893451653LL, 6}, {1, 715827883LL, 4}, {1, 354224107LL, 3}, + {1, 1402438301LL, 5}, {1, 2776544515LL, 6}, {1, 1374389535LL, 5}, + {1, 680390859LL, 4}, {1, 2694881441LL, 6}, {1, 333589693LL, 3}, + {1, 1321528399LL, 5}, {1, 2617884829LL, 6}, {1, 1296593901LL, 5}, + {1, 1284476201LL, 5}, {1, 1272582903LL, 5}, {1, 2521815661LL, 6}, + {1, 156180629LL, 2}, {1, 2476377541LL, 6}, {1, 2454267027LL, 6}, + {1, 1216273925LL, 5}, {1, 2411209711LL, 6}, {1, 1195121335LL, 5}, + {1, 2369637129LL, 6}, {1, 2349383821LL, 6}, {1, 582368447LL, 4}, + {1, 1154949189LL, 5}, {1, 2290649225LL, 6}, {1, 70991195LL, 1}, + {1, 1126548799LL, 5}, {1, 558694933LL, 4}, {1, 2216757315LL, 6}, + {1, 274877907LL, 3}, {1, 2181570691LL, 6}, {1, 2164392969LL, 6}, + {0, 0LL, 7}, {1, 266354561LL, 3}, {1, 2114445439LL, 6}, + {1, 1049152317LL, 5}, {1, 1041204193LL, 5}, {1, 4133502361LL, 7}, + {1, 128207979LL, 2}, {1, 4072265289LL, 7}, {1, 2021161081LL, 6}, + {1, 125400505LL, 2}, {1, 1991868891LL, 6}, {1, 1977538899LL, 6}, + {1, 3926827243LL, 7}, {1, 974744351LL, 5}, {1, 3871519817LL, 7}, + {1, 961111563LL, 5}, {1, 954437177LL, 5}, {1, 3791419407LL, 7}, + {1, 3765450781LL, 7}, {1, 1869917735LL, 6}, {1, 3714566311LL, 7}, + {1, 230602271LL, 3}, {1, 458129845LL, 4}, {1, 910191745LL, 5}, + {1, 1808407283LL, 6}, {1, 3593175255LL, 7}, {1, 892460737LL, 5}, + {1, 443351463LL, 4}, {1, 3524075731LL, 7}, {1, 875407347LL, 5}, + {1, 1739733589LL, 6}, {1, 432197967LL, 4}, {1, 1717986919LL, 6}, + {1, 3414632385LL, 7}, {1, 424194301LL, 4}, {1, 210795941LL, 3}, + {1, 1676084799LL, 6}, {1, 1665926709LL, 6}, {1, 827945503LL, 5}, + {1, 1645975491LL, 6}, {1, 818089009LL, 5}, {1, 1626496491LL, 6}, + {1, 1616928865LL, 6}, {1, 3214946281LL, 7}, {1, 799063683LL, 5}, + {1, 397222409LL, 4}, {1, 789879043LL, 5}, {1, 1570730897LL, 6}, + {1, 780903145LL, 5}, {1, 3105965051LL, 7}, {1, 3088515809LL, 7}, + {1, 3071261531LL, 7}, {1, 3054198967LL, 7}, {1, 759331235LL, 5}, + {1, 3020636341LL, 7}, {1, 3004130131LL, 7}, {1, 2987803337LL, 7}, + {1, 2971653049LL, 7}, {1, 738919105LL, 5}, {1, 2939870663LL, 7}, + {1, 2924233053LL, 7}, {1, 2908760921LL, 7}, {1, 2893451653LL, 7}, + {1, 2878302691LL, 7}, {1, 715827883LL, 5}, {1, 356059465LL, 4}, + {1, 354224107LL, 4}, {1, 2819260585LL, 7}, {1, 1402438301LL, 6}, + {1, 1395319325LL, 6}, {1, 2776544515LL, 7}, {1, 1381296015LL, 6}, + {1, 1374389535LL, 6}, {1, 42735993LL, 1}, {1, 680390859LL, 5}, + {1, 2708156719LL, 7}, {1, 2694881441LL, 7}, {1, 1340867839LL, 6}, + {1, 333589693LL, 4}, {1, 663956297LL, 5}, {1, 1321528399LL, 6}, + {1, 2630410593LL, 7}, {1, 2617884829LL, 7}, {1, 81421181LL, 2}, + {1, 1296593901LL, 6}, {1, 2581013211LL, 7}, {1, 1284476201LL, 6}, + {1, 1278501893LL, 6}, {1, 1272582903LL, 6}, {1, 2533436931LL, 7}, + {1, 2521815661LL, 7}, {1, 2510300521LL, 7}, {1, 156180629LL, 3}, + {1, 2487582869LL, 7}, {1, 2476377541LL, 7}, {1, 2465272709LL, 7}, + {1, 2454267027LL, 7}, {1, 2443359173LL, 7}, {1, 1216273925LL, 6}, + {1, 605457945LL, 5}, {1, 2411209711LL, 7}, {1, 1200340205LL, 6}, + {1, 1195121335LL, 6}, {1, 2379895299LL, 7}, {1, 2369637129LL, 7}, + {1, 2359467013LL, 7}, {1, 2349383821LL, 7}, {1, 2339386443LL, 7}, + {1, 582368447LL, 5}, {1, 2319644785LL, 7}, {1, 1154949189LL, 6}, + {1, 2300233531LL, 7}, {1, 2290649225LL, 7}, {1, 285143057LL, 4}, + {1, 70991195LL, 2}, {1, 2262369605LL, 7}, {1, 1126548799LL, 6}, + {1, 1121950641LL, 6}, {1, 558694933LL, 5}, {1, 2225732041LL, 7}, + {1, 2216757315LL, 7}, {1, 2207854675LL, 7}, {1, 274877907LL, 4}, + {1, 2190262207LL, 7}, {1, 2181570691LL, 7}, {1, 2172947881LL, 7}, + {1, 2164392969LL, 7}, {1, 2155905153LL, 7}, {1, 2147483649LL, 7}, +}; + +__declspec(safe) +static unmasked inline unsigned int8 +__fast_idiv(unsigned int8 numerator, uniform unsigned int8 divisor) { + uniform int64 method = __idiv_table_u8[divisor-2][0]; + uniform int64 multiplier = __idiv_table_u8[divisor-2][1]; + uniform int64 shift = __idiv_table_u8[divisor-2][2]; + + unsigned int16 mult = multiplier; + unsigned int16 val = numerator; + if (method == 0) + return numerator >> shift; + else if (method == 1) + return (val * mult) >> (8 + shift); + else { + val *= mult; + val >>= 8; + val += (numerator-val)>>1; + return (val >> shift); + } +} + +__declspec(safe) +static unmasked inline int8 __fast_idiv(int8 numerator, uniform int8 divisor) { + uniform int8 method = __idiv_table_s8[divisor-2][0]; + uniform int16 multiplier = __idiv_table_s8[divisor-2][1]; + uniform int8 shift = __idiv_table_s8[divisor-2][2]; + + if (method == 0) + return numerator >> shift; + else { + unsigned int8 sign = numerator >> 7; + numerator ^= sign; + int16 mul = (int16)numerator * (int16)multiplier; + mul >>= 8 + shift; + return (int8)mul ^ sign; + } +} + +__declspec(safe) +static unmasked inline unsigned int16 __fast_idiv(unsigned int16 numerator, + uniform unsigned int16 divisor) { + uniform int64 method = __idiv_table_u16[divisor-2][0]; + uniform int64 multiplier = __idiv_table_u16[divisor-2][1]; + uniform int64 shift = __idiv_table_u16[divisor-2][2]; + + unsigned int32 mult = multiplier; + unsigned int32 val = numerator; + if (method == 0) + return numerator >> shift; + else if (method == 1) + return (val * mult) >> (16 + shift); + else { + val *= mult; + val >>= 16; + val += (numerator-val)>>1; + return val >> shift; + } +} + +__declspec(safe) +static unmasked inline int16 __fast_idiv(int16 numerator, uniform int16 divisor) { + uniform int64 method = __idiv_table_s16[divisor-2][0]; + uniform int64 multiplier = __idiv_table_s16[divisor-2][1]; + uniform int64 shift = __idiv_table_s16[divisor-2][2]; + + if (method == 0) + return numerator >> shift; + else { + unsigned int16 sign = numerator >> 15; + numerator ^= sign; + int32 mul = (int32)numerator * (int32)multiplier; + mul >>= 16 + shift; + int16 result = mul; + return result ^ sign; + } +} + +__declspec(safe) +static unmasked inline inline unsigned int32 __fast_idiv(unsigned int32 numerator, + uniform unsigned int32 divisor) { + uniform int64 method = __idiv_table_u32[divisor-2][0]; + uniform int64 multiplier = __idiv_table_u32[divisor-2][1]; + uniform int64 shift = __idiv_table_u32[divisor-2][2]; + + unsigned int64 mult = multiplier; + unsigned int64 val = numerator; + if (method == 0) + return numerator >> shift; + else if (method == 1) + return (val * mult) >> (32 + shift); + else { + val *= mult; + val >>= 32; + val += (numerator-val)>>1; + return val >> shift; + } +} + +__declspec(safe) +static unmasked inline int32 __fast_idiv(int32 numerator, uniform int32 divisor) { + uniform int64 method = __idiv_table_s32[divisor-2][0]; + uniform int64 multiplier = __idiv_table_s32[divisor-2][1]; + uniform int64 shift = __idiv_table_s32[divisor-2][2]; + + if (method == 0) + return numerator >> shift; + else { + unsigned int32 sign = numerator >> 31; + numerator ^= sign; + int64 mul = (int64)numerator * (int64)multiplier; + mul >>= 32 + shift; + int32 result = mul; + return result ^ sign; + } +} + +/////////////////////////////////////////////////////////////////////////// +// Saturating int8/int16 ops + +__declspec(safe) +static unmasked inline unsigned int8 avg_up(unsigned int8 a, unsigned int8 b) { + return __avg_up_uint8(a, b); +} + +__declspec(safe) +static unmasked inline int8 avg_up(int8 a, int8 b) { + return __avg_up_int8(a, b); +} + +__declspec(safe) +static unmasked inline unsigned int16 avg_up(unsigned int16 a, unsigned int16 b) { + return __avg_up_uint16(a, b); +} + +__declspec(safe) +static unmasked inline int16 avg_up(int16 a, int16 b) { + return __avg_up_int16(a, b); +} + +__declspec(safe) +static unmasked inline unsigned int8 avg_down(unsigned int8 a, unsigned int8 b) { + return __avg_down_uint8(a, b); +} + +__declspec(safe) +static unmasked inline int8 avg_down(int8 a, int8 b) { + return __avg_down_int8(a, b); +} + +__declspec(safe) +static unmasked inline unsigned int16 avg_down(unsigned int16 a, unsigned int16 b) { + return __avg_down_uint16(a, b); +} + +__declspec(safe) +static unmasked inline int16 avg_down(int16 a, int16 b) { + return __avg_down_int16(a, b); +} diff --git a/sym.cpp b/sym.cpp index f16f5e11..05f9996a 100644 --- a/sym.cpp +++ b/sym.cpp @@ -214,6 +214,17 @@ SymbolTable::LookupType(const char *name) const { return NULL; } +bool +SymbolTable::ContainsType(const Type *type) const { + TypeMapType::const_iterator iter = types.begin(); + while (iter != types.end()) { + if (iter->second == type) { + return true; + } + iter++; + } + return false; +} std::vector SymbolTable::ClosestVariableOrFunctionMatch(const char *str) const { diff --git a/sym.h b/sym.h index efb532a3..761c3612 100644 --- a/sym.h +++ b/sym.h @@ -219,6 +219,12 @@ public: @return Pointer to the Type, if found; otherwise NULL is returned. */ const Type *LookupType(const char *name) const; + + /** Look for a type given a pointer. + + @return True if found, False otherwise. + */ + bool ContainsType(const Type * type) const; /** This method returns zero or more strings with the names of symbols in the symbol table that nearly (but not exactly) match the given diff --git a/tests/aossoa-1.ispc b/tests/aossoa-1.ispc index 59964d6d..32d3bcba 100644 --- a/tests/aossoa-1.ispc +++ b/tests/aossoa-1.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 3 -#define maxProgramCount 64 +#define width 3ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 3; diff --git a/tests/aossoa-2.ispc b/tests/aossoa-2.ispc index 9ff82226..df8eae5c 100644 --- a/tests/aossoa-2.ispc +++ b/tests/aossoa-2.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 4 -#define maxProgramCount 64 +#define width 4ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 4; diff --git a/tests/aossoa-5.ispc b/tests/aossoa-5.ispc index eb4fed3a..d6346455 100644 --- a/tests/aossoa-5.ispc +++ b/tests/aossoa-5.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 3 -#define maxProgramCount 64 +#define width 3ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 3; diff --git a/tests/aossoa-6.ispc b/tests/aossoa-6.ispc index b64cd10b..7c177fde 100644 --- a/tests/aossoa-6.ispc +++ b/tests/aossoa-6.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 4 -#define maxProgramCount 64 +#define width 4ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 4; diff --git a/tests/atomics-12.ispc b/tests/atomics-12.ispc index c27ad99c..d6359555 100644 --- a/tests/atomics-12.ispc +++ b/tests/atomics-12.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; float b = 0; if (programIndex < 30 && programIndex & 1) - b = atomic_or_global(&s, (1 << programIndex)); + b = atomic_or_global(&s, (1ul << programIndex)); RET[programIndex] = s; } @@ -15,6 +15,6 @@ export void result(uniform float RET[]) { uniform int sum = 0; for (uniform int i = 0; i < min(30, programCount); ++i) if (i & 1) - sum += (1 << i); + sum += (1ul << i); RET[programIndex] = sum; } diff --git a/tests/atomics-13.ispc b/tests/atomics-13.ispc index 86faaddb..dea3bfc3 100644 --- a/tests/atomics-13.ispc +++ b/tests/atomics-13.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; int32 b = 0; if (programIndex < 32 && programIndex & 1) - b = atomic_or_global(&s, (1 << programIndex)); + b = atomic_or_global(&s, (1ul << programIndex)); RET[programIndex] = popcnt(reduce_max((int32)b)); } diff --git a/tests/atomics-4.ispc b/tests/atomics-4.ispc index 30b343d1..ac746ad2 100644 --- a/tests/atomics-4.ispc +++ b/tests/atomics-4.ispc @@ -5,10 +5,10 @@ uniform int32 s = 0; export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; - float b = atomic_or_global(&s, (1< 0 ? 1 : 0; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/coalesce-1.ispc b/tests/coalesce-1.ispc index acfe8cdf..39a79a91 100644 --- a/tests/coalesce-1.ispc +++ b/tests/coalesce-1.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; assert(programIndex <= 64); diff --git a/tests/coalesce-2.ispc b/tests/coalesce-2.ispc index 88b952a4..a047e456 100644 --- a/tests/coalesce-2.ispc +++ b/tests/coalesce-2.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; RET[programIndex] = buf[programIndex & 1]; diff --git a/tests/coalesce-3.ispc b/tests/coalesce-3.ispc index 7a05963f..c1718b4f 100644 --- a/tests/coalesce-3.ispc +++ b/tests/coalesce-3.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; RET[programIndex] = buf[(programIndex >> 2) * 16 + (programIndex & 3)]; diff --git a/tests/coalesce-4.ispc b/tests/coalesce-4.ispc index 1ddd4b89..182a4d4f 100644 --- a/tests/coalesce-4.ispc +++ b/tests/coalesce-4.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[2*programIndex]; diff --git a/tests/coalesce-5.ispc b/tests/coalesce-5.ispc index 2dd8d44e..385e8526 100644 --- a/tests/coalesce-5.ispc +++ b/tests/coalesce-5.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[4*programIndex]; diff --git a/tests/coalesce-6.ispc b/tests/coalesce-6.ispc index 2a54a2db..8c630a45 100644 --- a/tests/coalesce-6.ispc +++ b/tests/coalesce-6.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[4*programIndex]; diff --git a/tests/coalesce-7.ispc b/tests/coalesce-7.ispc index 8ed628bd..29b56b8d 100644 --- a/tests/coalesce-7.ispc +++ b/tests/coalesce-7.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[4*programIndex]; diff --git a/tests/coalesce-8.ispc b/tests/coalesce-8.ispc index dfefaa19..f01ca9c3 100644 --- a/tests/coalesce-8.ispc +++ b/tests/coalesce-8.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; int index = (programIndex < 4) ? (programIndex & 1) : diff --git a/tests/count-leading-trailing-zeros-1.ispc b/tests/count-leading-trailing-zeros-1.ispc index 221d066d..3f12c07d 100644 --- a/tests/count-leading-trailing-zeros-1.ispc +++ b/tests/count-leading-trailing-zeros-1.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - RET[programIndex] = count_trailing_zeros(0xf0); + RET[programIndex] = count_trailing_zeros(0xf0ul); } export void result(uniform float RET[]) { diff --git a/tests/count-leading-trailing-zeros-4.ispc b/tests/count-leading-trailing-zeros-4.ispc index 475c18ca..4b849018 100644 --- a/tests/count-leading-trailing-zeros-4.ispc +++ b/tests/count-leading-trailing-zeros-4.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - int32 i = (1 << (programIndex % 28)); + int32 i = (1ul << (programIndex % 28)); RET[programIndex] = count_leading_zeros(i); } diff --git a/tests/exclusive-scan-and-2.ispc b/tests/exclusive-scan-and-2.ispc index 5d2bcd1f..b742a91e 100644 --- a/tests/exclusive-scan-and-2.ispc +++ b/tests/exclusive-scan-and-2.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { RET[programIndex] = -1; - int32 a = ~(1 << programIndex); + int32 a = ~(1ul << programIndex); if ((programIndex < 32) && (programIndex & 1) == 0) { RET[programIndex] = exclusive_scan_and(a); } @@ -15,7 +15,7 @@ export void result(uniform float RET[]) { if ((programIndex & 1) == 0 && programIndex > 0 && programIndex < 32) { int val = 0xffffffff; for (int i = 0; i < programIndex-1; i += 2) - val &= ~(1< 32) break; + } + } + } + + for (int8 num = 0; num < 127; ++num) { + for (uniform int8 div = 2; div < 127; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 64) break; + } + } + } + + for (int16 num = 0; num < 32767; ++num) { + for (uniform int16 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 96) break; + } + } + } + + for (unsigned int16 num = 0; num < 0xffff; ++num) { + for (uniform unsigned int16 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 128) break; + } + } + } + + // randomly sample int32s... + uniform RNGState state; + seed_rng(&state, 1234); + for (uniform int i = 0; i < 64k; ++i) { + unsigned int32 num = random(&state); + for (uniform unsigned int32 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("ui32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 160) break; + } + } + } + + for (uniform int64 i = 0; i < 64k; ++i) { + int32 num = random(&state); + if (num < 0) + continue; + for (uniform int32 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("si32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 192) break; + } + } + } + + RET[programIndex] = errorCount; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} + diff --git a/tests/kilo-mega-giga-2.ispc b/tests/kilo-mega-giga-2.ispc index 77e201ef..42545b8d 100644 --- a/tests/kilo-mega-giga-2.ispc +++ b/tests/kilo-mega-giga-2.ispc @@ -8,5 +8,5 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { } export void result(uniform float RET[]) { - RET[programIndex] = 2*1024*1024 + 5; + RET[programIndex] = 2ul*1024ul*1024ul + 5; } diff --git a/tests/ldexp-double.ispc b/tests/ldexp-double.ispc index 6b3ed734..e1b7a59f 100644 --- a/tests/ldexp-double.ispc +++ b/tests/ldexp-double.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - double a = 1 << (programIndex % 28); + double a = 1ul << (programIndex % 28); if (programIndex & 1) a = -a; RET[programIndex] = ldexp(a, 2); @@ -11,7 +11,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { export void result(uniform float RET[]) { int pi = programIndex % 28; - RET[programIndex] = (1 << (pi + 2)); + RET[programIndex] = (1ul << (pi + 2)); if (programIndex & 1) RET[programIndex] = -RET[programIndex]; } diff --git a/tests/ldexp-float.ispc b/tests/ldexp-float.ispc index a2ec9a27..305ae106 100644 --- a/tests/ldexp-float.ispc +++ b/tests/ldexp-float.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - float a = 1 << (programIndex % 28); + float a = 1ul << (programIndex % 28); if (programIndex & 1) a = -a; RET[programIndex] = ldexp(a, 2); @@ -11,7 +11,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { export void result(uniform float RET[]) { int pi = programIndex % 28; - RET[programIndex] = (1 << (pi + 2)); + RET[programIndex] = (1ul << (pi + 2)); if (programIndex & 1) RET[programIndex] = -RET[programIndex]; } diff --git a/tests/local-atomics-12.ispc b/tests/local-atomics-12.ispc index 23a30af5..358ffd34 100644 --- a/tests/local-atomics-12.ispc +++ b/tests/local-atomics-12.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; float b = 0; if (programIndex < 29 && (programIndex & 1)) - b = atomic_or_local(&s, (1 << programIndex)); + b = atomic_or_local(&s, (1ul << programIndex)); RET[programIndex] = s; } @@ -15,6 +15,6 @@ export void result(uniform float RET[]) { uniform int sum = 0; for (uniform int i = 0; i < min(programCount, 29); ++i) if (i & 1) - sum += (1 << i); + sum += (1ul << i); RET[programIndex] = sum; } diff --git a/tests/local-atomics-13.ispc b/tests/local-atomics-13.ispc index 36fd1f1c..b9d35d09 100644 --- a/tests/local-atomics-13.ispc +++ b/tests/local-atomics-13.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; int32 b = 0; if (programIndex < 28 && (programIndex & 1)) - b = atomic_or_local(&s, (1 << programIndex)); + b = atomic_or_local(&s, (1ul << programIndex)); RET[programIndex] = popcnt(reduce_max(b)); } diff --git a/tests/local-atomics-14.ispc b/tests/local-atomics-14.ispc index 4cf81809..25c52e60 100644 --- a/tests/local-atomics-14.ispc +++ b/tests/local-atomics-14.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; float b = 0; if (programIndex < 32 && (programIndex & 1)) - b = atomic_or_local(&s, (1 << programIndex)); + b = atomic_or_local(&s, (1ul << programIndex)); RET[programIndex] = (s>>20); } @@ -15,6 +15,6 @@ export void result(uniform float RET[]) { uniform int sum = 0; for (uniform int i = 0; i < min(32, programCount); ++i) if (i & 1) - sum += (1 << i); + sum += (1ul << i); RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20; } diff --git a/tests/local-atomics-4.ispc b/tests/local-atomics-4.ispc index f7f6a04a..b3648ab5 100644 --- a/tests/local-atomics-4.ispc +++ b/tests/local-atomics-4.ispc @@ -7,10 +7,10 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; float b = 0; if (programIndex < 29) - atomic_or_local(&s, (1< struct Foo" for assignment operator is not possible +// Type conversion from "const uniform int[0-9]*" to "soa<4> struct Foo" for assignment operator is not possible struct Pt { float x, y, z; }; diff --git a/tests_errors/soa-12.ispc b/tests_errors/soa-12.ispc index e2cd3242..c0420614 100644 --- a/tests_errors/soa-12.ispc +++ b/tests_errors/soa-12.ispc @@ -1,4 +1,4 @@ -// Can't convert between types "const uniform int32" and "soa<4> float" with different SOA widths +// Can't convert between types "const uniform int[0-9]*" and "soa<4> float" with different SOA widths struct Pt { float x, y, z; }; diff --git a/tests_errors/soa-3.ispc b/tests_errors/soa-3.ispc index b2be1b59..04dc84bc 100644 --- a/tests_errors/soa-3.ispc +++ b/tests_errors/soa-3.ispc @@ -1,4 +1,4 @@ -// syntax error, unexpected '-', expecting int32 constant +// syntax error, unexpected '-', expecting int struct F { float a, b, c; }; diff --git a/tests_errors/soa-4.ispc b/tests_errors/soa-4.ispc index b2be1b59..04dc84bc 100644 --- a/tests_errors/soa-4.ispc +++ b/tests_errors/soa-4.ispc @@ -1,4 +1,4 @@ -// syntax error, unexpected '-', expecting int32 constant +// syntax error, unexpected '-', expecting int struct F { float a, b, c; }; diff --git a/tests_errors/soa-9.ispc b/tests_errors/soa-9.ispc index 7c6a1df9..e9e7509a 100644 --- a/tests_errors/soa-9.ispc +++ b/tests_errors/soa-9.ispc @@ -1,4 +1,4 @@ -// Can't convert from pointer to SOA type "soa<8> struct A * uniform" to pointer to non-SOA type "void * varying" +// Can't convert from pointer to SOA type "soa<8> struct A \* uniform" to pointer to non-SOA type "void \* varying" struct A { float a, b; }; diff --git a/tests_errors/struct_arith.ispc b/tests_errors/struct_arith.ispc index 9d942880..df729d02 100644 --- a/tests_errors/struct_arith.ispc +++ b/tests_errors/struct_arith.ispc @@ -1,4 +1,4 @@ -// Assignment operator "+=" is illegal with struct type +// Assignment operator "\+=" is illegal with struct type struct Point { float x, y, z; }; diff --git a/tests_errors/vec-size-compile-constant.ispc b/tests_errors/vec-size-compile-constant.ispc index b9e61721..0eb6f90e 100644 --- a/tests_errors/vec-size-compile-constant.ispc +++ b/tests_errors/vec-size-compile-constant.ispc @@ -1,4 +1,4 @@ -// syntax error, unexpected identifier, expecting int32 constant +// syntax error, unexpected identifier, expecting int void foo(uniform int i) { float a; diff --git a/util.cpp b/util.cpp index dbea9517..6b121988 100644 --- a/util.cpp +++ b/util.cpp @@ -79,8 +79,8 @@ compiler under a debuffer; in this case, just return a reasonable default. */ -static int -lTerminalWidth() { +int +TerminalWidth() { if (g->disableLineWrap) return 1<<30; @@ -228,8 +228,8 @@ lFindIndent(int numColons, const char *buf) { /** Print the given string to the given FILE, assuming the given output column width. Break words as needed to avoid words spilling past the last column. */ -static void -lPrintWithWordBreaks(const char *buf, int indent, int columnWidth, FILE *out) { +void +PrintWithWordBreaks(const char *buf, int indent, int columnWidth, FILE *out) { #ifdef ISPC_IS_WINDOWS fputs(buf, out); fputs("\n", out); @@ -375,7 +375,7 @@ lPrint(const char *type, bool isError, SourcePos p, const char *fmt, return; printed.insert(formattedBuf); - lPrintWithWordBreaks(formattedBuf, indent, lTerminalWidth(), stderr); + PrintWithWordBreaks(formattedBuf, indent, TerminalWidth(), stderr); lPrintFileLineContext(p); free(errorBuf); diff --git a/util.h b/util.h index b247b8bd..7edf71f7 100644 --- a/util.h +++ b/util.h @@ -156,4 +156,18 @@ void GetDirectoryAndFileName(const std::string ¤tDir, bool VerifyDataLayoutCompatibility(const std::string &module_dl, const std::string &lib_dl); +/** Print the given string to the given FILE, assuming the given output + column width. Break words as needed to avoid words spilling past the + last column. */ +void PrintWithWordBreaks(const char *buf, int indent, int columnWidth, + FILE *out); + +/** Returns the width of the terminal where the compiler is running. + Finding this out may fail in a variety of reasonable situations (piping + compiler output to 'less', redirecting output to a file, running the + compiler under a debuffer; in this case, just return a reasonable + default. + */ +int TerminalWidth(); + #endif // ISPC_UTIL_H