diff --git a/Makefile b/Makefile index 54734f39..f2e18543 100644 --- a/Makefile +++ b/Makefile @@ -62,14 +62,17 @@ CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \ util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll builtins-sse2-x2.ll \ - builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll +TARGETS=avx avx-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 generic-16 +BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \ + builtins/dispatch.ll +BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \ + builtins-c-32.cpp builtins-c-64.cpp BISON_SRC=parse.yy FLEX_SRC=lex.ll -OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \ - builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \ - $(FLEX_SRC:.ll=.o)) +OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \ + stdlib_generic_ispc.o stdlib_x86_ispc.o \ + $(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o)) default: ispc @@ -104,6 +107,10 @@ objs/%.o: %.cpp @echo Compiling $< @$(CXX) $(CXXFLAGS) -o $@ -c $< +objs/%.o: objs/%.cpp + @echo Compiling $< + @$(CXX) $(CXXFLAGS) -o $@ -c $< + objs/parse.cc: parse.yy @echo Running bison on $< @$(YACC) -o $@ $< @@ -120,41 +127,24 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc @echo Compiling $< @$(CXX) $(CXXFLAGS) -o $@ -c $< -objs/builtins-%.cpp: builtins-%.ll - @echo Creating C++ source from builtin definitions file $< - @m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@ - -objs/builtins-%.o: objs/builtins-%.cpp - @echo Compiling $< - @$(CXX) $(CXXFLAGS) -o $@ -c $< - -objs/builtins-c-32.cpp: builtins-c.c +objs/builtins-%.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll) @echo Creating C++ source from builtins definition file $< - @$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-32.c > $@ + @m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | ./bitcode2cpp.py $< > $@ -objs/builtins-c-32.o: objs/builtins-c-32.cpp - @echo Compiling $< - @$(CXX) $(CXXFLAGS) -o $@ -c $< - -objs/builtins-c-64.cpp: builtins-c.c +objs/builtins-c-32.cpp: builtins/builtins.c @echo Creating C++ source from builtins definition file $< - @$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-64.c > $@ + @$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-32 > $@ -objs/builtins-c-64.o: objs/builtins-c-64.cpp - @echo Compiling $< - @$(CXX) $(CXXFLAGS) -o $@ -c $< +objs/builtins-c-64.cpp: builtins/builtins.c + @echo Creating C++ source from builtins definition file $< + @$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-64 > $@ -objs/stdlib_ispc.cpp: stdlib.ispc - @echo Creating C++ source from $< - @$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | ./stdlib2cpp.py > $@ +objs/stdlib_generic_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for generic + @$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + ./stdlib2cpp.py generic > $@ -objs/stdlib_ispc.o: objs/stdlib_ispc.cpp - @echo Compiling $< - @$(CXX) $(CXXFLAGS) -o $@ -c $< - -objs/builtins-sse2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2.ll -objs/builtins-sse2-x2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2-x2.ll -objs/builtins-sse4.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4.ll -objs/builtins-sse4-x2.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4-x2.ll -objs/builtins-avx.cpp: builtins.m4 builtins-avx-common.ll builtins-avx.ll -objs/builtins-avx-x2.cpp: builtins.m4 builtins-avx-common.ll builtins-avx-x2.ll +objs/stdlib_x86_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for x86 + @$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \ + ./stdlib2cpp.py x86 > $@ diff --git a/bitcode2cpp.py b/bitcode2cpp.py index fa7d4782..a1a5d2bf 100755 --- a/bitcode2cpp.py +++ b/bitcode2cpp.py @@ -11,7 +11,8 @@ length=0 src=str(sys.argv[1]) -target = re.sub(".*builtins-", "", src) +target = re.sub("builtins/target-", "", src) +target = re.sub("builtins/", "", target) target = re.sub("\.ll$", "", target) target = re.sub("\.c$", "", target) target = re.sub("-", "_", target) diff --git a/builtins.cpp b/builtins.cpp index 5358e789..9bd41e8f 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -99,6 +99,9 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64; // varying + if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType && + t == LLVMTypes::MaskType) + return AtomicType::VaryingBool; else if (t == LLVMTypes::Int8VectorType) return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8; else if (t == LLVMTypes::Int16VectorType) @@ -194,7 +197,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) { // symbol creation code below assumes that any LLVM vector of i32s is a // varying int32. Here, we need that to be interpreted as a varying // bool, so just have a one-off override for that one... - if (name == "__sext_varying_bool") { + if (g->target.maskBitCount != 1 && name == "__sext_varying_bool") { const Type *returnType = AtomicType::VaryingInt32; std::vector argTypes; argTypes.push_back(AtomicType::VaryingBool); @@ -556,7 +559,7 @@ lSetInternalFunctions(llvm::Module *module) { int count = sizeof(names) / sizeof(names[0]); for (int i = 0; i < count; ++i) { llvm::Function *f = module->getFunction(names[i]); - if (f != NULL) + if (f != NULL && f->empty() == false) f->setLinkage(llvm::GlobalValue::InternalLinkage); } } @@ -744,6 +747,33 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod FATAL("logic error in DefineStdlib"); } break; + case Target::GENERIC: + switch (g->target.vectorWidth) { + case 4: + extern unsigned char builtins_bitcode_generic_4[]; + extern int builtins_bitcode_generic_4_length; + AddBitcodeToModule(builtins_bitcode_generic_4, + builtins_bitcode_generic_4_length, + module, symbolTable); + break; + case 8: + extern unsigned char builtins_bitcode_generic_8[]; + extern int builtins_bitcode_generic_8_length; + AddBitcodeToModule(builtins_bitcode_generic_8, + builtins_bitcode_generic_8_length, + module, symbolTable); + break; + case 16: + extern unsigned char builtins_bitcode_generic_16[]; + extern int builtins_bitcode_generic_16_length; + AddBitcodeToModule(builtins_bitcode_generic_16, + builtins_bitcode_generic_16_length, + module, symbolTable); + break; + default: + FATAL("logic error in DefineStdlib"); + } + break; default: FATAL("logic error"); } @@ -771,11 +801,16 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod if (includeStdlibISPC) { // If the user wants the standard library to be included, parse the // serialized version of the stdlib.ispc file to get its - // definitions added. Disable emission of performance warnings for - // now, since the user doesn't care about any of that in the stdlib - // implementation... - extern char stdlib_code[]; - yy_scan_string(stdlib_code); - yyparse(); + // definitions added. + if (g->target.isa == Target::GENERIC) { + extern char stdlib_generic_code[]; + yy_scan_string(stdlib_generic_code); + yyparse(); + } + else { + extern char stdlib_x86_code[]; + yy_scan_string(stdlib_x86_code); + yyparse(); + } } } diff --git a/builtins-c.c b/builtins/builtins.c similarity index 100% rename from builtins-c.c rename to builtins/builtins.c diff --git a/builtins-dispatch.ll b/builtins/dispatch.ll similarity index 100% rename from builtins-dispatch.ll rename to builtins/dispatch.ll diff --git a/builtins-avx-common.ll b/builtins/target-avx-common.ll similarity index 99% rename from builtins-avx-common.ll rename to builtins/target-avx-common.ll index 6b08466d..07fb12b4 100644 --- a/builtins-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -32,6 +32,9 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; AVX target implementation. +ctlztz() +define_prefetches() + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins-avx-x2.ll b/builtins/target-avx-x2.ll similarity index 99% rename from builtins-avx-x2.ll rename to builtins/target-avx-x2.ll index 6254c405..90e2680c 100644 --- a/builtins-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -32,12 +32,16 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Basic 16-wide definitions -stdlib_core(16) -packed_load_and_store(16) -scans(16) -int64minmax(16) +define(`WIDTH',`16') +define(`MASK',`i32') +include(`util.m4') -include(`builtins-avx-common.ll') +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-avx-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins-avx.ll b/builtins/target-avx.ll similarity index 99% rename from builtins-avx.ll rename to builtins/target-avx.ll index a00a527e..dc7339bd 100644 --- a/builtins-avx.ll +++ b/builtins/target-avx.ll @@ -32,12 +32,16 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Basic 8-wide definitions -stdlib_core(8) -packed_load_and_store(8) -scans(8) -int64minmax(8) +define(`WIDTH',`8') +define(`MASK',`i32') +include(`util.m4') -include(`builtins-avx-common.ll') +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-avx-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-generic-16.ll b/builtins/target-generic-16.ll new file mode 100644 index 00000000..807fd242 --- /dev/null +++ b/builtins/target-generic-16.ll @@ -0,0 +1,34 @@ +;; Copyright (c) 2010-2011, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`16') +include(`target-generic-common.ll') + diff --git a/builtins/target-generic-4.ll b/builtins/target-generic-4.ll new file mode 100644 index 00000000..7eb1f300 --- /dev/null +++ b/builtins/target-generic-4.ll @@ -0,0 +1,34 @@ +;; Copyright (c) 2010-2011, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`4') +include(`target-generic-common.ll') + diff --git a/builtins/target-generic-8.ll b/builtins/target-generic-8.ll new file mode 100644 index 00000000..bd9261ff --- /dev/null +++ b/builtins/target-generic-8.ll @@ -0,0 +1,34 @@ +;; Copyright (c) 2010-2011, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`8') +include(`target-generic-common.ll') + diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll new file mode 100644 index 00000000..b59e8d53 --- /dev/null +++ b/builtins/target-generic-common.ll @@ -0,0 +1,277 @@ +;; Copyright (c) 2010-2011, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`MASK',`i1') +include(`util.m4') + +stdlib_core() + +scans() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +declare void @__fastmath() nounwind + +;; round/floor/ceil + +declare float @__round_uniform_float(float) nounwind readnone +declare float @__floor_uniform_float(float) nounwind readnone +declare float @__ceil_uniform_float(float) nounwind readnone + +declare double @__round_uniform_double(double) nounwind readnone +declare double @__floor_uniform_double(double) nounwind readnone +declare double @__ceil_uniform_double(double) nounwind readnone + +declare @__round_varying_float() nounwind readnone +declare @__floor_varying_float() nounwind readnone +declare @__ceil_varying_float() nounwind readnone +declare @__round_varying_double() nounwind readnone +declare @__floor_varying_double() nounwind readnone +declare @__ceil_varying_double() nounwind readnone + +;; min/max + +declare float @__max_uniform_float(float, float) nounwind readnone +declare float @__min_uniform_float(float, float) nounwind readnone +declare i32 @__min_uniform_int32(i32, i32) nounwind readnone +declare i32 @__max_uniform_int32(i32, i32) nounwind readnone +declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone +declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone +declare i64 @__min_uniform_int64(i64, i64) nounwind readnone +declare i64 @__max_uniform_int64(i64, i64) nounwind readnone +declare i64 @__min_uniform_uint64(i64, i64) nounwind readnone +declare i64 @__max_uniform_uint64(i64, i64) nounwind readnone +declare double @__min_uniform_double(double, double) nounwind readnone +declare double @__max_uniform_double(double, double) nounwind readnone + +declare @__max_varying_float(, + ) nounwind readnone +declare @__min_varying_float(, + ) nounwind readnone +declare @__min_varying_int32(, ) nounwind readnone +declare @__max_varying_int32(, ) nounwind readnone +declare @__min_varying_uint32(, ) nounwind readnone +declare @__max_varying_uint32(, ) nounwind readnone +declare @__min_varying_int64(, ) nounwind readnone +declare @__max_varying_int64(, ) nounwind readnone +declare @__min_varying_uint64(, ) nounwind readnone +declare @__max_varying_uint64(, ) nounwind readnone +declare @__min_varying_double(, + ) nounwind readnone +declare @__max_varying_double(, + ) nounwind readnone + +;; sqrt/rsqrt/rcp + +declare float @__rsqrt_uniform_float(float) nounwind readnone +declare float @__rcp_uniform_float(float) nounwind readnone +declare float @__sqrt_uniform_float(float) nounwind readnone +declare @__rcp_varying_float() nounwind readnone +declare @__rsqrt_varying_float( %v) nounwind readnone +declare @__sqrt_varying_float() nounwind readnone + +declare double @__sqrt_uniform_double(double) nounwind readnone +declare @__sqrt_varying_double() nounwind readnone + +;; bit ops + +declare i32 @__popcnt_int32(i32) nounwind readnone +declare i64 @__popcnt_int64(i64) nounwind readnone + +declare i32 @__count_trailing_zeros_i32(i32) nounwind readnone +declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone +declare i32 @__count_leading_zeros_i32(i32) nounwind readnone +declare i64 @__count_leading_zeros_i64(i64) nounwind readnone + +;; svml + +; FIXME: need either to wire these up to the 8-wide SVML entrypoints, +; or, use the macro to call the 4-wide ones twice with our 8-wide +; vectors... + +declare @__svml_sin() +declare @__svml_cos() +declare void @__svml_sincos(, *, *) +declare @__svml_tan() +declare @__svml_atan() +declare @__svml_atan2(, ) +declare @__svml_exp() +declare @__svml_log() +declare @__svml_pow(, ) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; reductions + +declare i32 @__movmsk() nounwind readnone + +declare float @__reduce_add_float() nounwind readnone +declare float @__reduce_min_float() nounwind readnone +declare float @__reduce_max_float() nounwind readnone + +declare i32 @__reduce_add_int32() nounwind readnone +declare i32 @__reduce_min_int32() nounwind readnone +declare i32 @__reduce_max_int32() nounwind readnone + +declare i32 @__reduce_add_uint32( %v) nounwind readnone +declare i32 @__reduce_min_uint32() nounwind readnone +declare i32 @__reduce_max_uint32() nounwind readnone + +declare double @__reduce_add_double() nounwind readnone +declare double @__reduce_min_double() nounwind readnone +declare double @__reduce_max_double() nounwind readnone + +declare i64 @__reduce_add_int64() nounwind readnone +declare i64 @__reduce_min_int64() nounwind readnone +declare i64 @__reduce_max_int64() nounwind readnone + +declare i64 @__reduce_add_uint64( %v) nounwind readnone +declare i64 @__reduce_min_uint64() nounwind readnone +declare i64 @__reduce_max_uint64() nounwind readnone + +declare i1 @__reduce_equal_int32( %v, i32 * nocapture %samevalue, + %mask) nounwind +declare i1 @__reduce_equal_float( %v, float * nocapture %samevalue, + %mask) nounwind +declare i1 @__reduce_equal_int64( %v, i64 * nocapture %samevalue, + %mask) nounwind +declare i1 @__reduce_equal_double( %v, double * nocapture %samevalue, + %mask) nounwind + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +load_and_broadcast(WIDTH, i8, 8) +load_and_broadcast(WIDTH, i16, 16) +load_and_broadcast(WIDTH, i32, 32) +load_and_broadcast(WIDTH, i64, 64) + +declare @__load_masked_8(i8 * nocapture, %mask) nounwind readonly +declare @__load_masked_16(i8 * nocapture, %mask) nounwind readonly +declare @__load_masked_32(i8 * nocapture, %mask) nounwind readonly +declare @__load_masked_64(i8 * nocapture, %mask) nounwind readonly + +declare void @__masked_store_8(* nocapture, , + ) nounwind +declare void @__masked_store_16(* nocapture, , + ) nounwind +declare void @__masked_store_32(* nocapture, , + ) nounwind +declare void @__masked_store_64(* nocapture, , + %mask) nounwind + +ifelse(LLVM_VERSION,LLVM_3_1svn,` +define void @__masked_store_blend_8(* nocapture, , + ) nounwind { + %v = load * %0 + %v1 = select %2, %1, %v + store %v1, * %0 + ret void +} + +define void @__masked_store_blend_16(* nocapture, , + ) nounwind { + %v = load * %0 + %v1 = select %2, %1, %v + store %v1, * %0 + ret void +} + +define void @__masked_store_blend_32(* nocapture, , + ) nounwind { + %v = load * %0 + %v1 = select %2, %1, %v + store %v1, * %0 + ret void +} + +define void @__masked_store_blend_64(* nocapture, + , ) nounwind { + %v = load * %0 + %v1 = select %2, %1, %v + store %v1, * %0 + ret void +} +',` +declare void @__masked_store_blend_8(* nocapture, , + ) nounwind +declare void @__masked_store_blend_16(* nocapture, , + ) nounwind +declare void @__masked_store_blend_32(* nocapture, , + ) nounwind +declare void @__masked_store_blend_64(* nocapture %ptr, + %new, + %mask) nounwind +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +define(`gather_scatter', ` +declare @__gather_base_offsets32_$1(i8 * nocapture %ptr, %offsets, + i32 %offset_scale, %vecmask) nounwind readonly +declare @__gather_base_offsets64_$1(i8 * nocapture %ptr, %offsets, + i32 %offset_scale, %vecmask) nounwind readonly +declare @__gather32_$1( %ptrs, + %vecmask) nounwind readonly +declare @__gather64_$1( %ptrs, + %vecmask) nounwind readonly + +declare void @__scatter_base_offsets32_$1(i8* nocapture %base, %offsets, + i32 %offset_scale, %values, %mask) nounwind +declare void @__scatter_base_offsets64_$1(i8* nocapture %base, %offsets, + i32 %offset_scale, %values, %mask) nounwind +declare void @__scatter32_$1( %ptrs, %values, + %mask) nounwind +declare void @__scatter64_$1( %ptrs, %values, + %mask) nounwind +') + +gather_scatter(i8) +gather_scatter(i16) +gather_scatter(i32) +gather_scatter(i64) + +declare i32 @__packed_load_active(i32 * nocapture %startptr, * nocapture %val_ptr, + %full_mask) nounwind +declare i32 @__packed_store_active(i32 * %startptr, %vals, + %full_mask) nounwind + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; prefetch + +declare void @__prefetch_read_uniform_1(i8 *) nounwind readnone +declare void @__prefetch_read_uniform_2(i8 *) nounwind readnone +declare void @__prefetch_read_uniform_3(i8 *) nounwind readnone +declare void @__prefetch_read_uniform_nt(i8 *) nounwind readnone + diff --git a/builtins-sse2-common.ll b/builtins/target-sse2-common.ll similarity index 99% rename from builtins-sse2-common.ll rename to builtins/target-sse2-common.ll index 659bdda7..80c34afb 100644 --- a/builtins-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -29,6 +29,9 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +ctlztz() +define_prefetches() + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins-sse2-x2.ll b/builtins/target-sse2-x2.ll similarity index 99% rename from builtins-sse2-x2.ll rename to builtins/target-sse2-x2.ll index b5eaa889..a9d71ea9 100644 --- a/builtins-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -36,12 +36,16 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; standard 8-wide definitions from m4 macros -stdlib_core(8) -packed_load_and_store(8) -scans(8) -int64minmax(8) +define(`WIDTH',`8') +define(`MASK',`i32') +include(`util.m4') -include(`builtins-sse2-common.ll') +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse2-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins-sse2.ll b/builtins/target-sse2.ll similarity index 99% rename from builtins-sse2.ll rename to builtins/target-sse2.ll index c49d6b2c..1a297199 100644 --- a/builtins-sse2.ll +++ b/builtins/target-sse2.ll @@ -33,12 +33,16 @@ ;; Define the standard library builtins for the SSE2 target ; Define some basics for a 4-wide target -stdlib_core(4) -packed_load_and_store(4) -scans(4) -int64minmax(4) +define(`WIDTH',`4') +define(`MASK',`i32') +include(`util.m4') -include(`builtins-sse2-common.ll') +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse2-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding diff --git a/builtins-sse4-common.ll b/builtins/target-sse4-common.ll similarity index 99% rename from builtins-sse4-common.ll rename to builtins/target-sse4-common.ll index f1ee95dc..19d31ce4 100644 --- a/builtins-sse4-common.ll +++ b/builtins/target-sse4-common.ll @@ -29,6 +29,9 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +ctlztz() +define_prefetches() + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins-sse4-x2.ll b/builtins/target-sse4-x2.ll similarity index 99% rename from builtins-sse4-x2.ll rename to builtins/target-sse4-x2.ll index fd399884..764f8613 100644 --- a/builtins-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -36,12 +36,16 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; standard 8-wide definitions from m4 macros -stdlib_core(8) -packed_load_and_store(8) -scans(8) -int64minmax(8) +define(`WIDTH',`8') +define(`MASK',`i32') +include(`util.m4') -include(`builtins-sse4-common.ll') +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse4-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins-sse4.ll b/builtins/target-sse4.ll similarity index 99% rename from builtins-sse4.ll rename to builtins/target-sse4.ll index 68c44a0e..7eadde4b 100644 --- a/builtins-sse4.ll +++ b/builtins/target-sse4.ll @@ -33,12 +33,16 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Define common 4-wide stuff -stdlib_core(4) -packed_load_and_store(4) -scans(4) -int64minmax(4) +define(`WIDTH',`4') +define(`MASK',`i32') +include(`util.m4') -include(`builtins-sse4-common.ll') +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse4-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins.m4 b/builtins/util.m4 similarity index 82% rename from builtins.m4 rename to builtins/util.m4 index f83bdbff..8853e81c 100644 --- a/builtins.m4 +++ b/builtins/util.m4 @@ -550,103 +550,103 @@ divert`'dnl ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; stdlib_core ;; -;; This macro defines a bunch of helper routines that only depend on the -;; target's vector width, which it takes as its first parameter. +;; This macro defines a bunch of helper routines that depend on the +;; target's vector width ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; define(`shuffles', ` -define <$1 x $2> @__broadcast_$3(<$1 x $2>, i32) nounwind readnone alwaysinline { - %v = extractelement <$1 x $2> %0, i32 %1 - %r_0 = insertelement <$1 x $2> undef, $2 %v, i32 0 -forloop(i, 1, eval($1-1), ` %r_`'i = insertelement <$1 x $2> %r_`'eval(i-1), $2 %v, i32 i +define @__broadcast_$2(, i32) nounwind readnone alwaysinline { + %v = extractelement %0, i32 %1 + %r_0 = insertelement undef, $1 %v, i32 0 +forloop(i, 1, eval(WIDTH-1), ` %r_`'i = insertelement %r_`'eval(i-1), $1 %v, i32 i ') - ret <$1 x $2> %r_`'eval($1-1) + ret %r_`'eval(WIDTH-1) } -define <$1 x $2> @__rotate_$3(<$1 x $2>, i32) nounwind readnone alwaysinline { +define @__rotate_$2(, i32) nounwind readnone alwaysinline { %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1) br i1 %isc, label %is_const, label %not_const is_const: ; though verbose, this turms into tight code if %1 is a constant -forloop(i, 0, eval($1-1), ` +forloop(i, 0, eval(WIDTH-1), ` %delta_`'i = add i32 %1, i - %delta_clamped_`'i = and i32 %delta_`'i, eval($1-1) - %v_`'i = extractelement <$1 x $2> %0, i32 %delta_clamped_`'i') + %delta_clamped_`'i = and i32 %delta_`'i, eval(WIDTH-1) + %v_`'i = extractelement %0, i32 %delta_clamped_`'i') - %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0 -forloop(i, 1, eval($1-1), ` %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i + %ret_0 = insertelement undef, $1 %v_0, i32 0 +forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement %ret_`'eval(i-1), $1 %v_`'i, i32 i ') - ret <$1 x $2> %ret_`'eval($1-1) + ret %ret_`'eval(WIDTH-1) not_const: ; store two instances of the vector into memory - %ptr = alloca <$1 x $2>, i32 2 - %ptr0 = getelementptr <$1 x $2> * %ptr, i32 0 - store <$1 x $2> %0, <$1 x $2> * %ptr0 - %ptr1 = getelementptr <$1 x $2> * %ptr, i32 1 - store <$1 x $2> %0, <$1 x $2> * %ptr1 + %ptr = alloca , i32 2 + %ptr0 = getelementptr * %ptr, i32 0 + store %0, * %ptr0 + %ptr1 = getelementptr * %ptr, i32 1 + store %0, * %ptr1 ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector - %offset = and i32 %1, eval($1-1) - %ptr_as_elt_array = bitcast <$1 x $2> * %ptr to [eval(2*$1) x $2] * - %load_ptr = getelementptr [eval(2*$1) x $2] * %ptr_as_elt_array, i32 0, i32 %offset - %load_ptr_vec = bitcast $2 * %load_ptr to <$1 x $2> * - %result = load <$1 x $2> * %load_ptr_vec, align $4 - ret <$1 x $2> %result + %offset = and i32 %1, eval(WIDTH-1) + %ptr_as_elt_array = bitcast * %ptr to [eval(2*WIDTH) x $1] * + %load_ptr = getelementptr [eval(2*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset + %load_ptr_vec = bitcast $1 * %load_ptr to * + %result = load * %load_ptr_vec, align $3 + ret %result } -define <$1 x $2> @__shuffle_$3(<$1 x $2>, <$1 x i32>) nounwind readnone alwaysinline { -forloop(i, 0, eval($1-1), ` - %index_`'i = extractelement <$1 x i32> %1, i32 i') -forloop(i, 0, eval($1-1), ` - %v_`'i = extractelement <$1 x $2> %0, i32 %index_`'i') +define @__shuffle_$2(, ) nounwind readnone alwaysinline { +forloop(i, 0, eval(WIDTH-1), ` + %index_`'i = extractelement %1, i32 i') +forloop(i, 0, eval(WIDTH-1), ` + %v_`'i = extractelement %0, i32 %index_`'i') - %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0 -forloop(i, 1, eval($1-1), ` %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i + %ret_0 = insertelement undef, $1 %v_0, i32 0 +forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement %ret_`'eval(i-1), $1 %v_`'i, i32 i ') - ret <$1 x $2> %ret_`'eval($1-1) + ret %ret_`'eval(WIDTH-1) } -define <$1 x $2> @__shuffle2_$3(<$1 x $2>, <$1 x $2>, <$1 x i32>) nounwind readnone alwaysinline { - %v2 = shufflevector <$1 x $2> %0, <$1 x $2> %1, < - forloop(i, 0, eval(2*$1-2), `i32 i, ') i32 eval(2*$1-1) +define @__shuffle2_$2(, , ) nounwind readnone alwaysinline { + %v2 = shufflevector %0, %1, < + forloop(i, 0, eval(2*WIDTH-2), `i32 i, ') i32 eval(2*WIDTH-1) > -forloop(i, 0, eval($1-1), ` - %index_`'i = extractelement <$1 x i32> %2, i32 i') +forloop(i, 0, eval(WIDTH-1), ` + %index_`'i = extractelement %2, i32 i') - %isc = call i1 @__is_compile_time_constant_varying_int32(<$1 x i32> %2) + %isc = call i1 @__is_compile_time_constant_varying_int32( %2) br i1 %isc, label %is_const, label %not_const is_const: ; extract from the requested lanes and insert into the result; LLVM turns ; this into good code in the end -forloop(i, 0, eval($1-1), ` - %v_`'i = extractelement %v2, i32 %index_`'i') +forloop(i, 0, eval(WIDTH-1), ` + %v_`'i = extractelement %v2, i32 %index_`'i') - %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0 -forloop(i, 1, eval($1-1), ` %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i + %ret_0 = insertelement undef, $1 %v_0, i32 0 +forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement %ret_`'eval(i-1), $1 %v_`'i, i32 i ') - ret <$1 x $2> %ret_`'eval($1-1) + ret %ret_`'eval(WIDTH-1) not_const: ; otherwise store the two vectors onto the stack and then use the given ; permutation vector to get indices into that array... - %ptr = alloca - store %v2, * %ptr - %baseptr = bitcast * %ptr to $2 * + %ptr = alloca + store %v2, * %ptr + %baseptr = bitcast * %ptr to $1 * - %ptr_0 = getelementptr $2 * %baseptr, i32 %index_0 - %val_0 = load $2 * %ptr_0 - %result_0 = insertelement <$1 x $2> undef, $2 %val_0, i32 0 + %ptr_0 = getelementptr $1 * %baseptr, i32 %index_0 + %val_0 = load $1 * %ptr_0 + %result_0 = insertelement undef, $1 %val_0, i32 0 -forloop(i, 1, eval($1-1), ` - %ptr_`'i = getelementptr $2 * %baseptr, i32 %index_`'i - %val_`'i = load $2 * %ptr_`'i - %result_`'i = insertelement <$1 x $2> %result_`'eval(i-1), $2 %val_`'i, i32 i +forloop(i, 1, eval(WIDTH-1), ` + %ptr_`'i = getelementptr $1 * %baseptr, i32 %index_`'i + %val_`'i = load $1 * %ptr_`'i + %result_`'i = insertelement %result_`'eval(i-1), $1 %val_`'i, i32 i ') - ret <$1 x $2> %result_`'eval($1-1) + ret %result_`'eval(WIDTH-1) } ') @@ -676,18 +676,20 @@ forloop(i, 1, eval($1-1), ` define(`global_atomic_associative', ` define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, - <$1 x i32> %m) nounwind alwaysinline { + <$1 x MASK> %m) nounwind alwaysinline { ; first, for any lanes where the mask is off, compute a vector where those lanes ; hold the identity value.. ; for the bit tricks below, we need the mask to be sign extended to be ; the size of the element type. - ifelse($3, `i64', `%mask = sext <$1 x i32> %m to <$1 x i64>') - ifelse($3, `i32', ` - ; silly workaround to do %mask = %m, which is not possible directly.. - %maskmem = alloca <$1 x i32> - store <$1 x i32> %m, <$1 x i32> * %maskmem - %mask = load <$1 x i32> * %maskmem' + ifelse( + MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>', + $3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>', + $3,i32, ` + ; silly workaround to do %mask = %m, which is not possible directly.. + %maskmem = alloca <$1 x i32> + store <$1 x i32> %m, <$1 x i32> * %maskmem + %mask = load <$1 x i32> * %maskmem' ) ; zero out any lanes that are off %valoff = and <$1 x $3> %val, %mask @@ -751,13 +753,13 @@ ifelse(LLVM_VERSION, `LLVM_2_9',` declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta) define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val, - <$1 x i32> %mask) nounwind alwaysinline { + <$1 x MASK> %mask) nounwind alwaysinline { %r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val) ret $3 %r } ', ` define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val, - <$1 x i32> %mask) nounwind alwaysinline { + <$1 x MASK> %mask) nounwind alwaysinline { %r = atomicrmw $2 $3 * %ptr, $3 %val seq_cst ret $3 %r } @@ -778,11 +780,11 @@ declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)') define(`global_swap', ` define <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val, - <$1 x i32> %mask) nounwind alwaysinline { + <$1 x MASK> %mask) nounwind alwaysinline { %rptr = alloca <$1 x $2> %rptr32 = bitcast <$1 x $2> * %rptr to $2 * - per_lane($1, <$1 x i32> %mask, ` + per_lane($1, <$1 x MASK> %mask, ` %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE ifelse(LLVM_VERSION, `LLVM_2_9',` %r_LANE_ID = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val_LANE_ID)', ` @@ -795,7 +797,7 @@ ifelse(LLVM_VERSION, `LLVM_2_9',` } define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val, - <$1 x i32> %mask) nounwind alwaysinline { + <$1 x MASK> %mask) nounwind alwaysinline { ifelse(LLVM_VERSION, `LLVM_2_9',` %r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)', ` %r = atomicrmw xchg $2 * %ptr, $2 %val seq_cst') @@ -816,11 +818,11 @@ ifelse(LLVM_VERSION, `LLVM_2_9',` declare $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)') define <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp, - <$1 x $2> %val, <$1 x i32> %mask) nounwind alwaysinline { + <$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline { %rptr = alloca <$1 x $2> %rptr32 = bitcast <$1 x $2> * %rptr to $2 * - per_lane($1, <$1 x i32> %mask, ` + per_lane($1, <$1 x MASK> %mask, ` %cmp_LANE_ID = extractelement <$1 x $2> %cmp, i32 LANE %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE ifelse(LLVM_VERSION, `LLVM_2_9',` @@ -835,7 +837,7 @@ ifelse(LLVM_VERSION, `LLVM_2_9',` } define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp, - $2 %val, <$1 x i32> %mask) nounwind alwaysinline { + $2 %val, <$1 x MASK> %mask) nounwind alwaysinline { ifelse(LLVM_VERSION, `LLVM_2_9',` %r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)', ` %r = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst') @@ -844,6 +846,85 @@ ifelse(LLVM_VERSION, `LLVM_2_9',` ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; count trailing zeros + +define(`ctlztz', ` +define i32 @__count_trailing_zeros_i32(i32) nounwind readnone alwaysinline { + %c = call i32 @llvm.cttz.i32(i32 %0) + ret i32 %c +} + +define i64 @__count_trailing_zeros_i64(i64) nounwind readnone alwaysinline { + %c = call i64 @llvm.cttz.i64(i64 %0) + ret i64 %c +} + +define i32 @__count_leading_zeros_i32(i32) nounwind readnone alwaysinline { + %c = call i32 @llvm.ctlz.i32(i32 %0) + ret i32 %c +} + +define i64 @__count_leading_zeros_i64(i64) nounwind readnone alwaysinline { + %c = call i64 @llvm.ctlz.i64(i64 %0) + ret i64 %c +} +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; prefetching + +define(`define_prefetches', ` +ifelse(LLVM_VERSION, `LLVM_2_9', +` +declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality) + +define void @__prefetch_read_uniform_1(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 3) + ret void +} + +define void @__prefetch_read_uniform_2(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 2) + ret void +} + +define void @__prefetch_read_uniform_3(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 1) + ret void +} + +define void @__prefetch_read_uniform_nt(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 0) + ret void +} +', ` +declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality, + i32 %cachetype) ; cachetype == 1 is dcache + +define void @__prefetch_read_uniform_1(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 3, i32 1) + ret void +} + +define void @__prefetch_read_uniform_2(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 2, i32 1) + ret void +} + +define void @__prefetch_read_uniform_3(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 1, i32 1) + ret void +} + +define void @__prefetch_read_uniform_nt(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1) + ret void +} +') +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + define(`stdlib_core', ` @@ -854,8 +935,8 @@ declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind declare void @ISPCSync(i8*) nounwind declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind -declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask) -declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>) +declare i1 @__is_compile_time_constant_mask( %mask) +declare i1 @__is_compile_time_constant_varying_int32() ; This function declares placeholder masked store functions for the ; front-end to use. @@ -869,10 +950,10 @@ declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>) ; stores (if the mask is all on) by the MaskedStoreOptPass optimization ; pass. -declare void @__pseudo_masked_store_8(<$1 x i8> * nocapture, <$1 x i8>, <$1 x i32>) -declare void @__pseudo_masked_store_16(<$1 x i16> * nocapture, <$1 x i16>, <$1 x i32>) -declare void @__pseudo_masked_store_32(<$1 x i32> * nocapture, <$1 x i32>, <$1 x i32>) -declare void @__pseudo_masked_store_64(<$1 x i64> * nocapture, <$1 x i64>, <$1 x i32>) +declare void @__pseudo_masked_store_8( * nocapture, , ) +declare void @__pseudo_masked_store_16( * nocapture, , ) +declare void @__pseudo_masked_store_32( * nocapture, , ) +declare void @__pseudo_masked_store_64( * nocapture, , ) ; Declare the pseudo-gather functions. When the ispc front-end needs ; to perform a gather, it generates a call to one of these functions, @@ -904,33 +985,33 @@ declare void @__pseudo_masked_store_64(<$1 x i64> * nocapture, <$1 x i64>, <$1 x ; converts them to native gather functions or converts them to vector ; loads, if equivalent. -declare <$1 x i8> @__pseudo_gather32_8(<$1 x i32>, <$1 x i32>) nounwind readonly -declare <$1 x i16> @__pseudo_gather32_16(<$1 x i32>, <$1 x i32>) nounwind readonly -declare <$1 x i32> @__pseudo_gather32_32(<$1 x i32>, <$1 x i32>) nounwind readonly -declare <$1 x i64> @__pseudo_gather32_64(<$1 x i32>, <$1 x i32>) nounwind readonly +declare @__pseudo_gather32_8(, ) nounwind readonly +declare @__pseudo_gather32_16(, ) nounwind readonly +declare @__pseudo_gather32_32(, ) nounwind readonly +declare @__pseudo_gather32_64(, ) nounwind readonly -declare <$1 x i8> @__pseudo_gather64_8(<$1 x i64>, <$1 x i32>) nounwind readonly -declare <$1 x i16> @__pseudo_gather64_16(<$1 x i64>, <$1 x i32>) nounwind readonly -declare <$1 x i32> @__pseudo_gather64_32(<$1 x i64>, <$1 x i32>) nounwind readonly -declare <$1 x i64> @__pseudo_gather64_64(<$1 x i64>, <$1 x i32>) nounwind readonly +declare @__pseudo_gather64_8(, ) nounwind readonly +declare @__pseudo_gather64_16(, ) nounwind readonly +declare @__pseudo_gather64_32(, ) nounwind readonly +declare @__pseudo_gather64_64(, ) nounwind readonly -declare <$1 x i8> @__pseudo_gather_base_offsets32_8(i8 *, <$1 x i32>, i32, - <$1 x i32>) nounwind readonly -declare <$1 x i16> @__pseudo_gather_base_offsets32_16(i8 *, <$1 x i32>, i32, - <$1 x i32>) nounwind readonly -declare <$1 x i32> @__pseudo_gather_base_offsets32_32(i8 *, <$1 x i32>, i32, - <$1 x i32>) nounwind readonly -declare <$1 x i64> @__pseudo_gather_base_offsets32_64(i8 *, <$1 x i32>, i32, - <$1 x i32>) nounwind readonly +declare @__pseudo_gather_base_offsets32_8(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets32_16(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets32_32(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets32_64(i8 *, , i32, + ) nounwind readonly -declare <$1 x i8> @__pseudo_gather_base_offsets64_8(i8 *, <$1 x i64>, i32, - <$1 x i32>) nounwind readonly -declare <$1 x i16> @__pseudo_gather_base_offsets64_16(i8 *, <$1 x i64>, i32, - <$1 x i32>) nounwind readonly -declare <$1 x i32> @__pseudo_gather_base_offsets64_32(i8 *, <$1 x i64>, i32, - <$1 x i32>) nounwind readonly -declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, i32, - <$1 x i32>) nounwind readonly +declare @__pseudo_gather_base_offsets64_8(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets64_16(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets64_32(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets64_64(i8 *, , i32, + ) nounwind readonly ; Similarly to the pseudo-gathers defined above, we also declare undefined ; pseudo-scatter instructions with signatures: @@ -955,94 +1036,94 @@ declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, i32, ; And the GSImprovementsPass in turn converts these to actual native ; scatters or masked stores. -declare void @__pseudo_scatter32_8(<$1 x i32>, <$1 x i8>, <$1 x i32>) nounwind -declare void @__pseudo_scatter32_16(<$1 x i32>, <$1 x i16>, <$1 x i32>) nounwind -declare void @__pseudo_scatter32_32(<$1 x i32>, <$1 x i32>, <$1 x i32>) nounwind -declare void @__pseudo_scatter32_64(<$1 x i32>, <$1 x i64>, <$1 x i32>) nounwind +declare void @__pseudo_scatter32_8(, , ) nounwind +declare void @__pseudo_scatter32_16(, , ) nounwind +declare void @__pseudo_scatter32_32(, , ) nounwind +declare void @__pseudo_scatter32_64(, , ) nounwind -declare void @__pseudo_scatter64_8(<$1 x i64>, <$1 x i8>, <$1 x i32>) nounwind -declare void @__pseudo_scatter64_16(<$1 x i64>, <$1 x i16>, <$1 x i32>) nounwind -declare void @__pseudo_scatter64_32(<$1 x i64>, <$1 x i32>, <$1 x i32>) nounwind -declare void @__pseudo_scatter64_64(<$1 x i64>, <$1 x i64>, <$1 x i32>) nounwind +declare void @__pseudo_scatter64_8(, , ) nounwind +declare void @__pseudo_scatter64_16(, , ) nounwind +declare void @__pseudo_scatter64_32(, , ) nounwind +declare void @__pseudo_scatter64_64(, , ) nounwind -declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <$1 x i32>, i32, - <$1 x i8>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <$1 x i32>, i32, - <$1 x i16>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <$1 x i32>, i32, - <$1 x i32>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <$1 x i32>, i32, - <$1 x i64>, <$1 x i32>) nounwind +declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, , i32, + , ) nounwind -declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <$1 x i64>, i32, - <$1 x i8>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <$1 x i64>, i32, - <$1 x i16>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <$1 x i64>, i32, - <$1 x i32>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <$1 x i64>, i32, - <$1 x i64>, <$1 x i32>) nounwind +declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, , i32, + , ) nounwind ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector ops -define i8 @__extract_int8(<$1 x i8>, i32) nounwind readnone alwaysinline { - %extract = extractelement <$1 x i8> %0, i32 %1 +define i8 @__extract_int8(, i32) nounwind readnone alwaysinline { + %extract = extractelement %0, i32 %1 ret i8 %extract } -define <$1 x i8> @__insert_int8(<$1 x i8>, i32, +define @__insert_int8(, i32, i8) nounwind readnone alwaysinline { - %insert = insertelement <$1 x i8> %0, i8 %2, i32 %1 - ret <$1 x i8> %insert + %insert = insertelement %0, i8 %2, i32 %1 + ret %insert } -define i16 @__extract_int16(<$1 x i16>, i32) nounwind readnone alwaysinline { - %extract = extractelement <$1 x i16> %0, i32 %1 +define i16 @__extract_int16(, i32) nounwind readnone alwaysinline { + %extract = extractelement %0, i32 %1 ret i16 %extract } -define <$1 x i16> @__insert_int16(<$1 x i16>, i32, +define @__insert_int16(, i32, i16) nounwind readnone alwaysinline { - %insert = insertelement <$1 x i16> %0, i16 %2, i32 %1 - ret <$1 x i16> %insert + %insert = insertelement %0, i16 %2, i32 %1 + ret %insert } -define i32 @__extract_int32(<$1 x i32>, i32) nounwind readnone alwaysinline { - %extract = extractelement <$1 x i32> %0, i32 %1 +define i32 @__extract_int32(, i32) nounwind readnone alwaysinline { + %extract = extractelement %0, i32 %1 ret i32 %extract } -define <$1 x i32> @__insert_int32(<$1 x i32>, i32, +define @__insert_int32(, i32, i32) nounwind readnone alwaysinline { - %insert = insertelement <$1 x i32> %0, i32 %2, i32 %1 - ret <$1 x i32> %insert + %insert = insertelement %0, i32 %2, i32 %1 + ret %insert } -define i64 @__extract_int64(<$1 x i64>, i32) nounwind readnone alwaysinline { - %extract = extractelement <$1 x i64> %0, i32 %1 +define i64 @__extract_int64(, i32) nounwind readnone alwaysinline { + %extract = extractelement %0, i32 %1 ret i64 %extract } -define <$1 x i64> @__insert_int64(<$1 x i64>, i32, +define @__insert_int64(, i32, i64) nounwind readnone alwaysinline { - %insert = insertelement <$1 x i64> %0, i64 %2, i32 %1 - ret <$1 x i64> %insert + %insert = insertelement %0, i64 %2, i32 %1 + ret %insert } -shuffles($1, i8, int8, 1) -shuffles($1, i16, int16, 2) -shuffles($1, float, float, 4) -shuffles($1, i32, int32, 4) -shuffles($1, double, double, 8) -shuffles($1, i64, int64, 8) +shuffles(i8, int8, 1) +shuffles(i16, int16, 2) +shuffles(float, float, 4) +shuffles(i32, int32, 4) +shuffles(double, double, 8) +shuffles(i64, int64, 8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; various bitcasts from one type to another -define <$1 x i32> @__intbits_varying_float(<$1 x float>) nounwind readnone alwaysinline { - %float_to_int_bitcast = bitcast <$1 x float> %0 to <$1 x i32> - ret <$1 x i32> %float_to_int_bitcast +define @__intbits_varying_float() nounwind readnone alwaysinline { + %float_to_int_bitcast = bitcast %0 to + ret %float_to_int_bitcast } define i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline { @@ -1050,9 +1131,9 @@ define i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline { ret i32 %float_to_int_bitcast } -define <$1 x i64> @__intbits_varying_double(<$1 x double>) nounwind readnone alwaysinline { - %double_to_int_bitcast = bitcast <$1 x double> %0 to <$1 x i64> - ret <$1 x i64> %double_to_int_bitcast +define @__intbits_varying_double() nounwind readnone alwaysinline { + %double_to_int_bitcast = bitcast %0 to + ret %double_to_int_bitcast } define i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline { @@ -1060,9 +1141,9 @@ define i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline { ret i64 %double_to_int_bitcast } -define <$1 x float> @__floatbits_varying_int32(<$1 x i32>) nounwind readnone alwaysinline { - %int_to_float_bitcast = bitcast <$1 x i32> %0 to <$1 x float> - ret <$1 x float> %int_to_float_bitcast +define @__floatbits_varying_int32() nounwind readnone alwaysinline { + %int_to_float_bitcast = bitcast %0 to + ret %int_to_float_bitcast } define float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline { @@ -1070,9 +1151,9 @@ define float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline { ret float %int_to_float_bitcast } -define <$1 x double> @__doublebits_varying_int64(<$1 x i64>) nounwind readnone alwaysinline { - %int_to_double_bitcast = bitcast <$1 x i64> %0 to <$1 x double> - ret <$1 x double> %int_to_double_bitcast +define @__doublebits_varying_int64() nounwind readnone alwaysinline { + %int_to_double_bitcast = bitcast %0 to + ret %int_to_double_bitcast } define double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline { @@ -1080,8 +1161,8 @@ define double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline { ret double %int_to_double_bitcast } -define <$1 x float> @__undef_varying() nounwind readnone alwaysinline { - ret <$1 x float> undef +define @__undef_varying() nounwind readnone alwaysinline { + ret undef } define float @__undef_uniform() nounwind readnone alwaysinline { @@ -1096,31 +1177,12 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline { ret i32 %r } -define <$1 x i32> @__sext_varying_bool(<$1 x i32>) nounwind readnone alwaysinline { - ret <$1 x i32> %0 -} - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; count trailing zeros - -define i32 @__count_trailing_zeros_i32(i32) nounwind readnone alwaysinline { - %c = call i32 @llvm.cttz.i32(i32 %0) - ret i32 %c -} - -define i64 @__count_trailing_zeros_i64(i64) nounwind readnone alwaysinline { - %c = call i64 @llvm.cttz.i64(i64 %0) - ret i64 %c -} - -define i32 @__count_leading_zeros_i32(i32) nounwind readnone alwaysinline { - %c = call i32 @llvm.ctlz.i32(i32 %0) - ret i32 %c -} - -define i64 @__count_leading_zeros_i64(i64) nounwind readnone alwaysinline { - %c = call i64 @llvm.ctlz.i64(i64 %0) - ret i64 %c +define @__sext_varying_bool() nounwind readnone alwaysinline { + ifelse(MASK,i1, ` + %se = sext %0 to + ret %se + ', ` + ret %0') } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1670,184 +1732,133 @@ define void define void @__aos_to_soa4_float(float * noalias %p, - <$1 x float> * noalias %out0, <$1 x float> * noalias %out1, - <$1 x float> * noalias %out2, <$1 x float> * noalias %out3) + * noalias %out0, * noalias %out1, + * noalias %out2, * noalias %out3) nounwind alwaysinline { - %p0 = bitcast float * %p to <$1 x float> * - %v0 = load <$1 x float> * %p0, align 4 - %p1 = getelementptr <$1 x float> * %p0, i32 1 - %v1 = load <$1 x float> * %p1, align 4 - %p2 = getelementptr <$1 x float> * %p0, i32 2 - %v2 = load <$1 x float> * %p2, align 4 - %p3 = getelementptr <$1 x float> * %p0, i32 3 - %v3 = load <$1 x float> * %p3, align 4 - call void @__aos_to_soa4_float$1(<$1 x float> %v0, <$1 x float> %v1, - <$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0, - <$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3) + %p0 = bitcast float * %p to * + %v0 = load * %p0, align 4 + %p1 = getelementptr * %p0, i32 1 + %v1 = load * %p1, align 4 + %p2 = getelementptr * %p0, i32 2 + %v2 = load * %p2, align 4 + %p3 = getelementptr * %p0, i32 3 + %v3 = load * %p3, align 4 + call void @__aos_to_soa4_float`'WIDTH ( %v0, %v1, + %v2, %v3, * %out0, + * %out1, * %out2, * %out3) ret void } define void @__aos_to_soa4_int32(i32 * noalias %ptr, - <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1, - <$1 x i32> * noalias %out2, <$1 x i32> * noalias %out3) + * noalias %out0, * noalias %out1, + * noalias %out2, * noalias %out3) nounwind alwaysinline { %fptr = bitcast i32 * %ptr to float * - %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> * - %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> * - %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> * - %fout3 = bitcast <$1 x i32> * %out3 to <$1 x float> * + %fout0 = bitcast * %out0 to * + %fout1 = bitcast * %out1 to * + %fout2 = bitcast * %out2 to * + %fout3 = bitcast * %out3 to * call void @__aos_to_soa4_float(float * %fptr, - <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2, - <$1 x float> * %fout3) + * %fout0, * %fout1, * %fout2, + * %fout3) ret void } define void -@__soa_to_aos4_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2, - <$1 x float> %v3, float * noalias %p) nounwind alwaysinline { - %out0 = bitcast float * %p to <$1 x float> * - %out1 = getelementptr <$1 x float> * %out0, i32 1 - %out2 = getelementptr <$1 x float> * %out0, i32 2 - %out3 = getelementptr <$1 x float> * %out0, i32 3 - call void @__soa_to_aos4_float$1(<$1 x float> %v0, <$1 x float> %v1, - <$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0, - <$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3) +@__soa_to_aos4_float( %v0, %v1, %v2, + %v3, float * noalias %p) nounwind alwaysinline { + %out0 = bitcast float * %p to * + %out1 = getelementptr * %out0, i32 1 + %out2 = getelementptr * %out0, i32 2 + %out3 = getelementptr * %out0, i32 3 + call void @__soa_to_aos4_float`'WIDTH ( %v0, %v1, + %v2, %v3, * %out0, + * %out1, * %out2, * %out3) ret void } define void -@__soa_to_aos4_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2, - <$1 x i32> %v3, i32 * noalias %base) nounwind alwaysinline { - %fv0 = bitcast <$1 x i32> %v0 to <$1 x float> - %fv1 = bitcast <$1 x i32> %v1 to <$1 x float> - %fv2 = bitcast <$1 x i32> %v2 to <$1 x float> - %fv3 = bitcast <$1 x i32> %v3 to <$1 x float> +@__soa_to_aos4_int32( %v0, %v1, %v2, + %v3, i32 * noalias %base) nounwind alwaysinline { + %fv0 = bitcast %v0 to + %fv1 = bitcast %v1 to + %fv2 = bitcast %v2 to + %fv3 = bitcast %v3 to %fbase = bitcast i32 * %base to float * - call void @__soa_to_aos4_float(<$1 x float> %fv0, <$1 x float> %fv1, - <$1 x float> %fv2, <$1 x float> %fv3, float * %fbase) + call void @__soa_to_aos4_float( %fv0, %fv1, + %fv2, %fv3, float * %fbase) ret void } define void @__aos_to_soa3_float(float * noalias %p, - <$1 x float> * %out0, <$1 x float> * %out1, - <$1 x float> * %out2) nounwind alwaysinline { - %p0 = bitcast float * %p to <$1 x float> * - %v0 = load <$1 x float> * %p0, align 4 - %p1 = getelementptr <$1 x float> * %p0, i32 1 - %v1 = load <$1 x float> * %p1, align 4 - %p2 = getelementptr <$1 x float> * %p0, i32 2 - %v2 = load <$1 x float> * %p2, align 4 - call void @__aos_to_soa3_float$1(<$1 x float> %v0, <$1 x float> %v1, - <$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1, - <$1 x float> * %out2) + * %out0, * %out1, + * %out2) nounwind alwaysinline { + %p0 = bitcast float * %p to * + %v0 = load * %p0, align 4 + %p1 = getelementptr * %p0, i32 1 + %v1 = load * %p1, align 4 + %p2 = getelementptr * %p0, i32 2 + %v2 = load * %p2, align 4 + call void @__aos_to_soa3_float`'WIDTH ( %v0, %v1, + %v2, * %out0, * %out1, + * %out2) ret void } define void @__aos_to_soa3_int32(i32 * noalias %base, - <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1, - <$1 x i32> * noalias %out2) nounwind alwaysinline { + * noalias %out0, * noalias %out1, + * noalias %out2) nounwind alwaysinline { %fbase = bitcast i32 * %base to float * - %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> * - %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> * - %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> * + %fout0 = bitcast * %out0 to * + %fout1 = bitcast * %out1 to * + %fout2 = bitcast * %out2 to * call void @__aos_to_soa3_float(float * %fbase, - <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2) + * %fout0, * %fout1, * %fout2) ret void } define void -@__soa_to_aos3_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2, +@__soa_to_aos3_float( %v0, %v1, %v2, float * noalias %p) nounwind alwaysinline { - %out0 = bitcast float * %p to <$1 x float> * - %out1 = getelementptr <$1 x float> * %out0, i32 1 - %out2 = getelementptr <$1 x float> * %out0, i32 2 - call void @__soa_to_aos3_float$1(<$1 x float> %v0, <$1 x float> %v1, - <$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1, - <$1 x float> * %out2) + %out0 = bitcast float * %p to * + %out1 = getelementptr * %out0, i32 1 + %out2 = getelementptr * %out0, i32 2 + call void @__soa_to_aos3_float`'WIDTH ( %v0, %v1, + %v2, * %out0, * %out1, + * %out2) ret void } define void -@__soa_to_aos3_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2, +@__soa_to_aos3_int32( %v0, %v1, %v2, i32 * noalias %base) nounwind alwaysinline { - %fv0 = bitcast <$1 x i32> %v0 to <$1 x float> - %fv1 = bitcast <$1 x i32> %v1 to <$1 x float> - %fv2 = bitcast <$1 x i32> %v2 to <$1 x float> + %fv0 = bitcast %v0 to + %fv1 = bitcast %v1 to + %fv2 = bitcast %v2 to %fbase = bitcast i32 * %base to float * - call void @__soa_to_aos3_float(<$1 x float> %fv0, <$1 x float> %fv1, - <$1 x float> %fv2, float * %fbase) + call void @__soa_to_aos3_float( %fv0, %fv1, + %fv2, float * %fbase) ret void } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; prefetching - -ifelse(LLVM_VERSION, `LLVM_2_9', -` -declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality) - -define void @__prefetch_read_uniform_1(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 3) - ret void -} - -define void @__prefetch_read_uniform_2(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 2) - ret void -} - -define void @__prefetch_read_uniform_3(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 1) - ret void -} - -define void @__prefetch_read_uniform_nt(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 0) - ret void -} -', ` -declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality, - i32 %cachetype) ; cachetype == 1 is dcache - -define void @__prefetch_read_uniform_1(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 3, i32 1) - ret void -} - -define void @__prefetch_read_uniform_2(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 2, i32 1) - ret void -} - -define void @__prefetch_read_uniform_3(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 1, i32 1) - ret void -} - -define void @__prefetch_read_uniform_nt(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1) - ret void -} -') - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; assert declare i32 @printf(i8*, ...) declare void @abort() noreturn -define void @__do_assert_uniform(i8 *%str, i1 %test, <$1 x i32> %mask) { +define void @__do_assert_uniform(i8 *%str, i1 %test, %mask) { br i1 %test, label %ok, label %fail fail: @@ -1860,12 +1871,12 @@ ok: } -define void @__do_assert_varying(i8 *%str, <$1 x i32> %test, - <$1 x i32> %mask) { - %nottest = xor <$1 x i32> %test, - < forloop(i, 1, eval($1-1), `i32 -1, ') i32 -1 > - %nottest_and_mask = and <$1 x i32> %nottest, %mask - %mm = call i32 @__movmsk(<$1 x i32> %nottest_and_mask) +define void @__do_assert_varying(i8 *%str, %test, + %mask) { + %nottest = xor %test, + < forloop(i, 1, eval(WIDTH-1), `MASK -1, ') MASK -1 > + %nottest_and_mask = and %nottest, %mask + %mm = call i32 @__movmsk( %nottest_and_mask) %all_ok = icmp eq i32 %mm, 0 br i1 %all_ok, label %ok, label %fail @@ -2010,118 +2021,118 @@ define void @__memory_barrier() nounwind readnone alwaysinline { ret void } -global_atomic_associative($1, add, i32, int32, 0) -global_atomic_associative($1, sub, i32, int32, 0) -global_atomic_associative($1, and, i32, int32, -1) -global_atomic_associative($1, or, i32, int32, 0) -global_atomic_associative($1, xor, i32, int32, 0) -global_atomic_uniform($1, add, i32, int32) -global_atomic_uniform($1, sub, i32, int32) -global_atomic_uniform($1, and, i32, int32) -global_atomic_uniform($1, or, i32, int32) -global_atomic_uniform($1, xor, i32, int32) -global_atomic_uniform($1, min, i32, int32) -global_atomic_uniform($1, max, i32, int32) -global_atomic_uniform($1, umin, i32, uint32) -global_atomic_uniform($1, umax, i32, uint32) +global_atomic_associative(WIDTH, add, i32, int32, 0) +global_atomic_associative(WIDTH, sub, i32, int32, 0) +global_atomic_associative(WIDTH, and, i32, int32, -1) +global_atomic_associative(WIDTH, or, i32, int32, 0) +global_atomic_associative(WIDTH, xor, i32, int32, 0) +global_atomic_uniform(WIDTH, add, i32, int32) +global_atomic_uniform(WIDTH, sub, i32, int32) +global_atomic_uniform(WIDTH, and, i32, int32) +global_atomic_uniform(WIDTH, or, i32, int32) +global_atomic_uniform(WIDTH, xor, i32, int32) +global_atomic_uniform(WIDTH, min, i32, int32) +global_atomic_uniform(WIDTH, max, i32, int32) +global_atomic_uniform(WIDTH, umin, i32, uint32) +global_atomic_uniform(WIDTH, umax, i32, uint32) -global_atomic_associative($1, add, i64, int64, 0) -global_atomic_associative($1, sub, i64, int64, 0) -global_atomic_associative($1, and, i64, int64, -1) -global_atomic_associative($1, or, i64, int64, 0) -global_atomic_associative($1, xor, i64, int64, 0) -global_atomic_uniform($1, add, i64, int64) -global_atomic_uniform($1, sub, i64, int64) -global_atomic_uniform($1, and, i64, int64) -global_atomic_uniform($1, or, i64, int64) -global_atomic_uniform($1, xor, i64, int64) -global_atomic_uniform($1, min, i64, int64) -global_atomic_uniform($1, max, i64, int64) -global_atomic_uniform($1, umin, i64, uint64) -global_atomic_uniform($1, umax, i64, uint64) +global_atomic_associative(WIDTH, add, i64, int64, 0) +global_atomic_associative(WIDTH, sub, i64, int64, 0) +global_atomic_associative(WIDTH, and, i64, int64, -1) +global_atomic_associative(WIDTH, or, i64, int64, 0) +global_atomic_associative(WIDTH, xor, i64, int64, 0) +global_atomic_uniform(WIDTH, add, i64, int64) +global_atomic_uniform(WIDTH, sub, i64, int64) +global_atomic_uniform(WIDTH, and, i64, int64) +global_atomic_uniform(WIDTH, or, i64, int64) +global_atomic_uniform(WIDTH, xor, i64, int64) +global_atomic_uniform(WIDTH, min, i64, int64) +global_atomic_uniform(WIDTH, max, i64, int64) +global_atomic_uniform(WIDTH, umin, i64, uint64) +global_atomic_uniform(WIDTH, umax, i64, uint64) -global_swap($1, i32, int32) -global_swap($1, i64, int64) +global_swap(WIDTH, i32, int32) +global_swap(WIDTH, i64, int64) -define <$1 x float> @__atomic_swap_float_global(float * %ptr, <$1 x float> %val, - <$1 x i32> %mask) nounwind alwaysinline { +define @__atomic_swap_float_global(float * %ptr, %val, + %mask) nounwind alwaysinline { %iptr = bitcast float * %ptr to i32 * - %ival = bitcast <$1 x float> %val to <$1 x i32> - %iret = call <$1 x i32> @__atomic_swap_int32_global(i32 * %iptr, <$1 x i32> %ival, <$1 x i32> %mask) - %ret = bitcast <$1 x i32> %iret to <$1 x float> - ret <$1 x float> %ret + %ival = bitcast %val to + %iret = call @__atomic_swap_int32_global(i32 * %iptr, %ival, %mask) + %ret = bitcast %iret to + ret %ret } -define <$1 x double> @__atomic_swap_double_global(double * %ptr, <$1 x double> %val, - <$1 x i32> %mask) nounwind alwaysinline { +define @__atomic_swap_double_global(double * %ptr, %val, + %mask) nounwind alwaysinline { %iptr = bitcast double * %ptr to i64 * - %ival = bitcast <$1 x double> %val to <$1 x i64> - %iret = call <$1 x i64> @__atomic_swap_int64_global(i64 * %iptr, <$1 x i64> %ival, <$1 x i32> %mask) - %ret = bitcast <$1 x i64> %iret to <$1 x double> - ret <$1 x double> %ret + %ival = bitcast %val to + %iret = call @__atomic_swap_int64_global(i64 * %iptr, %ival, %mask) + %ret = bitcast %iret to + ret %ret } define float @__atomic_swap_uniform_float_global(float * %ptr, float %val, - <$1 x i32> %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { %iptr = bitcast float * %ptr to i32 * %ival = bitcast float %val to i32 - %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <$1 x i32> %mask) + %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, %mask) %ret = bitcast i32 %iret to float ret float %ret } define double @__atomic_swap_uniform_double_global(double * %ptr, double %val, - <$1 x i32> %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { %iptr = bitcast double * %ptr to i64 * %ival = bitcast double %val to i64 - %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <$1 x i32> %mask) + %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, %mask) %ret = bitcast i64 %iret to double ret double %ret } -global_atomic_exchange($1, i32, int32) -global_atomic_exchange($1, i64, int64) +global_atomic_exchange(WIDTH, i32, int32) +global_atomic_exchange(WIDTH, i64, int64) -define <$1 x float> @__atomic_compare_exchange_float_global(float * %ptr, - <$1 x float> %cmp, <$1 x float> %val, <$1 x i32> %mask) nounwind alwaysinline { +define @__atomic_compare_exchange_float_global(float * %ptr, + %cmp, %val, %mask) nounwind alwaysinline { %iptr = bitcast float * %ptr to i32 * - %icmp = bitcast <$1 x float> %cmp to <$1 x i32> - %ival = bitcast <$1 x float> %val to <$1 x i32> - %iret = call <$1 x i32> @__atomic_compare_exchange_int32_global(i32 * %iptr, <$1 x i32> %icmp, - <$1 x i32> %ival, <$1 x i32> %mask) - %ret = bitcast <$1 x i32> %iret to <$1 x float> - ret <$1 x float> %ret + %icmp = bitcast %cmp to + %ival = bitcast %val to + %iret = call @__atomic_compare_exchange_int32_global(i32 * %iptr, %icmp, + %ival, %mask) + %ret = bitcast %iret to + ret %ret } -define <$1 x double> @__atomic_compare_exchange_double_global(double * %ptr, - <$1 x double> %cmp, <$1 x double> %val, <$1 x i32> %mask) nounwind alwaysinline { +define @__atomic_compare_exchange_double_global(double * %ptr, + %cmp, %val, %mask) nounwind alwaysinline { %iptr = bitcast double * %ptr to i64 * - %icmp = bitcast <$1 x double> %cmp to <$1 x i64> - %ival = bitcast <$1 x double> %val to <$1 x i64> - %iret = call <$1 x i64> @__atomic_compare_exchange_int64_global(i64 * %iptr, <$1 x i64> %icmp, - <$1 x i64> %ival, <$1 x i32> %mask) - %ret = bitcast <$1 x i64> %iret to <$1 x double> - ret <$1 x double> %ret + %icmp = bitcast %cmp to + %ival = bitcast %val to + %iret = call @__atomic_compare_exchange_int64_global(i64 * %iptr, %icmp, + %ival, %mask) + %ret = bitcast %iret to + ret %ret } define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val, - <$1 x i32> %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { %iptr = bitcast float * %ptr to i32 * %icmp = bitcast float %cmp to i32 %ival = bitcast float %val to i32 %iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp, - i32 %ival, <$1 x i32> %mask) + i32 %ival, %mask) %ret = bitcast i32 %iret to float ret float %ret } define double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp, - double %val, <$1 x i32> %mask) nounwind alwaysinline { + double %val, %mask) nounwind alwaysinline { %iptr = bitcast double * %ptr to i64 * %icmp = bitcast double %cmp to i64 %ival = bitcast double %val to i64 %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp, - i64 %ival, <$1 x i32> %mask) + i64 %ival, %mask) %ret = bitcast i64 %iret to double ret double %ret } @@ -2168,10 +2179,10 @@ define <$1 x i64> @__$2_varying_$3(<$1 x i64>, <$1 x i64>) nounwind alwaysinline ;; vector width as a parameter define(`int64minmax', ` -i64minmax($1,min,int64,slt) -i64minmax($1,max,int64,sgt) -i64minmax($1,min,uint64,ult) -i64minmax($1,max,uint64,ugt) +i64minmax(WIDTH,min,int64,slt) +i64minmax(WIDTH,max,int64,sgt) +i64minmax(WIDTH,min,uint64,ult) +i64minmax(WIDTH,max,uint64,ugt) ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2410,24 +2421,24 @@ define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>, define(`packed_load_and_store', ` -define i32 @__packed_load_active(i32 * %startptr, <$1 x i32> * %val_ptr, - <$1 x i32> %full_mask) nounwind alwaysinline { +define i32 @__packed_load_active(i32 * %startptr, * %val_ptr, + %full_mask) nounwind alwaysinline { entry: - %mask = call i32 @__movmsk(<$1 x i32> %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask) + %mask = call i32 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: - %allon = icmp eq i32 %mask, eval((1 << $1) -1) + %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1) br i1 %allon, label %all_on, label %unknown_mask all_on: ;; everyone wants to load, so just load an entire vector width in a single ;; vector load - %vecptr = bitcast i32 *%startptr to <$1 x i32> * - %vec_load = load <$1 x i32> *%vecptr, align 4 - store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4 - ret i32 $1 + %vecptr = bitcast i32 *%startptr to * + %vec_load = load *%vecptr, align 4 + store %vec_load, * %val_ptr, align 4 + ret i32 WIDTH unknown_mask: br label %loop @@ -2445,7 +2456,7 @@ loop: load: %loadptr = getelementptr i32 *%startptr, i32 %offset %loadval = load i32 *%loadptr - %val_ptr_i32 = bitcast <$1 x i32> * %val_ptr to i32 * + %val_ptr_i32 = bitcast * %val_ptr to i32 * %storeptr = getelementptr i32 *%val_ptr_i32, i32 %lane store i32 %loadval, i32 *%storeptr %offset1 = add i32 %offset, 1 @@ -2457,28 +2468,28 @@ loopend: %nextlanemask = mul i32 %lanemask, 2 ; are we done yet? - %test = icmp ne i32 %nextlane, $1 + %test = icmp ne i32 %nextlane, WIDTH br i1 %test, label %loop, label %done done: ret i32 %nextoffset } -define i32 @__packed_store_active(i32 * %startptr, <$1 x i32> %vals, - <$1 x i32> %full_mask) nounwind alwaysinline { +define i32 @__packed_store_active(i32 * %startptr, %vals, + %full_mask) nounwind alwaysinline { entry: - %mask = call i32 @__movmsk(<$1 x i32> %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask) + %mask = call i32 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: - %allon = icmp eq i32 %mask, eval((1 << $1) -1) + %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1) br i1 %allon, label %all_on, label %unknown_mask all_on: - %vecptr = bitcast i32 *%startptr to <$1 x i32> * - store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4 - ret i32 $1 + %vecptr = bitcast i32 *%startptr to * + store %vals, * %vecptr, align 4 + ret i32 WIDTH unknown_mask: br label %loop @@ -2494,7 +2505,7 @@ loop: br i1 %do_store, label %store, label %loopend store: - %storeval = extractelement <$1 x i32> %vals, i32 %lane + %storeval = extractelement %vals, i32 %lane %storeptr = getelementptr i32 *%startptr, i32 %offset store i32 %storeval, i32 *%storeptr %offset1 = add i32 %offset, 1 @@ -2506,7 +2517,7 @@ loopend: %nextlanemask = mul i32 %lanemask, 2 ; are we done yet? - %test = icmp ne i32 %nextlane, $1 + %test = icmp ne i32 %nextlane, WIDTH br i1 %test, label %loop, label %done done: @@ -2613,7 +2624,7 @@ reduce_equal_aux($1, double, double, i64, fcmp, 64) define(`exclusive_scan', ` define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v, - <$1 x i32> %mask) nounwind alwaysinline { + <$1 x MASK> %mask) nounwind alwaysinline { ; first, set the value of any off lanes to the identity value %ptr = alloca <$1 x $2> %idvec1 = bitcast $2 $5 to <1 x $2> @@ -2623,7 +2634,7 @@ define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v, %ptr`'$3 = bitcast <$1 x $2> * %ptr to <$1 x i`'$3> * %vi = bitcast <$1 x $2> %v to <$1 x i`'$3> call void @__masked_store_blend_$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi, - <$1 x i32> %mask) + <$1 x MASK> %mask) %v_id = load <$1 x $2> * %ptr ; extract elements of the vector to use in computing the scan @@ -2649,16 +2660,16 @@ define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v, ') define(`scans', ` -exclusive_scan($1, i32, 32, add, 0, add_i32) -exclusive_scan($1, float, 32, fadd, zeroinitializer, add_float) -exclusive_scan($1, i64, 64, add, 0, add_i64) -exclusive_scan($1, double, 64, fadd, zeroinitializer, add_double) +exclusive_scan(WIDTH, i32, 32, add, 0, add_i32) +exclusive_scan(WIDTH, float, 32, fadd, zeroinitializer, add_float) +exclusive_scan(WIDTH, i64, 64, add, 0, add_i64) +exclusive_scan(WIDTH, double, 64, fadd, zeroinitializer, add_double) -exclusive_scan($1, i32, 32, and, -1, and_i32) -exclusive_scan($1, i64, 64, and, -1, and_i64) +exclusive_scan(WIDTH, i32, 32, and, -1, and_i32) +exclusive_scan(WIDTH, i64, 64, and, -1, and_i64) -exclusive_scan($1, i32, 32, or, 0, or_i32) -exclusive_scan($1, i64, 64, or, 0, or_i64) +exclusive_scan(WIDTH, i32, 32, or, 0, or_i32) +exclusive_scan(WIDTH, i64, 64, or, 0, or_i64) ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/ctx.cpp b/ctx.cpp index 043f7acc..694a3b1d 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -875,8 +875,11 @@ FunctionEmitContext::LaneMask(llvm::Value *v) { // into an i32 value std::vector mm; m->symbolTable->LookupFunction("__movmsk", &mm); - // There should be one with signed int signature, one unsigned int. - Assert(mm.size() == 2); + if (g->target.maskBitCount == 1) + Assert(mm.size() == 1); + else + // There should be one with signed int signature, one unsigned int. + Assert(mm.size() == 2); // We can actually call either one, since both are i32s as far as // LLVM's type system is concerned... llvm::Function *fmm = mm[0]->function; @@ -929,6 +932,9 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) { return NULL; } + if (g->target.maskBitCount == 1) + return b; + LLVM_TYPE_CONST llvm::ArrayType *at = llvm::dyn_cast(b->getType()); if (at) { diff --git a/func.cpp b/func.cpp index 61dfb784..4c8d2222 100644 --- a/func.cpp +++ b/func.cpp @@ -288,7 +288,10 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, bool checkMask = (type->isTask == true) || ((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) && costEstimate > CHECK_MASK_AT_FUNCTION_START_COST); - if (checkMask && g->opt.disableCoherentControlFlow == false) { + checkMask &= (g->target.maskingIsFree == false); + checkMask &= (g->opt.disableCoherentControlFlow == false); + + if (checkMask) { llvm::Value *mask = ctx->GetFunctionMask(); llvm::Value *allOn = ctx->All(mask); llvm::BasicBlock *bbAllOn = ctx->CreateBasicBlock("all_on"); diff --git a/ispc.cpp b/ispc.cpp index 8bfc9a9d..8cc618c3 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -129,24 +129,60 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->nativeVectorWidth = 4; t->vectorWidth = 4; t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; } else if (!strcasecmp(isa, "sse2-x2")) { t->isa = Target::SSE2; t->nativeVectorWidth = 4; t->vectorWidth = 8; t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; } else if (!strcasecmp(isa, "sse4")) { t->isa = Target::SSE4; t->nativeVectorWidth = 4; t->vectorWidth = 4; t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; } else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) { t->isa = Target::SSE4; t->nativeVectorWidth = 4; t->vectorWidth = 8; t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; + } + else if (!strcasecmp(isa, "generic-4")) { + t->isa = Target::GENERIC; + t->nativeVectorWidth = 4; + t->vectorWidth = 4; + t->maskingIsFree = true; + t->allOffMaskIsSafe = true; + t->maskBitCount = 1; + } + else if (!strcasecmp(isa, "generic-8")) { + t->isa = Target::GENERIC; + t->nativeVectorWidth = 8; + t->vectorWidth = 8; + t->maskingIsFree = true; + t->allOffMaskIsSafe = true; + t->maskBitCount = 1; + } + else if (!strcasecmp(isa, "generic-16")) { + t->isa = Target::GENERIC; + t->nativeVectorWidth = 16; + t->vectorWidth = 16; + t->maskingIsFree = true; + t->allOffMaskIsSafe = true; + t->maskBitCount = 1; } #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn) else if (!strcasecmp(isa, "avx")) { @@ -154,12 +190,18 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->nativeVectorWidth = 8; t->vectorWidth = 8; t->attributes = "+avx,+popcnt,+cmov"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; } else if (!strcasecmp(isa, "avx-x2")) { t->isa = Target::AVX; t->nativeVectorWidth = 8; t->vectorWidth = 16; t->attributes = "+avx,+popcnt,+cmov"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; } #endif // LLVM 3.0+ #if defined(LLVM_3_1svn) @@ -168,12 +210,18 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->nativeVectorWidth = 8; t->vectorWidth = 8; t->attributes = "+avx2,+popcnt,+cmov"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; } else if (!strcasecmp(isa, "avx2-x2")) { t->isa = Target::AVX2; t->nativeVectorWidth = 16; t->vectorWidth = 16; t->attributes = "+avx2,+popcnt,+cmov"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; } #endif // LLVM 3.1 else { @@ -221,7 +269,7 @@ Target::SupportedTargetISAs() { #ifdef LLVM_3_1svn ", avx2, avx2-x2" #endif // LLVM_3_1svn - ; + ", generic-4, generic-8, generic-16"; } @@ -300,6 +348,8 @@ Target::GetISAString() const { return "avx"; case Target::AVX2: return "avx2"; + case Target::GENERIC: + return "generic"; default: FATAL("Unhandled target in GetISAString()"); } diff --git a/ispc.h b/ispc.h index 6eb2cdd9..254c8311 100644 --- a/ispc.h +++ b/ispc.h @@ -193,7 +193,7 @@ struct Target { flexible/performant of them will apear last in the enumerant. Note also that __best_available_isa() needs to be updated if ISAs are added or the enumerant values are reordered. */ - enum ISA { SSE2, SSE4, AVX, AVX2, NUM_ISAS }; + enum ISA { SSE2, SSE4, AVX, AVX2, GENERIC, NUM_ISAS }; /** Instruction set being compiled to. */ ISA isa; @@ -222,6 +222,23 @@ struct Target { /** Indicates whether position independent code should be generated. */ bool generatePIC; + + /** Is there overhead associated with masking on the target + architecture; e.g. there is on SSE, due to extra blends and the + like, but there isn't with an ISA that supports masking + natively. */ + bool maskingIsFree; + + /** Is it safe to run code with the mask all if: e.g. on SSE, the fast + gather trick assumes that at least one program instance is running + (so that it can safely assume that the array base pointer is + valid). */ + bool allOffMaskIsSafe; + + /** How many bits are used to store each element of the mask: e.g. this + is 32 on SSE/AVX, since that matches the HW better, but it's 1 for + the generic target. */ + int maskBitCount; }; diff --git a/ispc.vcxproj b/ispc.vcxproj index fb56b96c..96a6855d 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -22,11 +22,15 @@ + + + - + + 4146;4800;4996;4355;4624;4005;4003;4018 @@ -40,15 +44,15 @@ 4146;4800;4996;4355;4624;4005;4065 4146;4800;4996;4355;4624;4005;4065 - - %LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp; -%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c > gen-bitcode-c-64.cpp - clang builtins-c.c - %LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp; -%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c > gen-bitcode-c-64.cpp - clang builtins-c.c - gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp - gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp + + %LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-32 > gen-bitcode-c-32.cpp; +%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-64 > gen-bitcode-c-64.cpp + Building builtins.c + %LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-32 > gen-bitcode-c-32.cpp; +%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-64 > gen-bitcode-c-64.cpp + Building builtins.c + gen-bitcode-c-32.cpp;gen-bitcode-c-64.cpp + gen-bitcode-c-32.cpp;gen-bitcode-c-64.cpp @@ -75,105 +79,148 @@ Document - %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp - gen-stdlib.cpp - %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp - gen-stdlib.cpp - Building gen-stdlib.cpp - Building gen-stdlib.cpp + %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 > gen-stdlib-x86.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic > gen-stdlib-generic.cpp; + + gen-stdlib-generic.cpp;gen-stdlib-x86.cpp + %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 > gen-stdlib-x86.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic > gen-stdlib-generic.cpp; + + gen-stdlib-generic.cpp;gen-stdlib-x86.cpp + Building gen-stdlib-{generic,x86}.cpp + Building gen-stdlib-{generic,x86}.cpp - + Document - m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp - gen-bitcode-sse4.cpp - builtins.m4;builtins-sse4-common.ll - m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp - gen-bitcode-sse4.cpp - builtins.m4;builtins-sse4-common.ll - Building gen-bitcode-sse4.cpp - Building gen-bitcode-sse4.cpp - - - - - Document - m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll > gen-bitcode-dispatch.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\dispatch.ll | python bitcode2cpp.py dispatch.ll > gen-bitcode-dispatch.cpp gen-bitcode-dispatch.cpp - builtins.m4 - m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll > gen-bitcode-dispatch.cpp + builtins\util.m4 + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\dispatch.ll | python bitcode2cpp.py dispatch.ll > gen-bitcode-dispatch.cpp gen-bitcode-dispatch.cpp - builtins.m4 + builtins\util.m4 Building gen-bitcode-dispatch.cpp Building gen-bitcode-dispatch.cpp - + Document - m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll > gen-bitcode-sse4-x2.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll > gen-bitcode-sse4.cpp + gen-bitcode-sse4.cpp + builtins\util.m4;builtins\target-sse4-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll > gen-bitcode-sse4.cpp + gen-bitcode-sse4.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4.cpp + Building gen-bitcode-sse4.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll > gen-bitcode-sse4-x2.cpp gen-bitcode-sse4-x2.cpp - builtins.m4;builtins-sse4-common.ll - m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll > gen-bitcode-sse4-x2.cpp + builtins\util.m4;builtins\target-sse4-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll > gen-bitcode-sse4-x2.cpp gen-bitcode-sse4-x2.cpp - builtins.m4;builtins-sse4-common.ll + builtins\util.m4;builtins\target-sse4-common.ll Building gen-bitcode-sse4-x2.cpp Building gen-bitcode-sse4-x2.cpp - + Document - m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll > gen-bitcode-sse2.cpp gen-bitcode-sse2.cpp - builtins.m4;builtins-sse2-common.ll - m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp + builtins\util.m4;builtins\target-sse2-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll > gen-bitcode-sse2.cpp gen-bitcode-sse2.cpp - builtins.m4;builtins-sse2-common.ll + builtins\util.m4;builtins\target-sse2-common.ll Building gen-bitcode-sse2.cpp Building gen-bitcode-sse2.cpp - + Document - m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll > gen-bitcode-sse2-x2.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll > gen-bitcode-sse2-x2.cpp gen-bitcode-sse2-x2.cpp - builtins.m4;builtins-sse2-common.ll - m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll > gen-bitcode-sse2-x2.cpp + builtins\util.m4;builtins\target-sse2-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll > gen-bitcode-sse2-x2.cpp gen-bitcode-sse2-x2.cpp - builtins.m4;builtins-sse2-common.ll + builtins\util.m4;builtins\target-sse2-common.ll Building gen-bitcode-sse2-x2.cpp Building gen-bitcode-sse2-x2.cpp - + Document - m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll > gen-bitcode-avx.cpp gen-bitcode-avx.cpp - builtins.m4;builtins-avx-common.ll - m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp + builtins\util.m4;builtins\target-avx-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll > gen-bitcode-avx.cpp gen-bitcode-avx.cpp - builtins.m4;builtins-avx-common.ll + builtins\util.m4;builtins\target-avx-common.ll Building gen-bitcode-avx.cpp Building gen-bitcode-avx.cpp - + Document - m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll > gen-bitcode-avx-x2.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll > gen-bitcode-avx-x2.cpp gen-bitcode-avx-x2.cpp - builtins.m4;builtins-sse.ll - m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll > gen-bitcode-avx-x2.cpp + builtins\util.m4;builtins\target-avx-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll > gen-bitcode-avx-x2.cpp gen-bitcode-avx-x2.cpp - builtins.m4;builtins-sse.ll + builtins\util.m4;builtins\target-avx-common.ll Building gen-bitcode-avx-x2.cpp Building gen-bitcode-avx-x2.cpp + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll > gen-bitcode-generic-4.cpp + gen-bitcode-generic-4.cpp + builtins\util.m4;builtins\target-generic-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll > gen-bitcode-generic-4.cpp + gen-bitcode-generic-4.cpp + builtins\util.m4;builtins\target-generic-common.ll + Building gen-bitcode-generic-4.cpp + Building gen-bitcode-generic-4.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll > gen-bitcode-generic-8.cpp + gen-bitcode-generic-8.cpp + builtins\util.m4;builtins\target-generic-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll > gen-bitcode-generic-8.cpp + gen-bitcode-generic-8.cpp + builtins\util.m4;builtins\target-generic-common.ll + Building gen-bitcode-generic-8.cpp + Building gen-bitcode-generic-8.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll > gen-bitcode-generic-16.cpp + gen-bitcode-generic-16.cpp + builtins\util.m4;builtins\target-generic-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll > gen-bitcode-generic-16.cpp + gen-bitcode-generic-16.cpp + builtins\util.m4;builtins\target-generic-common.ll + Building gen-bitcode-generic-16.cpp + Building gen-bitcode-generic-16.cpp + + Document diff --git a/llvmutil.cpp b/llvmutil.cpp index 6c440a91..4a50e337 100644 --- a/llvmutil.cpp +++ b/llvmutil.cpp @@ -105,11 +105,14 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) { LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0); LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0); - // Note that both the mask and bool vectors are vector of int32s - // (not i1s). LLVM ends up generating much better SSE code with - // this representation. - LLVMTypes::MaskType = LLVMTypes::BoolVectorType = - llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth); + if (target.maskBitCount == 1) + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth); + else { + assert(target.maskBitCount == 32); + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth); + } LLVMTypes::Int1VectorType = llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth); @@ -141,7 +144,11 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) { std::vector maskOnes; llvm::Constant *onMask = NULL; - onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1, + if (target.maskBitCount == 1) + onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1, + false /*unsigned*/); // 0x1 + else + onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1, true /*signed*/); // 0xffffffff for (int i = 0; i < target.vectorWidth; ++i) @@ -150,8 +157,12 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) { std::vector maskZeros; llvm::Constant *offMask = NULL; - offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0, - true /*signed*/); + if (target.maskBitCount == 1) + offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0, + true /*signed*/); + else + offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0, + true /*signed*/); for (int i = 0; i < target.vectorWidth; ++i) maskZeros.push_back(offMask); diff --git a/module.cpp b/module.cpp index 9fade4b9..5dc9b160 100644 --- a/module.cpp +++ b/module.cpp @@ -1158,22 +1158,14 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre opts.addMacroDef("PI=3.1415926535"); // Add #define for current compilation target - switch (g->target.isa) { - case Target::SSE2: - opts.addMacroDef("ISPC_TARGET_SSE2"); - break; - case Target::SSE4: - opts.addMacroDef("ISPC_TARGET_SSE4"); - break; - case Target::AVX: - opts.addMacroDef("ISPC_TARGET_AVX"); - break; - case Target::AVX2: - opts.addMacroDef("ISPC_TARGET_AVX2"); - break; - default: - FATAL("Unhandled target ISA in preprocessor symbol definition"); + char targetMacro[128]; + sprintf(targetMacro, "ISPC_TARGET_%s", g->target.GetISAString()); + char *p = targetMacro; + while (*p) { + *p = toupper(*p); + ++p; } + opts.addMacroDef(targetMacro); if (g->target.is32Bit) opts.addMacroDef("ISPC_POINTER_SIZE=32"); diff --git a/opt.cpp b/opt.cpp index c77a76f7..17458a06 100644 --- a/opt.cpp +++ b/opt.cpp @@ -2444,7 +2444,7 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { int count = sizeof(names) / sizeof(names[0]); for (int i = 0; i < count; ++i) { llvm::Function *f = m->module->getFunction(names[i]); - if (f != NULL) { + if (f != NULL && f->empty() == false) { f->setLinkage(llvm::GlobalValue::InternalLinkage); modifiedAny = true; } diff --git a/parse.yy b/parse.yy index 8510244a..70cb2b3f 100644 --- a/parse.yy +++ b/parse.yy @@ -1605,7 +1605,8 @@ lAddFunctionParams(Declarator *decl) { /** Add a symbol for the built-in mask variable to the symbol table */ static void lAddMaskToSymbolTable(SourcePos pos) { - const Type *t = AtomicType::VaryingConstUInt32; + const Type *t = g->target.isa == Target::GENERIC ? + AtomicType::VaryingConstBool : AtomicType::VaryingConstUInt32; Symbol *maskSymbol = new Symbol("__mask", pos, t); m->symbolTable->AddVariable(maskSymbol); } diff --git a/stdlib.ispc b/stdlib.ispc index 1a804733..c3b02fa7 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -38,6 +38,14 @@ ispc code */ +#ifdef ISPC_TARGET_GENERIC +#define IntMaskType bool +#define UIntMaskType bool +#else +#define IntMaskType int32 +#define UIntMaskType unsigned int32 +#endif + /////////////////////////////////////////////////////////////////////////// // Low level primitives @@ -274,13 +282,21 @@ static inline int32 sign_extend(bool v) { static inline uniform bool any(bool v) { // We only care about whether "any" is true for the active program instances, // so we have to make v with the current program mask. +#ifdef ISPC_TARGET_GENERIC + return __movmsk(v & __mask) != 0; +#else return __movmsk(__sext_varying_bool(v) & __mask) != 0; +#endif } static inline uniform bool all(bool v) { // As with any(), we need to explicitly mask v with the current program mask // so we're only looking at the current lanes +#ifdef ISPC_TARGET_GENERIC + bool match = ((v & __mask) == __mask); +#else int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask); +#endif return __movmsk(match) == (1 << programCount) - 1; } @@ -308,7 +324,11 @@ static inline int popcnt(int64 v) { static inline uniform int popcnt(bool v) { // As with any() and all(), only count across the active lanes +#ifdef ISPC_TARGET_GENERIC + return __popcnt_int32(__movmsk(v & __mask)); +#else return __popcnt_int32(__movmsk(__sext_varying_bool(v) & __mask)); +#endif } static inline uniform int lanemask() { @@ -672,19 +692,19 @@ static inline uniform bool reduce_equal(TYPE v, uniform TYPE * uniform value) { return __reduce_equal_##FUNCTYPE(v, value, (MASKTYPE)__mask); \ } -REDUCE_EQUAL(int32, int32, int32) -REDUCE_EQUAL(unsigned int32, int32, unsigned int32) -REDUCE_EQUAL(float, float, int32) -REDUCE_EQUAL(int64, int64, int32) -REDUCE_EQUAL(unsigned int64, int64, unsigned int32) -REDUCE_EQUAL(double, double, int32) +REDUCE_EQUAL(int32, int32, IntMaskType) +REDUCE_EQUAL(unsigned int32, int32, UIntMaskType) +REDUCE_EQUAL(float, float, IntMaskType) +REDUCE_EQUAL(int64, int64, IntMaskType) +REDUCE_EQUAL(unsigned int64, int64, UIntMaskType) +REDUCE_EQUAL(double, double, IntMaskType) static int32 exclusive_scan_add(int32 v) { - return __exclusive_scan_add_i32(v, (int32)__mask); + return __exclusive_scan_add_i32(v, (IntMaskType)__mask); } static unsigned int32 exclusive_scan_add(unsigned int32 v) { - return __exclusive_scan_add_i32(v, __mask); + return __exclusive_scan_add_i32((int32)v, (IntMaskType)__mask); } static float exclusive_scan_add(float v) { @@ -692,11 +712,11 @@ static float exclusive_scan_add(float v) { } static int64 exclusive_scan_add(int64 v) { - return __exclusive_scan_add_i64(v, (int32)__mask); + return __exclusive_scan_add_i64(v, (IntMaskType)__mask); } static unsigned int64 exclusive_scan_add(unsigned int64 v) { - return __exclusive_scan_add_i64(v, __mask); + return __exclusive_scan_add_i64(v, (UIntMaskType)__mask); } static double exclusive_scan_add(double v) { @@ -704,35 +724,35 @@ static double exclusive_scan_add(double v) { } static int32 exclusive_scan_and(int32 v) { - return __exclusive_scan_and_i32(v, (int32)__mask); + return __exclusive_scan_and_i32(v, (IntMaskType)__mask); } static unsigned int32 exclusive_scan_and(unsigned int32 v) { - return __exclusive_scan_and_i32(v, __mask); + return __exclusive_scan_and_i32(v, (UIntMaskType)__mask); } static int64 exclusive_scan_and(int64 v) { - return __exclusive_scan_and_i64(v, (int32)__mask); + return __exclusive_scan_and_i64(v, (IntMaskType)__mask); } static unsigned int64 exclusive_scan_and(unsigned int64 v) { - return __exclusive_scan_and_i64(v, __mask); + return __exclusive_scan_and_i64(v, (UIntMaskType)__mask); } static int32 exclusive_scan_or(int32 v) { - return __exclusive_scan_or_i32(v, (int32)__mask); + return __exclusive_scan_or_i32(v, (IntMaskType)__mask); } static unsigned int32 exclusive_scan_or(unsigned int32 v) { - return __exclusive_scan_or_i32(v, __mask); + return __exclusive_scan_or_i32(v, (UIntMaskType)__mask); } static int64 exclusive_scan_or(int64 v) { - return __exclusive_scan_or_i64(v, (int32)__mask); + return __exclusive_scan_or_i64(v, (IntMaskType)__mask); } static unsigned int64 exclusive_scan_or(unsigned int64 v) { - return __exclusive_scan_or_i64(v, __mask); + return __exclusive_scan_or_i64(v, (UIntMaskType)__mask); } /////////////////////////////////////////////////////////////////////////// @@ -741,23 +761,23 @@ static unsigned int64 exclusive_scan_or(unsigned int64 v) { static inline uniform int packed_load_active(uniform unsigned int * uniform a, unsigned int * uniform vals) { - return __packed_load_active(a, vals, (unsigned int32)__mask); + return __packed_load_active(a, vals, (UIntMaskType)__mask); } static inline uniform int packed_store_active(uniform unsigned int * uniform a, unsigned int vals) { - return __packed_store_active(a, vals, (unsigned int32)__mask); + return __packed_store_active(a, vals, (UIntMaskType)__mask); } static inline uniform int packed_load_active(uniform int * uniform a, int * uniform vals) { - return __packed_load_active(a, vals, (int32)__mask); + return __packed_load_active(a, vals, (IntMaskType)__mask); } static inline uniform int packed_store_active(uniform int * uniform a, int vals) { - return __packed_store_active(a, vals, (int32)__mask); + return __packed_store_active(a, vals, (IntMaskType)__mask); } /////////////////////////////////////////////////////////////////////////// @@ -848,49 +868,49 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \ return ret; \ } -DEFINE_ATOMIC_OP(int32,int32,add,add,int32) -DEFINE_ATOMIC_OP(int32,int32,subtract,sub,int32) -DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,int32) -DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,int32) -DEFINE_ATOMIC_OP(int32,int32,and,and,int32) -DEFINE_ATOMIC_OP(int32,int32,or,or,int32) -DEFINE_ATOMIC_OP(int32,int32,xor,xor,int32) -DEFINE_ATOMIC_OP(int32,int32,swap,swap,int32) +DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType) +DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType) +DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType) +DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType) +DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType) +DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType) +DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType) +DEFINE_ATOMIC_OP(int32,int32,swap,swap,IntMaskType) // For everything but atomic min and max, we can use the same // implementations for unsigned as for signed. -DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,unsigned int32) -DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,unsigned int32) -DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,unsigned int32) +DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType) +DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType) +DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,UIntMaskType) -DEFINE_ATOMIC_OP(float,float,swap,swap,int32) +DEFINE_ATOMIC_OP(float,float,swap,swap,IntMaskType) -DEFINE_ATOMIC_OP(int64,int64,add,add,int32) -DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int32) -DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,int32) -DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,int32) -DEFINE_ATOMIC_OP(int64,int64,and,and,int32) -DEFINE_ATOMIC_OP(int64,int64,or,or,int32) -DEFINE_ATOMIC_OP(int64,int64,xor,xor,int32) -DEFINE_ATOMIC_OP(int64,int64,swap,swap,int32) +DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType) +DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType) +DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType) +DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType) +DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType) +DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType) +DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType) +DEFINE_ATOMIC_OP(int64,int64,swap,swap,IntMaskType) // For everything but atomic min and max, we can use the same // implementations for unsigned as for signed. -DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,unsigned int32) -DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,unsigned int32) -DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,unsigned int32) +DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType) +DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType) +DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,UIntMaskType) -DEFINE_ATOMIC_OP(double,double,swap,swap,int32) +DEFINE_ATOMIC_OP(double,double,swap,swap,IntMaskType) #undef DEFINE_ATOMIC_OP @@ -913,12 +933,12 @@ static inline uniform TA atomic_compare_exchange_global( \ return ret; \ } -ATOMIC_DECL_CMPXCHG(int32, int32, int32) -ATOMIC_DECL_CMPXCHG(unsigned int32, int32, unsigned int32) -ATOMIC_DECL_CMPXCHG(float, float, int32) -ATOMIC_DECL_CMPXCHG(int64, int64, int32) -ATOMIC_DECL_CMPXCHG(unsigned int64, int64, unsigned int32) -ATOMIC_DECL_CMPXCHG(double, double, int32) +ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType) +ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType) +ATOMIC_DECL_CMPXCHG(float, float, IntMaskType) +ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType) +ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType) +ATOMIC_DECL_CMPXCHG(double, double, IntMaskType) #undef ATOMIC_DECL_CMPXCHG diff --git a/stdlib2cpp.py b/stdlib2cpp.py index 132f8257..6fa5fc2e 100755 --- a/stdlib2cpp.py +++ b/stdlib2cpp.py @@ -2,7 +2,9 @@ import sys -print "char stdlib_code[] = { " +t=str(sys.argv[1]) + +print "char stdlib_" + t + "_code[] = { " for line in sys.stdin: for c in line: diff --git a/stmt.cpp b/stmt.cpp index e799fc0b..95142abe 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -622,9 +622,6 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask, /** Given an AST node, check to see if it's safe if we happen to run the code for that node with the execution mask all off. - - FIXME: this is actually a target-specific thing; for non SSE/AVX - targets with more complete masking support, some of this won't apply... */ static bool lCheckAllOffSafety(ASTNode *node, void *data) { @@ -648,6 +645,11 @@ lCheckAllOffSafety(ASTNode *node, void *data) { return false; } + if (g->target.allOffMaskIsSafe == true) + // Don't worry about memory accesses if we have a target that can + // safely run them with the mask all off + return true; + IndexExpr *ie; if ((ie = dynamic_cast(node)) != NULL && ie->baseExpr != NULL) { const Type *type = ie->baseExpr->GetType();