From 1d9201fe3d3172778f2f53caaf2db58ac4ca1b99 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Mon, 19 Dec 2011 13:46:50 -0800 Subject: [PATCH] Add "generic" 4, 8, and 16-wide targets. When used, these targets end up with calls to undefined functions for all of the various special vector stuff ispc needs to compile ispc programs (masked store, gather, min/max, sqrt, etc.). These targets are not yet useful for anything, but are a step toward having an option to C++ code with calls out to intrinsics. Reorganized the directory structure a bit and put the LLVM bitcode used to define target-specific stuff (as well as some generic built-ins stuff) into a builtins/ directory. Note that for building on Windows, it's now necessary to set a LLVM_VERSION environment variable (with values like LLVM_2_9, LLVM_3_0, LLVM_3_1svn, etc.) --- Makefile | 64 +- bitcode2cpp.py | 3 +- builtins.cpp | 51 +- builtins-c.c => builtins/builtins.c | 0 builtins-dispatch.ll => builtins/dispatch.ll | 0 .../target-avx-common.ll | 3 + .../target-avx-x2.ll | 14 +- builtins-avx.ll => builtins/target-avx.ll | 14 +- builtins/target-generic-16.ll | 34 + builtins/target-generic-4.ll | 34 + builtins/target-generic-8.ll | 34 + builtins/target-generic-common.ll | 277 ++++++ .../target-sse2-common.ll | 3 + .../target-sse2-x2.ll | 14 +- builtins-sse2.ll => builtins/target-sse2.ll | 14 +- .../target-sse4-common.ll | 3 + .../target-sse4-x2.ll | 14 +- builtins-sse4.ll => builtins/target-sse4.ll | 14 +- builtins.m4 => builtins/util.m4 | 847 +++++++++--------- ctx.cpp | 10 +- func.cpp | 5 +- ispc.cpp | 52 +- ispc.h | 19 +- ispc.vcxproj | 165 ++-- llvmutil.cpp | 27 +- module.cpp | 22 +- opt.cpp | 2 +- parse.yy | 3 +- stdlib.ispc | 144 +-- stdlib2cpp.py | 4 +- stmt.cpp | 8 +- 31 files changed, 1249 insertions(+), 649 deletions(-) rename builtins-c.c => builtins/builtins.c (100%) rename builtins-dispatch.ll => builtins/dispatch.ll (100%) rename builtins-avx-common.ll => builtins/target-avx-common.ll (99%) rename builtins-avx-x2.ll => builtins/target-avx-x2.ll (99%) rename builtins-avx.ll => builtins/target-avx.ll (99%) create mode 100644 builtins/target-generic-16.ll create mode 100644 builtins/target-generic-4.ll create mode 100644 builtins/target-generic-8.ll create mode 100644 builtins/target-generic-common.ll rename builtins-sse2-common.ll => builtins/target-sse2-common.ll (99%) rename builtins-sse2-x2.ll => builtins/target-sse2-x2.ll (99%) rename builtins-sse2.ll => builtins/target-sse2.ll (99%) rename builtins-sse4-common.ll => builtins/target-sse4-common.ll (99%) rename builtins-sse4-x2.ll => builtins/target-sse4-x2.ll (99%) rename builtins-sse4.ll => builtins/target-sse4.ll (99%) rename builtins.m4 => builtins/util.m4 (82%) diff --git a/Makefile b/Makefile index 54734f39..f2e18543 100644 --- a/Makefile +++ b/Makefile @@ -62,14 +62,17 @@ CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \ util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll builtins-sse2-x2.ll \ - builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll +TARGETS=avx avx-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 generic-16 +BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \ + builtins/dispatch.ll +BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \ + builtins-c-32.cpp builtins-c-64.cpp BISON_SRC=parse.yy FLEX_SRC=lex.ll -OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \ - builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \ - $(FLEX_SRC:.ll=.o)) +OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \ + stdlib_generic_ispc.o stdlib_x86_ispc.o \ + $(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o)) default: ispc @@ -104,6 +107,10 @@ objs/%.o: %.cpp @echo Compiling $< @$(CXX) $(CXXFLAGS) -o $@ -c $< +objs/%.o: objs/%.cpp + @echo Compiling $< + @$(CXX) $(CXXFLAGS) -o $@ -c $< + objs/parse.cc: parse.yy @echo Running bison on $< @$(YACC) -o $@ $< @@ -120,41 +127,24 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc @echo Compiling $< @$(CXX) $(CXXFLAGS) -o $@ -c $< -objs/builtins-%.cpp: builtins-%.ll - @echo Creating C++ source from builtin definitions file $< - @m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@ - -objs/builtins-%.o: objs/builtins-%.cpp - @echo Compiling $< - @$(CXX) $(CXXFLAGS) -o $@ -c $< - -objs/builtins-c-32.cpp: builtins-c.c +objs/builtins-%.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll) @echo Creating C++ source from builtins definition file $< - @$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-32.c > $@ + @m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | ./bitcode2cpp.py $< > $@ -objs/builtins-c-32.o: objs/builtins-c-32.cpp - @echo Compiling $< - @$(CXX) $(CXXFLAGS) -o $@ -c $< - -objs/builtins-c-64.cpp: builtins-c.c +objs/builtins-c-32.cpp: builtins/builtins.c @echo Creating C++ source from builtins definition file $< - @$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-64.c > $@ + @$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-32 > $@ -objs/builtins-c-64.o: objs/builtins-c-64.cpp - @echo Compiling $< - @$(CXX) $(CXXFLAGS) -o $@ -c $< +objs/builtins-c-64.cpp: builtins/builtins.c + @echo Creating C++ source from builtins definition file $< + @$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-64 > $@ -objs/stdlib_ispc.cpp: stdlib.ispc - @echo Creating C++ source from $< - @$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | ./stdlib2cpp.py > $@ +objs/stdlib_generic_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for generic + @$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + ./stdlib2cpp.py generic > $@ -objs/stdlib_ispc.o: objs/stdlib_ispc.cpp - @echo Compiling $< - @$(CXX) $(CXXFLAGS) -o $@ -c $< - -objs/builtins-sse2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2.ll -objs/builtins-sse2-x2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2-x2.ll -objs/builtins-sse4.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4.ll -objs/builtins-sse4-x2.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4-x2.ll -objs/builtins-avx.cpp: builtins.m4 builtins-avx-common.ll builtins-avx.ll -objs/builtins-avx-x2.cpp: builtins.m4 builtins-avx-common.ll builtins-avx-x2.ll +objs/stdlib_x86_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for x86 + @$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \ + ./stdlib2cpp.py x86 > $@ diff --git a/bitcode2cpp.py b/bitcode2cpp.py index fa7d4782..a1a5d2bf 100755 --- a/bitcode2cpp.py +++ b/bitcode2cpp.py @@ -11,7 +11,8 @@ length=0 src=str(sys.argv[1]) -target = re.sub(".*builtins-", "", src) +target = re.sub("builtins/target-", "", src) +target = re.sub("builtins/", "", target) target = re.sub("\.ll$", "", target) target = re.sub("\.c$", "", target) target = re.sub("-", "_", target) diff --git a/builtins.cpp b/builtins.cpp index 5358e789..9bd41e8f 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -99,6 +99,9 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64; // varying + if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType && + t == LLVMTypes::MaskType) + return AtomicType::VaryingBool; else if (t == LLVMTypes::Int8VectorType) return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8; else if (t == LLVMTypes::Int16VectorType) @@ -194,7 +197,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) { // symbol creation code below assumes that any LLVM vector of i32s is a // varying int32. Here, we need that to be interpreted as a varying // bool, so just have a one-off override for that one... - if (name == "__sext_varying_bool") { + if (g->target.maskBitCount != 1 && name == "__sext_varying_bool") { const Type *returnType = AtomicType::VaryingInt32; std::vector argTypes; argTypes.push_back(AtomicType::VaryingBool); @@ -556,7 +559,7 @@ lSetInternalFunctions(llvm::Module *module) { int count = sizeof(names) / sizeof(names[0]); for (int i = 0; i < count; ++i) { llvm::Function *f = module->getFunction(names[i]); - if (f != NULL) + if (f != NULL && f->empty() == false) f->setLinkage(llvm::GlobalValue::InternalLinkage); } } @@ -744,6 +747,33 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod FATAL("logic error in DefineStdlib"); } break; + case Target::GENERIC: + switch (g->target.vectorWidth) { + case 4: + extern unsigned char builtins_bitcode_generic_4[]; + extern int builtins_bitcode_generic_4_length; + AddBitcodeToModule(builtins_bitcode_generic_4, + builtins_bitcode_generic_4_length, + module, symbolTable); + break; + case 8: + extern unsigned char builtins_bitcode_generic_8[]; + extern int builtins_bitcode_generic_8_length; + AddBitcodeToModule(builtins_bitcode_generic_8, + builtins_bitcode_generic_8_length, + module, symbolTable); + break; + case 16: + extern unsigned char builtins_bitcode_generic_16[]; + extern int builtins_bitcode_generic_16_length; + AddBitcodeToModule(builtins_bitcode_generic_16, + builtins_bitcode_generic_16_length, + module, symbolTable); + break; + default: + FATAL("logic error in DefineStdlib"); + } + break; default: FATAL("logic error"); } @@ -771,11 +801,16 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod if (includeStdlibISPC) { // If the user wants the standard library to be included, parse the // serialized version of the stdlib.ispc file to get its - // definitions added. Disable emission of performance warnings for - // now, since the user doesn't care about any of that in the stdlib - // implementation... - extern char stdlib_code[]; - yy_scan_string(stdlib_code); - yyparse(); + // definitions added. + if (g->target.isa == Target::GENERIC) { + extern char stdlib_generic_code[]; + yy_scan_string(stdlib_generic_code); + yyparse(); + } + else { + extern char stdlib_x86_code[]; + yy_scan_string(stdlib_x86_code); + yyparse(); + } } } diff --git a/builtins-c.c b/builtins/builtins.c similarity index 100% rename from builtins-c.c rename to builtins/builtins.c diff --git a/builtins-dispatch.ll b/builtins/dispatch.ll similarity index 100% rename from builtins-dispatch.ll rename to builtins/dispatch.ll diff --git a/builtins-avx-common.ll b/builtins/target-avx-common.ll similarity index 99% rename from builtins-avx-common.ll rename to builtins/target-avx-common.ll index 6b08466d..07fb12b4 100644 --- a/builtins-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -32,6 +32,9 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; AVX target implementation. +ctlztz() +define_prefetches() + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins-avx-x2.ll b/builtins/target-avx-x2.ll similarity index 99% rename from builtins-avx-x2.ll rename to builtins/target-avx-x2.ll index 6254c405..90e2680c 100644 --- a/builtins-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -32,12 +32,16 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Basic 16-wide definitions -stdlib_core(16) -packed_load_and_store(16) -scans(16) -int64minmax(16) +define(`WIDTH',`16') +define(`MASK',`i32') +include(`util.m4') -include(`builtins-avx-common.ll') +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-avx-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins-avx.ll b/builtins/target-avx.ll similarity index 99% rename from builtins-avx.ll rename to builtins/target-avx.ll index a00a527e..dc7339bd 100644 --- a/builtins-avx.ll +++ b/builtins/target-avx.ll @@ -32,12 +32,16 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Basic 8-wide definitions -stdlib_core(8) -packed_load_and_store(8) -scans(8) -int64minmax(8) +define(`WIDTH',`8') +define(`MASK',`i32') +include(`util.m4') -include(`builtins-avx-common.ll') +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-avx-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-generic-16.ll b/builtins/target-generic-16.ll new file mode 100644 index 00000000..807fd242 --- /dev/null +++ b/builtins/target-generic-16.ll @@ -0,0 +1,34 @@ +;; Copyright (c) 2010-2011, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`16') +include(`target-generic-common.ll') + diff --git a/builtins/target-generic-4.ll b/builtins/target-generic-4.ll new file mode 100644 index 00000000..7eb1f300 --- /dev/null +++ b/builtins/target-generic-4.ll @@ -0,0 +1,34 @@ +;; Copyright (c) 2010-2011, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`4') +include(`target-generic-common.ll') + diff --git a/builtins/target-generic-8.ll b/builtins/target-generic-8.ll new file mode 100644 index 00000000..bd9261ff --- /dev/null +++ b/builtins/target-generic-8.ll @@ -0,0 +1,34 @@ +;; Copyright (c) 2010-2011, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`8') +include(`target-generic-common.ll') + diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll new file mode 100644 index 00000000..b59e8d53 --- /dev/null +++ b/builtins/target-generic-common.ll @@ -0,0 +1,277 @@ +;; Copyright (c) 2010-2011, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`MASK',`i1') +include(`util.m4') + +stdlib_core() + +scans() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +declare void @__fastmath() nounwind + +;; round/floor/ceil + +declare float @__round_uniform_float(float) nounwind readnone +declare float @__floor_uniform_float(float) nounwind readnone +declare float @__ceil_uniform_float(float) nounwind readnone + +declare double @__round_uniform_double(double) nounwind readnone +declare double @__floor_uniform_double(double) nounwind readnone +declare double @__ceil_uniform_double(double) nounwind readnone + +declare @__round_varying_float() nounwind readnone +declare @__floor_varying_float() nounwind readnone +declare @__ceil_varying_float() nounwind readnone +declare @__round_varying_double() nounwind readnone +declare @__floor_varying_double() nounwind readnone +declare @__ceil_varying_double() nounwind readnone + +;; min/max + +declare float @__max_uniform_float(float, float) nounwind readnone +declare float @__min_uniform_float(float, float) nounwind readnone +declare i32 @__min_uniform_int32(i32, i32) nounwind readnone +declare i32 @__max_uniform_int32(i32, i32) nounwind readnone +declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone +declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone +declare i64 @__min_uniform_int64(i64, i64) nounwind readnone +declare i64 @__max_uniform_int64(i64, i64) nounwind readnone +declare i64 @__min_uniform_uint64(i64, i64) nounwind readnone +declare i64 @__max_uniform_uint64(i64, i64) nounwind readnone +declare double @__min_uniform_double(double, double) nounwind readnone +declare double @__max_uniform_double(double, double) nounwind readnone + +declare @__max_varying_float(, + ) nounwind readnone +declare @__min_varying_float(, + ) nounwind readnone +declare @__min_varying_int32(, ) nounwind readnone +declare @__max_varying_int32(, ) nounwind readnone +declare @__min_varying_uint32(, ) nounwind readnone +declare @__max_varying_uint32(, ) nounwind readnone +declare @__min_varying_int64(, ) nounwind readnone +declare @__max_varying_int64(, ) nounwind readnone +declare @__min_varying_uint64(, ) nounwind readnone +declare @__max_varying_uint64(, ) nounwind readnone +declare @__min_varying_double(, + ) nounwind readnone +declare @__max_varying_double(, + ) nounwind readnone + +;; sqrt/rsqrt/rcp + +declare float @__rsqrt_uniform_float(float) nounwind readnone +declare float @__rcp_uniform_float(float) nounwind readnone +declare float @__sqrt_uniform_float(float) nounwind readnone +declare @__rcp_varying_float() nounwind readnone +declare @__rsqrt_varying_float( %v) nounwind readnone +declare @__sqrt_varying_float() nounwind readnone + +declare double @__sqrt_uniform_double(double) nounwind readnone +declare @__sqrt_varying_double() nounwind readnone + +;; bit ops + +declare i32 @__popcnt_int32(i32) nounwind readnone +declare i64 @__popcnt_int64(i64) nounwind readnone + +declare i32 @__count_trailing_zeros_i32(i32) nounwind readnone +declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone +declare i32 @__count_leading_zeros_i32(i32) nounwind readnone +declare i64 @__count_leading_zeros_i64(i64) nounwind readnone + +;; svml + +; FIXME: need either to wire these up to the 8-wide SVML entrypoints, +; or, use the macro to call the 4-wide ones twice with our 8-wide +; vectors... + +declare @__svml_sin() +declare @__svml_cos() +declare void @__svml_sincos(, *, *) +declare @__svml_tan() +declare @__svml_atan() +declare @__svml_atan2(, ) +declare @__svml_exp() +declare @__svml_log() +declare @__svml_pow(, ) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; reductions + +declare i32 @__movmsk() nounwind readnone + +declare float @__reduce_add_float() nounwind readnone +declare float @__reduce_min_float() nounwind readnone +declare float @__reduce_max_float() nounwind readnone + +declare i32 @__reduce_add_int32() nounwind readnone +declare i32 @__reduce_min_int32() nounwind readnone +declare i32 @__reduce_max_int32() nounwind readnone + +declare i32 @__reduce_add_uint32( %v) nounwind readnone +declare i32 @__reduce_min_uint32() nounwind readnone +declare i32 @__reduce_max_uint32() nounwind readnone + +declare double @__reduce_add_double() nounwind readnone +declare double @__reduce_min_double() nounwind readnone +declare double @__reduce_max_double() nounwind readnone + +declare i64 @__reduce_add_int64() nounwind readnone +declare i64 @__reduce_min_int64() nounwind readnone +declare i64 @__reduce_max_int64() nounwind readnone + +declare i64 @__reduce_add_uint64( %v) nounwind readnone +declare i64 @__reduce_min_uint64() nounwind readnone +declare i64 @__reduce_max_uint64() nounwind readnone + +declare i1 @__reduce_equal_int32( %v, i32 * nocapture %samevalue, + %mask) nounwind +declare i1 @__reduce_equal_float( %v, float * nocapture %samevalue, + %mask) nounwind +declare i1 @__reduce_equal_int64( %v, i64 * nocapture %samevalue, + %mask) nounwind +declare i1 @__reduce_equal_double( %v, double * nocapture %samevalue, + %mask) nounwind + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +load_and_broadcast(WIDTH, i8, 8) +load_and_broadcast(WIDTH, i16, 16) +load_and_broadcast(WIDTH, i32, 32) +load_and_broadcast(WIDTH, i64, 64) + +declare @__load_masked_8(i8 * nocapture, %mask) nounwind readonly +declare @__load_masked_16(i8 * nocapture, %mask) nounwind readonly +declare @__load_masked_32(i8 * nocapture, %mask) nounwind readonly +declare @__load_masked_64(i8 * nocapture, %mask) nounwind readonly + +declare void @__masked_store_8(* nocapture, , + ) nounwind +declare void @__masked_store_16(* nocapture, , + ) nounwind +declare void @__masked_store_32(* nocapture, , + ) nounwind +declare void @__masked_store_64(* nocapture, , + %mask) nounwind + +ifelse(LLVM_VERSION,LLVM_3_1svn,` +define void @__masked_store_blend_8(* nocapture, , + ) nounwind { + %v = load * %0 + %v1 = select %2, %1, %v + store %v1, * %0 + ret void +} + +define void @__masked_store_blend_16(* nocapture, , + ) nounwind { + %v = load * %0 + %v1 = select %2, %1, %v + store %v1, * %0 + ret void +} + +define void @__masked_store_blend_32(* nocapture, , + ) nounwind { + %v = load * %0 + %v1 = select %2, %1, %v + store %v1, * %0 + ret void +} + +define void @__masked_store_blend_64(* nocapture, + , ) nounwind { + %v = load * %0 + %v1 = select %2, %1, %v + store %v1, * %0 + ret void +} +',` +declare void @__masked_store_blend_8(* nocapture, , + ) nounwind +declare void @__masked_store_blend_16(* nocapture, , + ) nounwind +declare void @__masked_store_blend_32(* nocapture, , + ) nounwind +declare void @__masked_store_blend_64(* nocapture %ptr, + %new, + %mask) nounwind +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +define(`gather_scatter', ` +declare @__gather_base_offsets32_$1(i8 * nocapture %ptr, %offsets, + i32 %offset_scale, %vecmask) nounwind readonly +declare @__gather_base_offsets64_$1(i8 * nocapture %ptr, %offsets, + i32 %offset_scale, %vecmask) nounwind readonly +declare @__gather32_$1( %ptrs, + %vecmask) nounwind readonly +declare @__gather64_$1( %ptrs, + %vecmask) nounwind readonly + +declare void @__scatter_base_offsets32_$1(i8* nocapture %base, %offsets, + i32 %offset_scale, %values, %mask) nounwind +declare void @__scatter_base_offsets64_$1(i8* nocapture %base, %offsets, + i32 %offset_scale, %values, %mask) nounwind +declare void @__scatter32_$1( %ptrs, %values, + %mask) nounwind +declare void @__scatter64_$1( %ptrs, %values, + %mask) nounwind +') + +gather_scatter(i8) +gather_scatter(i16) +gather_scatter(i32) +gather_scatter(i64) + +declare i32 @__packed_load_active(i32 * nocapture %startptr, * nocapture %val_ptr, + %full_mask) nounwind +declare i32 @__packed_store_active(i32 * %startptr, %vals, + %full_mask) nounwind + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; prefetch + +declare void @__prefetch_read_uniform_1(i8 *) nounwind readnone +declare void @__prefetch_read_uniform_2(i8 *) nounwind readnone +declare void @__prefetch_read_uniform_3(i8 *) nounwind readnone +declare void @__prefetch_read_uniform_nt(i8 *) nounwind readnone + diff --git a/builtins-sse2-common.ll b/builtins/target-sse2-common.ll similarity index 99% rename from builtins-sse2-common.ll rename to builtins/target-sse2-common.ll index 659bdda7..80c34afb 100644 --- a/builtins-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -29,6 +29,9 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +ctlztz() +define_prefetches() + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins-sse2-x2.ll b/builtins/target-sse2-x2.ll similarity index 99% rename from builtins-sse2-x2.ll rename to builtins/target-sse2-x2.ll index b5eaa889..a9d71ea9 100644 --- a/builtins-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -36,12 +36,16 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; standard 8-wide definitions from m4 macros -stdlib_core(8) -packed_load_and_store(8) -scans(8) -int64minmax(8) +define(`WIDTH',`8') +define(`MASK',`i32') +include(`util.m4') -include(`builtins-sse2-common.ll') +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse2-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins-sse2.ll b/builtins/target-sse2.ll similarity index 99% rename from builtins-sse2.ll rename to builtins/target-sse2.ll index c49d6b2c..1a297199 100644 --- a/builtins-sse2.ll +++ b/builtins/target-sse2.ll @@ -33,12 +33,16 @@ ;; Define the standard library builtins for the SSE2 target ; Define some basics for a 4-wide target -stdlib_core(4) -packed_load_and_store(4) -scans(4) -int64minmax(4) +define(`WIDTH',`4') +define(`MASK',`i32') +include(`util.m4') -include(`builtins-sse2-common.ll') +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse2-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding diff --git a/builtins-sse4-common.ll b/builtins/target-sse4-common.ll similarity index 99% rename from builtins-sse4-common.ll rename to builtins/target-sse4-common.ll index f1ee95dc..19d31ce4 100644 --- a/builtins-sse4-common.ll +++ b/builtins/target-sse4-common.ll @@ -29,6 +29,9 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +ctlztz() +define_prefetches() + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins-sse4-x2.ll b/builtins/target-sse4-x2.ll similarity index 99% rename from builtins-sse4-x2.ll rename to builtins/target-sse4-x2.ll index fd399884..764f8613 100644 --- a/builtins-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -36,12 +36,16 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; standard 8-wide definitions from m4 macros -stdlib_core(8) -packed_load_and_store(8) -scans(8) -int64minmax(8) +define(`WIDTH',`8') +define(`MASK',`i32') +include(`util.m4') -include(`builtins-sse4-common.ll') +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse4-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins-sse4.ll b/builtins/target-sse4.ll similarity index 99% rename from builtins-sse4.ll rename to builtins/target-sse4.ll index 68c44a0e..7eadde4b 100644 --- a/builtins-sse4.ll +++ b/builtins/target-sse4.ll @@ -33,12 +33,16 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Define common 4-wide stuff -stdlib_core(4) -packed_load_and_store(4) -scans(4) -int64minmax(4) +define(`WIDTH',`4') +define(`MASK',`i32') +include(`util.m4') -include(`builtins-sse4-common.ll') +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse4-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins.m4 b/builtins/util.m4 similarity index 82% rename from builtins.m4 rename to builtins/util.m4 index f83bdbff..8853e81c 100644 --- a/builtins.m4 +++ b/builtins/util.m4 @@ -550,103 +550,103 @@ divert`'dnl ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; stdlib_core ;; -;; This macro defines a bunch of helper routines that only depend on the -;; target's vector width, which it takes as its first parameter. +;; This macro defines a bunch of helper routines that depend on the +;; target's vector width ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; define(`shuffles', ` -define <$1 x $2> @__broadcast_$3(<$1 x $2>, i32) nounwind readnone alwaysinline { - %v = extractelement <$1 x $2> %0, i32 %1 - %r_0 = insertelement <$1 x $2> undef, $2 %v, i32 0 -forloop(i, 1, eval($1-1), ` %r_`'i = insertelement <$1 x $2> %r_`'eval(i-1), $2 %v, i32 i +define @__broadcast_$2(, i32) nounwind readnone alwaysinline { + %v = extractelement %0, i32 %1 + %r_0 = insertelement undef, $1 %v, i32 0 +forloop(i, 1, eval(WIDTH-1), ` %r_`'i = insertelement %r_`'eval(i-1), $1 %v, i32 i ') - ret <$1 x $2> %r_`'eval($1-1) + ret %r_`'eval(WIDTH-1) } -define <$1 x $2> @__rotate_$3(<$1 x $2>, i32) nounwind readnone alwaysinline { +define @__rotate_$2(, i32) nounwind readnone alwaysinline { %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1) br i1 %isc, label %is_const, label %not_const is_const: ; though verbose, this turms into tight code if %1 is a constant -forloop(i, 0, eval($1-1), ` +forloop(i, 0, eval(WIDTH-1), ` %delta_`'i = add i32 %1, i - %delta_clamped_`'i = and i32 %delta_`'i, eval($1-1) - %v_`'i = extractelement <$1 x $2> %0, i32 %delta_clamped_`'i') + %delta_clamped_`'i = and i32 %delta_`'i, eval(WIDTH-1) + %v_`'i = extractelement %0, i32 %delta_clamped_`'i') - %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0 -forloop(i, 1, eval($1-1), ` %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i + %ret_0 = insertelement undef, $1 %v_0, i32 0 +forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement %ret_`'eval(i-1), $1 %v_`'i, i32 i ') - ret <$1 x $2> %ret_`'eval($1-1) + ret %ret_`'eval(WIDTH-1) not_const: ; store two instances of the vector into memory - %ptr = alloca <$1 x $2>, i32 2 - %ptr0 = getelementptr <$1 x $2> * %ptr, i32 0 - store <$1 x $2> %0, <$1 x $2> * %ptr0 - %ptr1 = getelementptr <$1 x $2> * %ptr, i32 1 - store <$1 x $2> %0, <$1 x $2> * %ptr1 + %ptr = alloca , i32 2 + %ptr0 = getelementptr * %ptr, i32 0 + store %0, * %ptr0 + %ptr1 = getelementptr * %ptr, i32 1 + store %0, * %ptr1 ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector - %offset = and i32 %1, eval($1-1) - %ptr_as_elt_array = bitcast <$1 x $2> * %ptr to [eval(2*$1) x $2] * - %load_ptr = getelementptr [eval(2*$1) x $2] * %ptr_as_elt_array, i32 0, i32 %offset - %load_ptr_vec = bitcast $2 * %load_ptr to <$1 x $2> * - %result = load <$1 x $2> * %load_ptr_vec, align $4 - ret <$1 x $2> %result + %offset = and i32 %1, eval(WIDTH-1) + %ptr_as_elt_array = bitcast * %ptr to [eval(2*WIDTH) x $1] * + %load_ptr = getelementptr [eval(2*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset + %load_ptr_vec = bitcast $1 * %load_ptr to * + %result = load * %load_ptr_vec, align $3 + ret %result } -define <$1 x $2> @__shuffle_$3(<$1 x $2>, <$1 x i32>) nounwind readnone alwaysinline { -forloop(i, 0, eval($1-1), ` - %index_`'i = extractelement <$1 x i32> %1, i32 i') -forloop(i, 0, eval($1-1), ` - %v_`'i = extractelement <$1 x $2> %0, i32 %index_`'i') +define @__shuffle_$2(, ) nounwind readnone alwaysinline { +forloop(i, 0, eval(WIDTH-1), ` + %index_`'i = extractelement %1, i32 i') +forloop(i, 0, eval(WIDTH-1), ` + %v_`'i = extractelement %0, i32 %index_`'i') - %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0 -forloop(i, 1, eval($1-1), ` %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i + %ret_0 = insertelement undef, $1 %v_0, i32 0 +forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement %ret_`'eval(i-1), $1 %v_`'i, i32 i ') - ret <$1 x $2> %ret_`'eval($1-1) + ret %ret_`'eval(WIDTH-1) } -define <$1 x $2> @__shuffle2_$3(<$1 x $2>, <$1 x $2>, <$1 x i32>) nounwind readnone alwaysinline { - %v2 = shufflevector <$1 x $2> %0, <$1 x $2> %1, < - forloop(i, 0, eval(2*$1-2), `i32 i, ') i32 eval(2*$1-1) +define @__shuffle2_$2(, , ) nounwind readnone alwaysinline { + %v2 = shufflevector %0, %1, < + forloop(i, 0, eval(2*WIDTH-2), `i32 i, ') i32 eval(2*WIDTH-1) > -forloop(i, 0, eval($1-1), ` - %index_`'i = extractelement <$1 x i32> %2, i32 i') +forloop(i, 0, eval(WIDTH-1), ` + %index_`'i = extractelement %2, i32 i') - %isc = call i1 @__is_compile_time_constant_varying_int32(<$1 x i32> %2) + %isc = call i1 @__is_compile_time_constant_varying_int32( %2) br i1 %isc, label %is_const, label %not_const is_const: ; extract from the requested lanes and insert into the result; LLVM turns ; this into good code in the end -forloop(i, 0, eval($1-1), ` - %v_`'i = extractelement %v2, i32 %index_`'i') +forloop(i, 0, eval(WIDTH-1), ` + %v_`'i = extractelement %v2, i32 %index_`'i') - %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0 -forloop(i, 1, eval($1-1), ` %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i + %ret_0 = insertelement undef, $1 %v_0, i32 0 +forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement %ret_`'eval(i-1), $1 %v_`'i, i32 i ') - ret <$1 x $2> %ret_`'eval($1-1) + ret %ret_`'eval(WIDTH-1) not_const: ; otherwise store the two vectors onto the stack and then use the given ; permutation vector to get indices into that array... - %ptr = alloca - store %v2, * %ptr - %baseptr = bitcast * %ptr to $2 * + %ptr = alloca + store %v2, * %ptr + %baseptr = bitcast * %ptr to $1 * - %ptr_0 = getelementptr $2 * %baseptr, i32 %index_0 - %val_0 = load $2 * %ptr_0 - %result_0 = insertelement <$1 x $2> undef, $2 %val_0, i32 0 + %ptr_0 = getelementptr $1 * %baseptr, i32 %index_0 + %val_0 = load $1 * %ptr_0 + %result_0 = insertelement undef, $1 %val_0, i32 0 -forloop(i, 1, eval($1-1), ` - %ptr_`'i = getelementptr $2 * %baseptr, i32 %index_`'i - %val_`'i = load $2 * %ptr_`'i - %result_`'i = insertelement <$1 x $2> %result_`'eval(i-1), $2 %val_`'i, i32 i +forloop(i, 1, eval(WIDTH-1), ` + %ptr_`'i = getelementptr $1 * %baseptr, i32 %index_`'i + %val_`'i = load $1 * %ptr_`'i + %result_`'i = insertelement %result_`'eval(i-1), $1 %val_`'i, i32 i ') - ret <$1 x $2> %result_`'eval($1-1) + ret %result_`'eval(WIDTH-1) } ') @@ -676,18 +676,20 @@ forloop(i, 1, eval($1-1), ` define(`global_atomic_associative', ` define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, - <$1 x i32> %m) nounwind alwaysinline { + <$1 x MASK> %m) nounwind alwaysinline { ; first, for any lanes where the mask is off, compute a vector where those lanes ; hold the identity value.. ; for the bit tricks below, we need the mask to be sign extended to be ; the size of the element type. - ifelse($3, `i64', `%mask = sext <$1 x i32> %m to <$1 x i64>') - ifelse($3, `i32', ` - ; silly workaround to do %mask = %m, which is not possible directly.. - %maskmem = alloca <$1 x i32> - store <$1 x i32> %m, <$1 x i32> * %maskmem - %mask = load <$1 x i32> * %maskmem' + ifelse( + MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>', + $3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>', + $3,i32, ` + ; silly workaround to do %mask = %m, which is not possible directly.. + %maskmem = alloca <$1 x i32> + store <$1 x i32> %m, <$1 x i32> * %maskmem + %mask = load <$1 x i32> * %maskmem' ) ; zero out any lanes that are off %valoff = and <$1 x $3> %val, %mask @@ -751,13 +753,13 @@ ifelse(LLVM_VERSION, `LLVM_2_9',` declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta) define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val, - <$1 x i32> %mask) nounwind alwaysinline { + <$1 x MASK> %mask) nounwind alwaysinline { %r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val) ret $3 %r } ', ` define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val, - <$1 x i32> %mask) nounwind alwaysinline { + <$1 x MASK> %mask) nounwind alwaysinline { %r = atomicrmw $2 $3 * %ptr, $3 %val seq_cst ret $3 %r } @@ -778,11 +780,11 @@ declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)') define(`global_swap', ` define <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val, - <$1 x i32> %mask) nounwind alwaysinline { + <$1 x MASK> %mask) nounwind alwaysinline { %rptr = alloca <$1 x $2> %rptr32 = bitcast <$1 x $2> * %rptr to $2 * - per_lane($1, <$1 x i32> %mask, ` + per_lane($1, <$1 x MASK> %mask, ` %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE ifelse(LLVM_VERSION, `LLVM_2_9',` %r_LANE_ID = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val_LANE_ID)', ` @@ -795,7 +797,7 @@ ifelse(LLVM_VERSION, `LLVM_2_9',` } define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val, - <$1 x i32> %mask) nounwind alwaysinline { + <$1 x MASK> %mask) nounwind alwaysinline { ifelse(LLVM_VERSION, `LLVM_2_9',` %r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)', ` %r = atomicrmw xchg $2 * %ptr, $2 %val seq_cst') @@ -816,11 +818,11 @@ ifelse(LLVM_VERSION, `LLVM_2_9',` declare $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)') define <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp, - <$1 x $2> %val, <$1 x i32> %mask) nounwind alwaysinline { + <$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline { %rptr = alloca <$1 x $2> %rptr32 = bitcast <$1 x $2> * %rptr to $2 * - per_lane($1, <$1 x i32> %mask, ` + per_lane($1, <$1 x MASK> %mask, ` %cmp_LANE_ID = extractelement <$1 x $2> %cmp, i32 LANE %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE ifelse(LLVM_VERSION, `LLVM_2_9',` @@ -835,7 +837,7 @@ ifelse(LLVM_VERSION, `LLVM_2_9',` } define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp, - $2 %val, <$1 x i32> %mask) nounwind alwaysinline { + $2 %val, <$1 x MASK> %mask) nounwind alwaysinline { ifelse(LLVM_VERSION, `LLVM_2_9',` %r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)', ` %r = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst') @@ -844,6 +846,85 @@ ifelse(LLVM_VERSION, `LLVM_2_9',` ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; count trailing zeros + +define(`ctlztz', ` +define i32 @__count_trailing_zeros_i32(i32) nounwind readnone alwaysinline { + %c = call i32 @llvm.cttz.i32(i32 %0) + ret i32 %c +} + +define i64 @__count_trailing_zeros_i64(i64) nounwind readnone alwaysinline { + %c = call i64 @llvm.cttz.i64(i64 %0) + ret i64 %c +} + +define i32 @__count_leading_zeros_i32(i32) nounwind readnone alwaysinline { + %c = call i32 @llvm.ctlz.i32(i32 %0) + ret i32 %c +} + +define i64 @__count_leading_zeros_i64(i64) nounwind readnone alwaysinline { + %c = call i64 @llvm.ctlz.i64(i64 %0) + ret i64 %c +} +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; prefetching + +define(`define_prefetches', ` +ifelse(LLVM_VERSION, `LLVM_2_9', +` +declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality) + +define void @__prefetch_read_uniform_1(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 3) + ret void +} + +define void @__prefetch_read_uniform_2(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 2) + ret void +} + +define void @__prefetch_read_uniform_3(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 1) + ret void +} + +define void @__prefetch_read_uniform_nt(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 0) + ret void +} +', ` +declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality, + i32 %cachetype) ; cachetype == 1 is dcache + +define void @__prefetch_read_uniform_1(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 3, i32 1) + ret void +} + +define void @__prefetch_read_uniform_2(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 2, i32 1) + ret void +} + +define void @__prefetch_read_uniform_3(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 1, i32 1) + ret void +} + +define void @__prefetch_read_uniform_nt(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1) + ret void +} +') +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + define(`stdlib_core', ` @@ -854,8 +935,8 @@ declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind declare void @ISPCSync(i8*) nounwind declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind -declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask) -declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>) +declare i1 @__is_compile_time_constant_mask( %mask) +declare i1 @__is_compile_time_constant_varying_int32() ; This function declares placeholder masked store functions for the ; front-end to use. @@ -869,10 +950,10 @@ declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>) ; stores (if the mask is all on) by the MaskedStoreOptPass optimization ; pass. -declare void @__pseudo_masked_store_8(<$1 x i8> * nocapture, <$1 x i8>, <$1 x i32>) -declare void @__pseudo_masked_store_16(<$1 x i16> * nocapture, <$1 x i16>, <$1 x i32>) -declare void @__pseudo_masked_store_32(<$1 x i32> * nocapture, <$1 x i32>, <$1 x i32>) -declare void @__pseudo_masked_store_64(<$1 x i64> * nocapture, <$1 x i64>, <$1 x i32>) +declare void @__pseudo_masked_store_8( * nocapture, , ) +declare void @__pseudo_masked_store_16( * nocapture, , ) +declare void @__pseudo_masked_store_32( * nocapture, , ) +declare void @__pseudo_masked_store_64( * nocapture, , ) ; Declare the pseudo-gather functions. When the ispc front-end needs ; to perform a gather, it generates a call to one of these functions, @@ -904,33 +985,33 @@ declare void @__pseudo_masked_store_64(<$1 x i64> * nocapture, <$1 x i64>, <$1 x ; converts them to native gather functions or converts them to vector ; loads, if equivalent. -declare <$1 x i8> @__pseudo_gather32_8(<$1 x i32>, <$1 x i32>) nounwind readonly -declare <$1 x i16> @__pseudo_gather32_16(<$1 x i32>, <$1 x i32>) nounwind readonly -declare <$1 x i32> @__pseudo_gather32_32(<$1 x i32>, <$1 x i32>) nounwind readonly -declare <$1 x i64> @__pseudo_gather32_64(<$1 x i32>, <$1 x i32>) nounwind readonly +declare @__pseudo_gather32_8(, ) nounwind readonly +declare @__pseudo_gather32_16(, ) nounwind readonly +declare @__pseudo_gather32_32(, ) nounwind readonly +declare @__pseudo_gather32_64(, ) nounwind readonly -declare <$1 x i8> @__pseudo_gather64_8(<$1 x i64>, <$1 x i32>) nounwind readonly -declare <$1 x i16> @__pseudo_gather64_16(<$1 x i64>, <$1 x i32>) nounwind readonly -declare <$1 x i32> @__pseudo_gather64_32(<$1 x i64>, <$1 x i32>) nounwind readonly -declare <$1 x i64> @__pseudo_gather64_64(<$1 x i64>, <$1 x i32>) nounwind readonly +declare @__pseudo_gather64_8(, ) nounwind readonly +declare @__pseudo_gather64_16(, ) nounwind readonly +declare @__pseudo_gather64_32(, ) nounwind readonly +declare @__pseudo_gather64_64(, ) nounwind readonly -declare <$1 x i8> @__pseudo_gather_base_offsets32_8(i8 *, <$1 x i32>, i32, - <$1 x i32>) nounwind readonly -declare <$1 x i16> @__pseudo_gather_base_offsets32_16(i8 *, <$1 x i32>, i32, - <$1 x i32>) nounwind readonly -declare <$1 x i32> @__pseudo_gather_base_offsets32_32(i8 *, <$1 x i32>, i32, - <$1 x i32>) nounwind readonly -declare <$1 x i64> @__pseudo_gather_base_offsets32_64(i8 *, <$1 x i32>, i32, - <$1 x i32>) nounwind readonly +declare @__pseudo_gather_base_offsets32_8(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets32_16(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets32_32(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets32_64(i8 *, , i32, + ) nounwind readonly -declare <$1 x i8> @__pseudo_gather_base_offsets64_8(i8 *, <$1 x i64>, i32, - <$1 x i32>) nounwind readonly -declare <$1 x i16> @__pseudo_gather_base_offsets64_16(i8 *, <$1 x i64>, i32, - <$1 x i32>) nounwind readonly -declare <$1 x i32> @__pseudo_gather_base_offsets64_32(i8 *, <$1 x i64>, i32, - <$1 x i32>) nounwind readonly -declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, i32, - <$1 x i32>) nounwind readonly +declare @__pseudo_gather_base_offsets64_8(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets64_16(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets64_32(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets64_64(i8 *, , i32, + ) nounwind readonly ; Similarly to the pseudo-gathers defined above, we also declare undefined ; pseudo-scatter instructions with signatures: @@ -955,94 +1036,94 @@ declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, i32, ; And the GSImprovementsPass in turn converts these to actual native ; scatters or masked stores. -declare void @__pseudo_scatter32_8(<$1 x i32>, <$1 x i8>, <$1 x i32>) nounwind -declare void @__pseudo_scatter32_16(<$1 x i32>, <$1 x i16>, <$1 x i32>) nounwind -declare void @__pseudo_scatter32_32(<$1 x i32>, <$1 x i32>, <$1 x i32>) nounwind -declare void @__pseudo_scatter32_64(<$1 x i32>, <$1 x i64>, <$1 x i32>) nounwind +declare void @__pseudo_scatter32_8(, , ) nounwind +declare void @__pseudo_scatter32_16(, , ) nounwind +declare void @__pseudo_scatter32_32(, , ) nounwind +declare void @__pseudo_scatter32_64(, , ) nounwind -declare void @__pseudo_scatter64_8(<$1 x i64>, <$1 x i8>, <$1 x i32>) nounwind -declare void @__pseudo_scatter64_16(<$1 x i64>, <$1 x i16>, <$1 x i32>) nounwind -declare void @__pseudo_scatter64_32(<$1 x i64>, <$1 x i32>, <$1 x i32>) nounwind -declare void @__pseudo_scatter64_64(<$1 x i64>, <$1 x i64>, <$1 x i32>) nounwind +declare void @__pseudo_scatter64_8(, , ) nounwind +declare void @__pseudo_scatter64_16(, , ) nounwind +declare void @__pseudo_scatter64_32(, , ) nounwind +declare void @__pseudo_scatter64_64(, , ) nounwind -declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <$1 x i32>, i32, - <$1 x i8>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <$1 x i32>, i32, - <$1 x i16>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <$1 x i32>, i32, - <$1 x i32>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <$1 x i32>, i32, - <$1 x i64>, <$1 x i32>) nounwind +declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, , i32, + , ) nounwind -declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <$1 x i64>, i32, - <$1 x i8>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <$1 x i64>, i32, - <$1 x i16>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <$1 x i64>, i32, - <$1 x i32>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <$1 x i64>, i32, - <$1 x i64>, <$1 x i32>) nounwind +declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, , i32, + , ) nounwind ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector ops -define i8 @__extract_int8(<$1 x i8>, i32) nounwind readnone alwaysinline { - %extract = extractelement <$1 x i8> %0, i32 %1 +define i8 @__extract_int8(, i32) nounwind readnone alwaysinline { + %extract = extractelement %0, i32 %1 ret i8 %extract } -define <$1 x i8> @__insert_int8(<$1 x i8>, i32, +define @__insert_int8(, i32, i8) nounwind readnone alwaysinline { - %insert = insertelement <$1 x i8> %0, i8 %2, i32 %1 - ret <$1 x i8> %insert + %insert = insertelement %0, i8 %2, i32 %1 + ret %insert } -define i16 @__extract_int16(<$1 x i16>, i32) nounwind readnone alwaysinline { - %extract = extractelement <$1 x i16> %0, i32 %1 +define i16 @__extract_int16(, i32) nounwind readnone alwaysinline { + %extract = extractelement %0, i32 %1 ret i16 %extract } -define <$1 x i16> @__insert_int16(<$1 x i16>, i32, +define @__insert_int16(, i32, i16) nounwind readnone alwaysinline { - %insert = insertelement <$1 x i16> %0, i16 %2, i32 %1 - ret <$1 x i16> %insert + %insert = insertelement %0, i16 %2, i32 %1 + ret %insert } -define i32 @__extract_int32(<$1 x i32>, i32) nounwind readnone alwaysinline { - %extract = extractelement <$1 x i32> %0, i32 %1 +define i32 @__extract_int32(, i32) nounwind readnone alwaysinline { + %extract = extractelement %0, i32 %1 ret i32 %extract } -define <$1 x i32> @__insert_int32(<$1 x i32>, i32, +define @__insert_int32(, i32, i32) nounwind readnone alwaysinline { - %insert = insertelement <$1 x i32> %0, i32 %2, i32 %1 - ret <$1 x i32> %insert + %insert = insertelement %0, i32 %2, i32 %1 + ret %insert } -define i64 @__extract_int64(<$1 x i64>, i32) nounwind readnone alwaysinline { - %extract = extractelement <$1 x i64> %0, i32 %1 +define i64 @__extract_int64(, i32) nounwind readnone alwaysinline { + %extract = extractelement %0, i32 %1 ret i64 %extract } -define <$1 x i64> @__insert_int64(<$1 x i64>, i32, +define @__insert_int64(, i32, i64) nounwind readnone alwaysinline { - %insert = insertelement <$1 x i64> %0, i64 %2, i32 %1 - ret <$1 x i64> %insert + %insert = insertelement %0, i64 %2, i32 %1 + ret %insert } -shuffles($1, i8, int8, 1) -shuffles($1, i16, int16, 2) -shuffles($1, float, float, 4) -shuffles($1, i32, int32, 4) -shuffles($1, double, double, 8) -shuffles($1, i64, int64, 8) +shuffles(i8, int8, 1) +shuffles(i16, int16, 2) +shuffles(float, float, 4) +shuffles(i32, int32, 4) +shuffles(double, double, 8) +shuffles(i64, int64, 8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; various bitcasts from one type to another -define <$1 x i32> @__intbits_varying_float(<$1 x float>) nounwind readnone alwaysinline { - %float_to_int_bitcast = bitcast <$1 x float> %0 to <$1 x i32> - ret <$1 x i32> %float_to_int_bitcast +define @__intbits_varying_float() nounwind readnone alwaysinline { + %float_to_int_bitcast = bitcast %0 to + ret %float_to_int_bitcast } define i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline { @@ -1050,9 +1131,9 @@ define i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline { ret i32 %float_to_int_bitcast } -define <$1 x i64> @__intbits_varying_double(<$1 x double>) nounwind readnone alwaysinline { - %double_to_int_bitcast = bitcast <$1 x double> %0 to <$1 x i64> - ret <$1 x i64> %double_to_int_bitcast +define @__intbits_varying_double() nounwind readnone alwaysinline { + %double_to_int_bitcast = bitcast %0 to + ret %double_to_int_bitcast } define i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline { @@ -1060,9 +1141,9 @@ define i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline { ret i64 %double_to_int_bitcast } -define <$1 x float> @__floatbits_varying_int32(<$1 x i32>) nounwind readnone alwaysinline { - %int_to_float_bitcast = bitcast <$1 x i32> %0 to <$1 x float> - ret <$1 x float> %int_to_float_bitcast +define @__floatbits_varying_int32() nounwind readnone alwaysinline { + %int_to_float_bitcast = bitcast %0 to + ret %int_to_float_bitcast } define float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline { @@ -1070,9 +1151,9 @@ define float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline { ret float %int_to_float_bitcast } -define <$1 x double> @__doublebits_varying_int64(<$1 x i64>) nounwind readnone alwaysinline { - %int_to_double_bitcast = bitcast <$1 x i64> %0 to <$1 x double> - ret <$1 x double> %int_to_double_bitcast +define @__doublebits_varying_int64() nounwind readnone alwaysinline { + %int_to_double_bitcast = bitcast %0 to + ret %int_to_double_bitcast } define double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline { @@ -1080,8 +1161,8 @@ define double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline { ret double %int_to_double_bitcast } -define <$1 x float> @__undef_varying() nounwind readnone alwaysinline { - ret <$1 x float> undef +define @__undef_varying() nounwind readnone alwaysinline { + ret undef } define float @__undef_uniform() nounwind readnone alwaysinline { @@ -1096,31 +1177,12 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline { ret i32 %r } -define <$1 x i32> @__sext_varying_bool(<$1 x i32>) nounwind readnone alwaysinline { - ret <$1 x i32> %0 -} - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; count trailing zeros - -define i32 @__count_trailing_zeros_i32(i32) nounwind readnone alwaysinline { - %c = call i32 @llvm.cttz.i32(i32 %0) - ret i32 %c -} - -define i64 @__count_trailing_zeros_i64(i64) nounwind readnone alwaysinline { - %c = call i64 @llvm.cttz.i64(i64 %0) - ret i64 %c -} - -define i32 @__count_leading_zeros_i32(i32) nounwind readnone alwaysinline { - %c = call i32 @llvm.ctlz.i32(i32 %0) - ret i32 %c -} - -define i64 @__count_leading_zeros_i64(i64) nounwind readnone alwaysinline { - %c = call i64 @llvm.ctlz.i64(i64 %0) - ret i64 %c +define @__sext_varying_bool() nounwind readnone alwaysinline { + ifelse(MASK,i1, ` + %se = sext %0 to + ret %se + ', ` + ret %0') } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1670,184 +1732,133 @@ define void define void @__aos_to_soa4_float(float * noalias %p, - <$1 x float> * noalias %out0, <$1 x float> * noalias %out1, - <$1 x float> * noalias %out2, <$1 x float> * noalias %out3) + * noalias %out0, * noalias %out1, + * noalias %out2, * noalias %out3) nounwind alwaysinline { - %p0 = bitcast float * %p to <$1 x float> * - %v0 = load <$1 x float> * %p0, align 4 - %p1 = getelementptr <$1 x float> * %p0, i32 1 - %v1 = load <$1 x float> * %p1, align 4 - %p2 = getelementptr <$1 x float> * %p0, i32 2 - %v2 = load <$1 x float> * %p2, align 4 - %p3 = getelementptr <$1 x float> * %p0, i32 3 - %v3 = load <$1 x float> * %p3, align 4 - call void @__aos_to_soa4_float$1(<$1 x float> %v0, <$1 x float> %v1, - <$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0, - <$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3) + %p0 = bitcast float * %p to * + %v0 = load * %p0, align 4 + %p1 = getelementptr * %p0, i32 1 + %v1 = load * %p1, align 4 + %p2 = getelementptr * %p0, i32 2 + %v2 = load * %p2, align 4 + %p3 = getelementptr * %p0, i32 3 + %v3 = load * %p3, align 4 + call void @__aos_to_soa4_float`'WIDTH ( %v0, %v1, + %v2, %v3, * %out0, + * %out1, * %out2, * %out3) ret void } define void @__aos_to_soa4_int32(i32 * noalias %ptr, - <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1, - <$1 x i32> * noalias %out2, <$1 x i32> * noalias %out3) + * noalias %out0, * noalias %out1, + * noalias %out2, * noalias %out3) nounwind alwaysinline { %fptr = bitcast i32 * %ptr to float * - %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> * - %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> * - %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> * - %fout3 = bitcast <$1 x i32> * %out3 to <$1 x float> * + %fout0 = bitcast * %out0 to * + %fout1 = bitcast * %out1 to * + %fout2 = bitcast * %out2 to * + %fout3 = bitcast * %out3 to * call void @__aos_to_soa4_float(float * %fptr, - <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2, - <$1 x float> * %fout3) + * %fout0, * %fout1, * %fout2, + * %fout3) ret void } define void -@__soa_to_aos4_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2, - <$1 x float> %v3, float * noalias %p) nounwind alwaysinline { - %out0 = bitcast float * %p to <$1 x float> * - %out1 = getelementptr <$1 x float> * %out0, i32 1 - %out2 = getelementptr <$1 x float> * %out0, i32 2 - %out3 = getelementptr <$1 x float> * %out0, i32 3 - call void @__soa_to_aos4_float$1(<$1 x float> %v0, <$1 x float> %v1, - <$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0, - <$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3) +@__soa_to_aos4_float( %v0, %v1, %v2, + %v3, float * noalias %p) nounwind alwaysinline { + %out0 = bitcast float * %p to * + %out1 = getelementptr * %out0, i32 1 + %out2 = getelementptr * %out0, i32 2 + %out3 = getelementptr * %out0, i32 3 + call void @__soa_to_aos4_float`'WIDTH ( %v0, %v1, + %v2, %v3, * %out0, + * %out1, * %out2, * %out3) ret void } define void -@__soa_to_aos4_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2, - <$1 x i32> %v3, i32 * noalias %base) nounwind alwaysinline { - %fv0 = bitcast <$1 x i32> %v0 to <$1 x float> - %fv1 = bitcast <$1 x i32> %v1 to <$1 x float> - %fv2 = bitcast <$1 x i32> %v2 to <$1 x float> - %fv3 = bitcast <$1 x i32> %v3 to <$1 x float> +@__soa_to_aos4_int32( %v0, %v1, %v2, + %v3, i32 * noalias %base) nounwind alwaysinline { + %fv0 = bitcast %v0 to + %fv1 = bitcast %v1 to + %fv2 = bitcast %v2 to + %fv3 = bitcast %v3 to %fbase = bitcast i32 * %base to float * - call void @__soa_to_aos4_float(<$1 x float> %fv0, <$1 x float> %fv1, - <$1 x float> %fv2, <$1 x float> %fv3, float * %fbase) + call void @__soa_to_aos4_float( %fv0, %fv1, + %fv2, %fv3, float * %fbase) ret void } define void @__aos_to_soa3_float(float * noalias %p, - <$1 x float> * %out0, <$1 x float> * %out1, - <$1 x float> * %out2) nounwind alwaysinline { - %p0 = bitcast float * %p to <$1 x float> * - %v0 = load <$1 x float> * %p0, align 4 - %p1 = getelementptr <$1 x float> * %p0, i32 1 - %v1 = load <$1 x float> * %p1, align 4 - %p2 = getelementptr <$1 x float> * %p0, i32 2 - %v2 = load <$1 x float> * %p2, align 4 - call void @__aos_to_soa3_float$1(<$1 x float> %v0, <$1 x float> %v1, - <$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1, - <$1 x float> * %out2) + * %out0, * %out1, + * %out2) nounwind alwaysinline { + %p0 = bitcast float * %p to * + %v0 = load * %p0, align 4 + %p1 = getelementptr * %p0, i32 1 + %v1 = load * %p1, align 4 + %p2 = getelementptr * %p0, i32 2 + %v2 = load * %p2, align 4 + call void @__aos_to_soa3_float`'WIDTH ( %v0, %v1, + %v2, * %out0, * %out1, + * %out2) ret void } define void @__aos_to_soa3_int32(i32 * noalias %base, - <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1, - <$1 x i32> * noalias %out2) nounwind alwaysinline { + * noalias %out0, * noalias %out1, + * noalias %out2) nounwind alwaysinline { %fbase = bitcast i32 * %base to float * - %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> * - %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> * - %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> * + %fout0 = bitcast * %out0 to * + %fout1 = bitcast * %out1 to * + %fout2 = bitcast * %out2 to * call void @__aos_to_soa3_float(float * %fbase, - <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2) + * %fout0, * %fout1, * %fout2) ret void } define void -@__soa_to_aos3_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2, +@__soa_to_aos3_float( %v0, %v1, %v2, float * noalias %p) nounwind alwaysinline { - %out0 = bitcast float * %p to <$1 x float> * - %out1 = getelementptr <$1 x float> * %out0, i32 1 - %out2 = getelementptr <$1 x float> * %out0, i32 2 - call void @__soa_to_aos3_float$1(<$1 x float> %v0, <$1 x float> %v1, - <$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1, - <$1 x float> * %out2) + %out0 = bitcast float * %p to * + %out1 = getelementptr * %out0, i32 1 + %out2 = getelementptr * %out0, i32 2 + call void @__soa_to_aos3_float`'WIDTH ( %v0, %v1, + %v2, * %out0, * %out1, + * %out2) ret void } define void -@__soa_to_aos3_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2, +@__soa_to_aos3_int32( %v0, %v1, %v2, i32 * noalias %base) nounwind alwaysinline { - %fv0 = bitcast <$1 x i32> %v0 to <$1 x float> - %fv1 = bitcast <$1 x i32> %v1 to <$1 x float> - %fv2 = bitcast <$1 x i32> %v2 to <$1 x float> + %fv0 = bitcast %v0 to + %fv1 = bitcast %v1 to + %fv2 = bitcast %v2 to %fbase = bitcast i32 * %base to float * - call void @__soa_to_aos3_float(<$1 x float> %fv0, <$1 x float> %fv1, - <$1 x float> %fv2, float * %fbase) + call void @__soa_to_aos3_float( %fv0, %fv1, + %fv2, float * %fbase) ret void } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; prefetching - -ifelse(LLVM_VERSION, `LLVM_2_9', -` -declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality) - -define void @__prefetch_read_uniform_1(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 3) - ret void -} - -define void @__prefetch_read_uniform_2(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 2) - ret void -} - -define void @__prefetch_read_uniform_3(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 1) - ret void -} - -define void @__prefetch_read_uniform_nt(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 0) - ret void -} -', ` -declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality, - i32 %cachetype) ; cachetype == 1 is dcache - -define void @__prefetch_read_uniform_1(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 3, i32 1) - ret void -} - -define void @__prefetch_read_uniform_2(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 2, i32 1) - ret void -} - -define void @__prefetch_read_uniform_3(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 1, i32 1) - ret void -} - -define void @__prefetch_read_uniform_nt(i8 *) alwaysinline { - call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1) - ret void -} -') - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; assert declare i32 @printf(i8*, ...) declare void @abort() noreturn -define void @__do_assert_uniform(i8 *%str, i1 %test, <$1 x i32> %mask) { +define void @__do_assert_uniform(i8 *%str, i1 %test, %mask) { br i1 %test, label %ok, label %fail fail: @@ -1860,12 +1871,12 @@ ok: } -define void @__do_assert_varying(i8 *%str, <$1 x i32> %test, - <$1 x i32> %mask) { - %nottest = xor <$1 x i32> %test, - < forloop(i, 1, eval($1-1), `i32 -1, ') i32 -1 > - %nottest_and_mask = and <$1 x i32> %nottest, %mask - %mm = call i32 @__movmsk(<$1 x i32> %nottest_and_mask) +define void @__do_assert_varying(i8 *%str, %test, + %mask) { + %nottest = xor %test, + < forloop(i, 1, eval(WIDTH-1), `MASK -1, ') MASK -1 > + %nottest_and_mask = and %nottest, %mask + %mm = call i32 @__movmsk( %nottest_and_mask) %all_ok = icmp eq i32 %mm, 0 br i1 %all_ok, label %ok, label %fail @@ -2010,118 +2021,118 @@ define void @__memory_barrier() nounwind readnone alwaysinline { ret void } -global_atomic_associative($1, add, i32, int32, 0) -global_atomic_associative($1, sub, i32, int32, 0) -global_atomic_associative($1, and, i32, int32, -1) -global_atomic_associative($1, or, i32, int32, 0) -global_atomic_associative($1, xor, i32, int32, 0) -global_atomic_uniform($1, add, i32, int32) -global_atomic_uniform($1, sub, i32, int32) -global_atomic_uniform($1, and, i32, int32) -global_atomic_uniform($1, or, i32, int32) -global_atomic_uniform($1, xor, i32, int32) -global_atomic_uniform($1, min, i32, int32) -global_atomic_uniform($1, max, i32, int32) -global_atomic_uniform($1, umin, i32, uint32) -global_atomic_uniform($1, umax, i32, uint32) +global_atomic_associative(WIDTH, add, i32, int32, 0) +global_atomic_associative(WIDTH, sub, i32, int32, 0) +global_atomic_associative(WIDTH, and, i32, int32, -1) +global_atomic_associative(WIDTH, or, i32, int32, 0) +global_atomic_associative(WIDTH, xor, i32, int32, 0) +global_atomic_uniform(WIDTH, add, i32, int32) +global_atomic_uniform(WIDTH, sub, i32, int32) +global_atomic_uniform(WIDTH, and, i32, int32) +global_atomic_uniform(WIDTH, or, i32, int32) +global_atomic_uniform(WIDTH, xor, i32, int32) +global_atomic_uniform(WIDTH, min, i32, int32) +global_atomic_uniform(WIDTH, max, i32, int32) +global_atomic_uniform(WIDTH, umin, i32, uint32) +global_atomic_uniform(WIDTH, umax, i32, uint32) -global_atomic_associative($1, add, i64, int64, 0) -global_atomic_associative($1, sub, i64, int64, 0) -global_atomic_associative($1, and, i64, int64, -1) -global_atomic_associative($1, or, i64, int64, 0) -global_atomic_associative($1, xor, i64, int64, 0) -global_atomic_uniform($1, add, i64, int64) -global_atomic_uniform($1, sub, i64, int64) -global_atomic_uniform($1, and, i64, int64) -global_atomic_uniform($1, or, i64, int64) -global_atomic_uniform($1, xor, i64, int64) -global_atomic_uniform($1, min, i64, int64) -global_atomic_uniform($1, max, i64, int64) -global_atomic_uniform($1, umin, i64, uint64) -global_atomic_uniform($1, umax, i64, uint64) +global_atomic_associative(WIDTH, add, i64, int64, 0) +global_atomic_associative(WIDTH, sub, i64, int64, 0) +global_atomic_associative(WIDTH, and, i64, int64, -1) +global_atomic_associative(WIDTH, or, i64, int64, 0) +global_atomic_associative(WIDTH, xor, i64, int64, 0) +global_atomic_uniform(WIDTH, add, i64, int64) +global_atomic_uniform(WIDTH, sub, i64, int64) +global_atomic_uniform(WIDTH, and, i64, int64) +global_atomic_uniform(WIDTH, or, i64, int64) +global_atomic_uniform(WIDTH, xor, i64, int64) +global_atomic_uniform(WIDTH, min, i64, int64) +global_atomic_uniform(WIDTH, max, i64, int64) +global_atomic_uniform(WIDTH, umin, i64, uint64) +global_atomic_uniform(WIDTH, umax, i64, uint64) -global_swap($1, i32, int32) -global_swap($1, i64, int64) +global_swap(WIDTH, i32, int32) +global_swap(WIDTH, i64, int64) -define <$1 x float> @__atomic_swap_float_global(float * %ptr, <$1 x float> %val, - <$1 x i32> %mask) nounwind alwaysinline { +define @__atomic_swap_float_global(float * %ptr, %val, + %mask) nounwind alwaysinline { %iptr = bitcast float * %ptr to i32 * - %ival = bitcast <$1 x float> %val to <$1 x i32> - %iret = call <$1 x i32> @__atomic_swap_int32_global(i32 * %iptr, <$1 x i32> %ival, <$1 x i32> %mask) - %ret = bitcast <$1 x i32> %iret to <$1 x float> - ret <$1 x float> %ret + %ival = bitcast %val to + %iret = call @__atomic_swap_int32_global(i32 * %iptr, %ival, %mask) + %ret = bitcast %iret to + ret %ret } -define <$1 x double> @__atomic_swap_double_global(double * %ptr, <$1 x double> %val, - <$1 x i32> %mask) nounwind alwaysinline { +define @__atomic_swap_double_global(double * %ptr, %val, + %mask) nounwind alwaysinline { %iptr = bitcast double * %ptr to i64 * - %ival = bitcast <$1 x double> %val to <$1 x i64> - %iret = call <$1 x i64> @__atomic_swap_int64_global(i64 * %iptr, <$1 x i64> %ival, <$1 x i32> %mask) - %ret = bitcast <$1 x i64> %iret to <$1 x double> - ret <$1 x double> %ret + %ival = bitcast %val to + %iret = call @__atomic_swap_int64_global(i64 * %iptr, %ival, %mask) + %ret = bitcast %iret to + ret %ret } define float @__atomic_swap_uniform_float_global(float * %ptr, float %val, - <$1 x i32> %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { %iptr = bitcast float * %ptr to i32 * %ival = bitcast float %val to i32 - %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <$1 x i32> %mask) + %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, %mask) %ret = bitcast i32 %iret to float ret float %ret } define double @__atomic_swap_uniform_double_global(double * %ptr, double %val, - <$1 x i32> %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { %iptr = bitcast double * %ptr to i64 * %ival = bitcast double %val to i64 - %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <$1 x i32> %mask) + %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, %mask) %ret = bitcast i64 %iret to double ret double %ret } -global_atomic_exchange($1, i32, int32) -global_atomic_exchange($1, i64, int64) +global_atomic_exchange(WIDTH, i32, int32) +global_atomic_exchange(WIDTH, i64, int64) -define <$1 x float> @__atomic_compare_exchange_float_global(float * %ptr, - <$1 x float> %cmp, <$1 x float> %val, <$1 x i32> %mask) nounwind alwaysinline { +define @__atomic_compare_exchange_float_global(float * %ptr, + %cmp, %val, %mask) nounwind alwaysinline { %iptr = bitcast float * %ptr to i32 * - %icmp = bitcast <$1 x float> %cmp to <$1 x i32> - %ival = bitcast <$1 x float> %val to <$1 x i32> - %iret = call <$1 x i32> @__atomic_compare_exchange_int32_global(i32 * %iptr, <$1 x i32> %icmp, - <$1 x i32> %ival, <$1 x i32> %mask) - %ret = bitcast <$1 x i32> %iret to <$1 x float> - ret <$1 x float> %ret + %icmp = bitcast %cmp to + %ival = bitcast %val to + %iret = call @__atomic_compare_exchange_int32_global(i32 * %iptr, %icmp, + %ival, %mask) + %ret = bitcast %iret to + ret %ret } -define <$1 x double> @__atomic_compare_exchange_double_global(double * %ptr, - <$1 x double> %cmp, <$1 x double> %val, <$1 x i32> %mask) nounwind alwaysinline { +define @__atomic_compare_exchange_double_global(double * %ptr, + %cmp, %val, %mask) nounwind alwaysinline { %iptr = bitcast double * %ptr to i64 * - %icmp = bitcast <$1 x double> %cmp to <$1 x i64> - %ival = bitcast <$1 x double> %val to <$1 x i64> - %iret = call <$1 x i64> @__atomic_compare_exchange_int64_global(i64 * %iptr, <$1 x i64> %icmp, - <$1 x i64> %ival, <$1 x i32> %mask) - %ret = bitcast <$1 x i64> %iret to <$1 x double> - ret <$1 x double> %ret + %icmp = bitcast %cmp to + %ival = bitcast %val to + %iret = call @__atomic_compare_exchange_int64_global(i64 * %iptr, %icmp, + %ival, %mask) + %ret = bitcast %iret to + ret %ret } define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val, - <$1 x i32> %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { %iptr = bitcast float * %ptr to i32 * %icmp = bitcast float %cmp to i32 %ival = bitcast float %val to i32 %iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp, - i32 %ival, <$1 x i32> %mask) + i32 %ival, %mask) %ret = bitcast i32 %iret to float ret float %ret } define double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp, - double %val, <$1 x i32> %mask) nounwind alwaysinline { + double %val, %mask) nounwind alwaysinline { %iptr = bitcast double * %ptr to i64 * %icmp = bitcast double %cmp to i64 %ival = bitcast double %val to i64 %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp, - i64 %ival, <$1 x i32> %mask) + i64 %ival, %mask) %ret = bitcast i64 %iret to double ret double %ret } @@ -2168,10 +2179,10 @@ define <$1 x i64> @__$2_varying_$3(<$1 x i64>, <$1 x i64>) nounwind alwaysinline ;; vector width as a parameter define(`int64minmax', ` -i64minmax($1,min,int64,slt) -i64minmax($1,max,int64,sgt) -i64minmax($1,min,uint64,ult) -i64minmax($1,max,uint64,ugt) +i64minmax(WIDTH,min,int64,slt) +i64minmax(WIDTH,max,int64,sgt) +i64minmax(WIDTH,min,uint64,ult) +i64minmax(WIDTH,max,uint64,ugt) ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2410,24 +2421,24 @@ define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>, define(`packed_load_and_store', ` -define i32 @__packed_load_active(i32 * %startptr, <$1 x i32> * %val_ptr, - <$1 x i32> %full_mask) nounwind alwaysinline { +define i32 @__packed_load_active(i32 * %startptr, * %val_ptr, + %full_mask) nounwind alwaysinline { entry: - %mask = call i32 @__movmsk(<$1 x i32> %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask) + %mask = call i32 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: - %allon = icmp eq i32 %mask, eval((1 << $1) -1) + %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1) br i1 %allon, label %all_on, label %unknown_mask all_on: ;; everyone wants to load, so just load an entire vector width in a single ;; vector load - %vecptr = bitcast i32 *%startptr to <$1 x i32> * - %vec_load = load <$1 x i32> *%vecptr, align 4 - store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4 - ret i32 $1 + %vecptr = bitcast i32 *%startptr to * + %vec_load = load *%vecptr, align 4 + store %vec_load, * %val_ptr, align 4 + ret i32 WIDTH unknown_mask: br label %loop @@ -2445,7 +2456,7 @@ loop: load: %loadptr = getelementptr i32 *%startptr, i32 %offset %loadval = load i32 *%loadptr - %val_ptr_i32 = bitcast <$1 x i32> * %val_ptr to i32 * + %val_ptr_i32 = bitcast * %val_ptr to i32 * %storeptr = getelementptr i32 *%val_ptr_i32, i32 %lane store i32 %loadval, i32 *%storeptr %offset1 = add i32 %offset, 1 @@ -2457,28 +2468,28 @@ loopend: %nextlanemask = mul i32 %lanemask, 2 ; are we done yet? - %test = icmp ne i32 %nextlane, $1 + %test = icmp ne i32 %nextlane, WIDTH br i1 %test, label %loop, label %done done: ret i32 %nextoffset } -define i32 @__packed_store_active(i32 * %startptr, <$1 x i32> %vals, - <$1 x i32> %full_mask) nounwind alwaysinline { +define i32 @__packed_store_active(i32 * %startptr, %vals, + %full_mask) nounwind alwaysinline { entry: - %mask = call i32 @__movmsk(<$1 x i32> %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask) + %mask = call i32 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: - %allon = icmp eq i32 %mask, eval((1 << $1) -1) + %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1) br i1 %allon, label %all_on, label %unknown_mask all_on: - %vecptr = bitcast i32 *%startptr to <$1 x i32> * - store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4 - ret i32 $1 + %vecptr = bitcast i32 *%startptr to * + store %vals, * %vecptr, align 4 + ret i32 WIDTH unknown_mask: br label %loop @@ -2494,7 +2505,7 @@ loop: br i1 %do_store, label %store, label %loopend store: - %storeval = extractelement <$1 x i32> %vals, i32 %lane + %storeval = extractelement %vals, i32 %lane %storeptr = getelementptr i32 *%startptr, i32 %offset store i32 %storeval, i32 *%storeptr %offset1 = add i32 %offset, 1 @@ -2506,7 +2517,7 @@ loopend: %nextlanemask = mul i32 %lanemask, 2 ; are we done yet? - %test = icmp ne i32 %nextlane, $1 + %test = icmp ne i32 %nextlane, WIDTH br i1 %test, label %loop, label %done done: @@ -2613,7 +2624,7 @@ reduce_equal_aux($1, double, double, i64, fcmp, 64) define(`exclusive_scan', ` define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v, - <$1 x i32> %mask) nounwind alwaysinline { + <$1 x MASK> %mask) nounwind alwaysinline { ; first, set the value of any off lanes to the identity value %ptr = alloca <$1 x $2> %idvec1 = bitcast $2 $5 to <1 x $2> @@ -2623,7 +2634,7 @@ define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v, %ptr`'$3 = bitcast <$1 x $2> * %ptr to <$1 x i`'$3> * %vi = bitcast <$1 x $2> %v to <$1 x i`'$3> call void @__masked_store_blend_$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi, - <$1 x i32> %mask) + <$1 x MASK> %mask) %v_id = load <$1 x $2> * %ptr ; extract elements of the vector to use in computing the scan @@ -2649,16 +2660,16 @@ define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v, ') define(`scans', ` -exclusive_scan($1, i32, 32, add, 0, add_i32) -exclusive_scan($1, float, 32, fadd, zeroinitializer, add_float) -exclusive_scan($1, i64, 64, add, 0, add_i64) -exclusive_scan($1, double, 64, fadd, zeroinitializer, add_double) +exclusive_scan(WIDTH, i32, 32, add, 0, add_i32) +exclusive_scan(WIDTH, float, 32, fadd, zeroinitializer, add_float) +exclusive_scan(WIDTH, i64, 64, add, 0, add_i64) +exclusive_scan(WIDTH, double, 64, fadd, zeroinitializer, add_double) -exclusive_scan($1, i32, 32, and, -1, and_i32) -exclusive_scan($1, i64, 64, and, -1, and_i64) +exclusive_scan(WIDTH, i32, 32, and, -1, and_i32) +exclusive_scan(WIDTH, i64, 64, and, -1, and_i64) -exclusive_scan($1, i32, 32, or, 0, or_i32) -exclusive_scan($1, i64, 64, or, 0, or_i64) +exclusive_scan(WIDTH, i32, 32, or, 0, or_i32) +exclusive_scan(WIDTH, i64, 64, or, 0, or_i64) ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/ctx.cpp b/ctx.cpp index 043f7acc..694a3b1d 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -875,8 +875,11 @@ FunctionEmitContext::LaneMask(llvm::Value *v) { // into an i32 value std::vector mm; m->symbolTable->LookupFunction("__movmsk", &mm); - // There should be one with signed int signature, one unsigned int. - Assert(mm.size() == 2); + if (g->target.maskBitCount == 1) + Assert(mm.size() == 1); + else + // There should be one with signed int signature, one unsigned int. + Assert(mm.size() == 2); // We can actually call either one, since both are i32s as far as // LLVM's type system is concerned... llvm::Function *fmm = mm[0]->function; @@ -929,6 +932,9 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) { return NULL; } + if (g->target.maskBitCount == 1) + return b; + LLVM_TYPE_CONST llvm::ArrayType *at = llvm::dyn_cast(b->getType()); if (at) { diff --git a/func.cpp b/func.cpp index 61dfb784..4c8d2222 100644 --- a/func.cpp +++ b/func.cpp @@ -288,7 +288,10 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, bool checkMask = (type->isTask == true) || ((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) && costEstimate > CHECK_MASK_AT_FUNCTION_START_COST); - if (checkMask && g->opt.disableCoherentControlFlow == false) { + checkMask &= (g->target.maskingIsFree == false); + checkMask &= (g->opt.disableCoherentControlFlow == false); + + if (checkMask) { llvm::Value *mask = ctx->GetFunctionMask(); llvm::Value *allOn = ctx->All(mask); llvm::BasicBlock *bbAllOn = ctx->CreateBasicBlock("all_on"); diff --git a/ispc.cpp b/ispc.cpp index 8bfc9a9d..8cc618c3 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -129,24 +129,60 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->nativeVectorWidth = 4; t->vectorWidth = 4; t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; } else if (!strcasecmp(isa, "sse2-x2")) { t->isa = Target::SSE2; t->nativeVectorWidth = 4; t->vectorWidth = 8; t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; } else if (!strcasecmp(isa, "sse4")) { t->isa = Target::SSE4; t->nativeVectorWidth = 4; t->vectorWidth = 4; t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; } else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) { t->isa = Target::SSE4; t->nativeVectorWidth = 4; t->vectorWidth = 8; t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; + } + else if (!strcasecmp(isa, "generic-4")) { + t->isa = Target::GENERIC; + t->nativeVectorWidth = 4; + t->vectorWidth = 4; + t->maskingIsFree = true; + t->allOffMaskIsSafe = true; + t->maskBitCount = 1; + } + else if (!strcasecmp(isa, "generic-8")) { + t->isa = Target::GENERIC; + t->nativeVectorWidth = 8; + t->vectorWidth = 8; + t->maskingIsFree = true; + t->allOffMaskIsSafe = true; + t->maskBitCount = 1; + } + else if (!strcasecmp(isa, "generic-16")) { + t->isa = Target::GENERIC; + t->nativeVectorWidth = 16; + t->vectorWidth = 16; + t->maskingIsFree = true; + t->allOffMaskIsSafe = true; + t->maskBitCount = 1; } #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn) else if (!strcasecmp(isa, "avx")) { @@ -154,12 +190,18 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->nativeVectorWidth = 8; t->vectorWidth = 8; t->attributes = "+avx,+popcnt,+cmov"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; } else if (!strcasecmp(isa, "avx-x2")) { t->isa = Target::AVX; t->nativeVectorWidth = 8; t->vectorWidth = 16; t->attributes = "+avx,+popcnt,+cmov"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; } #endif // LLVM 3.0+ #if defined(LLVM_3_1svn) @@ -168,12 +210,18 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->nativeVectorWidth = 8; t->vectorWidth = 8; t->attributes = "+avx2,+popcnt,+cmov"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; } else if (!strcasecmp(isa, "avx2-x2")) { t->isa = Target::AVX2; t->nativeVectorWidth = 16; t->vectorWidth = 16; t->attributes = "+avx2,+popcnt,+cmov"; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; } #endif // LLVM 3.1 else { @@ -221,7 +269,7 @@ Target::SupportedTargetISAs() { #ifdef LLVM_3_1svn ", avx2, avx2-x2" #endif // LLVM_3_1svn - ; + ", generic-4, generic-8, generic-16"; } @@ -300,6 +348,8 @@ Target::GetISAString() const { return "avx"; case Target::AVX2: return "avx2"; + case Target::GENERIC: + return "generic"; default: FATAL("Unhandled target in GetISAString()"); } diff --git a/ispc.h b/ispc.h index 6eb2cdd9..254c8311 100644 --- a/ispc.h +++ b/ispc.h @@ -193,7 +193,7 @@ struct Target { flexible/performant of them will apear last in the enumerant. Note also that __best_available_isa() needs to be updated if ISAs are added or the enumerant values are reordered. */ - enum ISA { SSE2, SSE4, AVX, AVX2, NUM_ISAS }; + enum ISA { SSE2, SSE4, AVX, AVX2, GENERIC, NUM_ISAS }; /** Instruction set being compiled to. */ ISA isa; @@ -222,6 +222,23 @@ struct Target { /** Indicates whether position independent code should be generated. */ bool generatePIC; + + /** Is there overhead associated with masking on the target + architecture; e.g. there is on SSE, due to extra blends and the + like, but there isn't with an ISA that supports masking + natively. */ + bool maskingIsFree; + + /** Is it safe to run code with the mask all if: e.g. on SSE, the fast + gather trick assumes that at least one program instance is running + (so that it can safely assume that the array base pointer is + valid). */ + bool allOffMaskIsSafe; + + /** How many bits are used to store each element of the mask: e.g. this + is 32 on SSE/AVX, since that matches the HW better, but it's 1 for + the generic target. */ + int maskBitCount; }; diff --git a/ispc.vcxproj b/ispc.vcxproj index fb56b96c..96a6855d 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -22,11 +22,15 @@ + + + - + + 4146;4800;4996;4355;4624;4005;4003;4018 @@ -40,15 +44,15 @@ 4146;4800;4996;4355;4624;4005;4065 4146;4800;4996;4355;4624;4005;4065 - - %LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp; -%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c > gen-bitcode-c-64.cpp - clang builtins-c.c - %LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp; -%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c > gen-bitcode-c-64.cpp - clang builtins-c.c - gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp - gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp + + %LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-32 > gen-bitcode-c-32.cpp; +%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-64 > gen-bitcode-c-64.cpp + Building builtins.c + %LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-32 > gen-bitcode-c-32.cpp; +%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-64 > gen-bitcode-c-64.cpp + Building builtins.c + gen-bitcode-c-32.cpp;gen-bitcode-c-64.cpp + gen-bitcode-c-32.cpp;gen-bitcode-c-64.cpp @@ -75,105 +79,148 @@ Document - %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp - gen-stdlib.cpp - %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp - gen-stdlib.cpp - Building gen-stdlib.cpp - Building gen-stdlib.cpp + %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 > gen-stdlib-x86.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic > gen-stdlib-generic.cpp; + + gen-stdlib-generic.cpp;gen-stdlib-x86.cpp + %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 > gen-stdlib-x86.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic > gen-stdlib-generic.cpp; + + gen-stdlib-generic.cpp;gen-stdlib-x86.cpp + Building gen-stdlib-{generic,x86}.cpp + Building gen-stdlib-{generic,x86}.cpp - + Document - m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp - gen-bitcode-sse4.cpp - builtins.m4;builtins-sse4-common.ll - m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp - gen-bitcode-sse4.cpp - builtins.m4;builtins-sse4-common.ll - Building gen-bitcode-sse4.cpp - Building gen-bitcode-sse4.cpp - - - - - Document - m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll > gen-bitcode-dispatch.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\dispatch.ll | python bitcode2cpp.py dispatch.ll > gen-bitcode-dispatch.cpp gen-bitcode-dispatch.cpp - builtins.m4 - m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll > gen-bitcode-dispatch.cpp + builtins\util.m4 + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\dispatch.ll | python bitcode2cpp.py dispatch.ll > gen-bitcode-dispatch.cpp gen-bitcode-dispatch.cpp - builtins.m4 + builtins\util.m4 Building gen-bitcode-dispatch.cpp Building gen-bitcode-dispatch.cpp - + Document - m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll > gen-bitcode-sse4-x2.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll > gen-bitcode-sse4.cpp + gen-bitcode-sse4.cpp + builtins\util.m4;builtins\target-sse4-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll > gen-bitcode-sse4.cpp + gen-bitcode-sse4.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4.cpp + Building gen-bitcode-sse4.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll > gen-bitcode-sse4-x2.cpp gen-bitcode-sse4-x2.cpp - builtins.m4;builtins-sse4-common.ll - m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll > gen-bitcode-sse4-x2.cpp + builtins\util.m4;builtins\target-sse4-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll > gen-bitcode-sse4-x2.cpp gen-bitcode-sse4-x2.cpp - builtins.m4;builtins-sse4-common.ll + builtins\util.m4;builtins\target-sse4-common.ll Building gen-bitcode-sse4-x2.cpp Building gen-bitcode-sse4-x2.cpp - + Document - m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll > gen-bitcode-sse2.cpp gen-bitcode-sse2.cpp - builtins.m4;builtins-sse2-common.ll - m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp + builtins\util.m4;builtins\target-sse2-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll > gen-bitcode-sse2.cpp gen-bitcode-sse2.cpp - builtins.m4;builtins-sse2-common.ll + builtins\util.m4;builtins\target-sse2-common.ll Building gen-bitcode-sse2.cpp Building gen-bitcode-sse2.cpp - + Document - m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll > gen-bitcode-sse2-x2.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll > gen-bitcode-sse2-x2.cpp gen-bitcode-sse2-x2.cpp - builtins.m4;builtins-sse2-common.ll - m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll > gen-bitcode-sse2-x2.cpp + builtins\util.m4;builtins\target-sse2-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll > gen-bitcode-sse2-x2.cpp gen-bitcode-sse2-x2.cpp - builtins.m4;builtins-sse2-common.ll + builtins\util.m4;builtins\target-sse2-common.ll Building gen-bitcode-sse2-x2.cpp Building gen-bitcode-sse2-x2.cpp - + Document - m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll > gen-bitcode-avx.cpp gen-bitcode-avx.cpp - builtins.m4;builtins-avx-common.ll - m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp + builtins\util.m4;builtins\target-avx-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll > gen-bitcode-avx.cpp gen-bitcode-avx.cpp - builtins.m4;builtins-avx-common.ll + builtins\util.m4;builtins\target-avx-common.ll Building gen-bitcode-avx.cpp Building gen-bitcode-avx.cpp - + Document - m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll > gen-bitcode-avx-x2.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll > gen-bitcode-avx-x2.cpp gen-bitcode-avx-x2.cpp - builtins.m4;builtins-sse.ll - m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll > gen-bitcode-avx-x2.cpp + builtins\util.m4;builtins\target-avx-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll > gen-bitcode-avx-x2.cpp gen-bitcode-avx-x2.cpp - builtins.m4;builtins-sse.ll + builtins\util.m4;builtins\target-avx-common.ll Building gen-bitcode-avx-x2.cpp Building gen-bitcode-avx-x2.cpp + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll > gen-bitcode-generic-4.cpp + gen-bitcode-generic-4.cpp + builtins\util.m4;builtins\target-generic-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll > gen-bitcode-generic-4.cpp + gen-bitcode-generic-4.cpp + builtins\util.m4;builtins\target-generic-common.ll + Building gen-bitcode-generic-4.cpp + Building gen-bitcode-generic-4.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll > gen-bitcode-generic-8.cpp + gen-bitcode-generic-8.cpp + builtins\util.m4;builtins\target-generic-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll > gen-bitcode-generic-8.cpp + gen-bitcode-generic-8.cpp + builtins\util.m4;builtins\target-generic-common.ll + Building gen-bitcode-generic-8.cpp + Building gen-bitcode-generic-8.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll > gen-bitcode-generic-16.cpp + gen-bitcode-generic-16.cpp + builtins\util.m4;builtins\target-generic-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll > gen-bitcode-generic-16.cpp + gen-bitcode-generic-16.cpp + builtins\util.m4;builtins\target-generic-common.ll + Building gen-bitcode-generic-16.cpp + Building gen-bitcode-generic-16.cpp + + Document diff --git a/llvmutil.cpp b/llvmutil.cpp index 6c440a91..4a50e337 100644 --- a/llvmutil.cpp +++ b/llvmutil.cpp @@ -105,11 +105,14 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) { LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0); LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0); - // Note that both the mask and bool vectors are vector of int32s - // (not i1s). LLVM ends up generating much better SSE code with - // this representation. - LLVMTypes::MaskType = LLVMTypes::BoolVectorType = - llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth); + if (target.maskBitCount == 1) + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth); + else { + assert(target.maskBitCount == 32); + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth); + } LLVMTypes::Int1VectorType = llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth); @@ -141,7 +144,11 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) { std::vector maskOnes; llvm::Constant *onMask = NULL; - onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1, + if (target.maskBitCount == 1) + onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1, + false /*unsigned*/); // 0x1 + else + onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1, true /*signed*/); // 0xffffffff for (int i = 0; i < target.vectorWidth; ++i) @@ -150,8 +157,12 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) { std::vector maskZeros; llvm::Constant *offMask = NULL; - offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0, - true /*signed*/); + if (target.maskBitCount == 1) + offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0, + true /*signed*/); + else + offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0, + true /*signed*/); for (int i = 0; i < target.vectorWidth; ++i) maskZeros.push_back(offMask); diff --git a/module.cpp b/module.cpp index 9fade4b9..5dc9b160 100644 --- a/module.cpp +++ b/module.cpp @@ -1158,22 +1158,14 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre opts.addMacroDef("PI=3.1415926535"); // Add #define for current compilation target - switch (g->target.isa) { - case Target::SSE2: - opts.addMacroDef("ISPC_TARGET_SSE2"); - break; - case Target::SSE4: - opts.addMacroDef("ISPC_TARGET_SSE4"); - break; - case Target::AVX: - opts.addMacroDef("ISPC_TARGET_AVX"); - break; - case Target::AVX2: - opts.addMacroDef("ISPC_TARGET_AVX2"); - break; - default: - FATAL("Unhandled target ISA in preprocessor symbol definition"); + char targetMacro[128]; + sprintf(targetMacro, "ISPC_TARGET_%s", g->target.GetISAString()); + char *p = targetMacro; + while (*p) { + *p = toupper(*p); + ++p; } + opts.addMacroDef(targetMacro); if (g->target.is32Bit) opts.addMacroDef("ISPC_POINTER_SIZE=32"); diff --git a/opt.cpp b/opt.cpp index c77a76f7..17458a06 100644 --- a/opt.cpp +++ b/opt.cpp @@ -2444,7 +2444,7 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { int count = sizeof(names) / sizeof(names[0]); for (int i = 0; i < count; ++i) { llvm::Function *f = m->module->getFunction(names[i]); - if (f != NULL) { + if (f != NULL && f->empty() == false) { f->setLinkage(llvm::GlobalValue::InternalLinkage); modifiedAny = true; } diff --git a/parse.yy b/parse.yy index 8510244a..70cb2b3f 100644 --- a/parse.yy +++ b/parse.yy @@ -1605,7 +1605,8 @@ lAddFunctionParams(Declarator *decl) { /** Add a symbol for the built-in mask variable to the symbol table */ static void lAddMaskToSymbolTable(SourcePos pos) { - const Type *t = AtomicType::VaryingConstUInt32; + const Type *t = g->target.isa == Target::GENERIC ? + AtomicType::VaryingConstBool : AtomicType::VaryingConstUInt32; Symbol *maskSymbol = new Symbol("__mask", pos, t); m->symbolTable->AddVariable(maskSymbol); } diff --git a/stdlib.ispc b/stdlib.ispc index 1a804733..c3b02fa7 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -38,6 +38,14 @@ ispc code */ +#ifdef ISPC_TARGET_GENERIC +#define IntMaskType bool +#define UIntMaskType bool +#else +#define IntMaskType int32 +#define UIntMaskType unsigned int32 +#endif + /////////////////////////////////////////////////////////////////////////// // Low level primitives @@ -274,13 +282,21 @@ static inline int32 sign_extend(bool v) { static inline uniform bool any(bool v) { // We only care about whether "any" is true for the active program instances, // so we have to make v with the current program mask. +#ifdef ISPC_TARGET_GENERIC + return __movmsk(v & __mask) != 0; +#else return __movmsk(__sext_varying_bool(v) & __mask) != 0; +#endif } static inline uniform bool all(bool v) { // As with any(), we need to explicitly mask v with the current program mask // so we're only looking at the current lanes +#ifdef ISPC_TARGET_GENERIC + bool match = ((v & __mask) == __mask); +#else int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask); +#endif return __movmsk(match) == (1 << programCount) - 1; } @@ -308,7 +324,11 @@ static inline int popcnt(int64 v) { static inline uniform int popcnt(bool v) { // As with any() and all(), only count across the active lanes +#ifdef ISPC_TARGET_GENERIC + return __popcnt_int32(__movmsk(v & __mask)); +#else return __popcnt_int32(__movmsk(__sext_varying_bool(v) & __mask)); +#endif } static inline uniform int lanemask() { @@ -672,19 +692,19 @@ static inline uniform bool reduce_equal(TYPE v, uniform TYPE * uniform value) { return __reduce_equal_##FUNCTYPE(v, value, (MASKTYPE)__mask); \ } -REDUCE_EQUAL(int32, int32, int32) -REDUCE_EQUAL(unsigned int32, int32, unsigned int32) -REDUCE_EQUAL(float, float, int32) -REDUCE_EQUAL(int64, int64, int32) -REDUCE_EQUAL(unsigned int64, int64, unsigned int32) -REDUCE_EQUAL(double, double, int32) +REDUCE_EQUAL(int32, int32, IntMaskType) +REDUCE_EQUAL(unsigned int32, int32, UIntMaskType) +REDUCE_EQUAL(float, float, IntMaskType) +REDUCE_EQUAL(int64, int64, IntMaskType) +REDUCE_EQUAL(unsigned int64, int64, UIntMaskType) +REDUCE_EQUAL(double, double, IntMaskType) static int32 exclusive_scan_add(int32 v) { - return __exclusive_scan_add_i32(v, (int32)__mask); + return __exclusive_scan_add_i32(v, (IntMaskType)__mask); } static unsigned int32 exclusive_scan_add(unsigned int32 v) { - return __exclusive_scan_add_i32(v, __mask); + return __exclusive_scan_add_i32((int32)v, (IntMaskType)__mask); } static float exclusive_scan_add(float v) { @@ -692,11 +712,11 @@ static float exclusive_scan_add(float v) { } static int64 exclusive_scan_add(int64 v) { - return __exclusive_scan_add_i64(v, (int32)__mask); + return __exclusive_scan_add_i64(v, (IntMaskType)__mask); } static unsigned int64 exclusive_scan_add(unsigned int64 v) { - return __exclusive_scan_add_i64(v, __mask); + return __exclusive_scan_add_i64(v, (UIntMaskType)__mask); } static double exclusive_scan_add(double v) { @@ -704,35 +724,35 @@ static double exclusive_scan_add(double v) { } static int32 exclusive_scan_and(int32 v) { - return __exclusive_scan_and_i32(v, (int32)__mask); + return __exclusive_scan_and_i32(v, (IntMaskType)__mask); } static unsigned int32 exclusive_scan_and(unsigned int32 v) { - return __exclusive_scan_and_i32(v, __mask); + return __exclusive_scan_and_i32(v, (UIntMaskType)__mask); } static int64 exclusive_scan_and(int64 v) { - return __exclusive_scan_and_i64(v, (int32)__mask); + return __exclusive_scan_and_i64(v, (IntMaskType)__mask); } static unsigned int64 exclusive_scan_and(unsigned int64 v) { - return __exclusive_scan_and_i64(v, __mask); + return __exclusive_scan_and_i64(v, (UIntMaskType)__mask); } static int32 exclusive_scan_or(int32 v) { - return __exclusive_scan_or_i32(v, (int32)__mask); + return __exclusive_scan_or_i32(v, (IntMaskType)__mask); } static unsigned int32 exclusive_scan_or(unsigned int32 v) { - return __exclusive_scan_or_i32(v, __mask); + return __exclusive_scan_or_i32(v, (UIntMaskType)__mask); } static int64 exclusive_scan_or(int64 v) { - return __exclusive_scan_or_i64(v, (int32)__mask); + return __exclusive_scan_or_i64(v, (IntMaskType)__mask); } static unsigned int64 exclusive_scan_or(unsigned int64 v) { - return __exclusive_scan_or_i64(v, __mask); + return __exclusive_scan_or_i64(v, (UIntMaskType)__mask); } /////////////////////////////////////////////////////////////////////////// @@ -741,23 +761,23 @@ static unsigned int64 exclusive_scan_or(unsigned int64 v) { static inline uniform int packed_load_active(uniform unsigned int * uniform a, unsigned int * uniform vals) { - return __packed_load_active(a, vals, (unsigned int32)__mask); + return __packed_load_active(a, vals, (UIntMaskType)__mask); } static inline uniform int packed_store_active(uniform unsigned int * uniform a, unsigned int vals) { - return __packed_store_active(a, vals, (unsigned int32)__mask); + return __packed_store_active(a, vals, (UIntMaskType)__mask); } static inline uniform int packed_load_active(uniform int * uniform a, int * uniform vals) { - return __packed_load_active(a, vals, (int32)__mask); + return __packed_load_active(a, vals, (IntMaskType)__mask); } static inline uniform int packed_store_active(uniform int * uniform a, int vals) { - return __packed_store_active(a, vals, (int32)__mask); + return __packed_store_active(a, vals, (IntMaskType)__mask); } /////////////////////////////////////////////////////////////////////////// @@ -848,49 +868,49 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \ return ret; \ } -DEFINE_ATOMIC_OP(int32,int32,add,add,int32) -DEFINE_ATOMIC_OP(int32,int32,subtract,sub,int32) -DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,int32) -DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,int32) -DEFINE_ATOMIC_OP(int32,int32,and,and,int32) -DEFINE_ATOMIC_OP(int32,int32,or,or,int32) -DEFINE_ATOMIC_OP(int32,int32,xor,xor,int32) -DEFINE_ATOMIC_OP(int32,int32,swap,swap,int32) +DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType) +DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType) +DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType) +DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType) +DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType) +DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType) +DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType) +DEFINE_ATOMIC_OP(int32,int32,swap,swap,IntMaskType) // For everything but atomic min and max, we can use the same // implementations for unsigned as for signed. -DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,unsigned int32) -DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,unsigned int32) -DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,unsigned int32) +DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType) +DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType) +DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,UIntMaskType) -DEFINE_ATOMIC_OP(float,float,swap,swap,int32) +DEFINE_ATOMIC_OP(float,float,swap,swap,IntMaskType) -DEFINE_ATOMIC_OP(int64,int64,add,add,int32) -DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int32) -DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,int32) -DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,int32) -DEFINE_ATOMIC_OP(int64,int64,and,and,int32) -DEFINE_ATOMIC_OP(int64,int64,or,or,int32) -DEFINE_ATOMIC_OP(int64,int64,xor,xor,int32) -DEFINE_ATOMIC_OP(int64,int64,swap,swap,int32) +DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType) +DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType) +DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType) +DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType) +DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType) +DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType) +DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType) +DEFINE_ATOMIC_OP(int64,int64,swap,swap,IntMaskType) // For everything but atomic min and max, we can use the same // implementations for unsigned as for signed. -DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,unsigned int32) -DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,unsigned int32) -DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,unsigned int32) -DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,unsigned int32) +DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType) +DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType) +DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,UIntMaskType) -DEFINE_ATOMIC_OP(double,double,swap,swap,int32) +DEFINE_ATOMIC_OP(double,double,swap,swap,IntMaskType) #undef DEFINE_ATOMIC_OP @@ -913,12 +933,12 @@ static inline uniform TA atomic_compare_exchange_global( \ return ret; \ } -ATOMIC_DECL_CMPXCHG(int32, int32, int32) -ATOMIC_DECL_CMPXCHG(unsigned int32, int32, unsigned int32) -ATOMIC_DECL_CMPXCHG(float, float, int32) -ATOMIC_DECL_CMPXCHG(int64, int64, int32) -ATOMIC_DECL_CMPXCHG(unsigned int64, int64, unsigned int32) -ATOMIC_DECL_CMPXCHG(double, double, int32) +ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType) +ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType) +ATOMIC_DECL_CMPXCHG(float, float, IntMaskType) +ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType) +ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType) +ATOMIC_DECL_CMPXCHG(double, double, IntMaskType) #undef ATOMIC_DECL_CMPXCHG diff --git a/stdlib2cpp.py b/stdlib2cpp.py index 132f8257..6fa5fc2e 100755 --- a/stdlib2cpp.py +++ b/stdlib2cpp.py @@ -2,7 +2,9 @@ import sys -print "char stdlib_code[] = { " +t=str(sys.argv[1]) + +print "char stdlib_" + t + "_code[] = { " for line in sys.stdin: for c in line: diff --git a/stmt.cpp b/stmt.cpp index e799fc0b..95142abe 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -622,9 +622,6 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask, /** Given an AST node, check to see if it's safe if we happen to run the code for that node with the execution mask all off. - - FIXME: this is actually a target-specific thing; for non SSE/AVX - targets with more complete masking support, some of this won't apply... */ static bool lCheckAllOffSafety(ASTNode *node, void *data) { @@ -648,6 +645,11 @@ lCheckAllOffSafety(ASTNode *node, void *data) { return false; } + if (g->target.allOffMaskIsSafe == true) + // Don't worry about memory accesses if we have a target that can + // safely run them with the mask all off + return true; + IndexExpr *ie; if ((ie = dynamic_cast(node)) != NULL && ie->baseExpr != NULL) { const Type *type = ie->baseExpr->GetType();