From 67b549a9373de16f5ff3fd65346b63f64e34a829 Mon Sep 17 00:00:00 2001 From: egaburov Date: Sun, 28 Jul 2013 14:31:43 +0200 Subject: [PATCH] Added nvptx64 target. Things to do: 1. builtins/target-nvptx64.ll to write, now it is just a copy of target-generic-1.ll 2. add __global__ & __device__ scope 2. make code work for a single cuda thread 3. use tasks to work as a block grid and programIndex as laneIdx, programCount as warpSize 4. ... and more... --- Makefile | 42 +- builtins.cpp | 14 +- builtins/target-nvptx64.ll | 958 +++++++++++++++++++++++++++++++++++++ ispc.cpp | 37 +- ispc.h | 2 +- main.cpp | 9 + 6 files changed, 1035 insertions(+), 27 deletions(-) create mode 100644 builtins/target-nvptx64.ll diff --git a/Makefile b/Makefile index 835f8e15..fab66b58 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,7 @@ # If you have your own special version of llvm and/or clang, change # these variables to match. -LLVM_CONFIG=$(shell which llvm-config) +LLVM_CONFIG=$(shell which /usr/local/llvm-3.3/bin/llvm-config) CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir) # Add llvm bin to the path so any scripts run will go to the right llvm-config @@ -55,7 +55,7 @@ LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags) LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//) LLVM_VERSION_DEF=-D$(LLVM_VERSION) -LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker arm +LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker arm nvptx # Component "option" was introduced in 3.3 and starting with 3.4 it is required for the link step. # We check if it's available before adding it (to not break 3.2 and earlier). ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1) @@ -122,7 +122,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \ +TARGETS=nvptx64 neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \ generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 # These files need to be compiled in two versions - 32 and 64 bits. BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) @@ -147,13 +147,13 @@ default: ispc depend: llvm_check $(CXX_SRC) $(HEADERS) @echo Updating dependencies - @$(CXX) -MM $(CXXFLAGS) $(CXX_SRC) | sed 's_^\([a-z]\)_objs/\1_g' > depend + $(CXX) -MM $(CXXFLAGS) $(CXX_SRC) | sed 's_^\([a-z]\)_objs/\1_g' > depend -include depend dirs: @echo Creating objs/ directory - @/bin/mkdir -p objs + /bin/mkdir -p objs llvm_check: @llvm-config --version > /dev/null || \ @@ -176,7 +176,7 @@ doxygen: ispc: print_llvm_src dirs $(OBJS) @echo Creating ispc executable - @$(CXX) $(OPT) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS) + $(CXX) $(OPT) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS) # Use clang as a default compiler, instead of gcc clang: ispc @@ -193,62 +193,62 @@ debug: OPT=-O0 -g objs/%.o: %.cpp @echo Compiling $< - @$(CXX) $(CXXFLAGS) -o $@ -c $< + $(CXX) $(CXXFLAGS) -o $@ -c $< objs/cbackend.o: cbackend.cpp @echo Compiling $< - @$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $< + $(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $< objs/opt.o: opt.cpp @echo Compiling $< - @$(CXX) -fno-rtti $(CXXFLAGS) -o $@ -c $< + $(CXX) -fno-rtti $(CXXFLAGS) -o $@ -c $< objs/%.o: objs/%.cpp @echo Compiling $< - @$(CXX) $(CXXFLAGS) -o $@ -c $< + $(CXX) $(CXXFLAGS) -o $@ -c $< objs/parse.cc: parse.yy @echo Running bison on $< - @$(YACC) -o $@ $< + $(YACC) -o $@ $< objs/parse.o: objs/parse.cc $(HEADERS) @echo Compiling $< - @$(CXX) $(CXXFLAGS) -o $@ -c $< + $(CXX) $(CXXFLAGS) -o $@ -c $< objs/lex.cpp: lex.ll @echo Running flex on $< - @$(LEX) -o $@ $< + $(LEX) -o $@ $< objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc @echo Compiling $< - @$(CXX) $(CXXFLAGS) -o $@ -c $< + $(CXX) $(CXXFLAGS) -o $@ -c $< objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 $(wildcard builtins/*common.ll) @echo Creating C++ source from builtins definition file $< - @m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@ + m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@ objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll) @echo Creating C++ source from builtins definition file $< \(32 bit version\) - @m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@ + m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@ objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll) @echo Creating C++ source from builtins definition file $< \(64 bit version\) - @m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@ + m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@ objs/builtins-c-32.cpp: builtins/builtins.c @echo Creating C++ source from builtins definition file $< - @$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 32 > $@ + $(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 32 > $@ objs/builtins-c-64.cpp: builtins/builtins.c @echo Creating C++ source from builtins definition file $< - @$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@ + $(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@ objs/stdlib_generic_ispc.cpp: stdlib.ispc @echo Creating C++ source from $< for generic - @$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + $(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ python stdlib2cpp.py generic > $@ objs/stdlib_x86_ispc.cpp: stdlib.ispc @echo Creating C++ source from $< for x86 - @$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \ + $(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \ python stdlib2cpp.py x86 > $@ diff --git a/builtins.cpp b/builtins.cpp index 3e03de10..4b91ba30 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -656,7 +656,8 @@ AddBitcodeToModule(const unsigned char *bitcode, int length, // the values for an ARM target. This maybe won't cause problems // in the generated code, since bulitins.c doesn't do anything too // complex w.r.t. struct layouts, etc. - if (g->target->getISA() != Target::NEON) + if (g->target->getISA() != Target::NEON && + g->target->getISA() != Target::NVPTX64) #endif // !__arm__ { Assert(bcTriple.getArch() == llvm::Triple::UnknownArch || @@ -819,6 +820,17 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // Next, add the target's custom implementations of the various needed // builtin functions (e.g. __masked_store_32(), etc). switch (g->target->getISA()) { + case Target::NVPTX64: + { + if (runtime32) { + fprintf(stderr, " please add 32-bit bulitins .. \n"); + assert(0); + } + else { + EXPORT_MODULE(builtins_bitcode_nvptx64_64bit); + } + break; + }; case Target::NEON: { if (runtime32) { EXPORT_MODULE(builtins_bitcode_neon_32bit); diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll new file mode 100644 index 00000000..c44c67f1 --- /dev/null +++ b/builtins/target-nvptx64.ll @@ -0,0 +1,958 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Define the standard library builtins for the NOVEC target +define(`MASK',`i32') +define(`WIDTH',`1') + +include(`util.m4') + +; Define some basics for a 1-wide target +stdlib_core() +packed_load_and_store() +scans() +int64minmax() +aossoa() +rdrand_decls() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +; define these with the macros from stdlib.m4 + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + + +define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> , + <1 x i32> %mask) nounwind readnone alwaysinline { +; %mv = trunc <1 x i32> %mask to <1 x i8> +; %notmask = xor <1 x i8> %mv, +; %cleared_old = and <1 x i8> %0, %notmask +; %masked_new = and <1 x i8> %1, %mv +; %new = or <1 x i8> %cleared_old, %masked_new +; ret <1 x i8> %new + + ; not doing this the easy way because of problems with LLVM's scalarizer +; %cmp = icmp eq <1 x i32> %mask, +; %sel = select <1 x i1> %cmp, <1 x i8> %0, <1 x i8> %1 + %m = extractelement <1 x i32> %mask, i32 0 + %cmp = icmp eq i32 %m, 0 + %d0 = extractelement <1 x i8> %0, i32 0 + %d1 = extractelement <1 x i8> %1, i32 0 + %sel = select i1 %cmp, i8 %d0, i8 %d1 + %r = insertelement <1 x i8> undef, i8 %sel, i32 0 + ret <1 x i8> %r +} + +define <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> , + <1 x i32> %mask) nounwind readnone alwaysinline { +; %mv = trunc <1 x i32> %mask to <1 x i16> +; %notmask = xor <1 x i16> %mv, +; %cleared_old = and <1 x i16> %0, %notmask +; %masked_new = and <1 x i16> %1, %mv +; %new = or <1 x i16> %cleared_old, %masked_new +; ret <1 x i16> %new +; %cmp = icmp eq <1 x i32> %mask, +; %sel = select <1 x i1> %cmp, <1 x i16> %0, <1 x i16> %1 + %m = extractelement <1 x i32> %mask, i32 0 + %cmp = icmp eq i32 %m, 0 + %d0 = extractelement <1 x i16> %0, i32 0 + %d1 = extractelement <1 x i16> %1, i32 0 + %sel = select i1 %cmp, i16 %d0, i16 %d1 + %r = insertelement <1 x i16> undef, i16 %sel, i32 0 + ret <1 x i16> %r + +; ret <1 x i16> %sel +} + + +define <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> , + <1 x i32> %mask) nounwind readnone alwaysinline { +; %notmask = xor <1 x i32> %mask, +; %cleared_old = and <1 x i32> %0, %notmask +; %masked_new = and <1 x i32> %1, %mask +; %new = or <1 x i32> %cleared_old, %masked_new +; ret <1 x i32> %new +; %cmp = icmp eq <1 x i32> %mask, +; %sel = select <1 x i1> %cmp, <1 x i32> %0, <1 x i32> %1 +; ret <1 x i32> %sel + %m = extractelement <1 x i32> %mask, i32 0 + %cmp = icmp eq i32 %m, 0 + %d0 = extractelement <1 x i32> %0, i32 0 + %d1 = extractelement <1 x i32> %1, i32 0 + %sel = select i1 %cmp, i32 %d0, i32 %d1 + %r = insertelement <1 x i32> undef, i32 %sel, i32 0 + ret <1 x i32> %r + +} + +define <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> , + <1 x i32> %mask) nounwind readnone alwaysinline { +; %newmask = zext <1 x i32> %mask to <1 x i64> +; %notmask = xor <1 x i64> %newmask, +; %cleared_old = and <1 x i64> %0, %notmask +; %masked_new = and <1 x i64> %1, %newmask +; %new = or <1 x i64> %cleared_old, %masked_new +; ret <1 x i64> %new +; %cmp = icmp eq <1 x i32> %mask, +; %sel = select <1 x i1> %cmp, <1 x i64> %0, <1 x i64> %1 +; ret <1 x i64> %sel + %m = extractelement <1 x i32> %mask, i32 0 + %cmp = icmp eq i32 %m, 0 + %d0 = extractelement <1 x i64> %0, i32 0 + %d1 = extractelement <1 x i64> %1, i32 0 + %sel = select i1 %cmp, i64 %d0, i64 %d1 + %r = insertelement <1 x i64> undef, i64 %sel, i32 0 + ret <1 x i64> %r + +} + +define <1 x float> @__vselect_float(<1 x float>, <1 x float>, + <1 x i32> %mask) nounwind readnone alwaysinline { +; %v0 = bitcast <1 x float> %0 to <1 x i32> +; %v1 = bitcast <1 x float> %1 to <1 x i32> +; %r = call <1 x i32> @__vselect_i32(<1 x i32> %v0, <1 x i32> %v1, <1 x i32> %mask) +; %rf = bitcast <1 x i32> %r to <1 x float> +; ret <1 x float> %rf +; %cmp = icmp eq <1 x i32> %mask, +; %sel = select <1 x i1> %cmp, <1 x float> %0, <1 x float> %1 +; ret <1 x float> %sel + %m = extractelement <1 x i32> %mask, i32 0 + %cmp = icmp eq i32 %m, 0 + %d0 = extractelement <1 x float> %0, i32 0 + %d1 = extractelement <1 x float> %1, i32 0 + %sel = select i1 %cmp, float %d0, float %d1 + %r = insertelement <1 x float> undef, float %sel, i32 0 + ret <1 x float> %r + +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +define void @__masked_store_blend_i8(<1 x i8>* nocapture, <1 x i8>, + <1 x i32> %mask) nounwind alwaysinline { + %val = load <1 x i8> * %0, align 4 + %newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask) + store <1 x i8> %newval, <1 x i8> * %0, align 4 + ret void +} + +define void @__masked_store_blend_i16(<1 x i16>* nocapture, <1 x i16>, + <1 x i32> %mask) nounwind alwaysinline { + %val = load <1 x i16> * %0, align 4 + %newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask) + store <1 x i16> %newval, <1 x i16> * %0, align 4 + ret void +} + +define void @__masked_store_blend_i32(<1 x i32>* nocapture, <1 x i32>, + <1 x i32> %mask) nounwind alwaysinline { + %val = load <1 x i32> * %0, align 4 + %newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask) + store <1 x i32> %newval, <1 x i32> * %0, align 4 + ret void +} + +define void @__masked_store_blend_i64(<1 x i64>* nocapture, <1 x i64>, + <1 x i32> %mask) nounwind alwaysinline { + %val = load <1 x i64> * %0, align 4 + %newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask) + store <1 x i64> %newval, <1 x i64> * %0, align 4 + ret void +} + +masked_store_float_double() + +define i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline { + %item = extractelement <1 x i32> %0, i32 0 + %v = lshr i32 %item, 31 + %v64 = zext i32 %v to i64 + ret i64 %v64 +} + +define i1 @__any(<1 x i32>) nounwind readnone alwaysinline { + %item = extractelement <1 x i32> %0, i32 0 + %v = lshr i32 %item, 31 + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all(<1 x i32>) nounwind readnone alwaysinline { + %item = extractelement <1 x i32> %0, i32 0 + %v = lshr i32 %item, 31 + %cmp = icmp eq i32 %v, 1 + ret i1 %cmp +} + +define i1 @__none(<1 x i32>) nounwind readnone alwaysinline { + %item = extractelement <1 x i32> %0, i32 0 + %v = lshr i32 %item, 31 + %cmp = icmp eq i32 %v, 0 + ret i1 %cmp +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding +;; +;; There are not any rounding instructions in SSE2, so we have to emulate +;; the functionality with multiple instructions... + +; The code for __round_* is the result of compiling the following source +; code. +; +; export float Round(float x) { +; unsigned int sign = signbits(x); +; unsigned int ix = intbits(x); +; ix ^= sign; +; x = floatbits(ix); +; x += 0x1.0p23f; +; x -= 0x1.0p23f; +; ix = intbits(x); +; ix ^= sign; +; x = floatbits(ix); +; return x; +;} + +define <1 x float> @__round_varying_float(<1 x float>) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32> + %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, + %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float> + %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, + %binop21.i = fadd <1 x float> %binop.i, + %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32> + %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float> + ret <1 x float> %int_to_float_bitcast.i.i.i +} + +;; Similarly, for implementations of the __floor* functions below, we have the +;; bitcode from compiling the following source code... + +;export float Floor(float x) { +; float y = Round(x); +; unsigned int cmp = y > x ? 0xffffffff : 0; +; float delta = -1.f; +; unsigned int idelta = intbits(delta); +; idelta &= cmp; +; delta = floatbits(idelta); +; return y + delta; +;} + +define <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind + %bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32> + %bitop.i = and <1 x i32> %val_to_boolvec32.i, + %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float> + %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <1 x float> %binop.i +} + +;; And here is the code we compiled to get the __ceil* functions below +; +;export uniform float Ceil(uniform float x) { +; uniform float y = Round(x); +; uniform int yltx = y < x ? 0xffffffff : 0; +; uniform float delta = 1.f; +; uniform int idelta = intbits(delta); +; idelta &= yltx; +; delta = floatbits(idelta); +; return y + delta; +;} + +define <1 x float> @__ceil_varying_float(<1 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind + %bincmp.i = fcmp olt <1 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32> + %bitop.i = and <1 x i32> %val_to_boolvec32.i, + %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float> + %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <1 x float> %binop.i +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +; expecting math lib to provide this +declare double @ceil (double) nounwind readnone +declare double @floor (double) nounwind readnone +declare double @round (double) nounwind readnone +;declare float @llvm.sqrt.f32(float %Val) +declare double @llvm.sqrt.f64(double %Val) +declare float @llvm.sin.f32(float %Val) +declare float @llvm.cos.f32(float %Val) +declare float @llvm.sqrt.f32(float %Val) +declare float @llvm.exp.f32(float %Val) +declare float @llvm.log.f32(float %Val) +declare float @llvm.pow.f32(float %f, float %e) + + + + +;; stuff that could be in builtins ... + +define(`unary1to1', ` + %v_0 = extractelement <1 x $1> %0, i32 0 + %r_0 = call $1 $2($1 %v_0) + %ret_0 = insertelement <1 x $1> undef, $1 %r_0, i32 0 + ret <1 x $1> %ret_0 +') + + + +;; dummy 1 wide vector ops +define void +@__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, + <1 x float> %v3, <1 x float> * noalias %out0, + <1 x float> * noalias %out1, <1 x float> * noalias %out2, + <1 x float> * noalias %out3) nounwind alwaysinline { + + store <1 x float> %v0, <1 x float > * %out0 + store <1 x float> %v1, <1 x float > * %out1 + store <1 x float> %v2, <1 x float > * %out2 + store <1 x float> %v3, <1 x float > * %out3 + + ret void +} + +define void +@__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, + <1 x float> %v3, <1 x float> * noalias %out0, + <1 x float> * noalias %out1, <1 x float> * noalias %out2, + <1 x float> * noalias %out3) nounwind alwaysinline { + call void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, + <1 x float> %v2, <1 x float> %v3, <1 x float> * %out0, + <1 x float> * %out1, <1 x float> * %out2, <1 x float> * %out3) + ret void +} + +define void +@__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1, + <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1, + <1 x float> * %out2) { + store <1 x float> %v0, <1 x float > * %out0 + store <1 x float> %v1, <1 x float > * %out1 + store <1 x float> %v2, <1 x float > * %out2 + + ret void +} + +define void +@__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1, + <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1, + <1 x float> * %out2) { + call void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1, + <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1, + <1 x float> * %out2) + ret void +} + + +;; end builtins + + +define <1 x double> @__round_varying_double(<1 x double>) nounwind readonly alwaysinline { + unary1to1(double, @round) +} + +define <1 x double> @__floor_varying_double(<1 x double>) nounwind readonly alwaysinline { + unary1to1(double, @floor) +} + + +define <1 x double> @__ceil_varying_double(<1 x double>) nounwind readonly alwaysinline { + unary1to1(double, @ceil) +} + +; To do vector integer min and max, we do the vector compare and then sign +; extend the i1 vector result to an i32 mask. The __vselect does the +; rest... + +define <1 x i32> @__min_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline { + %c = icmp slt <1 x i32> %0, %1 + %mask = sext <1 x i1> %c to <1 x i32> + %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask) + ret <1 x i32> %v +} + +define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline { + %c = icmp slt i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} + +define <1 x i32> @__max_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline { + %c = icmp sgt <1 x i32> %0, %1 + %mask = sext <1 x i1> %c to <1 x i32> + %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask) + ret <1 x i32> %v +} + +define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline { + %c = icmp sgt i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} + +; The functions for unsigned ints are similar, just with unsigned +; comparison functions... + +define <1 x i32> @__min_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline { + %c = icmp ult <1 x i32> %0, %1 + %mask = sext <1 x i1> %c to <1 x i32> + %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask) + ret <1 x i32> %v +} + +define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline { + %c = icmp ult i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} + +define <1 x i32> @__max_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline { + %c = icmp ugt <1 x i32> %0, %1 + %mask = sext <1 x i1> %c to <1 x i32> + %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask) + ret <1 x i32> %v +} + +define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline { + %c = icmp ugt i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops / reductions + +declare i32 @llvm.ctpop.i32(i32) nounwind readnone + +define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { + %call = call i32 @llvm.ctpop.i32(i32 %0) + ret i32 %call +} + +declare i64 @llvm.ctpop.i64(i64) nounwind readnone + +define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { + %call = call i64 @llvm.ctpop.i64(i64 %0) + ret i64 %call +} + + +define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline { + %r = extractelement <1 x float> %v, i32 0 + ret float %r +} + +define float @__reduce_min_float(<1 x float>) nounwind readnone { + %r = extractelement <1 x float> %0, i32 0 + ret float %r +} + +define float @__reduce_max_float(<1 x float>) nounwind readnone { + %r = extractelement <1 x float> %0, i32 0 + ret float %r +} + +define i32 @__reduce_add_int32(<1 x i32> %v) nounwind readnone { + %r = extractelement <1 x i32> %v, i32 0 + ret i32 %r +} + +define i32 @__reduce_min_int32(<1 x i32>) nounwind readnone { + %r = extractelement <1 x i32> %0, i32 0 + ret i32 %r +} + +define i32 @__reduce_max_int32(<1 x i32>) nounwind readnone { + %r = extractelement <1 x i32> %0, i32 0 + ret i32 %r +} + +define i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone { + %r = extractelement <1 x i32> %0, i32 0 + ret i32 %r +} + +define i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone { + %r = extractelement <1 x i32> %0, i32 0 + ret i32 %r + } + + +define double @__reduce_add_double(<1 x double>) nounwind readnone { + %m = extractelement <1 x double> %0, i32 0 + ret double %m +} + +define double @__reduce_min_double(<1 x double>) nounwind readnone { + %m = extractelement <1 x double> %0, i32 0 + ret double %m +} + +define double @__reduce_max_double(<1 x double>) nounwind readnone { + %m = extractelement <1 x double> %0, i32 0 + ret double %m +} + +define i64 @__reduce_add_int64(<1 x i64>) nounwind readnone { + %m = extractelement <1 x i64> %0, i32 0 + ret i64 %m +} + +define i64 @__reduce_min_int64(<1 x i64>) nounwind readnone { + %m = extractelement <1 x i64> %0, i32 0 + ret i64 %m +} + +define i64 @__reduce_max_int64(<1 x i64>) nounwind readnone { + %m = extractelement <1 x i64> %0, i32 0 + ret i64 %m +} + +define i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone { + %m = extractelement <1 x i64> %0, i32 0 + ret i64 %m +} + +define i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone { + %m = extractelement <1 x i64> %0, i32 0 + ret i64 %m +} + +define i1 @__reduce_equal_int32(<1 x i32> %vv, i32 * %samevalue, + <1 x i32> %mask) nounwind alwaysinline { + %v=extractelement <1 x i32> %vv, i32 0 + store i32 %v, i32 * %samevalue + ret i1 true + +} + +define i1 @__reduce_equal_float(<1 x float> %vv, float * %samevalue, + <1 x i32> %mask) nounwind alwaysinline { + %v=extractelement <1 x float> %vv, i32 0 + store float %v, float * %samevalue + ret i1 true + +} + +define i1 @__reduce_equal_int64(<1 x i64> %vv, i64 * %samevalue, + <1 x i32> %mask) nounwind alwaysinline { + %v=extractelement <1 x i64> %vv, i32 0 + store i64 %v, i64 * %samevalue + ret i1 true + +} + +define i1 @__reduce_equal_double(<1 x double> %vv, double * %samevalue, + <1 x i32> %mask) nounwind alwaysinline { + %v=extractelement <1 x double> %vv, i32 0 + store double %v, double * %samevalue + ret i1 true + +} + +; extracting/reinserting elements because I want to be able to remove vectors later on + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +define <1 x float> @__rcp_varying_float(<1 x float>) nounwind readonly alwaysinline { + ;%call = call <1 x float> @llvm.x86.sse.rcp.ps(<1 x float> %0) + ; do one N-R iteration to improve precision + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + ;%v_iv = fmul <1 x float> %0, %call + ;%two_minus = fsub <1 x float> , %v_iv + ;%iv_mul = fmul <1 x float> %call, %two_minus + ;ret <1 x float> %iv_mul + %d = extractelement <1 x float> %0, i32 0 + %r = fdiv float 1.,%d + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret <1 x float> %rv +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; sqrt + +define <1 x float> @__sqrt_varying_float(<1 x float>) nounwind readonly alwaysinline { + ;%call = call <1 x float> @llvm.x86.sse.sqrt.ps(<1 x float> %0) + ;ret <1 x float> %call + %d = extractelement <1 x float> %0, i32 0 + %r = call float @llvm.sqrt.f32(float %d) + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret <1 x float> %rv +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; rsqrt + +define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + ;%is = call <1 x float> @llvm.x86.sse.rsqrt.ps(<1 x float> %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + ;%v_is = fmul <1 x float> %v, %is + ;%v_is_is = fmul <1 x float> %v_is, %is + ;%three_sub = fsub <1 x float> , %v_is_is + ;%is_mul = fmul <1 x float> %is, %three_sub + ;%half_scale = fmul <1 x float> , %is_mul + ;ret <1 x float> %half_scale + %s = call <1 x float> @__sqrt_varying_float(<1 x float> %v) + %r = call <1 x float> @__rcp_varying_float(<1 x float> %s) + ret <1 x float> %r + +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; svml stuff + +define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm.sin.f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + unary1to1(float,@llvm.sin.f32) + +} + +define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm.cos.f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + unary1to1(float, @llvm.cos.f32) + +} + +define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline { +; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0) +; store <1 x float> %s, <1 x float> * %1 +; ret void + %sin = call <1 x float> @__svml_sin (<1 x float> %0) + %cos = call <1 x float> @__svml_cos (<1 x float> %0) + store <1 x float> %sin, <1 x float> * %1 + store <1 x float> %cos, <1 x float> * %2 + ret void +} + +define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm_tan_f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + ;unasry1to1(float, @llvm.tan.f32) + ; UNSUPPORTED! + ret <1 x float > %0 +} + +define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline { +; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0) +; ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm_atan_f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + ;unsary1to1(float,@llvm.atan.f32) + ;UNSUPPORTED! + ret <1 x float > %0 + +} + +define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1) + ;ret <1 x float> %ret + ;%y = extractelement <1 x float> %0, i32 0 + ;%x = extractelement <1 x float> %1, i32 0 + ;%q = fdiv float %y, %x + ;%a = call float @llvm.atan.f32 (float %q) + ;%rv = insertelement <1 x float> undef, float %a, i32 0 + ;ret <1 x float> %rv + ; UNSUPPORTED! + ret <1 x float > %0 +} + +define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0) + ;ret <1 x float> %ret + unary1to1(float, @llvm.exp.f32) +} + +define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0) + ;ret <1 x float> %ret + unary1to1(float, @llvm.log.f32) +} + +define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1) + ;ret <1 x float> %ret + %r = extractelement <1 x float> %0, i32 0 + %e = extractelement <1 x float> %1, i32 0 + %s = call float @llvm.pow.f32(float %r,float %e) + %rv = insertelement <1 x float> undef, float %s, i32 0 + ret <1 x float> %rv + +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +define <1 x float> @__max_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline { +; %call = call <1 x float> @llvm.x86.sse.max.ps(<1 x float> %0, <1 x float> %1) +; ret <1 x float> %call + %a = extractelement <1 x float> %0, i32 0 + %b = extractelement <1 x float> %1, i32 0 + %d = fcmp ogt float %a, %b + %r = select i1 %d, float %a, float %b + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret <1 x float> %rv +} + +define <1 x float> @__min_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline { +; %call = call <1 x float> @llvm.x86.sse.min.ps(<1 x float> %0, <1 x float> %1) +; ret <1 x float> %call + %a = extractelement <1 x float> %0, i32 0 + %b = extractelement <1 x float> %1, i32 0 + %d = fcmp olt float %a, %b + %r = select i1 %d, float %a, float %b + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret <1 x float> %rv + +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +;declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +define <1 x double> @__sqrt_varying_double(<1 x double>) nounwind alwaysinline { + ;unarya2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ;ret <1 x double> %ret + unary1to1(double, @llvm.sqrt.f64) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +;declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +;declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define <1 x double> @__min_varying_double(<1 x double>, <1 x double>) nounwind readnone { + ;binarsy2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ;ret <1 x double> %ret + %a = extractelement <1 x double> %0, i32 0 + %b = extractelement <1 x double> %1, i32 0 + %d = fcmp olt double %a, %b + %r = select i1 %d, double %a, double %b + %rv = insertelement <1 x double> undef, double %r, i32 0 + ret <1 x double> %rv + +} + +define <1 x double> @__max_varying_double(<1 x double>, <1 x double>) nounwind readnone { + ;binary2sto4(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ;ret <1 x double> %ret + %a = extractelement <1 x double> %0, i32 0 + %b = extractelement <1 x double> %1, i32 0 + %d = fcmp ogt double %a, %b + %r = select i1 %d, double %a, double %b + %rv = insertelement <1 x double> undef, double %r, i32 0 + ret <1 x double> %rv + +} + + +define float @__rcp_uniform_float(float) nounwind readonly alwaysinline { +; uniform float iv = extract(__rcp_u(v), 0); +; return iv * (2. - v * iv); + %r = fdiv float 1.,%0 + ret float %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + +define float @__round_uniform_float(float) nounwind readonly alwaysinline { + ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + ; the roundss intrinsic is a total mess--docs say: + ; + ; __m128 _mm_round_ss (__m128 a, __m128 b, const int c) + ; + ; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function + ; on b0. The higher order 96 bits are copied directly from input parameter a. The + ; return value is described by the following equations: + ; + ; r0 = RND(b0) + ; r1 = a1 + ; r2 = a2 + ; r3 = a3 + ; + ; It doesn't matter what we pass as a, since we only need the r0 value + ; here. So we pass the same register for both. + %v = insertelement<1 x float> undef, float %0, i32 0 + %rv = call <1 x float> @__round_varying_float(<1 x float> %v) + %r=extractelement <1 x float> %rv, i32 0 + ret float %r + +} + +define float @__floor_uniform_float(float) nounwind readonly alwaysinline { + %v = insertelement<1 x float> undef, float %0, i32 0 + %rv = call <1 x float> @__floor_varying_float(<1 x float> %v) + %r=extractelement <1 x float> %rv, i32 0 + ret float %r + +} + +define float @__ceil_uniform_float(float) nounwind readonly alwaysinline { + %v = insertelement<1 x float> undef, float %0, i32 0 + %rv = call <1 x float> @__ceil_varying_float(<1 x float> %v) + %r=extractelement <1 x float> %rv, i32 0 + ret float %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + + +define double @__round_uniform_double(double) nounwind readonly alwaysinline { + %rs=call double @round(double %0) + ret double %rs +} + +define double @__floor_uniform_double(double) nounwind readonly alwaysinline { + %rs = call double @floor(double %0) + ret double %rs +} + +define double @__ceil_uniform_double(double) nounwind readonly alwaysinline { + %rs = call double @ceil(double %0) + ret double %rs +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; sqrt + + +define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline { + %ret = call float @llvm.sqrt.f32(float %0) + ret float %ret +} + +define double @__sqrt_uniform_double(double) nounwind readonly alwaysinline { + %ret = call double @llvm.sqrt.f64(double %0) + ret double %ret +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rsqrt + + +define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline { + %s = call float @__sqrt_uniform_float(float %0) + %r = call float @__rcp_uniform_float(float %s) + ret float %r +} + + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; fastmath + + +define void @__fastmath() nounwind alwaysinline { + ; no-op + ret void +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + + +define float @__max_uniform_float(float, float) nounwind readonly alwaysinline { + %d = fcmp ogt float %0, %1 + %r = select i1 %d, float %0, float %1 + ret float %r + +} + +define float @__min_uniform_float(float, float) nounwind readonly alwaysinline { + %d = fcmp olt float %0, %1 + %r = select i1 %d, float %0, float %1 + ret float %r + +} +define double @__max_uniform_double(double, double) nounwind readonly alwaysinline { + %d = fcmp ogt double %0, %1 + %r = select i1 %d, double %0, double %1 + ret double %r + +} + +define double @__min_uniform_double(double, double) nounwind readonly alwaysinline { + %d = fcmp olt double %0, %1 + %r = select i1 %d, double %0, double %1 + ret double %r + +} + +define_shuffles() + +ctlztz() + +define_prefetches() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone diff --git a/ispc.cpp b/ispc.cpp index 887f6ca3..480ff99a 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -144,7 +144,7 @@ static const char *supportedCPUs[] = { // FIXME: LLVM supports a ton of different ARM CPU variants--not just // cortex-a9 and a15. We should be able to handle any of them that also // have NEON support. - "cortex-a9", "cortex-a15", + "sm_35", "cortex-a9", "cortex-a15", "atom", "penryn", "core2", "corei7", "corei7-avx" #if !defined(LLVM_3_1) , "core-avx-i", "core-avx2" @@ -187,7 +187,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : isa = "avx2"; else if (!strcmp(cpu, "cortex-a9") || !strcmp(cpu, "cortex-a15")) - isa = "neon"; + isa = "neon"; + else if (!strcmp(cpu, "sm_35")) + isa = "nvptx64"; else if (!strcmp(cpu, "core-avx-i")) isa = "avx1.1"; else if (!strcmp(cpu, "sandybridge") || @@ -218,6 +220,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : cpu = "cortex-a9"; #endif + if (cpu == NULL && !strcmp(isa, "nvptx64")) + cpu = "sm_35"; + if (cpu == NULL) { std::string hostCPU = llvm::sys::getHostCPUName(); if (hostCPU.size() > 0) @@ -248,6 +253,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : if (arch == NULL) { if (!strcmp(isa, "neon")) arch = "arm"; + else if (!strcmp(isa, "nvptx64")) + arch = "nvptx64"; else arch = "x86-64"; } @@ -454,6 +461,22 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } + else if (!strcasecmp(isa, "nvptx64")) { + this->m_isa = Target::NVPTX64; + this->m_nativeVectorWidth = 1; + this->m_vectorWidth = 1; + this->m_attributes = "+sm_35"; +#if 0 + this->m_hasHalf = false; + this->m_maskingIsFree = true; + this->m_maskBitCount = 1; + this->m_hasTranscendentals = true; + this->m_hasGather = this->m_hasScatter = true; +#else + this->m_maskingIsFree = false; + this->m_maskBitCount = 32; +#endif + } else { fprintf(stderr, "Target ISA \"%s\" is unknown. Choices are: %s\n", isa, SupportedTargetISAs()); @@ -561,13 +584,13 @@ Target::SupportedTargetCPUs() { const char * Target::SupportedTargetArchs() { - return "arm, x86, x86-64"; + return "nvptx64, arm, x86, x86-64"; } const char * Target::SupportedTargetISAs() { - return "neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2" + return "nvptx64, neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2" ", avx1.1, avx1.1-x2, avx2, avx2-x2" ", generic-1, generic-4, generic-8, generic-16, generic-32"; } @@ -579,6 +602,10 @@ Target::GetTripleString() const { if (m_arch == "arm") { triple.setTriple("armv7-eabi"); } + else if (m_arch == "nvptx64") + { + triple.setTriple("nvptx64"); + } else { // Start with the host triple as the default triple.setTriple(llvm::sys::getDefaultTargetTriple()); @@ -604,6 +631,8 @@ Target::ISAToString(ISA isa) { switch (isa) { case Target::NEON: return "neon"; + case Target::NVPTX64: + return "nvptx64"; case Target::SSE2: return "sse2"; case Target::SSE4: diff --git a/ispc.h b/ispc.h index 7d10b908..de41a3e8 100644 --- a/ispc.h +++ b/ispc.h @@ -175,7 +175,7 @@ public: flexible/performant of them will apear last in the enumerant. Note also that __best_available_isa() needs to be updated if ISAs are added or the enumerant values are reordered. */ - enum ISA { NEON, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS }; + enum ISA { NVPTX64, NEON, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS }; /** Initializes the given Target pointer for a target of the given name, if the name is a known target. Returns true if the diff --git a/main.cpp b/main.cpp index de2bb620..9ab0b793 100644 --- a/main.cpp +++ b/main.cpp @@ -262,6 +262,15 @@ int main(int Argc, char *Argv[]) { LLVMInitializeARMDisassembler(); LLVMInitializeARMTargetMC(); + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXAsmPrinter(); +#if 0 + LLVMInitializeNVPTXAsmParser(); + LLVMInitializeNVPTXDisassembler(); +#endif + LLVMInitializeNVPTXTargetMC(); + char *file = NULL; const char *headerFileName = NULL; const char *outFileName = NULL;