diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll index d29d1c1c..e6385d10 100644 --- a/builtins/target-nvptx.ll +++ b/builtins/target-nvptx.ll @@ -124,23 +124,23 @@ define i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline ;; i32 define internal i32 @__shfl_i32_nvptx(i32, i32) nounwind readnone alwaysinline { - %shfl = tail call i32 asm sideeffect "shfl.idx.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) nounwind readnone alwaysinline + %shfl = tail call i32 asm sideeffect "shfl.idx.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) ret i32 %shfl } define internal i32 @__shfl_xor_i32_nvptx(i32, i32) nounwind readnone alwaysinline { - %shfl = tail call i32 asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) nounwind readnone alwaysinline + %shfl = tail call i32 asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) ret i32 %shfl } ;; float define internal float @__shfl_float_nvptx(float, i32) nounwind readnone alwaysinline { - %shfl = tail call float asm sideeffect "shfl.idx.b32 $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) nounwind readnone alwaysinline + %shfl = tail call float asm sideeffect "shfl.idx.b32 $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) ret float %shfl } define internal float @__shfl_xor_float_nvptx(float, i32) nounwind readnone alwaysinline { - %shfl = tail call float asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) nounwind readnone alwaysinline + %shfl = tail call float asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) ret float %shfl } @@ -148,12 +148,12 @@ define internal float @__shfl_xor_float_nvptx(float, i32) nounwind readnone alwa ;; float/double define internal float @__fminf_nvptx(float,float) nounwind readnone alwaysinline { - %min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) nounwind readnone alwaysinline + %min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) ret float %min } define internal float @__fmaxf_nvptx(float,float) nounwind readnone alwaysinline { - %max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) nounwind readnone alwaysinline + %max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) ret float %max } @@ -245,12 +245,12 @@ define internal i32 @__ballot_nvptx(i1) nounwind readnone alwaysinline "{ .reg .pred %p1; setp.ne.u32 %p1, $1, 0; vote.ballot.b32 $0, %p1; - }", "=r,r"(i32 %conv) nounwind readnone alwaysinline + }", "=r,r"(i32 %conv) ret i32 %res } define internal i32 @__lanemask_lt_nvptx() nounwind readnone alwaysinline { - %mask = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() nounwind readnone alwaysinline + %mask = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() ret i32 %mask } @@ -418,7 +418,7 @@ define float @__half_to_float_uniform(i16 %v) nounwind readnone alwaysinline "{ .reg .f16 tmp; mov.b16 tmp, $1; cvt.f32.f16 $0, tmp; - }", "=f,h"(i16 %v) nounwind readnone alwaysinline + }", "=f,h"(i16 %v) ret float %res } define i16 @__float_to_half_uniform(float %v) nounwind readnone alwaysinline @@ -429,7 +429,7 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone alwaysinline "{ .reg .f16 tmp; cvt.rn.f16.f32 tmp, $1; mov.b16 $0, tmp; - }", "=h,f"(float %v) nounwind readnone alwaysinline + }", "=h,f"(float %v) ret i16 %half } define @__half_to_float_varying( %v) nounwind readnone alwaysinline @@ -473,7 +473,7 @@ define internal float @__round_uniform_float_ptx(float) nounwind readnone always cvt.rzi.f32.f32 f9, f4; BB2_2: mov.f32 $0, f9; - }", "=f,f"(float %0) nounwind readnone alwaysinline + }", "=f,f"(float %0) ret float %2 } define float @__round_uniform_float(float) nounwind readonly alwaysinline { @@ -490,12 +490,12 @@ define float @__round_uniform_float(float) nounwind readonly alwaysinline { } define float @__floor_uniform_float(float) nounwind readnone alwaysinline { - %2 = tail call float asm sideeffect "cvt.rmi.f32.f32 $0, $1;", "=f,f"(float %0) nounwind alwaysinline readnone + %2 = tail call float asm sideeffect "cvt.rmi.f32.f32 $0, $1;", "=f,f"(float %0) ret float %2 } define float @__ceil_uniform_float(float) nounwind readnone alwaysinline { - %2 = tail call float asm sideeffect "cvt.rpi.f32.f32 $0, $1;", "=f,f"(float %0) nounwind alwaysinline readnone + %2 = tail call float asm sideeffect "cvt.rpi.f32.f32 $0, $1;", "=f,f"(float %0) ret float %2 } @@ -534,17 +534,17 @@ define double @__round_uniform_double(double) nounwind readnone alwaysinline BB5_2: mov.f64 $0, fd8; - }", "=d,d"(double %0) nounwind readnone alwaysinline + }", "=d,d"(double %0) ret double %2 } define double @__floor_uniform_double(double) nounwind readnone alwaysinline { - %2 = tail call double asm sideeffect "cvt.rmi.f64.f64 $0, $1;", "=f,f"(double %0) nounwind alwaysinline readnone + %2 = tail call double asm sideeffect "cvt.rmi.f64.f64 $0, $1;", "=f,f"(double %0) ret double %2 } define double @__ceil_uniform_double(double) nounwind readnone alwaysinline { - %2 = tail call double asm sideeffect "cvt.rpi.f64.f64 $0, $1;", "=f,f"(double %0) nounwind alwaysinline readnone + %2 = tail call double asm sideeffect "cvt.rpi.f64.f64 $0, $1;", "=f,f"(double %0) ret double %2 } @@ -763,7 +763,7 @@ declare i32 @llvm.ctpop.i32(i32) nounwind readnone define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { %call = call i32 @llvm.ctpop.i32(i32 %0) ret i32 %call -;; %res = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %0) nounwind readnone alwaysinline +;; %res = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %0) ;; ret i32 %res } @@ -1117,7 +1117,7 @@ define internal i32 @__shfl_reduce_and_step_i32_nvptx(i32, i32) nounwind readnon shfl.bfly.b32 r0|p, $1, $2, 0; @p and.b32 r0, r0, $3; mov.u32 $0, r0; - }", "=r,r,r,r"(i32 %0, i32 %1, i32 %0) nounwind readnone alwaysinline + }", "=r,r,r,r"(i32 %0, i32 %1, i32 %0) ret i32 %shfl } shfl64(__shfl_reduce_and_step, i64) @@ -1298,7 +1298,7 @@ define internal i32 @__shfl_scan_add_step_i32(i32 %partial, i32 %up_offset) noun shfl.up.b32 r0|p, $1, $2, 0; @p add.u32 r0, r0, $3; mov.u32 $0, r0; - }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) nounwind readnone alwaysinline + }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) ret i32 %result; } define <1 x i32> @__exclusive_scan_add_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline @@ -1325,7 +1325,7 @@ define internal i32 @__shfl_scan_or_step_i32(i32 %partial, i32 %up_offset) nounw shfl.up.b32 r0|p, $1, $2, 0; @p or.b32 r0, r0, $3; mov.u32 $0, r0; - }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) nounwind alwaysinline + }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) ret i32 %result; } define <1 x i32> @__exclusive_scan_or_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline @@ -1341,7 +1341,7 @@ define <1 x i32> @__exclusive_scan_or_i32(<1 x i32>, <1 x i1>) nounwind readnone shfl.up.b32 r0|p, $1, 1, 0; @!p mov.u32 r0, 0; mov.u32 $0, r0; - }","=r,r"(i32 %v1); alwaysinline nounwind + }","=r,r"(i32 %v1) %s1 = tail call i32 @__shfl_scan_or_step_i32(i32 %v, i32 1); %s2 = tail call i32 @__shfl_scan_or_step_i32(i32 %s1, i32 2); @@ -1360,7 +1360,7 @@ define internal i32 @__shfl_scan_and_step_i32(i32 %partial, i32 %up_offset) noun shfl.up.b32 r0|p, $1, $2, 0; @p and.b32 r0, r0, $3; mov.u32 $0, r0; - }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) alwaysinline + }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) ret i32 %result; } define <1 x i32> @__exclusive_scan_and_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline @@ -1376,7 +1376,7 @@ define <1 x i32> @__exclusive_scan_and_i32(<1 x i32>, <1 x i1>) nounwind readnon shfl.up.b32 r0|p, $1, 1, 0; @!p mov.u32 r0, -1; mov.u32 $0, r0; - }","=r,r"(i32 %v1); alwaysinline + }","=r,r"(i32 %v1) %s1 = call i32 @__shfl_scan_and_step_i32(i32 %v, i32 1); %s2 = call i32 @__shfl_scan_and_step_i32(i32 %s1, i32 2); @@ -1395,7 +1395,7 @@ define internal float @__shfl_scan_add_step_float(float %partial, i32 %up_offset shfl.up.b32 f0|p, $1, $2, 0; @p add.f32 f0, f0, $3; mov.f32 $0, f0; - }", "=f,f,r,f"(float %partial, i32 %up_offset, float %partial) nounwind readnone alwaysinline + }", "=f,f,r,f"(float %partial, i32 %up_offset, float %partial) ret float %result; } define <1 x float> @__exclusive_scan_add_float(<1 x float>, <1 x i1>) nounwind readnone alwaysinline @@ -1427,7 +1427,7 @@ define internal double @__shfl_scan_add_step_double(double %partial, i32 %up_off mov.b64 fd0, {r3,r4}; @p add.f64 fd0, fd0, $3; mov.f64 $0, fd0; - }", "=d,d,r,d"(double %partial, i32 %up_offset, double %partial) nounwind readnone alwaysinline + }", "=d,d,r,d"(double %partial, i32 %up_offset, double %partial) ret double %result; } define <1 x double> @__exclusive_scan_add_double(<1 x double>, <1 x i1>) nounwind readnone alwaysinline @@ -1460,7 +1460,7 @@ define internal i64 @__shfl_scan_add_step_i64(i64 %partial, i32 %up_offset) noun mov.b64 rl0, {r3,r4}; @p add.s64 rl0, rl0, $3; mov.s64 $0, rl0; - }", "=l,l,r,l"(i64 %partial, i32 %up_offset, i64 %partial) nounwind readnone alwaysinline + }", "=l,l,r,l"(i64 %partial, i32 %up_offset, i64 %partial) ret i64 %result; } define <1 x i64> @__exclusive_scan_add_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline diff --git a/examples/portable/common_ptx.mk b/examples/portable/common_ptx.mk index ed4b11cf..cfaa0b02 100644 --- a/examples/portable/common_ptx.mk +++ b/examples/portable/common_ptx.mk @@ -21,14 +21,15 @@ ifdef PTXCC_REGMAX endif # -ISPC=ispc +ISPC=$(ISPC_HOME)/ispc ISPC_FLAGS+=-O3 --math-lib=fast --target=nvptx --opt=fast-math # # # ISPC_LLVM_OBJS=$(ISPC_SRC:%.ispc=objs_ptx/%_llvm_ispc.o) ISPC_NVVM_OBJS=$(ISPC_SRC:%.ispc=objs_ptx/%_nvvm_ispc.o) -ISPC_BCS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.bc) +#ISPC_BCS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.bc) +ISPC_LLS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.ll) ISPC_LLVM_PTX=$(ISPC_SRC:%.ispc=objs_ptx/%_llvm_ispc.ptx) ISPC_NVVM_PTX=$(ISPC_SRC:%.ispc=objs_ptx/%_nvvm_ispc.ptx) ISPC_HEADERS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.h) @@ -42,13 +43,13 @@ CXX_OBJS+=objs_ptx/ispc_malloc_gcc.o PTXGEN = $(ISPC_HOME)/ptxtools/ptxgen PTXGEN += --use_fast_math -LLVM32=$(HOME)/usr/local/llvm/bin-3.2 -LLVM32DIS=$(LLVM32)/bin/llvm-dis +#LLVM32=$(HOME)/usr/local/llvm/bin-3.2 +#LLVM32DIS=$(LLVM32)/bin/llvm-dis -LLC=$(HOME)/usr/local/llvm/bin-trunk/bin/llc +LLC=$(LLVM_ROOT)/bin/llc LLC_FLAGS=-march=nvptx64 -mcpu=sm_35 -# .SUFFIXES: .bc .o .cu +# .SUFFIXES: .bc .o .cu .ll ifdef LLVM_GPU OBJSptx_llvm=$(ISPC_LLVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS) @@ -74,7 +75,7 @@ endif all: dirs \ $(PROGptx_nvvm) \ $(PROGptx_llvm) \ - $(PROGcu) $(ISPC_BC) $(ISPC_HEADERS) $(ISPC_NVVM_PTX) $(ISPC_LLVM_PTX) + $(PROGcu) $(ISPC_BCS) $(ISPC_LLS) $(ISPC_HEADERS) $(ISPC_NVVM_PTX) $(ISPC_LLVM_PTX) dirs: /bin/mkdir -p objs_ptx/ @@ -109,15 +110,21 @@ objs_ptx/%_nvcc.o: %.cu $(NVCC) $(NVCC_FLAGS) -o $@ -c $< # compile ISPC to LLVM BC -objs_ptx/%_ispc.h objs_ptx/%_ispc.bc: %.ispc - $(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_ptx/$*_ispc.h -o objs_ptx/$*_ispc.bc $< +#objs_ptx/%_ispc.h objs_ptx/%_ispc.bc: %.ispc +# $(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_ptx/$*_ispc.h -o objs_ptx/$*_ispc.bc $< +objs_ptx/%_ispc.h objs_ptx/%_ispc.ll: %.ispc + $(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_ptx/$*_ispc.h -o objs_ptx/$*_ispc.ll $< # generate PTX from LLVM BC -objs_ptx/%_llvm_ispc.ptx: objs_ptx/%_ispc.bc +#objs_ptx/%_llvm_ispc.ptx: objs_ptx/%_ispc.bc +# $(LLC) $(LLC_FLAGS) -o $@ $< +objs_ptx/%_llvm_ispc.ptx: objs_ptx/%_ispc.ll $(LLC) $(LLC_FLAGS) -o $@ $< -objs_ptx/%_nvvm_ispc.ptx: objs_ptx/%_ispc.bc - $(LLVM32DIS) $< -o objs_ptx/$*_ispc-ll32.ll - $(PTXGEN) objs_ptx/$*_ispc-ll32.ll -o $@ +#objs_ptx/%_nvvm_ispc.ptx: objs_ptx/%_ispc.bc +# $(LLVM32DIS) $< -o objs_ptx/$*_ispc-ll32.ll +# $(PTXGEN) objs_ptx/$*_ispc-ll32.ll -o $@ +objs_ptx/%_nvvm_ispc.ptx: objs_ptx/%_ispc.ll + $(PTXGEN) $< -o $@ # generate an object file from PTX objs_ptx/%_ispc.o: objs_ptx/%_ispc.ptx diff --git a/module.cpp b/module.cpp index db378591..fc5c6437 100644 --- a/module.cpp +++ b/module.cpp @@ -58,6 +58,7 @@ #include #include #include +#include #ifdef ISPC_IS_WINDOWS #include #include @@ -71,6 +72,7 @@ #include #include #include + #include "llvm/Assembly/AssemblyAnnotationWriter.h" #else #include #include @@ -78,6 +80,7 @@ #include #include #include + #include "llvm/Assembly/AssemblyAnnotationWriter.h" #endif #include #include @@ -1034,8 +1037,14 @@ Module::writeOutput(OutputType outputType, const char *outFileName, fileType = "assembly"; break; case Bitcode: - if (strcasecmp(suffix, "bc")) - fileType = "LLVM bitcode"; + if (g->target->getISA() != Target::NVPTX) + { + if (strcasecmp(suffix, "bc")) + fileType = "LLVM bitcode"; + } + else + if (strcasecmp(suffix, "ll")) + fileType = "LLVM assembly"; break; case Object: if (strcasecmp(suffix, "o") && strcasecmp(suffix, "obj")) @@ -1104,6 +1113,73 @@ Module::writeOutput(OutputType outputType, const char *outFileName, return writeObjectFileOrAssembly(outputType, outFileName); } +typedef std::vector vecString_t; +static vecString_t +lSplitString(const std::string &s) +{ + std::stringstream ss(s); + std::istream_iterator begin(ss); + std::istream_iterator end; + return vecString_t(begin,end); +} + +static void +lFixAttributes(const vecString_t &src, vecString_t &dst) +{ + dst.clear(); + + std::vector< std::pair > attributePos; + + typedef std::map attributeMap_t; + attributeMap_t attributeMap; + + for (vecString_t::const_iterator it = src.begin(); it != src.end(); it++) + { + const vecString_t words = lSplitString(*it); + if (!words.empty() && words[0] == "attributes" && words[1][0] == '#') + { + const int nWords = words.size(); + assert(nWords > 3); + assert(words[2 ] == "="); + assert(words[3 ] == "{"); + assert(words[nWords-1] == "}"); + std::string attributes; + for (int w = 4; w < nWords-1; w++) + attributes += words[w] + " "; + attributeMap[words[1]] = attributes; + } + } + for (vecString_t::const_iterator it = src.begin(); it != src.end(); it++) + { + vecString_t words = lSplitString(*it); + if (!words.empty() && words[0] == "attributes") + continue; + std::string s; + std::map attributeSet; +#if 1 /* this attributed cannot be used in function parametrers, so remove them */ + attributeSet["readnone"] = " "; + attributeSet["readonly"] = " "; + attributeSet["readnone,"] = ","; + attributeSet["readonly,"] = ","; +#endif + + + for (vecString_t::iterator w = words.begin(); w != words.end(); w++) + { + if (attributeSet.find(*w) != attributeSet.end()) + *w = attributeSet[*w]; + + if ((*w)[0] == '#') + { + attributeMap_t::iterator m = attributeMap.find(*w); + assert (m != attributeMap.end()); + *w = attributeMap[*w]; + } + s += *w + " "; + } + dst.push_back(s); + } +} bool Module::writeBitcode(llvm::Module *module, const char *outFileName) { @@ -1128,12 +1204,44 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) { } llvm::raw_fd_ostream fos(fd, (fd != 1), false); - if (g->target->getISA() == Target::NVPTX) + if (g->target->getISA() != Target::NVPTX) { + llvm::WriteBitcodeToFile(module, fos); + } + else + { + /* when using "nvptx" target, emit patched/hacked assembly + * NVPTX only accepts 3.2-style LLVM assembly, where attributes + * must be inlined, rather then referenced by #attribute_d + * As soon as NVVM support 3.3,3.4 style assembly this fix won't be needed + */ const std::string dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"; module->setDataLayout(dl_string); + + std::string s; + llvm::raw_string_ostream out(s); + llvm::OwningPtr Annotator; + module->print(out, Annotator.get()); + std::istringstream iss(s); + + vecString_t input,output; + while (std::getline(iss,s)) + input.push_back(s); + output = input; + +#if !(defined(LLVM_3_1) || defined(LLVM_3_2)) + /* do not fix attributed with LLVM 3.2, everything is fine there */ + lFixAttributes(input,output); +#endif + + for (vecString_t::iterator it = output.begin(); it != output.end(); it++) + { + *it += "\n"; + fos << *it; + } } - llvm::WriteBitcodeToFile(module, fos); + + return true; } diff --git a/ptxtools/runtest_ptxcc.sh b/ptxtools/runtest_ptxcc.sh index 67d9ccd7..52eccad0 100755 --- a/ptxtools/runtest_ptxcc.sh +++ b/ptxtools/runtest_ptxcc.sh @@ -7,9 +7,12 @@ TMPDIR=/tmp fbname=`basename $1` if [ "$NVVM" == "1" ]; then - LLVM32=$HOME/usr/local/llvm/bin-3.2 - LLVMDIS=$LLVM32/bin/llvm-dis - $($LLVMDIS $1 -o $TMPDIR/$fbname.ll) && $($PTXGEN $TMPDIR/$fbname.ll -o $TMPDIR/$fbname.ptx) && \ +# LLVM32=$HOME/usr/local/llvm/bin-3.2 +# LLVM34=$HOME/usr/local/llvm/bin-3.4 +# LLVMAS=$LLVM34/bin/llvm-as +# LLVMDIS=$LLVM32/bin/llvm-dis +# $($LLVMAS $1 -o $TMPDIR/$fbname.bc) && $($LLVMDIS $TMPDIR/$fbname.bc -o $TMPDIR/$fbname.ll) && $($PTXGEN $TMPDIR/$fbname.ll -o $TMPDIR/$fbname.ptx) && \ + $($PTXGEN $1 -o $TMPDIR/$fbname.ptx) && \ $($PTXCC $TMPDIR/$fbname.ptx -o $TMPDIR/$fbname.o -Xnvcc="-G") && \ $(nvcc test_static_nvptx.cpp examples/util/nvcc_helpers.cu examples/util/ispc_malloc.cpp $TMPDIR/$fbname.o -arch=sm_35 -Iexamples/util/ -D_CUDA_ -lcudadevrt $ARGS); else