diff --git a/Makefile b/Makefile index abe7e1f7..b3af57bb 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,7 @@ # If you have your own special version of llvm and/or clang, change # these variables to match. -LLVM_CONFIG=$(shell which $(HOME)/usr/local/llvm/bin-trunk/bin/llvm-config) +LLVM_CONFIG=$(shell which $(HOME)/usr/local/llvm/bin-3.4/bin/llvm-config) CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir) # Enable ARM by request diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll index ff73e09d..76df46d9 100644 --- a/builtins/target-nvptx.ll +++ b/builtins/target-nvptx.ll @@ -653,9 +653,19 @@ define i1 @__none(<1 x i1>) nounwind readnone alwaysinline { declare i16 @__reduce_add_int8() nounwind readnone declare i32 @__reduce_add_int16() nounwind readnone -define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline { - %r = extractelement <1 x float> %v, i32 0 - ret float %r +define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline { + %value = extractelement <1 x float> %v, i32 0 + %call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16) + %call1 = fadd float %call, %value + %call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8) + %call1.1 = fadd float %call1, %call.1 + %call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4) + %call1.2 = fadd float %call1.1, %call.2 + %call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2) + %call1.3 = fadd float %call1.2, %call.3 + %call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1) + %call1.4 = fadd float %call1.3, %call.4 + ret float %call1.4 } define float @__reduce_min_float(<1 x float>) nounwind readnone { diff --git a/ctx.cpp b/ctx.cpp index 43012640..6fb7561d 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1410,7 +1410,7 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) { llvm::Value * FunctionEmitContext::ProgramIndexVector(bool is32bits) { - if (g->target->getISA() != Target::NVPTX) + if (1 || g->target->getISA() != Target::NVPTX) { llvm::SmallVector array; for (int i = 0; i < g->target->getVectorWidth() ; ++i) { diff --git a/examples_ptx/ptxcc/ptxcc b/examples_ptx/ptxcc/ptxcc index 0f7e384e..7953f902 100755 --- a/examples_ptx/ptxcc/ptxcc +++ b/examples_ptx/ptxcc/ptxcc @@ -10,7 +10,7 @@ DEPTX=dePTX NVCC=nvcc $DEPTX < $PTXSRC > $PTXCU && -$NVCC -arch=sm_35 -dc $NVCCPARM -dryrun $PTXCU 2>&1 | \ +$NVCC -arch=sm_35 -dc $NVCCPARM -dryrun $PTXCU 2>&1 | \ sed 's/\#\$//g'| \ awk '{ if ($1 == "LIBRARIES=") print $1$2; else if ($1 == "cicc") print "cp '$PTXSRC'", $NF; else print $0 }' > $PTXSH && sh $PTXSH diff --git a/func.cpp b/func.cpp index f12f244b..981433d6 100644 --- a/func.cpp +++ b/func.cpp @@ -512,6 +512,7 @@ Function::GenerateIR() { if (g->target->getISA() == Target::NVPTX) { functionName += std::string("___export"); /* add ___export to the end, for ptxcc to recognize it is exported */ +#if 0 llvm::NamedMDNode* annotations = m->module->getOrInsertNamedMetadata("nvvm.annotations"); llvm::SmallVector av; @@ -519,6 +520,7 @@ Function::GenerateIR() { av.push_back(llvm::MDString::get(*g->ctx, "kernel")); av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1)); annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); +#endif } llvm::Function *appFunction = llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module); diff --git a/module.cpp b/module.cpp index f8936625..93b0dcd6 100644 --- a/module.cpp +++ b/module.cpp @@ -427,15 +427,6 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE return; } -#if 0 - if (g->target->getISA() == Target::NVPTX && - type->IsVaryingType()) - { - Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target."); - return; - } -#endif - if (Type::Equal(type, AtomicType::Void)) { Error(pos, "\"void\" type global variable is illegal."); return; @@ -453,6 +444,17 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE return; } +#if 1 + if (g->target->getISA() == Target::NVPTX && + at != NULL && + type->IsVaryingType()) + { + Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target."); + return; + } +#endif + + llvm::Type *llvmType = type->LLVMType(g->ctx); if (llvmType == NULL) return; @@ -2130,6 +2132,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre opts.addMacroDef("cif=if"); opts.addMacroDef("cfor=for"); opts.addMacroDef("cwhile=while"); + opts.addMacroDef("ccontinue=continue"); opts.addMacroDef("cdo=do"); opts.addMacroDef("taskIndex=blockIndex0()"); opts.addMacroDef("taskCount=blockCount0()"); diff --git a/opt.cpp b/opt.cpp index b0910b90..8702ca38 100644 --- a/opt.cpp +++ b/opt.cpp @@ -497,7 +497,11 @@ Optimize(llvm::Module *module, int optLevel) { // run absolutely no optimizations, since the front-end needs us to // take the various __pseudo_* functions it has emitted and turn // them into something that can actually execute. - optPM.add(CreateImproveMemoryOpsPass(), 100); + + if (g->opt.disableGatherScatterOptimizations == false && + g->target->getVectorWidth() > 1) + optPM.add(CreateImproveMemoryOpsPass(), 100); + if (g->opt.disableHandlePseudoMemoryOps == false) optPM.add(CreateReplacePseudoMemoryOpsPass()); diff --git a/run_tests.py b/run_tests.py index f546463e..0da70e72 100755 --- a/run_tests.py +++ b/run_tests.py @@ -257,7 +257,7 @@ def run_test(testname): cc_cmd = "%s %s -DTEST_SIG=%d -o %s" % \ (nvptxcc_exe_rel, obj_name, match, exe_name) - ispc_cmd = ispc_exe_rel + " --woff %s -o %s --arch=%s --target=%s" % \ + ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --arch=%s --target=%s" % \ (filename, obj_name, options.arch, options.target) if options.no_opt: ispc_cmd += " -O0" @@ -271,7 +271,7 @@ def run_test(testname): print "Grepping: %s" % grep_cmd sp = subprocess.Popen(grep_cmd, shell=True) sp.communicate() - ispc_cmd = ispc_exe_rel + " --woff %s -o %s --emit-asm --target=%s" % \ + ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-asm --target=%s" % \ (filename4ptx, obj_name, options.target) # compile the ispc code, make the executable, and run it... @@ -287,7 +287,7 @@ def run_test(testname): basename = os.path.basename(filename) os.unlink("%s.pdb" % basename) os.unlink("%s.ilk" % basename) - os.unlink(obj_name) +# os.unlink(obj_name) except: None diff --git a/tests/broadcast.ispc b/tests/broadcast.ispc index 1df835ae..6dfa1a00 100644 --- a/tests/broadcast.ispc +++ b/tests/broadcast.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; - float b = (programCount == 1) ? 3 : broadcast(a, 2); + float b = (programCount == 1) ? 4 : broadcast(a, 2); RET[programIndex] = b; } diff --git a/tests/c-test-64.ispc b/tests/c-test-64.ispc index 3429bf91..d2602bc7 100644 --- a/tests/c-test-64.ispc +++ b/tests/c-test-64.ispc @@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 2; - RET[1] = RET[5] = RET[9] = RET[13] = 3; - RET[2] = RET[6] = RET[10] = RET[14] = 5; - RET[3] = RET[7] = RET[11] = RET[15] = 6; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 2; + RET[i+1] = 3; + RET[i+2] = 5; + RET[i+3] = 6; + } } diff --git a/tests/c-test-65.ispc b/tests/c-test-65.ispc index 9a363864..15df6367 100644 --- a/tests/c-test-65.ispc +++ b/tests/c-test-65.ispc @@ -18,6 +18,9 @@ export void f_fu(uniform float RET[4], uniform float aFOO[4], uniform float b) { export void result(uniform float RET[]) { RET[programIndex] = 3; - RET[0] = RET[4] = RET[8] = RET[12] = 1; - RET[3] = RET[7] = RET[11] = RET[15] = 29; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 1; + RET[i+3] = 29; + } } diff --git a/tests/c-test-66.ispc b/tests/c-test-66.ispc index a6c35dc7..22511604 100644 --- a/tests/c-test-66.ispc +++ b/tests/c-test-66.ispc @@ -19,6 +19,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { RET[programIndex] = 32; - RET[2] = RET[6] = RET[10] = RET[14] = 38; - RET[3] = RET[7] = RET[11] = RET[15] = 39; + for (int i = 0; i < programCount; i += 4) + { + RET[i+2] = 38; + RET[i+3] = 39; + } } diff --git a/tests/cfor-array-struct-gather.ispc b/tests/cfor-array-struct-gather.ispc index c320ad7c..d433b00d 100644 --- a/tests/cfor-array-struct-gather.ispc +++ b/tests/cfor-array-struct-gather.ispc @@ -4,14 +4,14 @@ export uniform int width() { return programCount; } struct Foo { - uniform float x[17]; + uniform float x[programCount+1]; }; export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; uniform Foo foo; uniform int i; - cfor (i = 0; i < 17; ++i) + cfor (i = 0; i < programCount+1; ++i) foo.x[i] = i; if ((int)a & 1) diff --git a/tests/cfor-gs-double-improve-multidim-1.ispc b/tests/cfor-gs-double-improve-multidim-1.ispc index ed672bd8..62124e2a 100644 --- a/tests/cfor-gs-double-improve-multidim-1.ispc +++ b/tests/cfor-gs-double-improve-multidim-1.ispc @@ -4,9 +4,9 @@ export uniform int width() { return programCount; } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; - uniform double udx[25][25]; - cfor (uniform int i = 0; i < 25; ++i) - cfor (uniform int j = 0; j < 25; ++j) + uniform double udx[programCount+1][programCount+1]; + cfor (uniform int i = 0; i < programCount+1; ++i) + cfor (uniform int j = 0; j < programCount+1; ++j) udx[i][j] = 10*i+j; int x = 1; diff --git a/tests/cfor-gs-improve-multidim-1.ispc b/tests/cfor-gs-improve-multidim-1.ispc index b0893617..42732ebb 100644 --- a/tests/cfor-gs-improve-multidim-1.ispc +++ b/tests/cfor-gs-improve-multidim-1.ispc @@ -5,9 +5,9 @@ export uniform int width() { return programCount; } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; - uniform float udx[20][20]; - cfor (uniform int i = 0; i < 20; ++i) - cfor (uniform int j = 0; j < 20; ++j) + uniform float udx[programCount+1][programCount+1]; + cfor (uniform int i = 0; i < programCount+1; ++i) + cfor (uniform int j = 0; j < programCount+1x; ++j) udx[i][j] = 100*i+j; int x = 1; diff --git a/tests/cfor-struct-gather-2.ispc b/tests/cfor-struct-gather-2.ispc index 7c615139..75da4a3f 100644 --- a/tests/cfor-struct-gather-2.ispc +++ b/tests/cfor-struct-gather-2.ispc @@ -13,9 +13,9 @@ float func(Foo foo[], int offset) { export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; - Foo foo[17]; + Foo foo[programCount+1]; uniform int i; - cfor (i = 0; i < 17; ++i) + cfor (i = 0; i < programCount+1; ++i) foo[i].f = i*a; RET[programIndex] = func(foo, (int)a); } diff --git a/tests/cfor-struct-gather-3.ispc b/tests/cfor-struct-gather-3.ispc index 7c615139..75da4a3f 100644 --- a/tests/cfor-struct-gather-3.ispc +++ b/tests/cfor-struct-gather-3.ispc @@ -13,9 +13,9 @@ float func(Foo foo[], int offset) { export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; - Foo foo[17]; + Foo foo[programCount+1]; uniform int i; - cfor (i = 0; i < 17; ++i) + cfor (i = 0; i < programCount+1; ++i) foo[i].f = i*a; RET[programIndex] = func(foo, (int)a); } diff --git a/tests/cfor-struct-gather.ispc b/tests/cfor-struct-gather.ispc index 49928a6b..9265da32 100644 --- a/tests/cfor-struct-gather.ispc +++ b/tests/cfor-struct-gather.ispc @@ -9,9 +9,9 @@ struct Foo { export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; - Foo foo[17]; + Foo foo[programCount+1]; uniform int i; - cfor (i = 0; i < 17; ++i) + cfor (i = 0; i < programCount+1; ++i) foo[i].f = i*a; RET[programIndex] = foo[(int)a].f; } diff --git a/tests/test-134.ispc b/tests/test-134.ispc index baa8ec37..9d4d0e94 100644 --- a/tests/test-134.ispc +++ b/tests/test-134.ispc @@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 1; - RET[1] = RET[5] = RET[9] = RET[13] = 3; - RET[2] = RET[6] = RET[10] = RET[14] = 3; - RET[3] = RET[7] = RET[11] = RET[15] = 29; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 1; + RET[i+1] = 3; + RET[i+2] = 3; + RET[i+3] = 29; + } } diff --git a/tests/test-135.ispc b/tests/test-135.ispc index c350a524..bb9881e6 100644 --- a/tests/test-135.ispc +++ b/tests/test-135.ispc @@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 1; - RET[1] = RET[5] = RET[9] = RET[13] = 3; - RET[2] = RET[6] = RET[10] = RET[14] = 3; - RET[3] = RET[7] = RET[11] = RET[15] = 29; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 1; + RET[i+1] = 3; + RET[i+2] = 3; + RET[i+3] = 29; + } } diff --git a/tests/test-136.ispc b/tests/test-136.ispc index ab6c6b5b..098ac456 100644 --- a/tests/test-136.ispc +++ b/tests/test-136.ispc @@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 1; - RET[1] = RET[5] = RET[9] = RET[13] = 3; - RET[2] = RET[6] = RET[10] = RET[14] = 3; - RET[3] = RET[7] = RET[11] = RET[15] = 29; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 1; + RET[i+1] = 3; + RET[i+2] = 3; + RET[i+3] = 29; + } } diff --git a/tests/test-140.ispc b/tests/test-140.ispc index a983d528..997d558e 100644 --- a/tests/test-140.ispc +++ b/tests/test-140.ispc @@ -8,8 +8,11 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { } export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 0x0.0p+0; - RET[1] = RET[5] = RET[9] = RET[13] = 0x1.62e43p-1; - RET[2] = RET[6] = RET[10] = RET[14] = 0x1.193ea8p+0; - RET[3] = RET[7] = RET[11] = RET[15] = 0x1.62e43p+0; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 0x0.0p+0; + RET[i+1] = 0x1.62e43p-1; + RET[i+2] = 0x1.193ea8p+0; + RET[i+3] = 0x1.62e43p+0; + } }