+fixed some example, found some bugs, and bugs in ptxas/cuda

2014-01-21 14:51:27 +01:00
parent 5a773ed62a
commit bc99897fbb
22 changed files with 98 additions and 58 deletions
--- a/2
+++ b/2
@@ -36,7 +36,7 @@

 # If you have your own special version of llvm and/or clang, change
 # these variables to match.
-LLVM_CONFIG=$(shell which $(HOME)/usr/local/llvm/bin-trunk/bin/llvm-config)
+LLVM_CONFIG=$(shell which $(HOME)/usr/local/llvm/bin-3.4/bin/llvm-config)
 CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)

 # Enable ARM by request
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
@@ -653,9 +653,19 @@ define  i1 @__none(<1 x i1>) nounwind readnone alwaysinline {
 declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
 declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone

-define  float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
-  %r = extractelement <1 x float> %v, i32 0
-  ret float %r
+define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
+  %value = extractelement <1 x float> %v, i32 0
+  %call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
+  %call1 = fadd float %call, %value 
+  %call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8)
+  %call1.1 = fadd float %call1, %call.1 
+  %call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4)
+  %call1.2 = fadd float %call1.1, %call.2
+  %call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2)
+  %call1.3 = fadd float %call1.2, %call.3 
+  %call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1)
+  %call1.4 = fadd float %call1.3, %call.4 
+  ret float %call1.4
 }

 define  float @__reduce_min_float(<1 x float>) nounwind readnone {
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1410,7 +1410,7 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {

 llvm::Value *
 FunctionEmitContext::ProgramIndexVector(bool is32bits) {
-  if (g->target->getISA() != Target::NVPTX)
+  if (1 || g->target->getISA() != Target::NVPTX)
  {
    llvm::SmallVector<llvm::Constant*, 16> array;
    for (int i = 0; i < g->target->getVectorWidth() ; ++i) {
--- a/examples_ptx/ptxcc/ptxcc
+++ b/examples_ptx/ptxcc/ptxcc
@@ -10,7 +10,7 @@ DEPTX=dePTX
 NVCC=nvcc

 $DEPTX < $PTXSRC > $PTXCU &&
-$NVCC -arch=sm_35 -dc $NVCCPARM -dryrun $PTXCU 2>&1 | \
+$NVCC -arch=sm_35  -dc $NVCCPARM -dryrun $PTXCU 2>&1 | \
  sed 's/\#\$//g'| \
  awk '{ if ($1 == "LIBRARIES=") print $1$2; else if ($1 == "cicc") print "cp '$PTXSRC'", $NF; else print $0 }' > $PTXSH &&
 sh $PTXSH
--- a/func.cpp
+++ b/func.cpp
@@ -512,6 +512,7 @@ Function::GenerateIR() {
                if (g->target->getISA() == Target::NVPTX)
                {
                  functionName += std::string("___export");  /* add ___export to the end, for ptxcc to recognize it is exported */
+#if 0
                  llvm::NamedMDNode* annotations =
                    m->module->getOrInsertNamedMetadata("nvvm.annotations");
                  llvm::SmallVector<llvm::Value*, 3> av;
@@ -519,6 +520,7 @@ Function::GenerateIR() {
                  av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
                  av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
                  annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); 
+#endif
                }
                llvm::Function *appFunction =
                    llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
--- a/module.cpp
+++ b/module.cpp
@@ -427,15 +427,6 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
        return;
    }

-#if 0
-    if (g->target->getISA() == Target::NVPTX &&
-        type->IsVaryingType())
-    {
-        Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target.");
-        return;
-    }
-#endif
-
    if (Type::Equal(type, AtomicType::Void)) {
        Error(pos, "\"void\" type global variable is illegal.");
        return;
@@ -453,6 +444,17 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
        return;
    }

+#if 1
+    if (g->target->getISA() == Target::NVPTX && 
+        at != NULL &&
+        type->IsVaryingType())
+    {
+        Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target.");
+        return;
+    }
+#endif
+
+
    llvm::Type *llvmType = type->LLVMType(g->ctx);
    if (llvmType == NULL)
        return;
@@ -2130,6 +2132,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
      opts.addMacroDef("cif=if");
      opts.addMacroDef("cfor=for");
      opts.addMacroDef("cwhile=while");
+      opts.addMacroDef("ccontinue=continue");
      opts.addMacroDef("cdo=do");
      opts.addMacroDef("taskIndex=blockIndex0()");
      opts.addMacroDef("taskCount=blockCount0()");
--- a/opt.cpp
+++ b/opt.cpp
@@ -497,7 +497,11 @@ Optimize(llvm::Module *module, int optLevel) {
        // run absolutely no optimizations, since the front-end needs us to
        // take the various __pseudo_* functions it has emitted and turn
        // them into something that can actually execute.
-        optPM.add(CreateImproveMemoryOpsPass(), 100);
+        
+        if (g->opt.disableGatherScatterOptimizations == false &&
+            g->target->getVectorWidth() > 1) 
+          optPM.add(CreateImproveMemoryOpsPass(), 100);
+
        if (g->opt.disableHandlePseudoMemoryOps == false)
            optPM.add(CreateReplacePseudoMemoryOpsPass());

--- a/run_tests.py
+++ b/run_tests.py
@@ -257,7 +257,7 @@ def run_test(testname):
                  cc_cmd = "%s %s -DTEST_SIG=%d -o %s" % \
                      (nvptxcc_exe_rel, obj_name, match, exe_name)

-            ispc_cmd = ispc_exe_rel + " --woff %s -o %s --arch=%s --target=%s" % \
+            ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --arch=%s --target=%s" % \
                       (filename, obj_name, options.arch, options.target)
            if options.no_opt:
                ispc_cmd += " -O0" 
@@ -271,7 +271,7 @@ def run_test(testname):
                  print "Grepping: %s" % grep_cmd
                sp = subprocess.Popen(grep_cmd, shell=True)
                sp.communicate()
-                ispc_cmd = ispc_exe_rel + " --woff %s -o %s --emit-asm --target=%s" % \
+                ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-asm --target=%s" % \
                       (filename4ptx, obj_name, options.target)

        # compile the ispc code, make the executable, and run it...
@@ -287,7 +287,7 @@ def run_test(testname):
                    basename = os.path.basename(filename)
                    os.unlink("%s.pdb" % basename)
                    os.unlink("%s.ilk" % basename)
-            os.unlink(obj_name)
+#            os.unlink(obj_name)
        except:
            None

--- a/tests/broadcast.ispc
+++ b/tests/broadcast.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    float b = (programCount == 1) ? 3 : broadcast(a, 2);
+    float b = (programCount == 1) ? 4 : broadcast(a, 2);
    RET[programIndex] = b;
 }

--- a/tests/c-test-64.ispc
+++ b/tests/c-test-64.ispc
@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 2;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 5;
-    RET[3] = RET[7] = RET[11] = RET[15] = 6;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 2;
+    RET[i+1] = 3;
+    RET[i+2] = 5;
+    RET[i+3] = 6;
+  }
 }
--- a/tests/c-test-65.ispc
+++ b/tests/c-test-65.ispc
@@ -18,6 +18,9 @@ export void f_fu(uniform float RET[4], uniform float aFOO[4], uniform float b) {

 export void result(uniform float RET[]) {
    RET[programIndex] = 3;
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+    for (int i = 0; i < programCount; i += 4)
+    {
+      RET[i+0] = 1;
+      RET[i+3] = 29;
+    }
 }
--- a/tests/c-test-66.ispc
+++ b/tests/c-test-66.ispc
@@ -19,6 +19,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {

 export void result(uniform float RET[]) {
    RET[programIndex] = 32;
-    RET[2] = RET[6] = RET[10] = RET[14] = 38;
-    RET[3] = RET[7] = RET[11] = RET[15] = 39;
+    for (int i = 0; i < programCount; i += 4)
+    {
+      RET[i+2] = 38;
+      RET[i+3] = 39;
+    }
 }
--- a/tests/cfor-array-struct-gather.ispc
+++ b/tests/cfor-array-struct-gather.ispc
@@ -4,14 +4,14 @@ export uniform int width() { return programCount; }


 struct Foo {
-    uniform float x[17];
+    uniform float x[programCount+1];
 };

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
    uniform Foo foo;
    uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
        foo.x[i] = i;

    if ((int)a & 1)
--- a/tests/cfor-gs-double-improve-multidim-1.ispc
+++ b/tests/cfor-gs-double-improve-multidim-1.ispc
@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex]; 
-    uniform double udx[25][25];
-    cfor (uniform int i = 0; i < 25; ++i)
-        cfor (uniform int j = 0; j < 25; ++j)
+    uniform double udx[programCount+1][programCount+1];
+    cfor (uniform int i = 0; i < programCount+1; ++i)
+        cfor (uniform int j = 0; j < programCount+1; ++j)
            udx[i][j] = 10*i+j;

    int x = 1;
--- a/tests/cfor-gs-improve-multidim-1.ispc
+++ b/tests/cfor-gs-improve-multidim-1.ispc
@@ -5,9 +5,9 @@ export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex]; 
-    uniform float udx[20][20];
-    cfor (uniform int i = 0; i < 20; ++i)
-        cfor (uniform int j = 0; j < 20; ++j)
+    uniform float udx[programCount+1][programCount+1];
+    cfor (uniform int i = 0; i < programCount+1; ++i)
+        cfor (uniform int j = 0; j < programCount+1x; ++j)
            udx[i][j] = 100*i+j;

    int x = 1;
--- a/tests/cfor-struct-gather-2.ispc
+++ b/tests/cfor-struct-gather-2.ispc
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
    uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
        foo[i].f = i*a;
    RET[programIndex] = func(foo, (int)a);
 }
--- a/tests/cfor-struct-gather-3.ispc
+++ b/tests/cfor-struct-gather-3.ispc
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
    uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
        foo[i].f = i*a;
    RET[programIndex] = func(foo, (int)a);
 }
--- a/tests/cfor-struct-gather.ispc
+++ b/tests/cfor-struct-gather.ispc
@@ -9,9 +9,9 @@ struct Foo {

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
    uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
        foo[i].f = i*a;
    RET[programIndex] = foo[(int)a].f;
 }
--- a/tests/test-134.ispc
+++ b/tests/test-134.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
--- a/tests/test-135.ispc
+++ b/tests/test-135.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
--- a/tests/test-136.ispc
+++ b/tests/test-136.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
--- a/tests/test-140.ispc
+++ b/tests/test-140.ispc
@@ -8,8 +8,11 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }

 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 0x0.0p+0;
-    RET[1] = RET[5] = RET[9] = RET[13] = 0x1.62e43p-1;
-    RET[2] = RET[6] = RET[10] = RET[14] =  0x1.193ea8p+0;
-    RET[3] = RET[7] = RET[11] = RET[15] = 0x1.62e43p+0;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 0x0.0p+0;
+    RET[i+1] = 0x1.62e43p-1;
+    RET[i+2] = 0x1.193ea8p+0;
+    RET[i+3] = 0x1.62e43p+0;
+  }
 }