diff --git a/Makefile b/Makefile
index abe7e1f7..b3af57bb 100644
--- a/Makefile
+++ b/Makefile
@@ -36,7 +36,7 @@
 
 # If you have your own special version of llvm and/or clang, change
 # these variables to match.
-LLVM_CONFIG=$(shell which $(HOME)/usr/local/llvm/bin-trunk/bin/llvm-config)
+LLVM_CONFIG=$(shell which $(HOME)/usr/local/llvm/bin-3.4/bin/llvm-config)
 CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
 
 # Enable ARM by request
diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll
index ff73e09d..76df46d9 100644
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
@@ -653,9 +653,19 @@ define  i1 @__none(<1 x i1>) nounwind readnone alwaysinline {
 declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
 declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
 
-define  float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
-  %r = extractelement <1 x float> %v, i32 0
-  ret float %r
+define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
+  %value = extractelement <1 x float> %v, i32 0
+  %call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
+  %call1 = fadd float %call, %value 
+  %call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8)
+  %call1.1 = fadd float %call1, %call.1 
+  %call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4)
+  %call1.2 = fadd float %call1.1, %call.2
+  %call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2)
+  %call1.3 = fadd float %call1.2, %call.3 
+  %call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1)
+  %call1.4 = fadd float %call1.3, %call.4 
+  ret float %call1.4
 }
 
 define  float @__reduce_min_float(<1 x float>) nounwind readnone {
diff --git a/ctx.cpp b/ctx.cpp
index 43012640..6fb7561d 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1410,7 +1410,7 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
 
 llvm::Value *
 FunctionEmitContext::ProgramIndexVector(bool is32bits) {
-  if (g->target->getISA() != Target::NVPTX)
+  if (1 || g->target->getISA() != Target::NVPTX)
   {
     llvm::SmallVector<llvm::Constant*, 16> array;
     for (int i = 0; i < g->target->getVectorWidth() ; ++i) {
diff --git a/examples_ptx/ptxcc/ptxcc b/examples_ptx/ptxcc/ptxcc
index 0f7e384e..7953f902 100755
--- a/examples_ptx/ptxcc/ptxcc
+++ b/examples_ptx/ptxcc/ptxcc
@@ -10,7 +10,7 @@ DEPTX=dePTX
 NVCC=nvcc
 
 $DEPTX < $PTXSRC > $PTXCU &&
-$NVCC -arch=sm_35 -dc $NVCCPARM -dryrun $PTXCU 2>&1 | \
+$NVCC -arch=sm_35  -dc $NVCCPARM -dryrun $PTXCU 2>&1 | \
   sed 's/\#\$//g'| \
   awk '{ if ($1 == "LIBRARIES=") print $1$2; else if ($1 == "cicc") print "cp '$PTXSRC'", $NF; else print $0 }' > $PTXSH &&
 sh $PTXSH
diff --git a/func.cpp b/func.cpp
index f12f244b..981433d6 100644
--- a/func.cpp
+++ b/func.cpp
@@ -512,6 +512,7 @@ Function::GenerateIR() {
                 if (g->target->getISA() == Target::NVPTX)
                 {
                   functionName += std::string("___export");  /* add ___export to the end, for ptxcc to recognize it is exported */
+#if 0
                   llvm::NamedMDNode* annotations =
                     m->module->getOrInsertNamedMetadata("nvvm.annotations");
                   llvm::SmallVector<llvm::Value*, 3> av;
@@ -519,6 +520,7 @@ Function::GenerateIR() {
                   av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
                   av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
                   annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); 
+#endif
                 }
                 llvm::Function *appFunction =
                     llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
diff --git a/module.cpp b/module.cpp
index f8936625..93b0dcd6 100644
--- a/module.cpp
+++ b/module.cpp
@@ -427,15 +427,6 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
         return;
     }
 
-#if 0
-    if (g->target->getISA() == Target::NVPTX &&
-        type->IsVaryingType())
-    {
-        Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target.");
-        return;
-    }
-#endif
-
     if (Type::Equal(type, AtomicType::Void)) {
         Error(pos, "\"void\" type global variable is illegal.");
         return;
@@ -453,6 +444,17 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
         return;
     }
 
+#if 1
+    if (g->target->getISA() == Target::NVPTX && 
+        at != NULL &&
+        type->IsVaryingType())
+    {
+        Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target.");
+        return;
+    }
+#endif
+
+
     llvm::Type *llvmType = type->LLVMType(g->ctx);
     if (llvmType == NULL)
         return;
@@ -2130,6 +2132,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
       opts.addMacroDef("cif=if");
       opts.addMacroDef("cfor=for");
       opts.addMacroDef("cwhile=while");
+      opts.addMacroDef("ccontinue=continue");
       opts.addMacroDef("cdo=do");
       opts.addMacroDef("taskIndex=blockIndex0()");
       opts.addMacroDef("taskCount=blockCount0()");
diff --git a/opt.cpp b/opt.cpp
index b0910b90..8702ca38 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -497,7 +497,11 @@ Optimize(llvm::Module *module, int optLevel) {
         // run absolutely no optimizations, since the front-end needs us to
         // take the various __pseudo_* functions it has emitted and turn
         // them into something that can actually execute.
-        optPM.add(CreateImproveMemoryOpsPass(), 100);
+        
+        if (g->opt.disableGatherScatterOptimizations == false &&
+            g->target->getVectorWidth() > 1) 
+          optPM.add(CreateImproveMemoryOpsPass(), 100);
+
         if (g->opt.disableHandlePseudoMemoryOps == false)
             optPM.add(CreateReplacePseudoMemoryOpsPass());
 
diff --git a/run_tests.py b/run_tests.py
index f546463e..0da70e72 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -257,7 +257,7 @@ def run_test(testname):
                   cc_cmd = "%s %s -DTEST_SIG=%d -o %s" % \
                       (nvptxcc_exe_rel, obj_name, match, exe_name)
 
-            ispc_cmd = ispc_exe_rel + " --woff %s -o %s --arch=%s --target=%s" % \
+            ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --arch=%s --target=%s" % \
                        (filename, obj_name, options.arch, options.target)
             if options.no_opt:
                 ispc_cmd += " -O0" 
@@ -271,7 +271,7 @@ def run_test(testname):
                   print "Grepping: %s" % grep_cmd
                 sp = subprocess.Popen(grep_cmd, shell=True)
                 sp.communicate()
-                ispc_cmd = ispc_exe_rel + " --woff %s -o %s --emit-asm --target=%s" % \
+                ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-asm --target=%s" % \
                        (filename4ptx, obj_name, options.target)
 
         # compile the ispc code, make the executable, and run it...
@@ -287,7 +287,7 @@ def run_test(testname):
                     basename = os.path.basename(filename)
                     os.unlink("%s.pdb" % basename)
                     os.unlink("%s.ilk" % basename)
-            os.unlink(obj_name)
+#            os.unlink(obj_name)
         except:
             None
 
diff --git a/tests/broadcast.ispc b/tests/broadcast.ispc
index 1df835ae..6dfa1a00 100644
--- a/tests/broadcast.ispc
+++ b/tests/broadcast.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    float b = (programCount == 1) ? 3 : broadcast(a, 2);
+    float b = (programCount == 1) ? 4 : broadcast(a, 2);
     RET[programIndex] = b;
 }
 
diff --git a/tests/c-test-64.ispc b/tests/c-test-64.ispc
index 3429bf91..d2602bc7 100644
--- a/tests/c-test-64.ispc
+++ b/tests/c-test-64.ispc
@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 2;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 5;
-    RET[3] = RET[7] = RET[11] = RET[15] = 6;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 2;
+    RET[i+1] = 3;
+    RET[i+2] = 5;
+    RET[i+3] = 6;
+  }
 }
diff --git a/tests/c-test-65.ispc b/tests/c-test-65.ispc
index 9a363864..15df6367 100644
--- a/tests/c-test-65.ispc
+++ b/tests/c-test-65.ispc
@@ -18,6 +18,9 @@ export void f_fu(uniform float RET[4], uniform float aFOO[4], uniform float b) {
 
 export void result(uniform float RET[]) {
     RET[programIndex] = 3;
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+    for (int i = 0; i < programCount; i += 4)
+    {
+      RET[i+0] = 1;
+      RET[i+3] = 29;
+    }
 }
diff --git a/tests/c-test-66.ispc b/tests/c-test-66.ispc
index a6c35dc7..22511604 100644
--- a/tests/c-test-66.ispc
+++ b/tests/c-test-66.ispc
@@ -19,6 +19,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 export void result(uniform float RET[]) {
     RET[programIndex] = 32;
-    RET[2] = RET[6] = RET[10] = RET[14] = 38;
-    RET[3] = RET[7] = RET[11] = RET[15] = 39;
+    for (int i = 0; i < programCount; i += 4)
+    {
+      RET[i+2] = 38;
+      RET[i+3] = 39;
+    }
 }
diff --git a/tests/cfor-array-struct-gather.ispc b/tests/cfor-array-struct-gather.ispc
index c320ad7c..d433b00d 100644
--- a/tests/cfor-array-struct-gather.ispc
+++ b/tests/cfor-array-struct-gather.ispc
@@ -4,14 +4,14 @@ export uniform int width() { return programCount; }
 
 
 struct Foo {
-    uniform float x[17];
+    uniform float x[programCount+1];
 };
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     uniform Foo foo;
     uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
         foo.x[i] = i;
 
     if ((int)a & 1)
diff --git a/tests/cfor-gs-double-improve-multidim-1.ispc b/tests/cfor-gs-double-improve-multidim-1.ispc
index ed672bd8..62124e2a 100644
--- a/tests/cfor-gs-double-improve-multidim-1.ispc
+++ b/tests/cfor-gs-double-improve-multidim-1.ispc
@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
-    uniform double udx[25][25];
-    cfor (uniform int i = 0; i < 25; ++i)
-        cfor (uniform int j = 0; j < 25; ++j)
+    uniform double udx[programCount+1][programCount+1];
+    cfor (uniform int i = 0; i < programCount+1; ++i)
+        cfor (uniform int j = 0; j < programCount+1; ++j)
             udx[i][j] = 10*i+j;
 
     int x = 1;
diff --git a/tests/cfor-gs-improve-multidim-1.ispc b/tests/cfor-gs-improve-multidim-1.ispc
index b0893617..42732ebb 100644
--- a/tests/cfor-gs-improve-multidim-1.ispc
+++ b/tests/cfor-gs-improve-multidim-1.ispc
@@ -5,9 +5,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
-    uniform float udx[20][20];
-    cfor (uniform int i = 0; i < 20; ++i)
-        cfor (uniform int j = 0; j < 20; ++j)
+    uniform float udx[programCount+1][programCount+1];
+    cfor (uniform int i = 0; i < programCount+1; ++i)
+        cfor (uniform int j = 0; j < programCount+1x; ++j)
             udx[i][j] = 100*i+j;
 
     int x = 1;
diff --git a/tests/cfor-struct-gather-2.ispc b/tests/cfor-struct-gather-2.ispc
index 7c615139..75da4a3f 100644
--- a/tests/cfor-struct-gather-2.ispc
+++ b/tests/cfor-struct-gather-2.ispc
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
     uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
         foo[i].f = i*a;
     RET[programIndex] = func(foo, (int)a);
 }
diff --git a/tests/cfor-struct-gather-3.ispc b/tests/cfor-struct-gather-3.ispc
index 7c615139..75da4a3f 100644
--- a/tests/cfor-struct-gather-3.ispc
+++ b/tests/cfor-struct-gather-3.ispc
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
     uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
         foo[i].f = i*a;
     RET[programIndex] = func(foo, (int)a);
 }
diff --git a/tests/cfor-struct-gather.ispc b/tests/cfor-struct-gather.ispc
index 49928a6b..9265da32 100644
--- a/tests/cfor-struct-gather.ispc
+++ b/tests/cfor-struct-gather.ispc
@@ -9,9 +9,9 @@ struct Foo {
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
     uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
         foo[i].f = i*a;
     RET[programIndex] = foo[(int)a].f;
 }
diff --git a/tests/test-134.ispc b/tests/test-134.ispc
index baa8ec37..9d4d0e94 100644
--- a/tests/test-134.ispc
+++ b/tests/test-134.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/test-135.ispc b/tests/test-135.ispc
index c350a524..bb9881e6 100644
--- a/tests/test-135.ispc
+++ b/tests/test-135.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/test-136.ispc b/tests/test-136.ispc
index ab6c6b5b..098ac456 100644
--- a/tests/test-136.ispc
+++ b/tests/test-136.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/test-140.ispc b/tests/test-140.ispc
index a983d528..997d558e 100644
--- a/tests/test-140.ispc
+++ b/tests/test-140.ispc
@@ -8,8 +8,11 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 0x0.0p+0;
-    RET[1] = RET[5] = RET[9] = RET[13] = 0x1.62e43p-1;
-    RET[2] = RET[6] = RET[10] = RET[14] =  0x1.193ea8p+0;
-    RET[3] = RET[7] = RET[11] = RET[15] = 0x1.62e43p+0;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 0x0.0p+0;
+    RET[i+1] = 0x1.62e43p-1;
+    RET[i+2] = 0x1.193ea8p+0;
+    RET[i+3] = 0x1.62e43p+0;
+  }
 }