diff --git a/Makefile b/Makefile
index 69b8423c..abe7e1f7 100644
--- a/Makefile
+++ b/Makefile
@@ -144,7 +144,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=nvptx64 avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=nvptx avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
@@ -254,15 +254,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	$(CXX) $(CXXFLAGS) -o $@ -c $<
 
-objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4  builtins/util_ptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4  builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
 	m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@
 
-objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4  builtins/util_ptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4  builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(32 bit version\)
 	m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@
 
-objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4  builtins/util_ptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4  builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(64 bit version\)
 	m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@
 
diff --git a/builtins.cpp b/builtins.cpp
index 40f7006b..a7820c4c 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -693,9 +693,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
         if (g->target->getISA() != Target::NEON32 &&
             g->target->getISA() != Target::NEON16 &&
             g->target->getISA() != Target::NEON8 &&
-            g->target->getISA() != Target::NVPTX64)
+            g->target->getISA() != Target::NVPTX)
 #else
-        if (g->target->getISA() != Target::NVPTX64)
+        if (g->target->getISA() != Target::NVPTX)
 #endif // !__arm__
         {
             Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
@@ -858,14 +858,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     // Next, add the target's custom implementations of the various needed
     // builtin functions (e.g. __masked_store_32(), etc).
     switch (g->target->getISA()) {
-    case Target::NVPTX64: 
+    case Target::NVPTX: 
       {
         if (runtime32) {
-            fprintf(stderr, "W're sorry, but only 64bit targets are supported at this moment .. \n");
+            fprintf(stderr, "Unforetunatly 32bit targets are supported at the moment .. \n");
             assert(0);
         }
         else {
-            EXPORT_MODULE(builtins_bitcode_nvptx64_64bit);
+            EXPORT_MODULE(builtins_bitcode_nvptx_64bit);
         }
         break;
       };
@@ -1138,7 +1138,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     }
 
     // define the 'programCount' builtin variable
-    if (!g->target->isPTX())
+    if (g->target->getISA() != Target::NVPTX)
     {
       lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
     }
diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx.ll
similarity index 99%
rename from builtins/target-nvptx64.ll
rename to builtins/target-nvptx.ll
index d43e5d4a..db217e9a 100644
--- a/builtins/target-nvptx64.ll
+++ b/builtins/target-nvptx.ll
@@ -105,15 +105,9 @@ define i32 @__lanemask_lt_nvptx() nounwind readnone alwaysinline
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; tasking
 
-define i8* @ISPCAlloc(i8**, i64, i32) nounwind alwaysinline
-{
-  %ptr = inttoptr i64 1 to i8*
-  ret i8* %ptr
-}
-
 ;; this call allocate parameter buffer for kernel launch
 declare i64 @cudaGetParameterBuffer(i64, i64) nounwind
-define i8* @ISPCGetParamBuffer(i8**, i64 %align, i64 %size) nounwind alwaysinline
+define i8* @ISPCAlloc(i8**, i64 %size, i32 %align32) nounwind alwaysinline
 {
 entry:
   %call = tail call i32 @__tid_x()
@@ -121,6 +115,7 @@ entry:
   %sub = add nsw i32 %call1, -1
   %and = and i32 %sub, %call
   %cmp = icmp eq i32 %and, 0
+  %align = zext i32 %align32 to i64
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:
@@ -224,7 +219,7 @@ define void @ISPCSync(i8*) nounwind alwaysinline
 
 
 
-include(`util_ptx.m4')
+include(`util-nvptx.m4')
 
 stdlib_core()
 packed_load_and_store()
diff --git a/builtins/util_ptx.m4 b/builtins/util-nvptx.m4
similarity index 100%
rename from builtins/util_ptx.m4
rename to builtins/util-nvptx.m4
diff --git a/ctx.cpp b/ctx.cpp
index 74a760ae..b5ca392c 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1410,7 +1410,7 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
 
 llvm::Value *
 FunctionEmitContext::ProgramIndexVector(bool is32bits) {
-  if (!g->target->isPTX()) //g->target->getISA() != Target::NVPTX64)
+  if (g->target->getISA() != Target::NVPTX)
   {
     llvm::SmallVector<llvm::Constant*, 16> array;
     for (int i = 0; i < g->target->getVectorWidth() ; ++i) {
@@ -3540,7 +3540,7 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
                                 std::vector<llvm::Value *> &argVals,
                                 llvm::Value *launchCount[3]){
 
-    if (!g->target->isPTX())
+    if (g->target->getISA() != Target::NVPTX)
     {
       if (callee == NULL) {
         AssertPos(currentPos, m->errorCount > 0);
@@ -3608,7 +3608,79 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
       args.push_back(launchCount[2]);
       return CallInst(flaunch, NULL, args, "");
     }
-    else /* isPTX ==  true */
+    else /* NVPTX */
+    {
+      if (callee == NULL) {
+        AssertPos(currentPos, m->errorCount > 0);
+        return NULL;
+      }
+      launchedTasks = true;
+
+      AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
+      std::vector<llvm::Type*> argTypes;
+      for (unsigned int i = 0; i < argVals.size(); i++)
+        argTypes.push_back(argVals[i]->getType());
+      llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes);
+      llvm::StructType *argStructType = static_cast<llvm::StructType *>(st);
+      llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
+      if (structSize->getType() != LLVMTypes::Int64Type)
+        structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
+            "struct_size_to_64");
+
+      const int align = 8;
+      llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
+      AssertPos(currentPos, falloc != NULL);
+      std::vector<llvm::Value *> allocArgs;
+      allocArgs.push_back(launchGroupHandlePtr);
+      allocArgs.push_back(structSize);
+      allocArgs.push_back(LLVMInt32(align));
+      llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
+      llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64");
+      llvm::BasicBlock* if_true  = CreateBasicBlock("if_true");
+      llvm::BasicBlock* if_false = CreateBasicBlock("if_false");
+
+      /* check if the pointer returned by ISPCAlloc is not NULL 
+       * --------------
+       * this is a workaround for not checking the value of programIndex 
+       * because ISPCAlloc will return NULL pointer for all programIndex > 0
+       * of course, if ISPAlloc fails to get parameter buffer, the pointer for programIndex = 0
+       * will also be NULL
+       * This check must be added, and also rewrite the code to make it less opaque 
+       */
+      llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1");
+      BranchInst(if_true, if_false, cmp1);
+
+      /**********************/
+      bblock = if_true;    
+
+      // label_if_then block:
+      llvm::Type *pt = llvm::PointerType::getUnqual(st);
+      llvm::Value *argmem = BitCastInst(voidmem, pt);
+      for (unsigned int i = 0; i < argVals.size(); ++i) 
+      {
+        llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
+        // don't need to do masked store here, I think
+        StoreInst(argVals[i], ptr);
+      }
+      BranchInst(if_false);
+
+      /**********************/
+      bblock = if_false;
+
+      llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
+      llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
+      AssertPos(currentPos, flaunch != NULL);
+      std::vector<llvm::Value *> args;
+      args.push_back(launchGroupHandlePtr);
+      args.push_back(fptr);
+      args.push_back(voidmem);
+      args.push_back(launchCount[0]);
+      args.push_back(launchCount[1]);
+      args.push_back(launchCount[2]);
+      llvm::Value *ret =  CallInst(flaunch, NULL, args, "");
+      return ret;
+    }
+#if 0
     {
       if (callee == NULL) {
         AssertPos(currentPos, m->errorCount > 0);
@@ -3684,13 +3756,16 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
       args.push_back(launchCount[2]);
       return CallInst(flaunch, NULL, args, "");
     }
+#endif
 }
 
 
 void
 FunctionEmitContext::SyncInst() {
-    if (!g->target->isPTX())
+#if 0
+    if (g->target->getISA() != Target::NVPTX)
     {
+#endif
       llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
       llvm::Value *nullPtrValue =
         llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
@@ -3714,6 +3789,7 @@ FunctionEmitContext::SyncInst() {
       BranchInst(bPostSync);
 
       SetCurrentBasicBlock(bPostSync);
+#if 0
     }
     else
     {
@@ -3726,6 +3802,7 @@ FunctionEmitContext::SyncInst() {
       CallInst(fsync, NULL, launchGroupHandle, "");
       StoreInst(nullPtrValue, launchGroupHandlePtr);
     }
+#endif
 }
 
 
diff --git a/decl.cpp b/decl.cpp
index c0857474..7c248f82 100644
--- a/decl.cpp
+++ b/decl.cpp
@@ -531,7 +531,7 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
         returnType = returnType->ResolveUnboundVariability(Variability::Varying);
 
         bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
-        if (isTask && g->target->isPTX()) //getISA() == Target::NVPTX64)
+        if (isTask && g->target->getISA() == Target::NVPTX)
         {
 //          ds->storageClass = SC_EXTERN_C;
           ds->typeQualifiers |= TYPEQUAL_UNMASKED;
@@ -547,12 +547,11 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
                   "qualifiers");
             return;
         }
-//        if (!g->target->isPTX())
-          if (isExternC && isTask) {
-              Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
-                    "qualifiers");
-              return;
-          }
+        if (isExternC && isTask) {
+          Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
+              "qualifiers");
+          return;
+        }
         if (isExternC && isExported) {
             Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
                   "qualifiers");
diff --git a/examples_ptx/common_gpu.mk b/examples_ptx/common_gpu.mk
index c4628559..e02e5b95 100644
--- a/examples_ptx/common_gpu.mk
+++ b/examples_ptx/common_gpu.mk
@@ -22,7 +22,7 @@ endif
 
 #
 ISPC=ispc
-ISPC_FLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math
+ISPC_FLAGS=-O3 --math-lib=default --target=nvptx --opt=fast-math
 #
 #
 #
diff --git a/func.cpp b/func.cpp
index 0782d724..165c17ba 100644
--- a/func.cpp
+++ b/func.cpp
@@ -125,7 +125,7 @@ Function::Function(Symbol *s, Stmt *c) {
             sym->parentFunction = this;
     }
 
-    if (type->isTask) {
+    if (type->isTask && g->target->getISA() != Target::NVPTX) {
         threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
         Assert(threadIndexSym);
         threadCountSym = m->symbolTable->LookupVariable("threadCount");
@@ -237,12 +237,122 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
 #endif
     const FunctionType *type = CastType<FunctionType>(sym->type);
     Assert(type != NULL);
+    if (type->isTask == true && g->target->getISA() != Target::NVPTX)  {
+        // For tasks, we there should always be three parmeters: the
+        // pointer to the structure that holds all of the arguments, the
+        // thread index, and the thread count variables.
+        llvm::Function::arg_iterator argIter = function->arg_begin();
+        llvm::Value *structParamPtr = argIter++;
+        llvm::Value *threadIndex = argIter++;
+        llvm::Value *threadCount = argIter++;
+        llvm::Value *taskIndex = argIter++;
+        llvm::Value *taskCount = argIter++;
+        llvm::Value *taskIndex0 = argIter++;
+        llvm::Value *taskIndex1 = argIter++;
+        llvm::Value *taskIndex2 = argIter++;
+        llvm::Value *taskCount0 = argIter++;
+        llvm::Value *taskCount1 = argIter++;
+        llvm::Value *taskCount2 = argIter++;
+
+        // Copy the function parameter values from the structure into local
+        // storage
+        for (unsigned int i = 0; i < args.size(); ++i)
+            lCopyInTaskParameter(i, structParamPtr, args, ctx);
+
+        if (type->isUnmasked == false) {
+            // Copy in the mask as well.
+            int nArgs = (int)args.size();
+            // The mask is the last parameter in the argument structure
+            llvm::Value *ptr = ctx->AddElementOffset(structParamPtr, nArgs, NULL,
+                                                     "task_struct_mask");
+            llvm::Value *ptrval = ctx->LoadInst(ptr, "mask");
+            ctx->SetFunctionMask(ptrval);
+        }
+
+        // Copy threadIndex and threadCount into stack-allocated storage so
+        // that their symbols point to something reasonable.
+        threadIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadIndex");
+        ctx->StoreInst(threadIndex, threadIndexSym->storagePtr);
+
+        threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount");
+        ctx->StoreInst(threadCount, threadCountSym->storagePtr);
+
+        // Copy taskIndex and taskCount into stack-allocated storage so
+        // that their symbols point to something reasonable.
+        taskIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex");
+        ctx->StoreInst(taskIndex, taskIndexSym->storagePtr);
+
+        taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount");
+        ctx->StoreInst(taskCount, taskCountSym->storagePtr);
+        
+        taskIndexSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex0");
+        ctx->StoreInst(taskIndex0, taskIndexSym0->storagePtr);
+        taskIndexSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex1");
+        ctx->StoreInst(taskIndex1, taskIndexSym1->storagePtr);
+        taskIndexSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex2");
+        ctx->StoreInst(taskIndex2, taskIndexSym2->storagePtr);
+        
+        taskCountSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount0");
+        ctx->StoreInst(taskCount0, taskCountSym0->storagePtr);
+        taskCountSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount1");
+        ctx->StoreInst(taskCount1, taskCountSym1->storagePtr);
+        taskCountSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount2");
+        ctx->StoreInst(taskCount2, taskCountSym2->storagePtr);
+    }
+    else {
+        // Regular, non-task function
+        llvm::Function::arg_iterator argIter = function->arg_begin();
+        for (unsigned int i = 0; i < args.size(); ++i, ++argIter) {
+            Symbol *sym = args[i];
+            if (sym == NULL)
+                // anonymous function parameter
+                continue;
+
+            argIter->setName(sym->name.c_str());
+
+            // Allocate stack storage for the parameter and emit code
+            // to store the its value there.
+            sym->storagePtr = ctx->AllocaInst(argIter->getType(), sym->name.c_str());
+            ctx->StoreInst(argIter, sym->storagePtr);
+            ctx->EmitFunctionParameterDebugInfo(sym, i);
+        }
+
+        // If the number of actual function arguments is equal to the
+        // number of declared arguments in decl->functionParams, then we
+        // don't have a mask parameter, so set it to be all on.  This
+        // happens for exmaple with 'export'ed functions that the app
+        // calls.
+        if (argIter == function->arg_end()) {
+            Assert(type->isUnmasked || type->isExported);
+            ctx->SetFunctionMask(LLVMMaskAllOn);
+        }
+        else {
+            Assert(type->isUnmasked == false);
+
+            // Otherwise use the mask to set the entry mask value
+            argIter->setName("__mask");
+            Assert(argIter->getType() == LLVMTypes::MaskType);
+            ctx->SetFunctionMask(argIter);
+            Assert(++argIter == function->arg_end());
+        }
+        if (type->isTask == true && g->target->getISA() == Target::NVPTX)
+        {
+          llvm::NamedMDNode* annotations =
+            m->module->getOrInsertNamedMetadata("nvvm.annotations");
+          llvm::SmallVector<llvm::Value*, 3> av;
+          av.push_back(function);
+          av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
+          av.push_back(LLVMInt32(1));
+          annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
+        }
+    }
+#if 0
     if (type->isTask == true) {
         // For tasks, we there should always be three parmeters: the
         // pointer to the structure that holds all of the arguments, the
         // thread index, and the thread count variables.
 
-        if (!g->target->isPTX()) //if (g->target->getISA() != Target::NVPTX64)
+        if (g->target->getISA() != Target::NVPTX)
         {
           llvm::Function::arg_iterator argIter = function->arg_begin();
           llvm::Value *structParamPtr = argIter++;
@@ -341,7 +451,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
             Assert(type->isUnmasked || type->isExported);
             ctx->SetFunctionMask(LLVMMaskAllOn);
           }
-          else  /* for NVPTX64 , function must be unmasked */
+          else  /* for NVPTX, function must be unmasked */
           {
              assert(0);
             Assert(type->isUnmasked == false);
@@ -353,7 +463,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
             Assert(++argIter == function->arg_end());
           }
 
-          if (g->target->isPTX() && g->target->getISA() == Target::NVPTX64)
+          if (g->target->getISA() == Target::NVPTX)
           {
             llvm::NamedMDNode* annotations =
               m->module->getOrInsertNamedMetadata("nvvm.annotations");
@@ -402,6 +512,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
             Assert(++argIter == function->arg_end());
         }
     }
+#endif
 
     // Finally, we can generate code for the function
     if (code != NULL) {
@@ -535,26 +646,12 @@ Function::GenerateIR() {
     }
 
     // And we can now go ahead and emit the code
-     /* export function with NVPTX64 target should be emitted host architecture */
-#if 0
-    const FunctionType *func_type= CastType<FunctionType>(sym->type);
-    if (g->target->getISA() == Target::NVPTX64 && func_type->isExported)
-      return;
-#endif
-
-#if 0
-    if (g->target->getISA() != Target::NVPTX64 && g->target->isPTX() && func_type->isTask)
-      return;
-#endif
-
-//    if (!(g->target->getISA()==Target::NVPTX64 && func_type->isExported))
     {
         FunctionEmitContext ec(this, sym, function, firstStmtPos);
         emitCode(&ec, function, firstStmtPos);
     }
 
     if (m->errorCount == 0) {
-//      if (!(g->target->getISA() == Target::NVPTX64 && func_type->isExported))
         if (llvm::verifyFunction(*function, llvm::ReturnStatusAction) == true) {
             if (g->debugPrint)
                 function->dump();
@@ -566,18 +663,18 @@ Function::GenerateIR() {
         // the application can call it
         const FunctionType *type = CastType<FunctionType>(sym->type);
         Assert(type != NULL);
-        if (type->isExported) { // && g->target->getISA() != Target::VPTX64) {
+        if (type->isExported) { 
             if (!type->isTask) {
-              if (g->target->isPTX() && g->target->getISA() == Target::NVPTX64)
-              {
-                llvm::NamedMDNode* annotations =
-                  m->module->getOrInsertNamedMetadata("nvvm.annotations");
-                llvm::SmallVector<llvm::Value*, 3> av;
-                av.push_back(function);
-                av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
-                av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
-                annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); 
-              }
+                if (g->target->getISA() == Target::NVPTX)
+                {
+                  llvm::NamedMDNode* annotations =
+                    m->module->getOrInsertNamedMetadata("nvvm.annotations");
+                  llvm::SmallVector<llvm::Value*, 3> av;
+                  av.push_back(function);
+                  av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
+                  av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
+                  annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); 
+                }
                 llvm::FunctionType *ftype = type->LLVMFunctionType(g->ctx, true);
                 llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
                 std::string functionName = sym->name;
@@ -585,7 +682,7 @@ Function::GenerateIR() {
                 if (g->mangleFunctionsWithTarget)
                     functionName += std::string("_") + g->target->GetISAString();
 
-                if (g->target->getISA() == Target::NVPTX64)
+                if (g->target->getISA() == Target::NVPTX)
                   functionName += std::string("___export");
                 llvm::Function *appFunction =
                     llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
@@ -615,7 +712,7 @@ Function::GenerateIR() {
                             FATAL("Function verificication failed");
                         }
                     }
-                    if (g->target->isPTX() && g->target->getISA() == Target::NVPTX64)
+                    if (g->target->getISA() == Target::NVPTX)
                     {
                       llvm::NamedMDNode* annotations =
                         m->module->getOrInsertNamedMetadata("nvvm.annotations");
diff --git a/ispc.cpp b/ispc.cpp
index 97735308..223e7317 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -174,7 +174,7 @@ static const char *supportedCPUs[] = {
 #endif // LLVM 3.4+
 };
 
-Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, bool isPTX) :
+Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     m_target(NULL),
     m_targetMachine(NULL),
 #if defined(LLVM_3_1)
@@ -184,7 +184,6 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
 #endif
     m_valid(false),
     m_isa(SSE2),
-    m_isPTX(isPTX),
     m_arch(""),
     m_is32Bit(true),
     m_cpu(""),
@@ -212,7 +211,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
             if (!strcmp(cpu, "core-avx2"))
                 isa = "avx2-i32x8";
             else if (!strcmp(cpu, "sm_35"))
-              isa = "nvptx64";
+              isa = "nvptx";
 #ifdef ISPC_ARM_ENABLED
             else if (!strcmp(cpu, "cortex-a9") ||
                      !strcmp(cpu, "cortex-a15"))
@@ -249,7 +248,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
         cpu = "cortex-a9";
 #endif
 
-    if (cpu == NULL && !strcmp(isa, "nvptx64"))
+    if (cpu == NULL && !strcmp(isa, "nvptx"))
       cpu = "sm_35";
 
     if (cpu == NULL) {
@@ -280,8 +279,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
     this->m_cpu = cpu;
 
     if (arch == NULL) {
-        if (!strcmp(isa, "nvptx64"))
-            arch = "nvptx64";
+        if (!strcmp(isa, "nvptx"))
+            arch = "nvptx";
 #ifdef ISPC_ARM_ENABLED
         else if (!strncmp(isa, "neon", 4))
             arch = "arm";
@@ -709,10 +708,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
         this->m_maskBitCount = 32;
     }
 #endif
-    else if (!strcasecmp(isa, "nvptx64")) 
+    else if (!strcasecmp(isa, "nvptx")) 
     {
-        this->m_isa = Target::NVPTX64;
-        this->m_isPTX = true;
+        this->m_isa = Target::NVPTX;
         this->m_nativeVectorWidth = 32;
         this->m_nativeVectorAlignment = 32;
         this->m_vectorWidth = 1;
@@ -780,7 +778,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
             dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-"
                 "i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-"
                 "f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32-v4:128:128";
-        } else if (m_isa == Target::NVPTX64)
+        } else if (m_isa == Target::NVPTX)
         {
           dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
         }
@@ -803,7 +801,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
         // Initialize target-specific "target-feature" attribute.
         if (!m_attributes.empty()) {
             llvm::AttrBuilder attrBuilder;
-            if (m_isa != Target::NVPTX64)
+            if (m_isa != Target::NVPTX)
               attrBuilder.addAttribute("target-cpu", this->m_cpu);
             attrBuilder.addAttribute("target-features", this->m_attributes);
             this->m_tf_attributes = new llvm::AttributeSet(
@@ -838,7 +836,7 @@ Target::SupportedCPUs() {
 
 const char *
 Target::SupportedArchs() {
-    return "nvptx64, "
+    return "nvptx, "
 #ifdef ISPC_ARM_ENABLED
         "arm, "
 #endif
@@ -848,7 +846,7 @@ Target::SupportedArchs() {
 
 const char *
 Target::SupportedTargets() {
-    return "nvptx64, "
+    return "nvptx, "
 #ifdef ISPC_ARM_ENABLED
         "neon-i8x16, neon-i16x8, neon-i32x4, "
 #endif
@@ -866,9 +864,9 @@ Target::SupportedTargets() {
 std::string
 Target::GetTripleString() const {
     llvm::Triple triple;
-    if (m_arch == "nvptx64")
+    if (m_arch == "nvptx")
     {
-      triple.setTriple("nvptx64");
+      triple.setTriple("nvptx");
     }
 #ifdef ISPC_ARM_ENABLED
     else if (m_arch == "arm") {
@@ -902,8 +900,8 @@ Target::GetTripleString() const {
 const char *
 Target::ISAToString(ISA isa) {
     switch (isa) {
-    case Target::NVPTX64:
-        return "nvptx64";
+    case Target::NVPTX:
+        return "nvptx";
 #ifdef ISPC_ARM_ENABLED
     case Target::NEON8:
         return "neon-8";
diff --git a/ispc.h b/ispc.h
index d649b6cd..ebef4bb0 100644
--- a/ispc.h
+++ b/ispc.h
@@ -179,7 +179,7 @@ public:
         flexible/performant of them will apear last in the enumerant.  Note
         also that __best_available_isa() needs to be updated if ISAs are
         added or the enumerant values are reordered.  */
-    enum ISA { NVPTX64,
+    enum ISA { NVPTX,
 #ifdef ISPC_ARM_ENABLED
                NEON32, NEON16, NEON8,
 #endif
@@ -189,7 +189,7 @@ public:
     /** Initializes the given Target pointer for a target of the given
         name, if the name is a known target.  Returns true if the
         target was initialized and false if the name is unknown. */
-    Target(const char *arch, const char *cpu, const char *isa, bool pic, bool isPTX = false);
+    Target(const char *arch, const char *cpu, const char *isa, bool pic);
 
     /** Returns a comma-delimited string giving the names of the currently
         supported compilation targets. */
@@ -251,7 +251,6 @@ public:
     bool isValid() const {return m_valid;}
 
     ISA getISA() const {return m_isa;}
-    bool isPTX() const {return m_isPTX;}
 
     std::string getArch() const {return m_arch;}
 
@@ -310,7 +309,6 @@ private:
 
     /** Instruction set being compiled to. */
     ISA m_isa;
-    bool m_isPTX;
 
     /** Target system architecture.  (e.g. "x86-64", "x86"). */
     std::string m_arch;
diff --git a/module.cpp b/module.cpp
index 4ca1b351..a745db29 100644
--- a/module.cpp
+++ b/module.cpp
@@ -733,7 +733,7 @@ Module::AddFunctionDeclaration(const std::string &name,
     if (storageClass == SC_EXTERN_C) {
         // Make sure the user hasn't supplied both an 'extern "C"' and a
         // 'task' qualifier with the function
-        if (functionType->isTask) //&& !g->target->isPTX())  //tISA() != Target::NVPTX64) 
+        if (functionType->isTask)
         {
             Error(pos, "\"task\" qualifier is illegal with C-linkage extern "
                   "function \"%s\".  Ignoring this function.", name.c_str());
@@ -796,8 +796,8 @@ Module::AddFunctionDeclaration(const std::string &name,
 #else // LLVM 3.1 and 3.3+
         function->addFnAttr(llvm::Attribute::AlwaysInline);
 #endif
-     /* evghenii: on PTX target this must not be used, cause crash, dunno why */
-    if (functionType->isTask && g->target->getISA() != Target::NVPTX64)
+     /* evghenii: on PTX target the following must not be set ... why ?!? */
+    if (functionType->isTask && g->target->getISA() != Target::NVPTX)
         // This also applies transitively to members I think?
 #if defined(LLVM_3_1)
         function->setDoesNotAlias(1, true);
@@ -953,7 +953,7 @@ Module::writeOutput(OutputType outputType, const char *outFileName,
         const char *fileType = NULL;
         switch (outputType) {
         case Asm:
-            if (g->target->getISA() != Target::NVPTX64)
+            if (g->target->getISA() != Target::NVPTX)
             {
               if (strcasecmp(suffix, "s"))
                 fileType = "assembly";
@@ -1053,7 +1053,7 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) {
     }
 
     llvm::raw_fd_ostream fos(fd, (fd != 1), false);
-    if (g->target->getISA() == Target::NVPTX64)
+    if (g->target->getISA() == Target::NVPTX)
     {
       const std::string dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
       module->setDataLayout(dl_string);
@@ -1925,7 +1925,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
             opts.addMacroDef(g->cppArgs[i].substr(2));
         }
     }
-    if (g->target->getISA() == Target::NVPTX64)
+    if (g->target->getISA() == Target::NVPTX)
     {
       opts.addMacroDef("__NVPTX__");
       opts.addMacroDef("programIndex=laneIndex()");
@@ -2331,135 +2331,9 @@ Module::CompileAndOutput(const char *srcFile,
                          const char *hostStubFileName,
                          const char *devStubFileName)
 {
-  char ptxname[] = "nvptx64";
-  for (int k = 0; k < 7; k++)
-    ptxname[k] = target[k];
-  if (0) //target != NULL && strcmp(ptxname,"nvptx64") == 0)  // NVPTX64
-  {
-    std::vector<std::string> targets = lExtractTargets(target);
-    Assert(targets.size() > 1);
-    // We're only compiling to a single target
-    int errorCount = 0;
-
-    const char *suffix_orig = strrchr(outFileName, '.');
-    ++suffix_orig;
-    assert(suffix_orig!=NULL);
-
-    g->PtxString = std::string();
-
-    for (int itarget = 0; itarget < 1; itarget++)
-    {
-      fprintf(stderr,  "compiling nvptx64 : target= %s\n",targets[itarget].c_str());
-      g->target = new Target(arch, cpu, targets[itarget].c_str(), generatePIC, /* isPTX= */ true);
-      if (!g->target->isValid())
-        return 1;
-
-      m = new Module(srcFile);
-      if (m->CompileFile() == 0) {
-        if (outputType == CXX) {
-          if (target == NULL || strncmp(target, "generic-", 8) != 0) {
-            Error(SourcePos(), "When generating C++ output, one of the \"generic-*\" "
-                "targets must be used.");
-            return 1;
-          }
-        }
-        else if (outputType == Asm || outputType == Object) {
-          if (target != NULL && strncmp(target, "generic-", 8) == 0) {
-            Error(SourcePos(), "When using a \"generic-*\" compilation target, "
-                "%s output can not be used.",
-                (outputType == Asm) ? "assembly" : "object file");
-            return 1;
-          }
-        }
-
-        assert(outFileName != NULL);
-
-        std::string targetOutFileName = 
-          lGetTargetFileName(outFileName, targets[itarget].c_str());
-        if (outputType == Asm)
-        {
-          const char * targetOutFileName_c = targetOutFileName.c_str();
-          const int suffix = strrchr(targetOutFileName_c, '.') - targetOutFileName_c + 1;
-          if (itarget == 1 && !strcasecmp(suffix_orig, "ptx"))
-          {
-            targetOutFileName[suffix  ] = 's';
-            targetOutFileName[suffix+1] =  0;
-          }
-        }
-
-        if (outputType != Object)
-        {
-          if (!m->writeOutput(outputType, targetOutFileName.c_str(), includeFileName))
-            return 1;
-        }
-        else if (itarget > 0)
-        {
-          if (!m->writeOutput(outputType, outFileName, includeFileName))
-            return 1;
-        }
-
-        if (itarget == 0)
-        {  /* store ptx into memory */
-          llvm::PassManager pm;
-#if defined(LLVM_3_1)
-          pm.add(new llvm::TargetData(*g->target->getDataLayout()));
-#else
-          pm.add(new llvm::DataLayout(*g->target->getDataLayout()));
-#endif
-
-          llvm::raw_string_ostream rso(g->PtxString);
-          llvm::formatted_raw_ostream fos(rso);
-
-          llvm::TargetMachine::CodeGenFileType fileType = llvm::TargetMachine::CGFT_AssemblyFile;
-          llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
-          if (targetMachine->addPassesToEmitFile(pm, fos, fileType)) {
-            fprintf(stderr, "Fatal error adding passes to emit object file!");
-            exit(1);
-          }
-
-          llvm::Module *module = m->module;
-          pm.run(*module);
-          fos.flush();
-          assert(!g->PtxString.empty());
-#if 0
-          std::cout << g->PtxString << std::endl;
-#endif
-        }
-
-
-        if (itarget > 0)
-        {
-          if (headerFileName != NULL)
-            if (!m->writeOutput(Module::Header, headerFileName))
-              return 1;
-          if (depsFileName != NULL)
-            if (!m->writeOutput(Module::Deps,depsFileName))
-              return 1;
-          if (hostStubFileName != NULL)
-            if (!m->writeOutput(Module::HostStub,hostStubFileName))
-              return 1;
-          if (devStubFileName != NULL)
-            if (!m->writeOutput(Module::DevStub,devStubFileName))
-              return 1;
-        }
-      }
-      else
-        ++m->errorCount;
-
-      errorCount += m->errorCount;
-      delete m;
-      m = NULL;
-
-      delete g->target;
-      g->target = NULL;
-
-    }
-    return errorCount > 0;
-  }
-  else if (target == NULL || strchr(target, ',') == NULL) {
+  if (target == NULL || strchr(target, ',') == NULL) {
         // We're only compiling to a single target
-        const bool isPTX = strcmp(target, "nvptx64") == 0;
-        g->target = new Target(arch, cpu, target, generatePIC, isPTX);
+        g->target = new Target(arch, cpu, target, generatePIC);
         if (!g->target->isValid())
             return 1;
 
@@ -2525,8 +2399,6 @@ Module::CompileAndOutput(const char *srcFile,
         // The user supplied multiple targets
         std::vector<std::string> targets = lExtractTargets(target);
         Assert(targets.size() > 1);
-        for (unsigned int i = 0; i < targets.size(); ++i) 
-          assert(strcmp(targets[i].c_str(), "nvptx64") < 0);
 
         if (outFileName != NULL && strcmp(outFileName, "-") == 0) {
             Error(SourcePos(), "Multi-target compilation can't generate output "
diff --git a/stmt.cpp b/stmt.cpp
index 67d0d96a..05d84a93 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -206,7 +206,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
         }
 
         if (sym->storageClass == SC_STATIC) {
-          if (g->target->getISA() == Target::NVPTX64)
+          if (g->target->getISA() == Target::NVPTX)
             if (!sym->type->IsConstType())
               Error(initExpr->pos, "Non-constant static variable ""\"%s\" is not supported with ""\"cuda\" target.",
                   sym->name.c_str());
@@ -1280,7 +1280,7 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
                       llvm::Value *varyingCounterPtr,
                       const std::vector<int> &spans) 
 {
-  if (!g->target->isPTX())
+  if (g->target->getISA() != Target::NVPTX)
   {
     // Smear the uniform counter value out to be varying
     llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
@@ -1315,7 +1315,7 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
     ctx->StoreInst(varyingCounter, varyingCounterPtr);
     return varyingCounter;
   }
-  else /* isPTX() == true */
+  else /* NVPTX == true */
   {
     // Smear the uniform counter value out to be varying
     llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
@@ -1465,921 +1465,458 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
     if (ctx->GetCurrentBasicBlock() == NULL || stmts == NULL)
         return;
 
-    if (!g->target->isPTX())
-    {
-      llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body");
-      llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body");
-      llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit");
+    llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body");
+    llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body");
+    llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit");
 
-      llvm::Value *oldMask = ctx->GetInternalMask();
-      llvm::Value *oldFunctionMask = ctx->GetFunctionMask();
+    llvm::Value *oldMask = ctx->GetInternalMask();
+    llvm::Value *oldFunctionMask = ctx->GetFunctionMask();
 
-      ctx->SetDebugPos(pos);
-      ctx->StartScope();
+    ctx->SetDebugPos(pos);
+    ctx->StartScope();
 
-      ctx->SetInternalMask(LLVMMaskAllOn);
-      ctx->SetFunctionMask(LLVMMaskAllOn);
+    ctx->SetInternalMask(LLVMMaskAllOn);
+    ctx->SetFunctionMask(LLVMMaskAllOn);
 
-      // This should be caught during typechecking
-      AssertPos(pos, startExprs.size() == dimVariables.size() &&
-          endExprs.size() == dimVariables.size());
-      int nDims = (int)dimVariables.size();
+    // This should be caught during typechecking
+    AssertPos(pos, startExprs.size() == dimVariables.size() &&
+        endExprs.size() == dimVariables.size());
+    int nDims = (int)dimVariables.size();
+
+    ///////////////////////////////////////////////////////////////////////
+    // Setup: compute the number of items we have to work on in each
+    // dimension and a number of derived values.
+    std::vector<llvm::BasicBlock *> bbReset, bbStep, bbTest;
+    std::vector<llvm::Value *> startVals, endVals, uniformCounterPtrs;
+    std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;
+
+    std::vector<int> span(nDims, 0);
+    const int vectorWidth = 
+      g->target->getISA() == Target::NVPTX ? 32 : g->target->getVectorWidth();
+    lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]);
+
+    for (int i = 0; i < nDims; ++i) {
+      // Basic blocks that we'll fill in later with the looping logic for
+      // this dimension.
+      bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
+      if (i < nDims-1)
+        // stepping for the innermost dimension is handled specially
+        bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
+      bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));
+
+      // Start and end value for this loop dimension
+      llvm::Value *sv = startExprs[i]->GetValue(ctx);
+      llvm::Value *ev = endExprs[i]->GetValue(ctx);
+      if (sv == NULL || ev == NULL)
+        return;
+      startVals.push_back(sv);
+      endVals.push_back(ev);
+
+      // nItems = endVal - startVal
+      llvm::Value *nItems =
+        ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");
+
+      // nExtras = nItems % (span for this dimension)
+      // This gives us the number of extra elements we need to deal with
+      // at the end of the loop for this dimension that don't fit cleanly
+      // into a vector width.
+      nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
+            LLVMInt32(span[i]), "nextras"));
+
+      // alignedEnd = endVal - nExtras
+      alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev,
+            nExtras[i], "aligned_end"));
 
       ///////////////////////////////////////////////////////////////////////
-      // Setup: compute the number of items we have to work on in each
-      // dimension and a number of derived values.
-      std::vector<llvm::BasicBlock *> bbReset, bbStep, bbTest;
-      std::vector<llvm::Value *> startVals, endVals, uniformCounterPtrs;
-      std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;
+      // Each dimension has a loop counter that is a uniform value that
+      // goes from startVal to endVal, in steps of the span for this
+      // dimension.  Its value is only used internally here for looping
+      // logic and isn't directly available in the user's program code.
+      uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type,
+            "counter"));
+      ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
 
-      std::vector<int> span(nDims, 0);
-      lGetSpans(nDims-1, nDims, g->target->getVectorWidth(), isTiled, &span[0]);
+      // There is also a varying variable that holds the set of index
+      // values for each dimension in the current loop iteration; this is
+      // the value that is program-visible.
+      dimVariables[i]->storagePtr =
+        ctx->AllocaInst(LLVMTypes::Int32VectorType,
+            dimVariables[i]->name.c_str());
+      dimVariables[i]->parentFunction = ctx->GetFunction();
+      ctx->EmitVariableDebugInfo(dimVariables[i]);
 
-      for (int i = 0; i < nDims; ++i) {
-        // Basic blocks that we'll fill in later with the looping logic for
-        // this dimension.
-        bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
-        if (i < nDims-1)
-          // stepping for the innermost dimension is handled specially
-          bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
-        bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));
-
-        // Start and end value for this loop dimension
-        llvm::Value *sv = startExprs[i]->GetValue(ctx);
-        llvm::Value *ev = endExprs[i]->GetValue(ctx);
-        if (sv == NULL || ev == NULL)
-          return;
-        startVals.push_back(sv);
-        endVals.push_back(ev);
-
-        // nItems = endVal - startVal
-        llvm::Value *nItems =
-          ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");
-
-        // nExtras = nItems % (span for this dimension)
-        // This gives us the number of extra elements we need to deal with
-        // at the end of the loop for this dimension that don't fit cleanly
-        // into a vector width.
-        nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
-              LLVMInt32(span[i]), "nextras"));
-
-        // alignedEnd = endVal - nExtras
-        alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev,
-              nExtras[i], "aligned_end"));
-
-        ///////////////////////////////////////////////////////////////////////
-        // Each dimension has a loop counter that is a uniform value that
-        // goes from startVal to endVal, in steps of the span for this
-        // dimension.  Its value is only used internally here for looping
-        // logic and isn't directly available in the user's program code.
-        uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type,
-              "counter"));
-        ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
-
-        // There is also a varying variable that holds the set of index
-        // values for each dimension in the current loop iteration; this is
-        // the value that is program-visible.
-        dimVariables[i]->storagePtr =
-          ctx->AllocaInst(LLVMTypes::Int32VectorType,
-              dimVariables[i]->name.c_str());
-        dimVariables[i]->parentFunction = ctx->GetFunction();
-        ctx->EmitVariableDebugInfo(dimVariables[i]);
-
-        // Each dimension also maintains a mask that represents which of
-        // the varying elements in the current iteration should be
-        // processed.  (i.e. this is used to disable the lanes that have
-        // out-of-bounds offsets.)
-        extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask"));
-        ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
-      }
-
-      ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR);
-
-      // On to the outermost loop's test
-      ctx->BranchInst(bbTest[0]);
-
-      ///////////////////////////////////////////////////////////////////////////
-      // foreach_reset: this code runs when we need to reset the counter for
-      // a given dimension in preparation for running through its loop again,
-      // after the enclosing level advances its counter.
-      for (int i = 0; i < nDims; ++i) {
-        ctx->SetCurrentBasicBlock(bbReset[i]);
-        if (i == 0)
-          ctx->BranchInst(bbExit);
-        else {
-          ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
-          ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
-          ctx->BranchInst(bbStep[i-1]);
-        }
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // foreach_step: increment the uniform counter by the vector width.
-      // Note that we don't increment the varying counter here as well but
-      // just generate its value when we need it in the loop body.  Don't do
-      // this for the innermost dimension, which has a more complex stepping
-      // structure..
-      for (int i = 0; i < nDims-1; ++i) {
-        ctx->SetCurrentBasicBlock(bbStep[i]);
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
-        llvm::Value *newCounter =
-          ctx->BinaryOperator(llvm::Instruction::Add, counter,
-              LLVMInt32(span[i]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
-        ctx->BranchInst(bbTest[i]);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // foreach_test (for all dimensions other than the innermost...)
-      std::vector<llvm::Value *> inExtras;
-      for (int i = 0; i < nDims-1; ++i) {
-        ctx->SetCurrentBasicBlock(bbTest[i]);
-
-        llvm::Value *haveExtras =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT,
-              endVals[i], alignedEnd[i], "have_extras");
-
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter");
-        llvm::Value *atAlignedEnd =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
-              counter, alignedEnd[i], "at_aligned_end");
-        llvm::Value *inEx =
-          ctx->BinaryOperator(llvm::Instruction::And, haveExtras,
-              atAlignedEnd, "in_extras");
-
-        if (i == 0)
-          inExtras.push_back(inEx);
-        else
-          inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx,
-                inExtras[i-1], "in_extras_all"));
-
-        llvm::Value *varyingCounter =
-          lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i],
-              dimVariables[i]->storagePtr, span);
-
-        llvm::Value *smearEnd = ctx->BroadcastValue(
-            endVals[i], LLVMTypes::Int32VectorType, "smear_end");
-
-        // Do a vector compare of its value to the end value to generate a
-        // mask for this last bit of work.
-        llvm::Value *emask =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              varyingCounter, smearEnd);
-        emask = ctx->I1VecToBoolVec(emask);
-
-        if (i == 0)
-          ctx->StoreInst(emask, extrasMaskPtrs[i]);
-        else {
-          llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
-          llvm::Value *newMask =
-            ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
-                "extras_mask");
-          ctx->StoreInst(newMask, extrasMaskPtrs[i]);
-        }
-
-        llvm::Value *notAtEnd =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              counter, endVals[i]);
-        ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // foreach_test (for innermost dimension)
-      //
-      // All of the outer dimensions are handled generically--basically as a
-      // for() loop from the start value to the end value, where at each loop
-      // test, we compute the mask of active elements for the current
-      // dimension and then update an overall mask that is the AND
-      // combination of all of the outer ones.
-      //
-      // The innermost loop is handled specially, for performance purposes.
-      // When starting the innermost dimension, we start by checking once
-      // whether any of the outer dimensions has set the mask to be
-      // partially-active or not.  We follow different code paths for these
-      // two cases, taking advantage of the knowledge that the mask is all
-      // on, when this is the case.
-      //
-      // In each of these code paths, we start with a loop from the starting
-      // value to the aligned end value for the innermost dimension; we can
-      // guarantee that the innermost loop will have an "all on" mask (as far
-      // as its dimension is concerned) for the duration of this loop.  Doing
-      // so allows us to emit code that assumes the mask is all on (for the
-      // case where none of the outer dimensions has set the mask to be
-      // partially on), or allows us to emit code that just uses the mask
-      // from the outer dimensions directly (for the case where they have).
-      //
-      // After this loop, we just need to deal with one vector's worth of
-      // "ragged extra bits", where the mask used includes the effect of the
-      // mask for the innermost dimension.
-      //
-      // We start out this process by emitting the check that determines
-      // whether any of the enclosing dimensions is partially active
-      // (i.e. processing extra elements that don't exactly fit into a
-      // vector).
-      llvm::BasicBlock *bbOuterInExtras =
-        ctx->CreateBasicBlock("outer_in_extras");
-      llvm::BasicBlock *bbOuterNotInExtras =
-        ctx->CreateBasicBlock("outer_not_in_extras");
-
-      ctx->SetCurrentBasicBlock(bbTest[nDims-1]);
-      if (inExtras.size())
-        ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
-            inExtras.back());
-      else
-        // for a 1D iteration domain, we certainly don't have any enclosing
-        // dimensions that are processing extra elements.
-        ctx->BranchInst(bbOuterNotInExtras);
-
-      ///////////////////////////////////////////////////////////////////////////
-      // One or more outer dimensions in extras, so we need to mask for the loop
-      // body regardless.  We break this into two cases, roughly:
-      // for (counter = start; counter < alignedEnd; counter += step) {
-      //   // mask is all on for inner, so set mask to outer mask
-      //   // run loop body with mask
-      // }
-      // // counter == alignedEnd
-      // if (counter < end) {
-      //   // set mask to outermask & (counter+programCounter < end)
-      //   // run loop body with mask
-      // }
-      llvm::BasicBlock *bbAllInnerPartialOuter =
-        ctx->CreateBasicBlock("all_inner_partial_outer");
-      llvm::BasicBlock *bbPartial =
-        ctx->CreateBasicBlock("both_partial");
-      ctx->SetCurrentBasicBlock(bbOuterInExtras); {
-        // Update the varying counter value here, since all subsequent
-        // blocks along this path need it.
-        lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
-            dimVariables[nDims-1]->storagePtr, span);
-
-        // here we just check to see if counter < alignedEnd
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
-        llvm::Value *beforeAlignedEnd =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              counter, alignedEnd[nDims-1], "before_aligned_end");
-        ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
-      }
-
-      // Below we have a basic block that runs the loop body code for the
-      // case where the mask is partially but not fully on.  This same block
-      // runs in multiple cases: both for handling any ragged extra data for
-      // the innermost dimension but also when outer dimensions have set the
-      // mask to be partially on.
-      //
-      // The value stored in stepIndexAfterMaskedBodyPtr is used after each
-      // execution of the body code to determine whether the innermost index
-      // value should be incremented by the step (we're running the "for"
-      // loop of full vectors at the innermost dimension, with outer
-      // dimensions having set the mask to be partially on), or whether we're
-      // running once for the ragged extra bits at the end of the innermost
-      // dimension, in which case we're done with the innermost dimension and
-      // should step the loop counter for the next enclosing dimension
-      // instead.
-      llvm::Value *stepIndexAfterMaskedBodyPtr =
-        ctx->AllocaInst(LLVMTypes::BoolType, "step_index");
-
-      ///////////////////////////////////////////////////////////////////////////
-      // We're in the inner loop part where the only masking is due to outer
-      // dimensions but the innermost dimension fits fully into a vector's
-      // width.  Set the mask and jump to the masked loop body.
-      ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); {
-        llvm::Value *mask;
-        if (nDims == 1)
-          // 1D loop; we shouldn't ever get here anyway
-          mask = LLVMMaskAllOff;
-        else
-          mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
-
-        ctx->SetInternalMask(mask);
-
-        ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbMaskedBody);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // We need to include the effect of the innermost dimension in the mask
-      // for the final bits here
-      ctx->SetCurrentBasicBlock(bbPartial); {
-        llvm::Value *varyingCounter =
-          ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
-        llvm::Value *smearEnd = ctx->BroadcastValue(
-            endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
-
-        llvm::Value *emask =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              varyingCounter, smearEnd);
-        emask = ctx->I1VecToBoolVec(emask);
-
-        if (nDims == 1) {
-          ctx->SetInternalMask(emask);
-        }
-        else {
-          llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
-          llvm::Value *newMask =
-            ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
-                "extras_mask");
-          ctx->SetInternalMask(newMask);
-        }
-
-        ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbMaskedBody);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // None of the outer dimensions is processing extras; along the lines
-      // of above, we can express this as:
-      // for (counter = start; counter < alignedEnd; counter += step) {
-      //   // mask is all on
-      //   // run loop body with mask all on
-      // }
-      // // counter == alignedEnd
-      // if (counter < end) {
-      //   // set mask to (counter+programCounter < end)
-      //   // run loop body with mask
-      // }
-      llvm::BasicBlock *bbPartialInnerAllOuter =
-        ctx->CreateBasicBlock("partial_inner_all_outer");
-      ctx->SetCurrentBasicBlock(bbOuterNotInExtras); {
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
-        llvm::Value *beforeAlignedEnd =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              counter, alignedEnd[nDims-1], "before_aligned_end");
-        ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
-            beforeAlignedEnd);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // full_body: do a full vector's worth of work.  We know that all
-      // lanes will be running here, so we explicitly set the mask to be 'all
-      // on'.  This ends up being relatively straightforward: just update the
-      // value of the varying loop counter and have the statements in the
-      // loop body emit their code.
-      llvm::BasicBlock *bbFullBodyContinue =
-        ctx->CreateBasicBlock("foreach_full_continue");
-      ctx->SetCurrentBasicBlock(bbFullBody); {
-        ctx->SetInternalMask(LLVMMaskAllOn);
-        ctx->SetBlockEntryMask(LLVMMaskAllOn);
-        lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
-            dimVariables[nDims-1]->storagePtr, span);
-        ctx->SetContinueTarget(bbFullBodyContinue);
-        ctx->AddInstrumentationPoint("foreach loop body (all on)");
-        stmts->EmitCode(ctx);
-        AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL);
-        ctx->BranchInst(bbFullBodyContinue);
-      }
-      ctx->SetCurrentBasicBlock(bbFullBodyContinue); {
-        ctx->RestoreContinuedLanes();
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
-        llvm::Value *newCounter =
-          ctx->BinaryOperator(llvm::Instruction::Add, counter,
-              LLVMInt32(span[nDims-1]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
-        ctx->BranchInst(bbOuterNotInExtras);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // We're done running blocks with the mask all on; see if the counter is
-      // less than the end value, in which case we need to run the body one
-      // more time to get the extra bits.
-      llvm::BasicBlock *bbSetInnerMask =
-        ctx->CreateBasicBlock("partial_inner_only");
-      ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); {
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
-        llvm::Value *beforeFullEnd =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              counter, endVals[nDims-1], "before_full_end");
-        ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // The outer dimensions are all on, so the mask is just given by the
-      // mask for the innermost dimension
-      ctx->SetCurrentBasicBlock(bbSetInnerMask); {
-        llvm::Value *varyingCounter =
-          lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
-              dimVariables[nDims-1]->storagePtr, span);
-        llvm::Value *smearEnd = ctx->BroadcastValue(
-            endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
-        llvm::Value *emask =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              varyingCounter, smearEnd);
-        emask = ctx->I1VecToBoolVec(emask);
-        ctx->SetInternalMask(emask);
-        ctx->SetBlockEntryMask(emask);
-
-        ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbMaskedBody);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // masked_body: set the mask and have the statements emit their
-      // code again.  Note that it's generally worthwhile having two copies
-      // of the statements' code, since the code above is emitted with the
-      // mask known to be all-on, which in turn leads to more efficient code
-      // for that case.
-      llvm::BasicBlock *bbStepInnerIndex =
-        ctx->CreateBasicBlock("step_inner_index");
-      llvm::BasicBlock *bbMaskedBodyContinue =
-        ctx->CreateBasicBlock("foreach_masked_continue");
-      ctx->SetCurrentBasicBlock(bbMaskedBody); {
-        ctx->AddInstrumentationPoint("foreach loop body (masked)");
-        ctx->SetContinueTarget(bbMaskedBodyContinue);
-        ctx->DisableGatherScatterWarnings();
-        ctx->SetBlockEntryMask(ctx->GetFullMask());
-        stmts->EmitCode(ctx);
-        ctx->EnableGatherScatterWarnings();
-        ctx->BranchInst(bbMaskedBodyContinue);
-      }
-      ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); {
-        ctx->RestoreContinuedLanes();
-        llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // step the innermost index, for the case where we're doing the
-      // innermost for loop over full vectors.
-      ctx->SetCurrentBasicBlock(bbStepInnerIndex); {
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
-        llvm::Value *newCounter =
-          ctx->BinaryOperator(llvm::Instruction::Add, counter,
-              LLVMInt32(span[nDims-1]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
-        ctx->BranchInst(bbOuterInExtras);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // foreach_exit: All done.  Restore the old mask and clean up
-      ctx->SetCurrentBasicBlock(bbExit);
-
-      ctx->SetInternalMask(oldMask);
-      ctx->SetFunctionMask(oldFunctionMask);
-
-      ctx->EndForeach();
-      ctx->EndScope();
+      // Each dimension also maintains a mask that represents which of
+      // the varying elements in the current iteration should be
+      // processed.  (i.e. this is used to disable the lanes that have
+      // out-of-bounds offsets.)
+      extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask"));
+      ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
     }
-    else /* isPTX() == true */
-    {
-      llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body");
-      llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body");
-      llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit");
 
-      llvm::Value *oldMask = ctx->GetInternalMask();
-      llvm::Value *oldFunctionMask = ctx->GetFunctionMask();
+    ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR);
 
-      ctx->SetDebugPos(pos);
-      ctx->StartScope();
+    // On to the outermost loop's test
+    ctx->BranchInst(bbTest[0]);
 
+    ///////////////////////////////////////////////////////////////////////////
+    // foreach_reset: this code runs when we need to reset the counter for
+    // a given dimension in preparation for running through its loop again,
+    // after the enclosing level advances its counter.
+    for (int i = 0; i < nDims; ++i) {
+      ctx->SetCurrentBasicBlock(bbReset[i]);
+      if (i == 0)
+        ctx->BranchInst(bbExit);
+      else {
+        ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
+        ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
+        ctx->BranchInst(bbStep[i-1]);
+      }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // foreach_step: increment the uniform counter by the vector width.
+    // Note that we don't increment the varying counter here as well but
+    // just generate its value when we need it in the loop body.  Don't do
+    // this for the innermost dimension, which has a more complex stepping
+    // structure..
+    for (int i = 0; i < nDims-1; ++i) {
+      ctx->SetCurrentBasicBlock(bbStep[i]);
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
+      llvm::Value *newCounter =
+        ctx->BinaryOperator(llvm::Instruction::Add, counter,
+            LLVMInt32(span[i]), "new_counter");
+      ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
+      ctx->BranchInst(bbTest[i]);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // foreach_test (for all dimensions other than the innermost...)
+    std::vector<llvm::Value *> inExtras;
+    for (int i = 0; i < nDims-1; ++i) {
+      ctx->SetCurrentBasicBlock(bbTest[i]);
+
+      llvm::Value *haveExtras =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT,
+            endVals[i], alignedEnd[i], "have_extras");
+
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter");
+      llvm::Value *atAlignedEnd =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
+            counter, alignedEnd[i], "at_aligned_end");
+      llvm::Value *inEx =
+        ctx->BinaryOperator(llvm::Instruction::And, haveExtras,
+            atAlignedEnd, "in_extras");
+
+      if (i == 0)
+        inExtras.push_back(inEx);
+      else
+        inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx,
+              inExtras[i-1], "in_extras_all"));
+
+      llvm::Value *varyingCounter =
+        lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i],
+            dimVariables[i]->storagePtr, span);
+
+      llvm::Value *smearEnd = ctx->BroadcastValue(
+          endVals[i], LLVMTypes::Int32VectorType, "smear_end");
+
+      // Do a vector compare of its value to the end value to generate a
+      // mask for this last bit of work.
+      llvm::Value *emask =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            varyingCounter, smearEnd);
+      emask = ctx->I1VecToBoolVec(emask);
+
+      if (i == 0)
+        ctx->StoreInst(emask, extrasMaskPtrs[i]);
+      else {
+        llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
+        llvm::Value *newMask =
+          ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
+              "extras_mask");
+        ctx->StoreInst(newMask, extrasMaskPtrs[i]);
+      }
+
+      llvm::Value *notAtEnd =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            counter, endVals[i]);
+      ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // foreach_test (for innermost dimension)
+    //
+    // All of the outer dimensions are handled generically--basically as a
+    // for() loop from the start value to the end value, where at each loop
+    // test, we compute the mask of active elements for the current
+    // dimension and then update an overall mask that is the AND
+    // combination of all of the outer ones.
+    //
+    // The innermost loop is handled specially, for performance purposes.
+    // When starting the innermost dimension, we start by checking once
+    // whether any of the outer dimensions has set the mask to be
+    // partially-active or not.  We follow different code paths for these
+    // two cases, taking advantage of the knowledge that the mask is all
+    // on, when this is the case.
+    //
+    // In each of these code paths, we start with a loop from the starting
+    // value to the aligned end value for the innermost dimension; we can
+    // guarantee that the innermost loop will have an "all on" mask (as far
+    // as its dimension is concerned) for the duration of this loop.  Doing
+    // so allows us to emit code that assumes the mask is all on (for the
+    // case where none of the outer dimensions has set the mask to be
+    // partially on), or allows us to emit code that just uses the mask
+    // from the outer dimensions directly (for the case where they have).
+    //
+    // After this loop, we just need to deal with one vector's worth of
+    // "ragged extra bits", where the mask used includes the effect of the
+    // mask for the innermost dimension.
+    //
+    // We start out this process by emitting the check that determines
+    // whether any of the enclosing dimensions is partially active
+    // (i.e. processing extra elements that don't exactly fit into a
+    // vector).
+    llvm::BasicBlock *bbOuterInExtras =
+      ctx->CreateBasicBlock("outer_in_extras");
+    llvm::BasicBlock *bbOuterNotInExtras =
+      ctx->CreateBasicBlock("outer_not_in_extras");
+
+    ctx->SetCurrentBasicBlock(bbTest[nDims-1]);
+    if (inExtras.size())
+      ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
+          inExtras.back());
+    else
+      // for a 1D iteration domain, we certainly don't have any enclosing
+      // dimensions that are processing extra elements.
+      ctx->BranchInst(bbOuterNotInExtras);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // One or more outer dimensions in extras, so we need to mask for the loop
+    // body regardless.  We break this into two cases, roughly:
+    // for (counter = start; counter < alignedEnd; counter += step) {
+    //   // mask is all on for inner, so set mask to outer mask
+    //   // run loop body with mask
+    // }
+    // // counter == alignedEnd
+    // if (counter < end) {
+    //   // set mask to outermask & (counter+programCounter < end)
+    //   // run loop body with mask
+    // }
+    llvm::BasicBlock *bbAllInnerPartialOuter =
+      ctx->CreateBasicBlock("all_inner_partial_outer");
+    llvm::BasicBlock *bbPartial =
+      ctx->CreateBasicBlock("both_partial");
+    ctx->SetCurrentBasicBlock(bbOuterInExtras); {
+      // Update the varying counter value here, since all subsequent
+      // blocks along this path need it.
+      lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
+          dimVariables[nDims-1]->storagePtr, span);
+
+      // here we just check to see if counter < alignedEnd
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
+      llvm::Value *beforeAlignedEnd =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            counter, alignedEnd[nDims-1], "before_aligned_end");
+      ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
+    }
+
+    // Below we have a basic block that runs the loop body code for the
+    // case where the mask is partially but not fully on.  This same block
+    // runs in multiple cases: both for handling any ragged extra data for
+    // the innermost dimension but also when outer dimensions have set the
+    // mask to be partially on.
+    //
+    // The value stored in stepIndexAfterMaskedBodyPtr is used after each
+    // execution of the body code to determine whether the innermost index
+    // value should be incremented by the step (we're running the "for"
+    // loop of full vectors at the innermost dimension, with outer
+    // dimensions having set the mask to be partially on), or whether we're
+    // running once for the ragged extra bits at the end of the innermost
+    // dimension, in which case we're done with the innermost dimension and
+    // should step the loop counter for the next enclosing dimension
+    // instead.
+    llvm::Value *stepIndexAfterMaskedBodyPtr =
+      ctx->AllocaInst(LLVMTypes::BoolType, "step_index");
+
+    ///////////////////////////////////////////////////////////////////////////
+    // We're in the inner loop part where the only masking is due to outer
+    // dimensions but the innermost dimension fits fully into a vector's
+    // width.  Set the mask and jump to the masked loop body.
+    ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); {
+      llvm::Value *mask;
+      if (nDims == 1)
+        // 1D loop; we shouldn't ever get here anyway
+        mask = LLVMMaskAllOff;
+      else
+        mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
+
+      ctx->SetInternalMask(mask);
+
+      ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
+      ctx->BranchInst(bbMaskedBody);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // We need to include the effect of the innermost dimension in the mask
+    // for the final bits here
+    ctx->SetCurrentBasicBlock(bbPartial); {
+      llvm::Value *varyingCounter =
+        ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
+      llvm::Value *smearEnd = ctx->BroadcastValue(
+          endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
+
+      llvm::Value *emask =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            varyingCounter, smearEnd);
+      emask = ctx->I1VecToBoolVec(emask);
+
+      if (nDims == 1) {
+        ctx->SetInternalMask(emask);
+      }
+      else {
+        llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
+        llvm::Value *newMask =
+          ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
+              "extras_mask");
+        ctx->SetInternalMask(newMask);
+      }
+
+      ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
+      ctx->BranchInst(bbMaskedBody);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // None of the outer dimensions is processing extras; along the lines
+    // of above, we can express this as:
+    // for (counter = start; counter < alignedEnd; counter += step) {
+    //   // mask is all on
+    //   // run loop body with mask all on
+    // }
+    // // counter == alignedEnd
+    // if (counter < end) {
+    //   // set mask to (counter+programCounter < end)
+    //   // run loop body with mask
+    // }
+    llvm::BasicBlock *bbPartialInnerAllOuter =
+      ctx->CreateBasicBlock("partial_inner_all_outer");
+    ctx->SetCurrentBasicBlock(bbOuterNotInExtras); {
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
+      llvm::Value *beforeAlignedEnd =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            counter, alignedEnd[nDims-1], "before_aligned_end");
+      ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
+          beforeAlignedEnd);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // full_body: do a full vector's worth of work.  We know that all
+    // lanes will be running here, so we explicitly set the mask to be 'all
+    // on'.  This ends up being relatively straightforward: just update the
+    // value of the varying loop counter and have the statements in the
+    // loop body emit their code.
+    llvm::BasicBlock *bbFullBodyContinue =
+      ctx->CreateBasicBlock("foreach_full_continue");
+    ctx->SetCurrentBasicBlock(bbFullBody); {
       ctx->SetInternalMask(LLVMMaskAllOn);
-      ctx->SetFunctionMask(LLVMMaskAllOn);
-
-      // This should be caught during typechecking
-      AssertPos(pos, startExprs.size() == dimVariables.size() &&
-          endExprs.size() == dimVariables.size());
-      int nDims = (int)dimVariables.size();
-
-      ///////////////////////////////////////////////////////////////////////
-      // Setup: compute the number of items we have to work on in each
-      // dimension and a number of derived values.
-      std::vector<llvm::BasicBlock *> bbReset, bbStep, bbTest;
-      std::vector<llvm::Value *> startVals, endVals, uniformCounterPtrs;
-      std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;
-
-      std::vector<int> span(nDims, 0);
-      const int vectorWidth = 32;
-      lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]);
-#if 0
-      for (int i = 0; i < nDims; i++)
-      {
-        fprintf(stderr, " i= %d [ %d ] : %d \n",
-            i, nDims, span[i]);
-      }
-      fprintf(stderr, " --- \n");
-#endif
-
-      for (int i = 0; i < nDims; ++i) {
-        // Basic blocks that we'll fill in later with the looping logic for
-        // this dimension.
-        bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
-        if (i < nDims-1)
-          // stepping for the innermost dimension is handled specially
-          bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
-        bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));
-
-        // Start and end value for this loop dimension
-        llvm::Value *sv = startExprs[i]->GetValue(ctx);
-        llvm::Value *ev = endExprs[i]->GetValue(ctx);
-        if (sv == NULL || ev == NULL)
-          return;
-        startVals.push_back(sv);
-        endVals.push_back(ev);
-
-        // nItems = endVal - startVal
-        llvm::Value *nItems =
-          ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");
-
-        // nExtras = nItems % (span for this dimension)
-        // This gives us the number of extra elements we need to deal with
-        // at the end of the loop for this dimension that don't fit cleanly
-        // into a vector width.
-        nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
-              LLVMInt32(span[i]), "nextras"));
-
-        // alignedEnd = endVal - nExtras
-        alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev,
-              nExtras[i], "aligned_end"));
-
-        ///////////////////////////////////////////////////////////////////////
-        // Each dimension has a loop counter that is a uniform value that
-        // goes from startVal to endVal, in steps of the span for this
-        // dimension.  Its value is only used internally here for looping
-        // logic and isn't directly available in the user's program code.
-        uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type,
-              "counter"));
-        ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
-
-        // There is also a varying variable that holds the set of index
-        // values for each dimension in the current loop iteration; this is
-        // the value that is program-visible.
-        dimVariables[i]->storagePtr =
-          ctx->AllocaInst(LLVMTypes::Int32VectorType,
-              dimVariables[i]->name.c_str());
-        dimVariables[i]->parentFunction = ctx->GetFunction();
-        ctx->EmitVariableDebugInfo(dimVariables[i]);
-
-        // Each dimension also maintains a mask that represents which of
-        // the varying elements in the current iteration should be
-        // processed.  (i.e. this is used to disable the lanes that have
-        // out-of-bounds offsets.)
-        extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask"));
-        ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
-      }
-
-      ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR);
-
-      // On to the outermost loop's test
-      ctx->BranchInst(bbTest[0]);
-
-      ///////////////////////////////////////////////////////////////////////////
-      // foreach_reset: this code runs when we need to reset the counter for
-      // a given dimension in preparation for running through its loop again,
-      // after the enclosing level advances its counter.
-      for (int i = 0; i < nDims; ++i) {
-        ctx->SetCurrentBasicBlock(bbReset[i]);
-        if (i == 0)
-          ctx->BranchInst(bbExit);
-        else {
-          ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
-          ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
-          ctx->BranchInst(bbStep[i-1]);
-        }
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // foreach_step: increment the uniform counter by the vector width.
-      // Note that we don't increment the varying counter here as well but
-      // just generate its value when we need it in the loop body.  Don't do
-      // this for the innermost dimension, which has a more complex stepping
-      // structure..
-      for (int i = 0; i < nDims-1; ++i) {
-        ctx->SetCurrentBasicBlock(bbStep[i]);
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
-        llvm::Value *newCounter =
-          ctx->BinaryOperator(llvm::Instruction::Add, counter,
-              LLVMInt32(span[i]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
-        ctx->BranchInst(bbTest[i]);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // foreach_test (for all dimensions other than the innermost...)
-      std::vector<llvm::Value *> inExtras;
-      for (int i = 0; i < nDims-1; ++i) {
-        ctx->SetCurrentBasicBlock(bbTest[i]);
-
-        llvm::Value *haveExtras =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT,
-              endVals[i], alignedEnd[i], "have_extras");
-
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter");
-        llvm::Value *atAlignedEnd =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
-              counter, alignedEnd[i], "at_aligned_end");
-        llvm::Value *inEx =
-          ctx->BinaryOperator(llvm::Instruction::And, haveExtras,
-              atAlignedEnd, "in_extras");
-
-        if (i == 0)
-          inExtras.push_back(inEx);
-        else
-          inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx,
-                inExtras[i-1], "in_extras_all"));
-
-        llvm::Value *varyingCounter =
-          lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i],
-              dimVariables[i]->storagePtr, span);
-
-        llvm::Value *smearEnd = ctx->BroadcastValue(
-            endVals[i], LLVMTypes::Int32VectorType, "smear_end");
-
-        // Do a vector compare of its value to the end value to generate a
-        // mask for this last bit of work.
-        llvm::Value *emask =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              varyingCounter, smearEnd);
-        emask = ctx->I1VecToBoolVec(emask);
-
-        if (i == 0)
-          ctx->StoreInst(emask, extrasMaskPtrs[i]);
-        else {
-          llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
-          llvm::Value *newMask =
-            ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
-                "extras_mask");
-          ctx->StoreInst(newMask, extrasMaskPtrs[i]);
-        }
-
-        llvm::Value *notAtEnd =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              counter, endVals[i]);
-        ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // foreach_test (for innermost dimension)
-      //
-      // All of the outer dimensions are handled generically--basically as a
-      // for() loop from the start value to the end value, where at each loop
-      // test, we compute the mask of active elements for the current
-      // dimension and then update an overall mask that is the AND
-      // combination of all of the outer ones.
-      //
-      // The innermost loop is handled specially, for performance purposes.
-      // When starting the innermost dimension, we start by checking once
-      // whether any of the outer dimensions has set the mask to be
-      // partially-active or not.  We follow different code paths for these
-      // two cases, taking advantage of the knowledge that the mask is all
-      // on, when this is the case.
-      //
-      // In each of these code paths, we start with a loop from the starting
-      // value to the aligned end value for the innermost dimension; we can
-      // guarantee that the innermost loop will have an "all on" mask (as far
-      // as its dimension is concerned) for the duration of this loop.  Doing
-      // so allows us to emit code that assumes the mask is all on (for the
-      // case where none of the outer dimensions has set the mask to be
-      // partially on), or allows us to emit code that just uses the mask
-      // from the outer dimensions directly (for the case where they have).
-      //
-      // After this loop, we just need to deal with one vector's worth of
-      // "ragged extra bits", where the mask used includes the effect of the
-      // mask for the innermost dimension.
-      //
-      // We start out this process by emitting the check that determines
-      // whether any of the enclosing dimensions is partially active
-      // (i.e. processing extra elements that don't exactly fit into a
-      // vector).
-      llvm::BasicBlock *bbOuterInExtras =
-        ctx->CreateBasicBlock("outer_in_extras");
-      llvm::BasicBlock *bbOuterNotInExtras =
-        ctx->CreateBasicBlock("outer_not_in_extras");
-
-      ctx->SetCurrentBasicBlock(bbTest[nDims-1]);
-      if (inExtras.size())
-        ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
-            inExtras.back());
-      else
-        // for a 1D iteration domain, we certainly don't have any enclosing
-        // dimensions that are processing extra elements.
-        ctx->BranchInst(bbOuterNotInExtras);
-
-      ///////////////////////////////////////////////////////////////////////////
-      // One or more outer dimensions in extras, so we need to mask for the loop
-      // body regardless.  We break this into two cases, roughly:
-      // for (counter = start; counter < alignedEnd; counter += step) {
-      //   // mask is all on for inner, so set mask to outer mask
-      //   // run loop body with mask
-      // }
-      // // counter == alignedEnd
-      // if (counter < end) {
-      //   // set mask to outermask & (counter+programCounter < end)
-      //   // run loop body with mask
-      // }
-      llvm::BasicBlock *bbAllInnerPartialOuter =
-        ctx->CreateBasicBlock("all_inner_partial_outer");
-      llvm::BasicBlock *bbPartial =
-        ctx->CreateBasicBlock("both_partial");
-      ctx->SetCurrentBasicBlock(bbOuterInExtras); {
-        // Update the varying counter value here, since all subsequent
-        // blocks along this path need it.
-        lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
-            dimVariables[nDims-1]->storagePtr, span);
-
-        // here we just check to see if counter < alignedEnd
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
-        llvm::Value *beforeAlignedEnd =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              counter, alignedEnd[nDims-1], "before_aligned_end");
-        ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
-      }
-
-      // Below we have a basic block that runs the loop body code for the
-      // case where the mask is partially but not fully on.  This same block
-      // runs in multiple cases: both for handling any ragged extra data for
-      // the innermost dimension but also when outer dimensions have set the
-      // mask to be partially on.
-      //
-      // The value stored in stepIndexAfterMaskedBodyPtr is used after each
-      // execution of the body code to determine whether the innermost index
-      // value should be incremented by the step (we're running the "for"
-      // loop of full vectors at the innermost dimension, with outer
-      // dimensions having set the mask to be partially on), or whether we're
-      // running once for the ragged extra bits at the end of the innermost
-      // dimension, in which case we're done with the innermost dimension and
-      // should step the loop counter for the next enclosing dimension
-      // instead.
-      llvm::Value *stepIndexAfterMaskedBodyPtr =
-        ctx->AllocaInst(LLVMTypes::BoolType, "step_index");
-
-      ///////////////////////////////////////////////////////////////////////////
-      // We're in the inner loop part where the only masking is due to outer
-      // dimensions but the innermost dimension fits fully into a vector's
-      // width.  Set the mask and jump to the masked loop body.
-      ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); {
-        llvm::Value *mask;
-        if (nDims == 1)
-          // 1D loop; we shouldn't ever get here anyway
-          mask = LLVMMaskAllOff;
-        else
-          mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
-
-        ctx->SetInternalMask(mask);
-
-        ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbMaskedBody);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // We need to include the effect of the innermost dimension in the mask
-      // for the final bits here
-      ctx->SetCurrentBasicBlock(bbPartial); {
-        llvm::Value *varyingCounter =
-          ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
-        llvm::Value *smearEnd = ctx->BroadcastValue(
-            endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
-
-        llvm::Value *emask =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              varyingCounter, smearEnd);
-        emask = ctx->I1VecToBoolVec(emask);
-
-        if (nDims == 1) {
-          ctx->SetInternalMask(emask);
-        }
-        else {
-          llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
-          llvm::Value *newMask =
-            ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
-                "extras_mask");
-          ctx->SetInternalMask(newMask);
-        }
-
-        ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbMaskedBody);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // None of the outer dimensions is processing extras; along the lines
-      // of above, we can express this as:
-      // for (counter = start; counter < alignedEnd; counter += step) {
-      //   // mask is all on
-      //   // run loop body with mask all on
-      // }
-      // // counter == alignedEnd
-      // if (counter < end) {
-      //   // set mask to (counter+programCounter < end)
-      //   // run loop body with mask
-      // }
-      llvm::BasicBlock *bbPartialInnerAllOuter =
-        ctx->CreateBasicBlock("partial_inner_all_outer");
-      ctx->SetCurrentBasicBlock(bbOuterNotInExtras); {
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
-        llvm::Value *beforeAlignedEnd =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              counter, alignedEnd[nDims-1], "before_aligned_end");
-        ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
-            beforeAlignedEnd);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // full_body: do a full vector's worth of work.  We know that all
-      // lanes will be running here, so we explicitly set the mask to be 'all
-      // on'.  This ends up being relatively straightforward: just update the
-      // value of the varying loop counter and have the statements in the
-      // loop body emit their code.
-      llvm::BasicBlock *bbFullBodyContinue =
-        ctx->CreateBasicBlock("foreach_full_continue");
-      ctx->SetCurrentBasicBlock(bbFullBody); {
-        ctx->SetInternalMask(LLVMMaskAllOn);
-        ctx->SetBlockEntryMask(LLVMMaskAllOn);
-        lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
-            dimVariables[nDims-1]->storagePtr, span);
-        ctx->SetContinueTarget(bbFullBodyContinue);
-        ctx->AddInstrumentationPoint("foreach loop body (all on)");
-        stmts->EmitCode(ctx);
-        AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL);
-        ctx->BranchInst(bbFullBodyContinue);
-      }
-      ctx->SetCurrentBasicBlock(bbFullBodyContinue); {
-        ctx->RestoreContinuedLanes();
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
-        llvm::Value *newCounter =
-          ctx->BinaryOperator(llvm::Instruction::Add, counter,
-              LLVMInt32(span[nDims-1]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
-        ctx->BranchInst(bbOuterNotInExtras);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // We're done running blocks with the mask all on; see if the counter is
-      // less than the end value, in which case we need to run the body one
-      // more time to get the extra bits.
-      llvm::BasicBlock *bbSetInnerMask =
-        ctx->CreateBasicBlock("partial_inner_only");
-      ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); {
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
-        llvm::Value *beforeFullEnd =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              counter, endVals[nDims-1], "before_full_end");
-        ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // The outer dimensions are all on, so the mask is just given by the
-      // mask for the innermost dimension
-      ctx->SetCurrentBasicBlock(bbSetInnerMask); {
-        llvm::Value *varyingCounter =
-          lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
-              dimVariables[nDims-1]->storagePtr, span);
-        llvm::Value *smearEnd = ctx->BroadcastValue(
-            endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
-        llvm::Value *emask =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              varyingCounter, smearEnd);
-        emask = ctx->I1VecToBoolVec(emask);
-        ctx->SetInternalMask(emask);
-        ctx->SetBlockEntryMask(emask);
-
-        ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbMaskedBody);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // masked_body: set the mask and have the statements emit their
-      // code again.  Note that it's generally worthwhile having two copies
-      // of the statements' code, since the code above is emitted with the
-      // mask known to be all-on, which in turn leads to more efficient code
-      // for that case.
-      llvm::BasicBlock *bbStepInnerIndex =
-        ctx->CreateBasicBlock("step_inner_index");
-      llvm::BasicBlock *bbMaskedBodyContinue =
-        ctx->CreateBasicBlock("foreach_masked_continue");
-      ctx->SetCurrentBasicBlock(bbMaskedBody); {
-        ctx->AddInstrumentationPoint("foreach loop body (masked)");
-        ctx->SetContinueTarget(bbMaskedBodyContinue);
-        ctx->DisableGatherScatterWarnings();
-        ctx->SetBlockEntryMask(ctx->GetFullMask());
-        stmts->EmitCode(ctx);
-        ctx->EnableGatherScatterWarnings();
-        ctx->BranchInst(bbMaskedBodyContinue);
-      }
-      ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); {
-        ctx->RestoreContinuedLanes();
-        llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // step the innermost index, for the case where we're doing the
-      // innermost for loop over full vectors.
-      ctx->SetCurrentBasicBlock(bbStepInnerIndex); {
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
-        llvm::Value *newCounter =
-          ctx->BinaryOperator(llvm::Instruction::Add, counter,
-              LLVMInt32(span[nDims-1]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
-        ctx->BranchInst(bbOuterInExtras);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // foreach_exit: All done.  Restore the old mask and clean up
-      ctx->SetCurrentBasicBlock(bbExit);
-
-      ctx->SetInternalMask(oldMask);
-      ctx->SetFunctionMask(oldFunctionMask);
-
-      ctx->EndForeach();
-      ctx->EndScope();
+      ctx->SetBlockEntryMask(LLVMMaskAllOn);
+      lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
+          dimVariables[nDims-1]->storagePtr, span);
+      ctx->SetContinueTarget(bbFullBodyContinue);
+      ctx->AddInstrumentationPoint("foreach loop body (all on)");
+      stmts->EmitCode(ctx);
+      AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL);
+      ctx->BranchInst(bbFullBodyContinue);
     }
+    ctx->SetCurrentBasicBlock(bbFullBodyContinue); {
+      ctx->RestoreContinuedLanes();
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
+      llvm::Value *newCounter =
+        ctx->BinaryOperator(llvm::Instruction::Add, counter,
+            LLVMInt32(span[nDims-1]), "new_counter");
+      ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
+      ctx->BranchInst(bbOuterNotInExtras);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // We're done running blocks with the mask all on; see if the counter is
+    // less than the end value, in which case we need to run the body one
+    // more time to get the extra bits.
+    llvm::BasicBlock *bbSetInnerMask =
+      ctx->CreateBasicBlock("partial_inner_only");
+    ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); {
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
+      llvm::Value *beforeFullEnd =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            counter, endVals[nDims-1], "before_full_end");
+      ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // The outer dimensions are all on, so the mask is just given by the
+    // mask for the innermost dimension
+    ctx->SetCurrentBasicBlock(bbSetInnerMask); {
+      llvm::Value *varyingCounter =
+        lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
+            dimVariables[nDims-1]->storagePtr, span);
+      llvm::Value *smearEnd = ctx->BroadcastValue(
+          endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
+      llvm::Value *emask =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            varyingCounter, smearEnd);
+      emask = ctx->I1VecToBoolVec(emask);
+      ctx->SetInternalMask(emask);
+      ctx->SetBlockEntryMask(emask);
+
+      ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
+      ctx->BranchInst(bbMaskedBody);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // masked_body: set the mask and have the statements emit their
+    // code again.  Note that it's generally worthwhile having two copies
+    // of the statements' code, since the code above is emitted with the
+    // mask known to be all-on, which in turn leads to more efficient code
+    // for that case.
+    llvm::BasicBlock *bbStepInnerIndex =
+      ctx->CreateBasicBlock("step_inner_index");
+    llvm::BasicBlock *bbMaskedBodyContinue =
+      ctx->CreateBasicBlock("foreach_masked_continue");
+    ctx->SetCurrentBasicBlock(bbMaskedBody); {
+      ctx->AddInstrumentationPoint("foreach loop body (masked)");
+      ctx->SetContinueTarget(bbMaskedBodyContinue);
+      ctx->DisableGatherScatterWarnings();
+      ctx->SetBlockEntryMask(ctx->GetFullMask());
+      stmts->EmitCode(ctx);
+      ctx->EnableGatherScatterWarnings();
+      ctx->BranchInst(bbMaskedBodyContinue);
+    }
+    ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); {
+      ctx->RestoreContinuedLanes();
+      llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
+      ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // step the innermost index, for the case where we're doing the
+    // innermost for loop over full vectors.
+    ctx->SetCurrentBasicBlock(bbStepInnerIndex); {
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
+      llvm::Value *newCounter =
+        ctx->BinaryOperator(llvm::Instruction::Add, counter,
+            LLVMInt32(span[nDims-1]), "new_counter");
+      ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
+      ctx->BranchInst(bbOuterInExtras);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // foreach_exit: All done.  Restore the old mask and clean up
+    ctx->SetCurrentBasicBlock(bbExit);
+
+    ctx->SetInternalMask(oldMask);
+    ctx->SetFunctionMask(oldFunctionMask);
+
+    ctx->EndForeach();
+    ctx->EndScope();
 }
 
 
diff --git a/type.cpp b/type.cpp
index 107a70de..6be852ad 100644
--- a/type.cpp
+++ b/type.cpp
@@ -2925,7 +2925,7 @@ FunctionType::GetReturnTypeString() const {
 llvm::FunctionType *
 FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
 
-    if (isTask == true) // && !g->target->isPTX()) //getISA() != Target::NVPTX64)
+    if (isTask == true)
         Assert(removeMask == false);
 
     // Get the LLVM Type *s for the function arguments
@@ -2950,44 +2950,30 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
         llvmArgTypes.push_back(LLVMTypes::MaskType);
 
     std::vector<llvm::Type *> callTypes;
-    if (isTask) {
+    if (isTask && g->target->getISA() != Target::NVPTX) {
         // Tasks take three arguments: a pointer to a struct that holds the
         // actual task arguments, the thread index, and the total number of
         // threads the tasks system has running.  (Task arguments are
         // marshalled in a struct so that it's easy to allocate space to
         // hold them until the task actually runs.)
-//        if (g->target->getISA() != Target::NVPTX64)
-        if (!g->target->isPTX()) 
-        {
-          llvm::Type *st = llvm::StructType::get(*ctx, llvmArgTypes);
-          callTypes.push_back(llvm::PointerType::getUnqual(st));
-          callTypes.push_back(LLVMTypes::Int32Type); // threadIndex
-          callTypes.push_back(LLVMTypes::Int32Type); // threadCount
-          callTypes.push_back(LLVMTypes::Int32Type); // taskIndex
-          callTypes.push_back(LLVMTypes::Int32Type); // taskCount
-          callTypes.push_back(LLVMTypes::Int32Type); // taskIndex0
-          callTypes.push_back(LLVMTypes::Int32Type); // taskIndex1
-          callTypes.push_back(LLVMTypes::Int32Type); // taskIndex2
-          callTypes.push_back(LLVMTypes::Int32Type); // taskCount0
-          callTypes.push_back(LLVMTypes::Int32Type); // taskCount1
-          callTypes.push_back(LLVMTypes::Int32Type); // taskCount2
-        }
-        else
-        {
-          if (g->target->getISA() == Target::NVPTX64)
-            callTypes = llvmArgTypes;
-          else
-          {
-            assert(0);  /* evghenii: must be removed in final, just for test for nvptx64 target */
-            llvm::Type *st = llvm::StructType::get(*ctx, llvmArgTypes);
-            callTypes.push_back(llvm::PointerType::getUnqual(st));
-          }
-        }
+        llvm::Type *st = llvm::StructType::get(*ctx, llvmArgTypes);
+        callTypes.push_back(llvm::PointerType::getUnqual(st));
+        callTypes.push_back(LLVMTypes::Int32Type); // threadIndex
+        callTypes.push_back(LLVMTypes::Int32Type); // threadCount
+        callTypes.push_back(LLVMTypes::Int32Type); // taskIndex
+        callTypes.push_back(LLVMTypes::Int32Type); // taskCount
+        callTypes.push_back(LLVMTypes::Int32Type); // taskIndex0
+        callTypes.push_back(LLVMTypes::Int32Type); // taskIndex1
+        callTypes.push_back(LLVMTypes::Int32Type); // taskIndex2
+        callTypes.push_back(LLVMTypes::Int32Type); // taskCount0
+        callTypes.push_back(LLVMTypes::Int32Type); // taskCount1
+        callTypes.push_back(LLVMTypes::Int32Type); // taskCount2
     }
     else
         // Otherwise we already have the types of the arguments
         callTypes = llvmArgTypes;
 
+
     if (returnType == NULL) {
         Assert(m->errorCount > 0);
         return NULL;