MAJOR CHANGE--- STOP WITH THIS BRANCH--

2014-01-06 13:51:02 +01:00
parent 77113fbffd
commit 546f9cb409
13 changed files with 710 additions and 1151 deletions
--- a/8
+++ b/8
@@ -144,7 +144,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=nvptx64 avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=nvptx avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
@@ -254,15 +254,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	$(CXX) $(CXXFLAGS) -o $@ -c $<

-objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4  builtins/util_ptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4  builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
 	m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@

-objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4  builtins/util_ptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4  builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(32 bit version\)
 	m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@

-objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4  builtins/util_ptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4  builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(64 bit version\)
 	m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@

--- a/builtins.cpp
+++ b/builtins.cpp
@@ -693,9 +693,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
        if (g->target->getISA() != Target::NEON32 &&
            g->target->getISA() != Target::NEON16 &&
            g->target->getISA() != Target::NEON8 &&
-            g->target->getISA() != Target::NVPTX64)
+            g->target->getISA() != Target::NVPTX)
 #else
-        if (g->target->getISA() != Target::NVPTX64)
+        if (g->target->getISA() != Target::NVPTX)
 #endif // !__arm__
        {
            Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
@@ -858,14 +858,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    // Next, add the target's custom implementations of the various needed
    // builtin functions (e.g. __masked_store_32(), etc).
    switch (g->target->getISA()) {
-    case Target::NVPTX64: 
+    case Target::NVPTX: 
      {
        if (runtime32) {
-            fprintf(stderr, "W're sorry, but only 64bit targets are supported at this moment .. \n");
+            fprintf(stderr, "Unforetunatly 32bit targets are supported at the moment .. \n");
            assert(0);
        }
        else {
-            EXPORT_MODULE(builtins_bitcode_nvptx64_64bit);
+            EXPORT_MODULE(builtins_bitcode_nvptx_64bit);
        }
        break;
      };
@@ -1138,7 +1138,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    }

    // define the 'programCount' builtin variable
-    if (!g->target->isPTX())
+    if (g->target->getISA() != Target::NVPTX)
    {
      lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
    }
--- a/builtins/target-nvptx64.ll
+++ b/builtins/target-nvptx64.ll
@@ -105,15 +105,9 @@ define i32 @__lanemask_lt_nvptx() nounwind readnone alwaysinline
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; tasking

-define i8* @ISPCAlloc(i8**, i64, i32) nounwind alwaysinline
-{
-  %ptr = inttoptr i64 1 to i8*
-  ret i8* %ptr
-}
-
 ;; this call allocate parameter buffer for kernel launch
 declare i64 @cudaGetParameterBuffer(i64, i64) nounwind
-define i8* @ISPCGetParamBuffer(i8**, i64 %align, i64 %size) nounwind alwaysinline
+define i8* @ISPCAlloc(i8**, i64 %size, i32 %align32) nounwind alwaysinline
 {
 entry:
  %call = tail call i32 @__tid_x()
@@ -121,6 +115,7 @@ entry:
  %sub = add nsw i32 %call1, -1
  %and = and i32 %sub, %call
  %cmp = icmp eq i32 %and, 0
+  %align = zext i32 %align32 to i64
  br i1 %cmp, label %if.then, label %if.end

 if.then:
@@ -224,7 +219,7 @@ define void @ISPCSync(i8*) nounwind alwaysinline



-include(`util_ptx.m4')
+include(`util-nvptx.m4')

 stdlib_core()
 packed_load_and_store()
--- a/builtins/util-nvptx.m4
+++ b/builtins/util-nvptx.m4
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1410,7 +1410,7 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {

 llvm::Value *
 FunctionEmitContext::ProgramIndexVector(bool is32bits) {
-  if (!g->target->isPTX()) //g->target->getISA() != Target::NVPTX64)
+  if (g->target->getISA() != Target::NVPTX)
  {
    llvm::SmallVector<llvm::Constant*, 16> array;
    for (int i = 0; i < g->target->getVectorWidth() ; ++i) {
@@ -3540,7 +3540,7 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
                                std::vector<llvm::Value *> &argVals,
                                llvm::Value *launchCount[3]){

-    if (!g->target->isPTX())
+    if (g->target->getISA() != Target::NVPTX)
    {
      if (callee == NULL) {
        AssertPos(currentPos, m->errorCount > 0);
@@ -3608,7 +3608,79 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
      args.push_back(launchCount[2]);
      return CallInst(flaunch, NULL, args, "");
    }
-    else /* isPTX ==  true */
+    else /* NVPTX */
+    {
+      if (callee == NULL) {
+        AssertPos(currentPos, m->errorCount > 0);
+        return NULL;
+      }
+      launchedTasks = true;
+
+      AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
+      std::vector<llvm::Type*> argTypes;
+      for (unsigned int i = 0; i < argVals.size(); i++)
+        argTypes.push_back(argVals[i]->getType());
+      llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes);
+      llvm::StructType *argStructType = static_cast<llvm::StructType *>(st);
+      llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
+      if (structSize->getType() != LLVMTypes::Int64Type)
+        structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
+            "struct_size_to_64");
+
+      const int align = 8;
+      llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
+      AssertPos(currentPos, falloc != NULL);
+      std::vector<llvm::Value *> allocArgs;
+      allocArgs.push_back(launchGroupHandlePtr);
+      allocArgs.push_back(structSize);
+      allocArgs.push_back(LLVMInt32(align));
+      llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
+      llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64");
+      llvm::BasicBlock* if_true  = CreateBasicBlock("if_true");
+      llvm::BasicBlock* if_false = CreateBasicBlock("if_false");
+
+      /* check if the pointer returned by ISPCAlloc is not NULL 
+       * --------------
+       * this is a workaround for not checking the value of programIndex 
+       * because ISPCAlloc will return NULL pointer for all programIndex > 0
+       * of course, if ISPAlloc fails to get parameter buffer, the pointer for programIndex = 0
+       * will also be NULL
+       * This check must be added, and also rewrite the code to make it less opaque 
+       */
+      llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1");
+      BranchInst(if_true, if_false, cmp1);
+
+      /**********************/
+      bblock = if_true;    
+
+      // label_if_then block:
+      llvm::Type *pt = llvm::PointerType::getUnqual(st);
+      llvm::Value *argmem = BitCastInst(voidmem, pt);
+      for (unsigned int i = 0; i < argVals.size(); ++i) 
+      {
+        llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
+        // don't need to do masked store here, I think
+        StoreInst(argVals[i], ptr);
+      }
+      BranchInst(if_false);
+
+      /**********************/
+      bblock = if_false;
+
+      llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
+      llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
+      AssertPos(currentPos, flaunch != NULL);
+      std::vector<llvm::Value *> args;
+      args.push_back(launchGroupHandlePtr);
+      args.push_back(fptr);
+      args.push_back(voidmem);
+      args.push_back(launchCount[0]);
+      args.push_back(launchCount[1]);
+      args.push_back(launchCount[2]);
+      llvm::Value *ret =  CallInst(flaunch, NULL, args, "");
+      return ret;
+    }
+#if 0
    {
      if (callee == NULL) {
        AssertPos(currentPos, m->errorCount > 0);
@@ -3684,13 +3756,16 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
      args.push_back(launchCount[2]);
      return CallInst(flaunch, NULL, args, "");
    }
+#endif
 }


 void
 FunctionEmitContext::SyncInst() {
-    if (!g->target->isPTX())
+#if 0
+    if (g->target->getISA() != Target::NVPTX)
    {
+#endif
      llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
      llvm::Value *nullPtrValue =
        llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
@@ -3714,6 +3789,7 @@ FunctionEmitContext::SyncInst() {
      BranchInst(bPostSync);

      SetCurrentBasicBlock(bPostSync);
+#if 0
    }
    else
    {
@@ -3726,6 +3802,7 @@ FunctionEmitContext::SyncInst() {
      CallInst(fsync, NULL, launchGroupHandle, "");
      StoreInst(nullPtrValue, launchGroupHandlePtr);
    }
+#endif
 }


--- a/decl.cpp
+++ b/decl.cpp
@@ -531,7 +531,7 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
        returnType = returnType->ResolveUnboundVariability(Variability::Varying);

        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
-        if (isTask && g->target->isPTX()) //getISA() == Target::NVPTX64)
+        if (isTask && g->target->getISA() == Target::NVPTX)
        {
 //          ds->storageClass = SC_EXTERN_C;
          ds->typeQualifiers |= TYPEQUAL_UNMASKED;
@@ -547,7 +547,6 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
                  "qualifiers");
            return;
        }
-//        if (!g->target->isPTX())
        if (isExternC && isTask) {
          Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
              "qualifiers");
--- a/examples_ptx/common_gpu.mk
+++ b/examples_ptx/common_gpu.mk
@@ -22,7 +22,7 @@ endif

 #
 ISPC=ispc
-ISPC_FLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math
+ISPC_FLAGS=-O3 --math-lib=default --target=nvptx --opt=fast-math
 #
 #
 #
--- a/func.cpp
+++ b/func.cpp
@@ -125,7 +125,7 @@ Function::Function(Symbol *s, Stmt *c) {
            sym->parentFunction = this;
    }

-    if (type->isTask) {
+    if (type->isTask && g->target->getISA() != Target::NVPTX) {
        threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
        Assert(threadIndexSym);
        threadCountSym = m->symbolTable->LookupVariable("threadCount");
@@ -237,12 +237,122 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
 #endif
    const FunctionType *type = CastType<FunctionType>(sym->type);
    Assert(type != NULL);
+    if (type->isTask == true && g->target->getISA() != Target::NVPTX)  {
+        // For tasks, we there should always be three parmeters: the
+        // pointer to the structure that holds all of the arguments, the
+        // thread index, and the thread count variables.
+        llvm::Function::arg_iterator argIter = function->arg_begin();
+        llvm::Value *structParamPtr = argIter++;
+        llvm::Value *threadIndex = argIter++;
+        llvm::Value *threadCount = argIter++;
+        llvm::Value *taskIndex = argIter++;
+        llvm::Value *taskCount = argIter++;
+        llvm::Value *taskIndex0 = argIter++;
+        llvm::Value *taskIndex1 = argIter++;
+        llvm::Value *taskIndex2 = argIter++;
+        llvm::Value *taskCount0 = argIter++;
+        llvm::Value *taskCount1 = argIter++;
+        llvm::Value *taskCount2 = argIter++;
+
+        // Copy the function parameter values from the structure into local
+        // storage
+        for (unsigned int i = 0; i < args.size(); ++i)
+            lCopyInTaskParameter(i, structParamPtr, args, ctx);
+
+        if (type->isUnmasked == false) {
+            // Copy in the mask as well.
+            int nArgs = (int)args.size();
+            // The mask is the last parameter in the argument structure
+            llvm::Value *ptr = ctx->AddElementOffset(structParamPtr, nArgs, NULL,
+                                                     "task_struct_mask");
+            llvm::Value *ptrval = ctx->LoadInst(ptr, "mask");
+            ctx->SetFunctionMask(ptrval);
+        }
+
+        // Copy threadIndex and threadCount into stack-allocated storage so
+        // that their symbols point to something reasonable.
+        threadIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadIndex");
+        ctx->StoreInst(threadIndex, threadIndexSym->storagePtr);
+
+        threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount");
+        ctx->StoreInst(threadCount, threadCountSym->storagePtr);
+
+        // Copy taskIndex and taskCount into stack-allocated storage so
+        // that their symbols point to something reasonable.
+        taskIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex");
+        ctx->StoreInst(taskIndex, taskIndexSym->storagePtr);
+
+        taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount");
+        ctx->StoreInst(taskCount, taskCountSym->storagePtr);
+        
+        taskIndexSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex0");
+        ctx->StoreInst(taskIndex0, taskIndexSym0->storagePtr);
+        taskIndexSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex1");
+        ctx->StoreInst(taskIndex1, taskIndexSym1->storagePtr);
+        taskIndexSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex2");
+        ctx->StoreInst(taskIndex2, taskIndexSym2->storagePtr);
+        
+        taskCountSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount0");
+        ctx->StoreInst(taskCount0, taskCountSym0->storagePtr);
+        taskCountSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount1");
+        ctx->StoreInst(taskCount1, taskCountSym1->storagePtr);
+        taskCountSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount2");
+        ctx->StoreInst(taskCount2, taskCountSym2->storagePtr);
+    }
+    else {
+        // Regular, non-task function
+        llvm::Function::arg_iterator argIter = function->arg_begin();
+        for (unsigned int i = 0; i < args.size(); ++i, ++argIter) {
+            Symbol *sym = args[i];
+            if (sym == NULL)
+                // anonymous function parameter
+                continue;
+
+            argIter->setName(sym->name.c_str());
+
+            // Allocate stack storage for the parameter and emit code
+            // to store the its value there.
+            sym->storagePtr = ctx->AllocaInst(argIter->getType(), sym->name.c_str());
+            ctx->StoreInst(argIter, sym->storagePtr);
+            ctx->EmitFunctionParameterDebugInfo(sym, i);
+        }
+
+        // If the number of actual function arguments is equal to the
+        // number of declared arguments in decl->functionParams, then we
+        // don't have a mask parameter, so set it to be all on.  This
+        // happens for exmaple with 'export'ed functions that the app
+        // calls.
+        if (argIter == function->arg_end()) {
+            Assert(type->isUnmasked || type->isExported);
+            ctx->SetFunctionMask(LLVMMaskAllOn);
+        }
+        else {
+            Assert(type->isUnmasked == false);
+
+            // Otherwise use the mask to set the entry mask value
+            argIter->setName("__mask");
+            Assert(argIter->getType() == LLVMTypes::MaskType);
+            ctx->SetFunctionMask(argIter);
+            Assert(++argIter == function->arg_end());
+        }
+        if (type->isTask == true && g->target->getISA() == Target::NVPTX)
+        {
+          llvm::NamedMDNode* annotations =
+            m->module->getOrInsertNamedMetadata("nvvm.annotations");
+          llvm::SmallVector<llvm::Value*, 3> av;
+          av.push_back(function);
+          av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
+          av.push_back(LLVMInt32(1));
+          annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
+        }
+    }
+#if 0
    if (type->isTask == true) {
        // For tasks, we there should always be three parmeters: the
        // pointer to the structure that holds all of the arguments, the
        // thread index, and the thread count variables.

-        if (!g->target->isPTX()) //if (g->target->getISA() != Target::NVPTX64)
+        if (g->target->getISA() != Target::NVPTX)
        {
          llvm::Function::arg_iterator argIter = function->arg_begin();
          llvm::Value *structParamPtr = argIter++;
@@ -341,7 +451,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
            Assert(type->isUnmasked || type->isExported);
            ctx->SetFunctionMask(LLVMMaskAllOn);
          }
-          else  /* for NVPTX64 , function must be unmasked */
+          else  /* for NVPTX, function must be unmasked */
          {
             assert(0);
            Assert(type->isUnmasked == false);
@@ -353,7 +463,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
            Assert(++argIter == function->arg_end());
          }

-          if (g->target->isPTX() && g->target->getISA() == Target::NVPTX64)
+          if (g->target->getISA() == Target::NVPTX)
          {
            llvm::NamedMDNode* annotations =
              m->module->getOrInsertNamedMetadata("nvvm.annotations");
@@ -402,6 +512,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
            Assert(++argIter == function->arg_end());
        }
    }
+#endif

    // Finally, we can generate code for the function
    if (code != NULL) {
@@ -535,26 +646,12 @@ Function::GenerateIR() {
    }

    // And we can now go ahead and emit the code
-     /* export function with NVPTX64 target should be emitted host architecture */
-#if 0
-    const FunctionType *func_type= CastType<FunctionType>(sym->type);
-    if (g->target->getISA() == Target::NVPTX64 && func_type->isExported)
-      return;
-#endif
-
-#if 0
-    if (g->target->getISA() != Target::NVPTX64 && g->target->isPTX() && func_type->isTask)
-      return;
-#endif
-
-//    if (!(g->target->getISA()==Target::NVPTX64 && func_type->isExported))
    {
        FunctionEmitContext ec(this, sym, function, firstStmtPos);
        emitCode(&ec, function, firstStmtPos);
    }

    if (m->errorCount == 0) {
-//      if (!(g->target->getISA() == Target::NVPTX64 && func_type->isExported))
        if (llvm::verifyFunction(*function, llvm::ReturnStatusAction) == true) {
            if (g->debugPrint)
                function->dump();
@@ -566,9 +663,9 @@ Function::GenerateIR() {
        // the application can call it
        const FunctionType *type = CastType<FunctionType>(sym->type);
        Assert(type != NULL);
-        if (type->isExported) { // && g->target->getISA() != Target::VPTX64) {
+        if (type->isExported) { 
            if (!type->isTask) {
-              if (g->target->isPTX() && g->target->getISA() == Target::NVPTX64)
+                if (g->target->getISA() == Target::NVPTX)
                {
                  llvm::NamedMDNode* annotations =
                    m->module->getOrInsertNamedMetadata("nvvm.annotations");
@@ -585,7 +682,7 @@ Function::GenerateIR() {
                if (g->mangleFunctionsWithTarget)
                    functionName += std::string("_") + g->target->GetISAString();

-                if (g->target->getISA() == Target::NVPTX64)
+                if (g->target->getISA() == Target::NVPTX)
                  functionName += std::string("___export");
                llvm::Function *appFunction =
                    llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
@@ -615,7 +712,7 @@ Function::GenerateIR() {
                            FATAL("Function verificication failed");
                        }
                    }
-                    if (g->target->isPTX() && g->target->getISA() == Target::NVPTX64)
+                    if (g->target->getISA() == Target::NVPTX)
                    {
                      llvm::NamedMDNode* annotations =
                        m->module->getOrInsertNamedMetadata("nvvm.annotations");
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -174,7 +174,7 @@ static const char *supportedCPUs[] = {
 #endif // LLVM 3.4+
 };

-Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, bool isPTX) :
+Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
    m_target(NULL),
    m_targetMachine(NULL),
 #if defined(LLVM_3_1)
@@ -184,7 +184,6 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
 #endif
    m_valid(false),
    m_isa(SSE2),
-    m_isPTX(isPTX),
    m_arch(""),
    m_is32Bit(true),
    m_cpu(""),
@@ -212,7 +211,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
            if (!strcmp(cpu, "core-avx2"))
                isa = "avx2-i32x8";
            else if (!strcmp(cpu, "sm_35"))
-              isa = "nvptx64";
+              isa = "nvptx";
 #ifdef ISPC_ARM_ENABLED
            else if (!strcmp(cpu, "cortex-a9") ||
                     !strcmp(cpu, "cortex-a15"))
@@ -249,7 +248,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
        cpu = "cortex-a9";
 #endif

-    if (cpu == NULL && !strcmp(isa, "nvptx64"))
+    if (cpu == NULL && !strcmp(isa, "nvptx"))
      cpu = "sm_35";

    if (cpu == NULL) {
@@ -280,8 +279,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
    this->m_cpu = cpu;

    if (arch == NULL) {
-        if (!strcmp(isa, "nvptx64"))
-            arch = "nvptx64";
+        if (!strcmp(isa, "nvptx"))
+            arch = "nvptx";
 #ifdef ISPC_ARM_ENABLED
        else if (!strncmp(isa, "neon", 4))
            arch = "arm";
@@ -709,10 +708,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
        this->m_maskBitCount = 32;
    }
 #endif
-    else if (!strcasecmp(isa, "nvptx64")) 
+    else if (!strcasecmp(isa, "nvptx")) 
    {
-        this->m_isa = Target::NVPTX64;
-        this->m_isPTX = true;
+        this->m_isa = Target::NVPTX;
        this->m_nativeVectorWidth = 32;
        this->m_nativeVectorAlignment = 32;
        this->m_vectorWidth = 1;
@@ -780,7 +778,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
            dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-"
                "i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-"
                "f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32-v4:128:128";
-        } else if (m_isa == Target::NVPTX64)
+        } else if (m_isa == Target::NVPTX)
        {
          dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
        }
@@ -803,7 +801,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
        // Initialize target-specific "target-feature" attribute.
        if (!m_attributes.empty()) {
            llvm::AttrBuilder attrBuilder;
-            if (m_isa != Target::NVPTX64)
+            if (m_isa != Target::NVPTX)
              attrBuilder.addAttribute("target-cpu", this->m_cpu);
            attrBuilder.addAttribute("target-features", this->m_attributes);
            this->m_tf_attributes = new llvm::AttributeSet(
@@ -838,7 +836,7 @@ Target::SupportedCPUs() {

 const char *
 Target::SupportedArchs() {
-    return "nvptx64, "
+    return "nvptx, "
 #ifdef ISPC_ARM_ENABLED
        "arm, "
 #endif
@@ -848,7 +846,7 @@ Target::SupportedArchs() {

 const char *
 Target::SupportedTargets() {
-    return "nvptx64, "
+    return "nvptx, "
 #ifdef ISPC_ARM_ENABLED
        "neon-i8x16, neon-i16x8, neon-i32x4, "
 #endif
@@ -866,9 +864,9 @@ Target::SupportedTargets() {
 std::string
 Target::GetTripleString() const {
    llvm::Triple triple;
-    if (m_arch == "nvptx64")
+    if (m_arch == "nvptx")
    {
-      triple.setTriple("nvptx64");
+      triple.setTriple("nvptx");
    }
 #ifdef ISPC_ARM_ENABLED
    else if (m_arch == "arm") {
@@ -902,8 +900,8 @@ Target::GetTripleString() const {
 const char *
 Target::ISAToString(ISA isa) {
    switch (isa) {
-    case Target::NVPTX64:
-        return "nvptx64";
+    case Target::NVPTX:
+        return "nvptx";
 #ifdef ISPC_ARM_ENABLED
    case Target::NEON8:
        return "neon-8";
--- a/ispc.h
+++ b/ispc.h
@@ -179,7 +179,7 @@ public:
        flexible/performant of them will apear last in the enumerant.  Note
        also that __best_available_isa() needs to be updated if ISAs are
        added or the enumerant values are reordered.  */
-    enum ISA { NVPTX64,
+    enum ISA { NVPTX,
 #ifdef ISPC_ARM_ENABLED
               NEON32, NEON16, NEON8,
 #endif
@@ -189,7 +189,7 @@ public:
    /** Initializes the given Target pointer for a target of the given
        name, if the name is a known target.  Returns true if the
        target was initialized and false if the name is unknown. */
-    Target(const char *arch, const char *cpu, const char *isa, bool pic, bool isPTX = false);
+    Target(const char *arch, const char *cpu, const char *isa, bool pic);

    /** Returns a comma-delimited string giving the names of the currently
        supported compilation targets. */
@@ -251,7 +251,6 @@ public:
    bool isValid() const {return m_valid;}

    ISA getISA() const {return m_isa;}
-    bool isPTX() const {return m_isPTX;}

    std::string getArch() const {return m_arch;}

@@ -310,7 +309,6 @@ private:

    /** Instruction set being compiled to. */
    ISA m_isa;
-    bool m_isPTX;

    /** Target system architecture.  (e.g. "x86-64", "x86"). */
    std::string m_arch;
--- a/module.cpp
+++ b/module.cpp
@@ -733,7 +733,7 @@ Module::AddFunctionDeclaration(const std::string &name,
    if (storageClass == SC_EXTERN_C) {
        // Make sure the user hasn't supplied both an 'extern "C"' and a
        // 'task' qualifier with the function
-        if (functionType->isTask) //&& !g->target->isPTX())  //tISA() != Target::NVPTX64) 
+        if (functionType->isTask)
        {
            Error(pos, "\"task\" qualifier is illegal with C-linkage extern "
                  "function \"%s\".  Ignoring this function.", name.c_str());
@@ -796,8 +796,8 @@ Module::AddFunctionDeclaration(const std::string &name,
 #else // LLVM 3.1 and 3.3+
        function->addFnAttr(llvm::Attribute::AlwaysInline);
 #endif
-     /* evghenii: on PTX target this must not be used, cause crash, dunno why */
-    if (functionType->isTask && g->target->getISA() != Target::NVPTX64)
+     /* evghenii: on PTX target the following must not be set ... why ?!? */
+    if (functionType->isTask && g->target->getISA() != Target::NVPTX)
        // This also applies transitively to members I think?
 #if defined(LLVM_3_1)
        function->setDoesNotAlias(1, true);
@@ -953,7 +953,7 @@ Module::writeOutput(OutputType outputType, const char *outFileName,
        const char *fileType = NULL;
        switch (outputType) {
        case Asm:
-            if (g->target->getISA() != Target::NVPTX64)
+            if (g->target->getISA() != Target::NVPTX)
            {
              if (strcasecmp(suffix, "s"))
                fileType = "assembly";
@@ -1053,7 +1053,7 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) {
    }

    llvm::raw_fd_ostream fos(fd, (fd != 1), false);
-    if (g->target->getISA() == Target::NVPTX64)
+    if (g->target->getISA() == Target::NVPTX)
    {
      const std::string dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
      module->setDataLayout(dl_string);
@@ -1925,7 +1925,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
            opts.addMacroDef(g->cppArgs[i].substr(2));
        }
    }
-    if (g->target->getISA() == Target::NVPTX64)
+    if (g->target->getISA() == Target::NVPTX)
    {
      opts.addMacroDef("__NVPTX__");
      opts.addMacroDef("programIndex=laneIndex()");
@@ -2331,135 +2331,9 @@ Module::CompileAndOutput(const char *srcFile,
                         const char *hostStubFileName,
                         const char *devStubFileName)
 {
-  char ptxname[] = "nvptx64";
-  for (int k = 0; k < 7; k++)
-    ptxname[k] = target[k];
-  if (0) //target != NULL && strcmp(ptxname,"nvptx64") == 0)  // NVPTX64
-  {
-    std::vector<std::string> targets = lExtractTargets(target);
-    Assert(targets.size() > 1);
+  if (target == NULL || strchr(target, ',') == NULL) {
        // We're only compiling to a single target
-    int errorCount = 0;
-
-    const char *suffix_orig = strrchr(outFileName, '.');
-    ++suffix_orig;
-    assert(suffix_orig!=NULL);
-
-    g->PtxString = std::string();
-
-    for (int itarget = 0; itarget < 1; itarget++)
-    {
-      fprintf(stderr,  "compiling nvptx64 : target= %s\n",targets[itarget].c_str());
-      g->target = new Target(arch, cpu, targets[itarget].c_str(), generatePIC, /* isPTX= */ true);
-      if (!g->target->isValid())
-        return 1;
-
-      m = new Module(srcFile);
-      if (m->CompileFile() == 0) {
-        if (outputType == CXX) {
-          if (target == NULL || strncmp(target, "generic-", 8) != 0) {
-            Error(SourcePos(), "When generating C++ output, one of the \"generic-*\" "
-                "targets must be used.");
-            return 1;
-          }
-        }
-        else if (outputType == Asm || outputType == Object) {
-          if (target != NULL && strncmp(target, "generic-", 8) == 0) {
-            Error(SourcePos(), "When using a \"generic-*\" compilation target, "
-                "%s output can not be used.",
-                (outputType == Asm) ? "assembly" : "object file");
-            return 1;
-          }
-        }
-
-        assert(outFileName != NULL);
-
-        std::string targetOutFileName = 
-          lGetTargetFileName(outFileName, targets[itarget].c_str());
-        if (outputType == Asm)
-        {
-          const char * targetOutFileName_c = targetOutFileName.c_str();
-          const int suffix = strrchr(targetOutFileName_c, '.') - targetOutFileName_c + 1;
-          if (itarget == 1 && !strcasecmp(suffix_orig, "ptx"))
-          {
-            targetOutFileName[suffix  ] = 's';
-            targetOutFileName[suffix+1] =  0;
-          }
-        }
-
-        if (outputType != Object)
-        {
-          if (!m->writeOutput(outputType, targetOutFileName.c_str(), includeFileName))
-            return 1;
-        }
-        else if (itarget > 0)
-        {
-          if (!m->writeOutput(outputType, outFileName, includeFileName))
-            return 1;
-        }
-
-        if (itarget == 0)
-        {  /* store ptx into memory */
-          llvm::PassManager pm;
-#if defined(LLVM_3_1)
-          pm.add(new llvm::TargetData(*g->target->getDataLayout()));
-#else
-          pm.add(new llvm::DataLayout(*g->target->getDataLayout()));
-#endif
-
-          llvm::raw_string_ostream rso(g->PtxString);
-          llvm::formatted_raw_ostream fos(rso);
-
-          llvm::TargetMachine::CodeGenFileType fileType = llvm::TargetMachine::CGFT_AssemblyFile;
-          llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
-          if (targetMachine->addPassesToEmitFile(pm, fos, fileType)) {
-            fprintf(stderr, "Fatal error adding passes to emit object file!");
-            exit(1);
-          }
-
-          llvm::Module *module = m->module;
-          pm.run(*module);
-          fos.flush();
-          assert(!g->PtxString.empty());
-#if 0
-          std::cout << g->PtxString << std::endl;
-#endif
-        }
-
-
-        if (itarget > 0)
-        {
-          if (headerFileName != NULL)
-            if (!m->writeOutput(Module::Header, headerFileName))
-              return 1;
-          if (depsFileName != NULL)
-            if (!m->writeOutput(Module::Deps,depsFileName))
-              return 1;
-          if (hostStubFileName != NULL)
-            if (!m->writeOutput(Module::HostStub,hostStubFileName))
-              return 1;
-          if (devStubFileName != NULL)
-            if (!m->writeOutput(Module::DevStub,devStubFileName))
-              return 1;
-        }
-      }
-      else
-        ++m->errorCount;
-
-      errorCount += m->errorCount;
-      delete m;
-      m = NULL;
-
-      delete g->target;
-      g->target = NULL;
-
-    }
-    return errorCount > 0;
-  }
-  else if (target == NULL || strchr(target, ',') == NULL) {
-        // We're only compiling to a single target
-        const bool isPTX = strcmp(target, "nvptx64") == 0;
-        g->target = new Target(arch, cpu, target, generatePIC, isPTX);
+        g->target = new Target(arch, cpu, target, generatePIC);
        if (!g->target->isValid())
            return 1;

@@ -2525,8 +2399,6 @@ Module::CompileAndOutput(const char *srcFile,
        // The user supplied multiple targets
        std::vector<std::string> targets = lExtractTargets(target);
        Assert(targets.size() > 1);
-        for (unsigned int i = 0; i < targets.size(); ++i) 
-          assert(strcmp(targets[i].c_str(), "nvptx64") < 0);

        if (outFileName != NULL && strcmp(outFileName, "-") == 0) {
            Error(SourcePos(), "Multi-target compilation can't generate output "
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -206,7 +206,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
        }

        if (sym->storageClass == SC_STATIC) {
-          if (g->target->getISA() == Target::NVPTX64)
+          if (g->target->getISA() == Target::NVPTX)
            if (!sym->type->IsConstType())
              Error(initExpr->pos, "Non-constant static variable ""\"%s\" is not supported with ""\"cuda\" target.",
                  sym->name.c_str());
@@ -1280,7 +1280,7 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
                      llvm::Value *varyingCounterPtr,
                      const std::vector<int> &spans) 
 {
-  if (!g->target->isPTX())
+  if (g->target->getISA() != Target::NVPTX)
  {
    // Smear the uniform counter value out to be varying
    llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
@@ -1315,7 +1315,7 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
    ctx->StoreInst(varyingCounter, varyingCounterPtr);
    return varyingCounter;
  }
-  else /* isPTX() == true */
+  else /* NVPTX == true */
  {
    // Smear the uniform counter value out to be varying
    llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
@@ -1465,8 +1465,6 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    if (ctx->GetCurrentBasicBlock() == NULL || stmts == NULL)
        return;

-    if (!g->target->isPTX())
-    {
    llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body");
    llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body");
    llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit");
@@ -1493,469 +1491,9 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;

    std::vector<int> span(nDims, 0);
-      lGetSpans(nDims-1, nDims, g->target->getVectorWidth(), isTiled, &span[0]);
-
-      for (int i = 0; i < nDims; ++i) {
-        // Basic blocks that we'll fill in later with the looping logic for
-        // this dimension.
-        bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
-        if (i < nDims-1)
-          // stepping for the innermost dimension is handled specially
-          bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
-        bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));
-
-        // Start and end value for this loop dimension
-        llvm::Value *sv = startExprs[i]->GetValue(ctx);
-        llvm::Value *ev = endExprs[i]->GetValue(ctx);
-        if (sv == NULL || ev == NULL)
-          return;
-        startVals.push_back(sv);
-        endVals.push_back(ev);
-
-        // nItems = endVal - startVal
-        llvm::Value *nItems =
-          ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");
-
-        // nExtras = nItems % (span for this dimension)
-        // This gives us the number of extra elements we need to deal with
-        // at the end of the loop for this dimension that don't fit cleanly
-        // into a vector width.
-        nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
-              LLVMInt32(span[i]), "nextras"));
-
-        // alignedEnd = endVal - nExtras
-        alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev,
-              nExtras[i], "aligned_end"));
-
-        ///////////////////////////////////////////////////////////////////////
-        // Each dimension has a loop counter that is a uniform value that
-        // goes from startVal to endVal, in steps of the span for this
-        // dimension.  Its value is only used internally here for looping
-        // logic and isn't directly available in the user's program code.
-        uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type,
-              "counter"));
-        ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
-
-        // There is also a varying variable that holds the set of index
-        // values for each dimension in the current loop iteration; this is
-        // the value that is program-visible.
-        dimVariables[i]->storagePtr =
-          ctx->AllocaInst(LLVMTypes::Int32VectorType,
-              dimVariables[i]->name.c_str());
-        dimVariables[i]->parentFunction = ctx->GetFunction();
-        ctx->EmitVariableDebugInfo(dimVariables[i]);
-
-        // Each dimension also maintains a mask that represents which of
-        // the varying elements in the current iteration should be
-        // processed.  (i.e. this is used to disable the lanes that have
-        // out-of-bounds offsets.)
-        extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask"));
-        ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
-      }
-
-      ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR);
-
-      // On to the outermost loop's test
-      ctx->BranchInst(bbTest[0]);
-
-      ///////////////////////////////////////////////////////////////////////////
-      // foreach_reset: this code runs when we need to reset the counter for
-      // a given dimension in preparation for running through its loop again,
-      // after the enclosing level advances its counter.
-      for (int i = 0; i < nDims; ++i) {
-        ctx->SetCurrentBasicBlock(bbReset[i]);
-        if (i == 0)
-          ctx->BranchInst(bbExit);
-        else {
-          ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
-          ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
-          ctx->BranchInst(bbStep[i-1]);
-        }
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // foreach_step: increment the uniform counter by the vector width.
-      // Note that we don't increment the varying counter here as well but
-      // just generate its value when we need it in the loop body.  Don't do
-      // this for the innermost dimension, which has a more complex stepping
-      // structure..
-      for (int i = 0; i < nDims-1; ++i) {
-        ctx->SetCurrentBasicBlock(bbStep[i]);
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
-        llvm::Value *newCounter =
-          ctx->BinaryOperator(llvm::Instruction::Add, counter,
-              LLVMInt32(span[i]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
-        ctx->BranchInst(bbTest[i]);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // foreach_test (for all dimensions other than the innermost...)
-      std::vector<llvm::Value *> inExtras;
-      for (int i = 0; i < nDims-1; ++i) {
-        ctx->SetCurrentBasicBlock(bbTest[i]);
-
-        llvm::Value *haveExtras =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT,
-              endVals[i], alignedEnd[i], "have_extras");
-
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter");
-        llvm::Value *atAlignedEnd =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
-              counter, alignedEnd[i], "at_aligned_end");
-        llvm::Value *inEx =
-          ctx->BinaryOperator(llvm::Instruction::And, haveExtras,
-              atAlignedEnd, "in_extras");
-
-        if (i == 0)
-          inExtras.push_back(inEx);
-        else
-          inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx,
-                inExtras[i-1], "in_extras_all"));
-
-        llvm::Value *varyingCounter =
-          lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i],
-              dimVariables[i]->storagePtr, span);
-
-        llvm::Value *smearEnd = ctx->BroadcastValue(
-            endVals[i], LLVMTypes::Int32VectorType, "smear_end");
-
-        // Do a vector compare of its value to the end value to generate a
-        // mask for this last bit of work.
-        llvm::Value *emask =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              varyingCounter, smearEnd);
-        emask = ctx->I1VecToBoolVec(emask);
-
-        if (i == 0)
-          ctx->StoreInst(emask, extrasMaskPtrs[i]);
-        else {
-          llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
-          llvm::Value *newMask =
-            ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
-                "extras_mask");
-          ctx->StoreInst(newMask, extrasMaskPtrs[i]);
-        }
-
-        llvm::Value *notAtEnd =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              counter, endVals[i]);
-        ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // foreach_test (for innermost dimension)
-      //
-      // All of the outer dimensions are handled generically--basically as a
-      // for() loop from the start value to the end value, where at each loop
-      // test, we compute the mask of active elements for the current
-      // dimension and then update an overall mask that is the AND
-      // combination of all of the outer ones.
-      //
-      // The innermost loop is handled specially, for performance purposes.
-      // When starting the innermost dimension, we start by checking once
-      // whether any of the outer dimensions has set the mask to be
-      // partially-active or not.  We follow different code paths for these
-      // two cases, taking advantage of the knowledge that the mask is all
-      // on, when this is the case.
-      //
-      // In each of these code paths, we start with a loop from the starting
-      // value to the aligned end value for the innermost dimension; we can
-      // guarantee that the innermost loop will have an "all on" mask (as far
-      // as its dimension is concerned) for the duration of this loop.  Doing
-      // so allows us to emit code that assumes the mask is all on (for the
-      // case where none of the outer dimensions has set the mask to be
-      // partially on), or allows us to emit code that just uses the mask
-      // from the outer dimensions directly (for the case where they have).
-      //
-      // After this loop, we just need to deal with one vector's worth of
-      // "ragged extra bits", where the mask used includes the effect of the
-      // mask for the innermost dimension.
-      //
-      // We start out this process by emitting the check that determines
-      // whether any of the enclosing dimensions is partially active
-      // (i.e. processing extra elements that don't exactly fit into a
-      // vector).
-      llvm::BasicBlock *bbOuterInExtras =
-        ctx->CreateBasicBlock("outer_in_extras");
-      llvm::BasicBlock *bbOuterNotInExtras =
-        ctx->CreateBasicBlock("outer_not_in_extras");
-
-      ctx->SetCurrentBasicBlock(bbTest[nDims-1]);
-      if (inExtras.size())
-        ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
-            inExtras.back());
-      else
-        // for a 1D iteration domain, we certainly don't have any enclosing
-        // dimensions that are processing extra elements.
-        ctx->BranchInst(bbOuterNotInExtras);
-
-      ///////////////////////////////////////////////////////////////////////////
-      // One or more outer dimensions in extras, so we need to mask for the loop
-      // body regardless.  We break this into two cases, roughly:
-      // for (counter = start; counter < alignedEnd; counter += step) {
-      //   // mask is all on for inner, so set mask to outer mask
-      //   // run loop body with mask
-      // }
-      // // counter == alignedEnd
-      // if (counter < end) {
-      //   // set mask to outermask & (counter+programCounter < end)
-      //   // run loop body with mask
-      // }
-      llvm::BasicBlock *bbAllInnerPartialOuter =
-        ctx->CreateBasicBlock("all_inner_partial_outer");
-      llvm::BasicBlock *bbPartial =
-        ctx->CreateBasicBlock("both_partial");
-      ctx->SetCurrentBasicBlock(bbOuterInExtras); {
-        // Update the varying counter value here, since all subsequent
-        // blocks along this path need it.
-        lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
-            dimVariables[nDims-1]->storagePtr, span);
-
-        // here we just check to see if counter < alignedEnd
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
-        llvm::Value *beforeAlignedEnd =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              counter, alignedEnd[nDims-1], "before_aligned_end");
-        ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
-      }
-
-      // Below we have a basic block that runs the loop body code for the
-      // case where the mask is partially but not fully on.  This same block
-      // runs in multiple cases: both for handling any ragged extra data for
-      // the innermost dimension but also when outer dimensions have set the
-      // mask to be partially on.
-      //
-      // The value stored in stepIndexAfterMaskedBodyPtr is used after each
-      // execution of the body code to determine whether the innermost index
-      // value should be incremented by the step (we're running the "for"
-      // loop of full vectors at the innermost dimension, with outer
-      // dimensions having set the mask to be partially on), or whether we're
-      // running once for the ragged extra bits at the end of the innermost
-      // dimension, in which case we're done with the innermost dimension and
-      // should step the loop counter for the next enclosing dimension
-      // instead.
-      llvm::Value *stepIndexAfterMaskedBodyPtr =
-        ctx->AllocaInst(LLVMTypes::BoolType, "step_index");
-
-      ///////////////////////////////////////////////////////////////////////////
-      // We're in the inner loop part where the only masking is due to outer
-      // dimensions but the innermost dimension fits fully into a vector's
-      // width.  Set the mask and jump to the masked loop body.
-      ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); {
-        llvm::Value *mask;
-        if (nDims == 1)
-          // 1D loop; we shouldn't ever get here anyway
-          mask = LLVMMaskAllOff;
-        else
-          mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
-
-        ctx->SetInternalMask(mask);
-
-        ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbMaskedBody);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // We need to include the effect of the innermost dimension in the mask
-      // for the final bits here
-      ctx->SetCurrentBasicBlock(bbPartial); {
-        llvm::Value *varyingCounter =
-          ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
-        llvm::Value *smearEnd = ctx->BroadcastValue(
-            endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
-
-        llvm::Value *emask =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              varyingCounter, smearEnd);
-        emask = ctx->I1VecToBoolVec(emask);
-
-        if (nDims == 1) {
-          ctx->SetInternalMask(emask);
-        }
-        else {
-          llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
-          llvm::Value *newMask =
-            ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
-                "extras_mask");
-          ctx->SetInternalMask(newMask);
-        }
-
-        ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbMaskedBody);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // None of the outer dimensions is processing extras; along the lines
-      // of above, we can express this as:
-      // for (counter = start; counter < alignedEnd; counter += step) {
-      //   // mask is all on
-      //   // run loop body with mask all on
-      // }
-      // // counter == alignedEnd
-      // if (counter < end) {
-      //   // set mask to (counter+programCounter < end)
-      //   // run loop body with mask
-      // }
-      llvm::BasicBlock *bbPartialInnerAllOuter =
-        ctx->CreateBasicBlock("partial_inner_all_outer");
-      ctx->SetCurrentBasicBlock(bbOuterNotInExtras); {
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
-        llvm::Value *beforeAlignedEnd =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              counter, alignedEnd[nDims-1], "before_aligned_end");
-        ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
-            beforeAlignedEnd);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // full_body: do a full vector's worth of work.  We know that all
-      // lanes will be running here, so we explicitly set the mask to be 'all
-      // on'.  This ends up being relatively straightforward: just update the
-      // value of the varying loop counter and have the statements in the
-      // loop body emit their code.
-      llvm::BasicBlock *bbFullBodyContinue =
-        ctx->CreateBasicBlock("foreach_full_continue");
-      ctx->SetCurrentBasicBlock(bbFullBody); {
-        ctx->SetInternalMask(LLVMMaskAllOn);
-        ctx->SetBlockEntryMask(LLVMMaskAllOn);
-        lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
-            dimVariables[nDims-1]->storagePtr, span);
-        ctx->SetContinueTarget(bbFullBodyContinue);
-        ctx->AddInstrumentationPoint("foreach loop body (all on)");
-        stmts->EmitCode(ctx);
-        AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL);
-        ctx->BranchInst(bbFullBodyContinue);
-      }
-      ctx->SetCurrentBasicBlock(bbFullBodyContinue); {
-        ctx->RestoreContinuedLanes();
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
-        llvm::Value *newCounter =
-          ctx->BinaryOperator(llvm::Instruction::Add, counter,
-              LLVMInt32(span[nDims-1]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
-        ctx->BranchInst(bbOuterNotInExtras);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // We're done running blocks with the mask all on; see if the counter is
-      // less than the end value, in which case we need to run the body one
-      // more time to get the extra bits.
-      llvm::BasicBlock *bbSetInnerMask =
-        ctx->CreateBasicBlock("partial_inner_only");
-      ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); {
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
-        llvm::Value *beforeFullEnd =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              counter, endVals[nDims-1], "before_full_end");
-        ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // The outer dimensions are all on, so the mask is just given by the
-      // mask for the innermost dimension
-      ctx->SetCurrentBasicBlock(bbSetInnerMask); {
-        llvm::Value *varyingCounter =
-          lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
-              dimVariables[nDims-1]->storagePtr, span);
-        llvm::Value *smearEnd = ctx->BroadcastValue(
-            endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
-        llvm::Value *emask =
-          ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-              varyingCounter, smearEnd);
-        emask = ctx->I1VecToBoolVec(emask);
-        ctx->SetInternalMask(emask);
-        ctx->SetBlockEntryMask(emask);
-
-        ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbMaskedBody);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // masked_body: set the mask and have the statements emit their
-      // code again.  Note that it's generally worthwhile having two copies
-      // of the statements' code, since the code above is emitted with the
-      // mask known to be all-on, which in turn leads to more efficient code
-      // for that case.
-      llvm::BasicBlock *bbStepInnerIndex =
-        ctx->CreateBasicBlock("step_inner_index");
-      llvm::BasicBlock *bbMaskedBodyContinue =
-        ctx->CreateBasicBlock("foreach_masked_continue");
-      ctx->SetCurrentBasicBlock(bbMaskedBody); {
-        ctx->AddInstrumentationPoint("foreach loop body (masked)");
-        ctx->SetContinueTarget(bbMaskedBodyContinue);
-        ctx->DisableGatherScatterWarnings();
-        ctx->SetBlockEntryMask(ctx->GetFullMask());
-        stmts->EmitCode(ctx);
-        ctx->EnableGatherScatterWarnings();
-        ctx->BranchInst(bbMaskedBodyContinue);
-      }
-      ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); {
-        ctx->RestoreContinuedLanes();
-        llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // step the innermost index, for the case where we're doing the
-      // innermost for loop over full vectors.
-      ctx->SetCurrentBasicBlock(bbStepInnerIndex); {
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
-        llvm::Value *newCounter =
-          ctx->BinaryOperator(llvm::Instruction::Add, counter,
-              LLVMInt32(span[nDims-1]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
-        ctx->BranchInst(bbOuterInExtras);
-      }
-
-      ///////////////////////////////////////////////////////////////////////////
-      // foreach_exit: All done.  Restore the old mask and clean up
-      ctx->SetCurrentBasicBlock(bbExit);
-
-      ctx->SetInternalMask(oldMask);
-      ctx->SetFunctionMask(oldFunctionMask);
-
-      ctx->EndForeach();
-      ctx->EndScope();
-    }
-    else /* isPTX() == true */
-    {
-      llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body");
-      llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body");
-      llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit");
-
-      llvm::Value *oldMask = ctx->GetInternalMask();
-      llvm::Value *oldFunctionMask = ctx->GetFunctionMask();
-
-      ctx->SetDebugPos(pos);
-      ctx->StartScope();
-
-      ctx->SetInternalMask(LLVMMaskAllOn);
-      ctx->SetFunctionMask(LLVMMaskAllOn);
-
-      // This should be caught during typechecking
-      AssertPos(pos, startExprs.size() == dimVariables.size() &&
-          endExprs.size() == dimVariables.size());
-      int nDims = (int)dimVariables.size();
-
-      ///////////////////////////////////////////////////////////////////////
-      // Setup: compute the number of items we have to work on in each
-      // dimension and a number of derived values.
-      std::vector<llvm::BasicBlock *> bbReset, bbStep, bbTest;
-      std::vector<llvm::Value *> startVals, endVals, uniformCounterPtrs;
-      std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;
-
-      std::vector<int> span(nDims, 0);
-      const int vectorWidth = 32;
+    const int vectorWidth = 
+      g->target->getISA() == Target::NVPTX ? 32 : g->target->getVectorWidth();
    lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]);
-#if 0
-      for (int i = 0; i < nDims; i++)
-      {
-        fprintf(stderr, " i= %d [ %d ] : %d \n",
-            i, nDims, span[i]);
-      }
-      fprintf(stderr, " --- \n");
-#endif

    for (int i = 0; i < nDims; ++i) {
      // Basic blocks that we'll fill in later with the looping logic for
@@ -2380,7 +1918,6 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    ctx->EndForeach();
    ctx->EndScope();
 }
-}


 Stmt *
--- a/type.cpp
+++ b/type.cpp
@@ -2925,7 +2925,7 @@ FunctionType::GetReturnTypeString() const {
 llvm::FunctionType *
 FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {

-    if (isTask == true) // && !g->target->isPTX()) //getISA() != Target::NVPTX64)
+    if (isTask == true)
        Assert(removeMask == false);

    // Get the LLVM Type *s for the function arguments
@@ -2950,15 +2950,12 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
        llvmArgTypes.push_back(LLVMTypes::MaskType);

    std::vector<llvm::Type *> callTypes;
-    if (isTask) {
+    if (isTask && g->target->getISA() != Target::NVPTX) {
        // Tasks take three arguments: a pointer to a struct that holds the
        // actual task arguments, the thread index, and the total number of
        // threads the tasks system has running.  (Task arguments are
        // marshalled in a struct so that it's easy to allocate space to
        // hold them until the task actually runs.)
-//        if (g->target->getISA() != Target::NVPTX64)
-        if (!g->target->isPTX()) 
-        {
        llvm::Type *st = llvm::StructType::get(*ctx, llvmArgTypes);
        callTypes.push_back(llvm::PointerType::getUnqual(st));
        callTypes.push_back(LLVMTypes::Int32Type); // threadIndex
@@ -2972,22 +2969,11 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
        callTypes.push_back(LLVMTypes::Int32Type); // taskCount1
        callTypes.push_back(LLVMTypes::Int32Type); // taskCount2
    }
-        else
-        {
-          if (g->target->getISA() == Target::NVPTX64)
-            callTypes = llvmArgTypes;
-          else
-          {
-            assert(0);  /* evghenii: must be removed in final, just for test for nvptx64 target */
-            llvm::Type *st = llvm::StructType::get(*ctx, llvmArgTypes);
-            callTypes.push_back(llvm::PointerType::getUnqual(st));
-          }
-        }
-    }
    else
        // Otherwise we already have the types of the arguments
        callTypes = llvmArgTypes;

+
    if (returnType == NULL) {
        Assert(m->errorCount > 0);
        return NULL;