diff --git a/builtins.cpp b/builtins.cpp
index fc21a7c3..397af3c3 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -654,14 +654,17 @@ lSetInternalFunctions(llvm::Module *module) {
         "__vec4_add_int32",
         "__vselect_float",
         "__vselect_i32",
-        "__tid_x",
-        "__ctaid_x",
-        "__ctaid_y",
-        "__ctaid_z",
-        "__nctaid_x",
-        "__nctaid_y",
-        "__nctaid_z",
-        "__warpsize",
+        "__program_index",
+        "__program_count",
+        "__warp_index",
+        "__task_index0",
+        "__task_index1",
+        "__task_index2",
+        "__task_index",
+        "__task_count0",
+        "__task_count1",
+        "__task_count2",
+        "__task_count",
         "__cvt_loc2gen",
         "__cvt_loc2gen_var",
         "__cvt_const2gen"
diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll
index 3bc2852e..2a61c013 100644
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
@@ -17,57 +17,91 @@ declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() nounwind readnone
 declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() nounwind readnone
 
-define i32 @__tid_x()  nounwind readnone alwaysinline
+;;;;;;;;;;
+
+define i32 @__program_index()  nounwind readnone alwaysinline
 {
  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
- ret i32 %tid
+ %program_index = and i32 %tid, 31
+ ret i32 %program_index
 }
-define i32 @__warpsize()  nounwind readnone alwaysinline
+define i32 @__program_count()  nounwind readnone alwaysinline
 {
 ;; %tid = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
 ;; ret i32 %tid
   ret i32 32
 }
-define i32 @__laneidx() nounwind readnone alwaysinline
+define i32 @__warp_index() nounwind readnone alwaysinline
 {
-  %tid = tail call i32 @__tid_x()
-  %lane = and i32 %tid, 31
-  ret i32 %lane;
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %warp_index = lshr i32 %tid, 5
+  ret i32 %warp_index
 }
 
+;;;;;;;;;;;;
 
-define i32 @__ctaid_x()  nounwind readnone alwaysinline
+define i32 @__task_index0()  nounwind readnone alwaysinline
 {
- %bid = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
- ret i32 %bid
+ %bid  = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+ %bid4 = shl i32 %bid, 2
+ %warp_index = call i32 @__warp_index()
+ %task_index0 = add i32 %bid4, %warp_index
+ ret i32 %task_index0
 }
-define i32 @__ctaid_y()  nounwind readnone alwaysinline
+define i32 @__task_index1()  nounwind readnone alwaysinline
 {
- %bid = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
- ret i32 %bid
+ %task_index1 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+ ret i32 %task_index1
 }
-define i32 @__ctaid_z()  nounwind readnone alwaysinline
+define i32 @__task_index2()  nounwind readnone alwaysinline
 {
- %bid = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
- ret i32 %bid
+ %task_index2 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+ ret i32 %task_index2
+}
+define i32 @__task_index()  nounwind readnone alwaysinline
+{
+  %ti0 = call i32 @__task_index0()
+  %ti1 = call i32 @__task_index1()
+  %ti2 = call i32 @__task_index2()
+  %tc0 = call i32 @__task_count0()
+  %tc1 = call i32 @__task_count1()
+  %mul1 = mul i32 %tc1, %ti2
+  %add1 = add i32 %mul1, %ti1
+  %mul2 = mul i32 %add1, %tc0
+  %task_index = add i32 %mul2, %ti0
+  ret i32 %task_index
 }
 
-define i32 @__nctaid_x()  nounwind readnone alwaysinline
+;;;;;
+
+define i32 @__task_count0()  nounwind readnone alwaysinline
 {
  %nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
- ret i32 %nb
+ %task_count0 = shl i32 %nb, 2
+ ret i32 %task_count0
 }
-define i32 @__nctaid_y()  nounwind readnone alwaysinline
+define i32 @__task_count1()  nounwind readnone alwaysinline
 {
- %nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
- ret i32 %nb
+ %task_count1 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+ ret i32 %task_count1
 }
-define i32 @__nctaid_z()  nounwind readnone alwaysinline
+define i32 @__task_count2()  nounwind readnone alwaysinline
 {
- %nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
- ret i32 %nb
+ %task_count2 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+ ret i32 %task_count2
 }
+define i32 @__task_count()  nounwind readnone alwaysinline
+{
+  %tc0 = call i32 @__task_count0()
+  %tc1 = call i32 @__task_count1()
+  %tc2 = call i32 @__task_count2()
+  %mul1 = mul i32 %tc1, %tc2
+  %task_count = mul i32 %mul1, %tc0
+  ret i32 %task_count
+}
+
 ;;;;;;;;
+
 declare i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p3i64(i64 addrspace(3)*)
 declare i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p4i64(i64 addrspace(4)*)
 define i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
@@ -228,10 +262,7 @@ declare i64 @cudaGetParameterBuffer(i64, i64) nounwind
 define i8* @ISPCAlloc(i8**, i64 %size, i32 %align32) nounwind alwaysinline
 {
 entry:
-  %call = tail call i32 @__tid_x()
-  %call1 = tail call i32 @__warpsize()
-  %sub = add nsw i32 %call1, -1
-  %and = and i32 %sub, %call
+  %and = call i32 @__program_index()
   %cmp = icmp eq i32 %and, 0
   %align = zext i32 %align32 to i64
   br i1 %cmp, label %if.then, label %if.end
@@ -270,10 +301,7 @@ entry:
 ;;  %ntxm1d4 = sdiv i32 %ntxm1, 4
   %ntxm1d4 = ashr i32 %ntxm1, 2
   %nbx     = add nsw i32 %ntxm1d4, 1
-  %call = tail call i32 @__tid_x()
-  %call1 = tail call i32 @__warpsize()
-  %sub = add nsw i32 %call1, -1
-  %and = and i32 %sub, %call
+  %and = call i32 @__program_index()
 ;; if (laneIdx == 0)
   %cmp = icmp eq i32 %and, 0
   br i1 %cmp, label %if.then, label %if.end
@@ -1191,8 +1219,7 @@ define(`shift',`
 define <1 x $1> @__shift_$1(<1 x $1>, i32) nounwind readnone alwaysinline
 {
   %val  = extractelement <1 x $1> %0, i32 0
-  %tid  = tail call i32 @__tid_x()
-  %lane = and i32 %tid,  31
+  %lane = call i32 @__program_index()
   %src  = add i32 %lane, %1
   %ret  = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %src)
   %c1   = icmp sge i32 %src, 0
@@ -1214,7 +1241,7 @@ define(`rotate', `
 define <1 x $1> @__rotate_$1(<1 x $1>, i32) nounwind readnone alwaysinline 
 {
   %val  = extractelement <1 x $1> %0, i32 0
-  %tid  = tail call i32 @__tid_x()
+  %tid  = call i32 @__program_index()
   %src  = add i32 %tid, %1
   %lane = and i32 %src, 31
   %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane)
@@ -1569,7 +1596,7 @@ define $1 @__extract_$2(<1 x $1>, i32) nounwind readnone alwaysinline {
 define <1 x $1> @__insert_$2(<1 x $1>, i32, 
                                    $1) nounwind readnone alwaysinline {
   %orig = extractelement <1 x $1> %0, i32 0
-  %lane = call i32 @__laneidx() 
+  %lane = call i32 @__program_index() 
   %c    = icmp eq i32 %lane, %1
   %val  = select i1 %c, $1 %2, $1 %orig
   %insert = insertelement <1 x $1> %0, $1 %val, i32 0
@@ -1620,7 +1647,7 @@ define void @__do_assert_uniform(i8 *%str, i1 %test, <WIDTH x MASK> %mask) {
   br i1 %test, label %ok, label %fail
 
 fail:
-  %lane = call i32 @__laneidx()
+  %lane = call i32 @__program_index()
   %cmp  = icmp eq i32 %lane, 0
   br i1 %cmp, label %fail_print, label %fail_void;
   
@@ -1966,7 +1993,7 @@ define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinli
 entry:
   %addr   = ptrtoint $3 * %ptr to i64
   %active = call i32 @__get_first_active_lane();
-  %lane   = call i32 @__laneidx();
+  %lane   = call i32 @__program_index();
   %c      = icmp eq i32 %lane, %active
   br i1 %c, label %p1, label %p2
 
@@ -2144,7 +2171,7 @@ define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %cmp, $3 %val) nounwind a
 entry:
   %addr   = ptrtoint $3 * %ptr to i64
   %active = call i32 @__get_first_active_lane();
-  %lane   = call i32 @__laneidx();
+  %lane   = call i32 @__program_index();
   %c      = icmp eq i32 %lane, %active
   br i1 %c, label %p1, label %p2
 
diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4
index 891e2760..19fcf68c 100644
--- a/builtins/util-nvptx.m4
+++ b/builtins/util-nvptx.m4
@@ -2032,10 +2032,7 @@ define noalias i8 * @__new_uniform_64rt(i64 %size)
 {
 entry:
 ;;  compute laneIdx = __tid_x() & (__warpsize() - 1)
-  %call = tail call i32 @__tid_x()
-  %call1 = tail call i32 @__warpsize()
-  %sub = add nsw i32 %call1, -1
-  %and = and i32 %sub, %call
+  %and = call i32 @__program_index()
 ;; if (laneIdx == 0)
   %cmp = icmp eq i32 %and, 0
   br i1 %cmp, label %if.then, label %if.end
@@ -2062,10 +2059,7 @@ if.end:                                           ; preds = %if.then, %entry
 define void @__delete_uniform_64rt(i8 * %ptr) 
 {
 entry:
-  %call = tail call i32 @__tid_x()
-  %call1 = tail call i32 @__warpsize()
-  %sub = add nsw i32 %call1, -1
-  %and = and i32 %sub, %call
+  %and = call i32 @__program_index()
   %cmp = icmp eq i32 %and, 0
   br i1 %cmp, label %if.then, label %if.end
 
diff --git a/builtins/util.m4 b/builtins/util.m4
index f6bbe768..87bd2c2c 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -4534,14 +4534,17 @@ define_down_avgs()
 
 define(`declare_nvptx',
 `
-declare i32 @__tid_x()  nounwind readnone alwaysinline
-declare i32 @__warpsize()  nounwind readnone alwaysinline
-declare i32 @__ctaid_x()  nounwind readnone alwaysinline
-declare i32 @__ctaid_y()  nounwind readnone alwaysinline
-declare i32 @__ctaid_z()  nounwind readnone alwaysinline
-declare i32 @__nctaid_x()  nounwind readnone alwaysinline
-declare i32 @__nctaid_y()  nounwind readnone alwaysinline
-declare i32 @__nctaid_z()  nounwind readnone alwaysinline
+declare i32 @__program_index()  nounwind readnone alwaysinline
+declare i32 @__program_count()  nounwind readnone alwaysinline
+declare i32 @__warp_index()  nounwind readnone alwaysinline
+declare i32 @__task_index0()  nounwind readnone alwaysinline
+declare i32 @__task_index1()  nounwind readnone alwaysinline
+declare i32 @__task_index2()  nounwind readnone alwaysinline
+declare i32 @__task_index()  nounwind readnone alwaysinline
+declare i32 @__task_count0()  nounwind readnone alwaysinline
+declare i32 @__task_count1()  nounwind readnone alwaysinline
+declare i32 @__task_count2()  nounwind readnone alwaysinline
+declare i32 @__task_count()  nounwind readnone alwaysinline
 declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
 declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
 declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline
diff --git a/ctx.cpp b/ctx.cpp
index 39f56885..38bea7c4 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1373,21 +1373,26 @@ FunctionEmitContext::None(llvm::Value *mask) {
 
 
 llvm::Value *
-FunctionEmitContext::LaneMask(llvm::Value *v) {
-   const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk";
-    // Call the target-dependent movmsk function to turn the vector mask
-    // into an i64 value
-    std::vector<Symbol *> mm;
-    m->symbolTable->LookupFunction(__movmsk, &mm);
-    if (g->target->getMaskBitCount() == 1)
-        AssertPos(currentPos, mm.size() == 1);
-    else
-        // There should be one with signed int signature, one unsigned int.
-        AssertPos(currentPos, mm.size() == 2);
-    // We can actually call either one, since both are i32s as far as
-    // LLVM's type system is concerned...
-    llvm::Function *fmm = mm[0]->function;
-    return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
+FunctionEmitContext::LaneMask(llvm::Value *v) 
+{
+#if 1 /* this makes mandelbrot example slower, why ?!? */
+  const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk";
+#else
+  const char *__movmsk = "__movmsk";
+#endif
+  // Call the target-dependent movmsk function to turn the vector mask
+  // into an i64 value
+  std::vector<Symbol *> mm;
+  m->symbolTable->LookupFunction(__movmsk, &mm);
+  if (g->target->getMaskBitCount() == 1)
+    AssertPos(currentPos, mm.size() == 1);
+  else
+    // There should be one with signed int signature, one unsigned int.
+    AssertPos(currentPos, mm.size() == 2);
+  // We can actually call either one, since both are i32s as far as
+  // LLVM's type system is concerned...
+  llvm::Function *fmm = mm[0]->function;
+  return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
 }
 
 bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName)
@@ -1476,13 +1481,9 @@ FunctionEmitContext::ProgramIndexVector(bool is32bits) {
 }
 llvm::Value *
 FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) {
-    llvm::Function *func_tid_x  = m->module->getFunction("__tid_x");
-    llvm::Function *func_warpsz = m->module->getFunction("__warpsize");
-    llvm::Value *__tid_x    = CallInst(func_tid_x,  NULL, std::vector<llvm::Value*>(), "laneIdxForEach");
-    llvm::Value *__warpsz   = CallInst(func_warpsz, NULL, std::vector<llvm::Value*>(), "warpSZForEach");
-    llvm::Value *__warpszm1 = BinaryOperator(llvm::Instruction::Add, __warpsz, LLVMInt32(-1), "__warpszm1");
-    llvm::Value *laneIdx = BinaryOperator(llvm::Instruction::And, __tid_x, __warpszm1, "__laneidx");
-    llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), laneIdx, 0, "__laneIdxV");
+    llvm::Function *func_program_index  = m->module->getFunction("__program_index");
+    llvm::Value *__program_index    = CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__program_indexS");
+    llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), __program_index, 0, "foreach__program_indexV");
 #if 0
     if (!is32bits)
       index = ZExtInst(index, LLVMTypes::Int64VectandType);
@@ -1887,146 +1888,6 @@ FunctionEmitContext::BitCastInst(llvm::Value *value, llvm::Type *type,
     return inst;
 }
 
-/* NVPTX: 
- * this is a helper function which adds a warp offset to a base pointer 
- * pointer must either be in local memory addrspace(3)
- * or the one just converted from addrspace(3) to addrspace(0) in lConvertToGenericPtr
- */
-static llvm::Value* lAddWarpOffset(FunctionEmitContext *ctx, llvm::Value *value)
-{
-  llvm::Function *func_tid_x  = m->module->getFunction("__tid_x");
-  llvm::Function *func_warpsz = m->module->getFunction("__warpsize");
-  llvm::Value *__tid_x  = ctx->CallInst(func_tid_x,  NULL, std::vector<llvm::Value*>(),  "tidCorrectLocalPtr");
-  llvm::Value *__warpsz = ctx->CallInst(func_warpsz, NULL, std::vector<llvm::Value*>(),  "warpSzCorrectLocaLPtr");
-  llvm::Value *_mwarpsz = ctx->BinaryOperator(llvm::Instruction::Sub, LLVMInt32(0), __warpsz, "mwarpSzCorrectLocalPtr");
-  llvm::Value *__offset = ctx->BinaryOperator(llvm::Instruction::And, __tid_x, _mwarpsz, "offsetCorrectLocalPtr");
-  return llvm::GetElementPtrInst::Create(value, __offset, "warpOffset_gep", ctx->GetCurrentBasicBlock());
-}
-
-static llvm::Value* lConvertGepToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos &currentPos)
-{
-  if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX) 
-    return value;
-  llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
-  const int addressSpace = pt->getAddressSpace();
-  if (addressSpace != 3 && addressSpace != 4) 
-    return value;
-  assert(0);
-
-  llvm::Type *elTy = pt->getElementType();
-  assert(elTy->isArrayTy());
-  const int numElTot = elTy->getArrayNumElements();
-  const int numEl    = numElTot/4;
-#if 0
-  fprintf(stderr, " --- detected addrspace(3) sz= %d --- \n", numEl);
-#endif
-  llvm::ArrayType *arrTy = llvm::dyn_cast<llvm::ArrayType>(pt->getArrayElementType());
-  assert(arrTy != NULL);
-  llvm::Type *arrElTy = arrTy->getElementType();
-#if 0
-  if (arrElTy->isArrayTy())
-      Error(currentPos, "Currently \"nvptx\" target doesn't support array-of-array");
-#endif
-
-  /* convert elTy addrspace(3)* to i64* addrspace(3)* */
-  llvm::PointerType *Int64Ptr3 = llvm::PointerType::get(LLVMTypes::Int64Type, addressSpace);
-  value = ctx->BitCastInst(value, Int64Ptr3, "gep2gen_cast1");
-
-  /* convert i64* addrspace(3) to i64* */
-  llvm::Function *__cvt2gen = m->module->getFunction(
-      addressSpace == 3 ? "__cvt_loc2gen" : "__cvt_const2gen");
-  std::vector<llvm::Value *> __cvt2gen_args;
-  __cvt2gen_args.push_back(value);
-  value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, "gep2gen_cvt", ctx->GetCurrentBasicBlock());
-
-  /* convert i64* to errElTy* */
-  llvm::PointerType *arrElTyPt0 = llvm::PointerType::get(arrElTy, 0);
-  value  = ctx->BitCastInst(value, arrElTyPt0, "gep2gen_cast2");
-
-  /* compute offset */
-  if (addressSpace == 3)
-  {
-    llvm::Function *funcTid    = m->module->getFunction("__tid_x");
-    llvm::Function *funcWarpSz = m->module->getFunction("__warpsize");
-    llvm::Value *tid    = ctx->CallInst(funcTid,    NULL, std::vector<llvm::Value*>(),  "gep2gen_tid");
-    llvm::Value *warpSz = ctx->CallInst(funcWarpSz, NULL, std::vector<llvm::Value*>(),  "gep2gen_warpSz");
-    llvm::Value *warpId = ctx->BinaryOperator(llvm::Instruction::SDiv, tid, warpSz, "gep2gen_warpId");
-    llvm::Value *offset = ctx->BinaryOperator(llvm::Instruction::Mul, warpId, LLVMInt32(numEl), "gep2gen_offset");
-    value = llvm::GetElementPtrInst::Create(value, offset, "gep2gen_offset", ctx->GetCurrentBasicBlock());
-  }
-
-  /* convert arrElTy* to elTy* */
-  llvm::PointerType *elTyPt0 = llvm::PointerType::get(elTy, 0);
-  value  = ctx->BitCastInst(value, elTyPt0, "gep2gen_cast3");
-
-  return value;
-}
-
-/* NVPTX:
- * this function compute correct address in local memory for load/store operations
- */
-static llvm::Value* lCorrectLocalPtr(FunctionEmitContext *ctx, llvm::Value* value)
-{
- // return value;
-  assert(value->getType()->isPointerTy());
-  llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
-  if (g->target->getISA() != Target::NVPTX || pt->getAddressSpace() != 3) return value;
-
-  assert(0);  /* we should never enter here */
-
-  return lAddWarpOffset(ctx, value);
-}
-
-/* NVPTX:
- * this function converts a pointer in addrspace(3 or 4) to addrspace(0) 
- */
-static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos &currentPos)
-{
-//  return value;
-  if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX) return value;
-  llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
-
-  /* make sure addrspace corresponds to either local or constant memories */
-  const int addressSpace = pt->getAddressSpace();
-  if (addressSpace != 3 && addressSpace != 4) return value;
-
-  assert(0);  /* we should never enter here */
-
-  /* if array, extracts element type */
-  llvm::Type *type   = pt->getElementType();
-  llvm::Type *typeEl = type;
-  if (type->isArrayTy())
-  {
-    typeEl = type->getArrayElementType();
-    if (typeEl->isArrayTy())
-      Error(currentPos, "Currently \"nvptx\" target doesn't support array-of-array");
-  }
-
-  /* convert elTy addrspace(3)* to i64* addrspace(3)* */
-  llvm::PointerType *Int64Ptr3 = llvm::PointerType::get(LLVMTypes::Int64Type, addressSpace);
-  value = ctx->BitCastInst(value, Int64Ptr3, "cvt2gen_i64ptr");
-
-  /* convert i64* addrspace(3) to i64* */
-  llvm::Function *__cvt2gen = m->module->getFunction(
-      addressSpace == 3 ? "__cvt_loc2gen" : "__cvt_const2gen");
-  std::vector<llvm::Value *> __cvt2gen_args;
-  __cvt2gen_args.push_back(value);
-#if 0
-  value = ctx->CallInst(__cvt2gen, NULL, __cvt2gen_args, "cvt2gen_call");
-#else
-  value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, "cvt2gen_call", ctx->GetCurrentBasicBlock());
-#endif
-
-  /* convert i64* to elTy* */
-  llvm::PointerType *typeElPtr = llvm::PointerType::get(typeEl, 0);
-  value  = ctx->BitCastInst(value, typeElPtr, "cvtLoc2Gen_i642ptr");
-
-  /* add warp offset to the pointer for local memory */
-  if (addressSpace == 3)
-    value = lAddWarpOffset(ctx, value);
-
-  return value;
-}
 
 llvm::Value *
 FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) {
@@ -2042,7 +1903,6 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) {
     if (name == NULL)
         name = LLVMGetName(value, "_ptr2int");
 
-    value = lConvertToGenericPtr(this, value, currentPos); /* NVPTX : convert to addrspace(0) */
     llvm::Type *type = LLVMTypes::PointerIntType;
     llvm::Instruction *inst = new llvm::PtrToIntInst(value, type, name, bblock);
     AddDebugPos(inst);
@@ -2076,7 +1936,6 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, llvm::Type *toType,
         }
     }
 
-    value = lConvertToGenericPtr(this, value, currentPos); /* NVPTX : convert to addrspace(0) */
     llvm::Instruction *inst = new llvm::PtrToIntInst(value, toType, name, bblock);
     AddDebugPos(inst);
     return inst;
@@ -2383,7 +2242,6 @@ FunctionEmitContext::MakeSlicePointer(llvm::Value *ptr, llvm::Value *offset) {
 llvm::Value *
 FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index,
                                        const Type *ptrRefType, const char *name) {
-    basePtr = lConvertGepToGenericPtr(this, basePtr, currentPos);
     if (basePtr == NULL || index == NULL) {
         AssertPos(currentPos, m->errorCount > 0);
         return NULL;
@@ -2454,7 +2312,6 @@ llvm::Value *
 FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0,
                                        llvm::Value *index1, const Type *ptrRefType,
                                        const char *name) {
-    basePtr = lConvertGepToGenericPtr(this, basePtr, currentPos);
     if (basePtr == NULL || index0 == NULL || index1 == NULL) {
         AssertPos(currentPos, m->errorCount > 0);
         return NULL;
@@ -2657,7 +2514,6 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) {
     if (name == NULL)
         name = LLVMGetName(ptr, "_load");
 
-    ptr = lCorrectLocalPtr(this, ptr); /* NVPTX: correct addrspace(3) pointer before load/store */
     llvm::LoadInst *inst = new llvm::LoadInst(ptr, name, bblock);
 
     if (g->opt.forceAlignedMemory &&
@@ -2790,7 +2646,6 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, llvm::Value *mask,
                 // it's totally unaligned.  (This shouldn't make any difference
                 // vs the proper alignment in practice.)
                 align = 1;
-            ptr = lCorrectLocalPtr(this, ptr); /* NVPTX: correct addrspace(3) pointer before load/store */
             llvm::Instruction *inst = new llvm::LoadInst(ptr, name,
                                                          false /* not volatile */,
                                                          align, bblock);
@@ -3218,7 +3073,6 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) {
         llvm::dyn_cast<llvm::PointerType>(ptr->getType());
     AssertPos(currentPos, pt != NULL);
 
-    ptr = lCorrectLocalPtr(this, ptr); /* NVPTX: correct addrspace(3) pointer before load/store */
     llvm::StoreInst *inst = new llvm::StoreInst(value, ptr, bblock);
 
     if (g->opt.forceAlignedMemory &&
@@ -3531,14 +3385,6 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
         return NULL;
     }
 
-#if 0
-    std::vector<llvm::Value *> args = args_in;
-    /* NVPTX:
-     * Convert all pointers to addrspace(0) 
-     */
-    for (unsigned int i = 0; i < args.size(); i++)
-      args[i] = lConvertToGenericPtr(this, args[i], currentPos);
-#endif
     std::vector<llvm::Value *> argVals = args;
     // Most of the time, the mask is passed as the last argument.  this
     // isn't the case for things like intrinsics, builtins, and extern "C"
diff --git a/module.cpp b/module.cpp
index fde31456..a8f521d8 100644
--- a/module.cpp
+++ b/module.cpp
@@ -2167,20 +2167,20 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
     if (g->target->getISA() == Target::NVPTX)
     {
       opts.addMacroDef("__NVPTX__");
-      opts.addMacroDef("programIndex=laneIndex()");
+      opts.addMacroDef("programIndex=__programIndex()");
       opts.addMacroDef("cif=if");
       opts.addMacroDef("cfor=for");
       opts.addMacroDef("cwhile=while");
       opts.addMacroDef("ccontinue=continue");
       opts.addMacroDef("cdo=do");
-      opts.addMacroDef("taskIndex0=blockIndex0()");
-      opts.addMacroDef("taskCount0=blockCount0()");
-      opts.addMacroDef("taskIndex1=blockIndex1()");
-      opts.addMacroDef("taskCount1=blockCount1()");
-      opts.addMacroDef("taskIndex2=blockIndex2()");
-      opts.addMacroDef("taskCount2=blockCount2()");
-      opts.addMacroDef("taskIndex=(taskIndex0 + taskCount0*(taskIndex1 + taskCount1*taskIndex2))");
-      opts.addMacroDef("taskCount=(taskCount0*taskCount1*taskCount2)");
+      opts.addMacroDef("taskIndex0=__taskIndex0()");
+      opts.addMacroDef("taskIndex1=__taskIndex1()");
+      opts.addMacroDef("taskIndex2=__taskIndex2()");
+      opts.addMacroDef("taskIndex=__taskIndex()");
+      opts.addMacroDef("taskCount0=__taskCount0()");
+      opts.addMacroDef("taskCount1=__taskCount1()");
+      opts.addMacroDef("taskCount2=__taskCount2()");
+      opts.addMacroDef("taskCount=__taskCount()");
     }
 
 #if defined(LLVM_3_1)
diff --git a/stdlib.ispc b/stdlib.ispc
index 1fccfc03..37cb4141 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -62,57 +62,25 @@
 ///////////////////////////////////////////////////////////////////////////
 // CUDA Specific primitives
 //
-#define CUDABLOCKSIZE 128
-#define WARPSIZE2     5
-#define WARPSIZE      (1<<WARPSIZE2)
 /***************/
-__declspec(safe,cost0)
-  static inline uniform int warpSize()
-{
-  return WARPSIZE; //__warpsize();
-}
+
+__declspec(safe,cost0) static inline varying int __programIndex() { return __program_index(); }
+__declspec(safe,cost0) static inline uniform int __programCount() { return __program_count(); }
+__declspec(safe,cost0) static inline uniform int __warpIndex()    { return __warp_index();    }
+
 /***************/
-__declspec(safe,cost0)
-  static inline varying int laneIndex()
-{
-  return __tid_x() & (WARPSIZE-1) ; //& (warpSize()-1);
-}
+
+__declspec(safe,cost0) static inline uniform int __taskIndex0() { return __task_index0(); }
+__declspec(safe,cost0) static inline uniform int __taskIndex1() { return __task_index1(); }
+__declspec(safe,cost0) static inline uniform int __taskIndex2() { return __task_index2(); }
+__declspec(safe,cost0) static inline uniform int __taskIndex () { return __task_index (); }
+
 /***************/
-__declspec(safe,cost0)
-  static inline uniform int blockIndex0()
-{
-  return (__ctaid_x() * (CUDABLOCKSIZE >> WARPSIZE2)) + (__tid_x() >> WARPSIZE2);
-}
-/***************/
-__declspec(safe,cost0)
-  static inline uniform int blockIndex1()
-{
-  return __ctaid_y();
-}
-/***************/
-__declspec(safe,cost0)
-  static inline uniform int blockIndex2()
-{
-  return __ctaid_z();
-}
-/***************/
-__declspec(safe,cost0)
-  static inline uniform int blockCount0()
-{
-  return __nctaid_x() * (CUDABLOCKSIZE >> WARPSIZE2);
-}
-/***************/
-__declspec(safe,cost0)
-  static inline uniform int blockCount1()
-{
-  return __nctaid_y();
-}
-/***************/
-__declspec(safe,cost0)
-  static inline uniform int blockCount2()
-{
-  return __nctaid_z();
-}
+
+__declspec(safe,cost0) static inline uniform int __taskCount0() { return __task_count0(); }
+__declspec(safe,cost0) static inline uniform int __taskCount1() { return __task_count1(); }
+__declspec(safe,cost0) static inline uniform int __taskCount2() { return __task_count2(); }
+__declspec(safe,cost0) static inline uniform int __taskCount () { return __task_count (); }
 
 ///////////////////////////////////////////////////////////////////////////
 // Low level primitives
diff --git a/stmt.cpp b/stmt.cpp
index 0f84215f..016bb0f4 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -186,11 +186,8 @@ static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *
     llvm::PointerType *arrElTyPt0 = llvm::PointerType::get(arrElTy, 0);
     value  = ctx->BitCastInst(value, arrElTyPt0, "gep2gen_cast2");
 
-    llvm::Function *funcTid    = m->module->getFunction("__tid_x");
-    llvm::Function *funcWarpSz = m->module->getFunction("__warpsize");
-    llvm::Value *tid    = ctx->CallInst(funcTid,    NULL, std::vector<llvm::Value*>(),  "gep2gen_tid");
-    llvm::Value *warpSz = ctx->CallInst(funcWarpSz, NULL, std::vector<llvm::Value*>(),  "gep2gen_warpSz");
-    llvm::Value *warpId = ctx->BinaryOperator(llvm::Instruction::SDiv, tid, warpSz, "gep2gen_warpId");
+    llvm::Function *func_warp_index    = m->module->getFunction("__warp_index");
+    llvm::Value *warpId = ctx->CallInst(func_warp_index, NULL, std::vector<llvm::Value*>(),  "gep2gen_warp_index");
     llvm::Value *offset = ctx->BinaryOperator(llvm::Instruction::Mul, warpId, LLVMInt32(numEl), "gep2gen_offset");
     value = llvm::GetElementPtrInst::Create(value, offset, "gep2gen_offset", ctx->GetCurrentBasicBlock());
   }
@@ -1517,10 +1514,8 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
     llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList);
 
     globalDelta->setInitializer(constDelta);
-    llvm::Function *func_tid_x = m->module->getFunction("__tid_x");
-    std::vector<llvm::Value *> allocArgs;
-    llvm::Value *__tid_x = ctx->CallInst(func_tid_x, NULL, allocArgs, "laneIdxForEach");
-    llvm::Value *laneIdx = ctx->BinaryOperator(llvm::Instruction::And, __tid_x, LLVMInt32(31), "__laneidx");
+    llvm::Function *func_program_index = m->module->getFunction("__program_index");
+    llvm::Value *laneIdx = ctx->CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__programIndex");
 
     std::vector<llvm::Value*> ptr_arrayidx_indices;
     ptr_arrayidx_indices.push_back(LLVMInt32(0));