diff --git a/builtins.cpp b/builtins.cpp
index bec7a3e5..80740146 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -481,6 +481,7 @@ lSetInternalFunctions(llvm::Module *module) {
         "__min_varying_uint32",
         "__min_varying_uint64",
         "__movmsk",
+        "__movmsk_ptx",
         "__new_uniform_32rt",
         "__new_uniform_64rt",
         "__new_varying32_32rt",
diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll
index 1e8d0ae5..fd314d3b 100644
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
@@ -722,16 +722,14 @@ svml_stubs(double,d,WIDTH)
 
 define  i64 @__movmsk(<1 x i1>) nounwind readnone alwaysinline {
   %v = extractelement <1 x i1> %0, i32 0
-;; if 0
-  ;; this one fails with ./tests/popcnt-4.ispc and others ...
-;; %v0  = call i32 @__ballot_nvptx(i1 %v)
-;; %v64 = zext i32 %v0 to i64
-
-;; else 
-   ;; this one just copies mask  
-    %v64 = zext i1 %v to i64
-;; endif 
-    ret i64 %v64
+  %v64 = zext i1 %v to i64
+  ret i64 %v64
+}
+define  i64 @__movmsk_ptx(<1 x i1>) nounwind readnone alwaysinline {
+  %v = extractelement <1 x i1> %0, i32 0
+   %v0  = call i32 @__ballot_nvptx(i1 %v)
+   %v64 = zext i32 %v0 to i64
+   ret i64 %v64
 }
 
 define  i1 @__any(<1 x i1>) nounwind readnone alwaysinline {
diff --git a/ctx.cpp b/ctx.cpp
index 6fb7561d..1f6e5e53 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1374,10 +1374,11 @@ FunctionEmitContext::None(llvm::Value *mask) {
 
 llvm::Value *
 FunctionEmitContext::LaneMask(llvm::Value *v) {
+   const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk";
     // Call the target-dependent movmsk function to turn the vector mask
     // into an i64 value
     std::vector<Symbol *> mm;
-    m->symbolTable->LookupFunction("__movmsk", &mm);
+    m->symbolTable->LookupFunction(__movmsk, &mm);
     if (g->target->getMaskBitCount() == 1)
         AssertPos(currentPos, mm.size() == 1);
     else
@@ -1389,6 +1390,18 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
     return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
 }
 
+llvm::Value*
+FunctionEmitContext::Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar)
+{
+  return NULL;
+}
+
+llvm::Value*
+FunctionEmitContext::Extract(llvm::Value *vector, llvm::Value *lane)
+{
+  return NULL;
+}
+
 
 llvm::Value *
 FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
@@ -1410,8 +1423,6 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
 
 llvm::Value *
 FunctionEmitContext::ProgramIndexVector(bool is32bits) {
-  if (1 || g->target->getISA() != Target::NVPTX)
-  {
     llvm::SmallVector<llvm::Constant*, 16> array;
     for (int i = 0; i < g->target->getVectorWidth() ; ++i) {
       llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
@@ -1421,9 +1432,9 @@ FunctionEmitContext::ProgramIndexVector(bool is32bits) {
     llvm::Constant* index = llvm::ConstantVector::get(array);
 
     return index;
-  }
-  else
-  { /* this calls __tid_x() & __warpsize */
+}
+llvm::Value *
+FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) {
     llvm::Function *func_tid_x  = m->module->getFunction("__tid_x");
     llvm::Function *func_warpsz = m->module->getFunction("__warpsize");
     llvm::Value *__tid_x    = CallInst(func_tid_x,  NULL, std::vector<llvm::Value*>(), "laneIdxForEach");
@@ -1432,7 +1443,6 @@ FunctionEmitContext::ProgramIndexVector(bool is32bits) {
     llvm::Value *laneIdx = BinaryOperator(llvm::Instruction::And, __tid_x, __warpszm1, "__laneidx");
     llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), laneIdx, 0, "__laneIdxV");
     return index;
-  }
 }
 
 
diff --git a/ctx.h b/ctx.h
index 4dd30053..57160c17 100644
--- a/ctx.h
+++ b/ctx.h
@@ -291,6 +291,13 @@ public:
         of the mask is on. */
     llvm::Value *LaneMask(llvm::Value *mask);
 
+
+    /** Issues a call to __insert_int8/int16/int32/int64/float/double */
+    llvm::Value* Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar);
+    /** Issues a call to __extract_int8/int16/int32/int64/float/double */
+    llvm::Value* Extract(llvm::Value *vector, llvm::Value *lane);
+
+
     /** Given two masks of type LLVMTypes::MaskType, return an i1 value
         that indicates whether the two masks are equal. */
     llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
@@ -298,6 +305,7 @@ public:
     /** Generate ConstantVector, which contains ProgramIndex, i.e.
         < i32 0, i32 1, i32 2, i32 3> */
     llvm::Value *ProgramIndexVector(bool is32bits = true);
+    llvm::Value *ProgramIndexVectorPTX(bool is32bits = true);
 
     /** Given a string, create an anonymous global variable to hold its
         value and return the pointer to the string. */
diff --git a/stdlib.ispc b/stdlib.ispc
index 871129e3..3e37ac5b 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -2057,11 +2057,10 @@ static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value)
 }                                                                      \
 static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) {    \
     TYPE ret;                                                          \
-    uniform TYPE * uniform ptrs[programCount];                         \
-    ptrs[programIndex] = p;                                            \
     foreach_active (i) {                                             \
-        ret = insert(ret, i, *ptrs[i]);                                \
-        *ptrs[i] = OPFUNC(*ptrs[i], extract(value, i));                \
+        uniform TYPE * uniform ptr = (uniform TYPE * uniform)extract((int64)p, i); \
+        ret  = insert(ret, i, *ptr);                                \
+        *ptr = OPFUNC(*ptr, extract(value, i));                \
     }                                                                  \
     return ret;                                                        \
 }
diff --git a/stmt.cpp b/stmt.cpp
index 2160cbaf..b30a0000 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -2243,7 +2243,8 @@ ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const {
         // math...)
 
         // Get the "program index" vector value
-        llvm::Value *programIndex = ctx->ProgramIndexVector();
+        llvm::Value *programIndex = g->target->getISA() == Target::NVPTX ?
+          ctx->ProgramIndexVectorPTX() : ctx->ProgramIndexVector();
 
         // And smear the current lane out to a vector
         llvm::Value *firstSet32 =
@@ -2354,6 +2355,8 @@ ForeachUniqueStmt::ForeachUniqueStmt(const char *iterName, Expr *e,
     sym = m->symbolTable->LookupVariable(iterName);
     expr = e;
     stmts = s;
+    if (g->target->getISA() == Target::NVPTX)
+      Error(pos, "\"foreach_unique\" is not yetsupported with \"nvptx\" target.");
 }