merged with nvptx

2014-02-20 11:01:58 +01:00
parent c325c0e085 11612a24ee
commit 4196c723eb
68 changed files with 8181 additions and 470 deletions
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -338,11 +338,13 @@ lSetInternalFunctions(llvm::Module *module) {
        "__all",
        "__any",
        "__aos_to_soa3_float",
+        "__aos_to_soa3_float1",
        "__aos_to_soa3_float16",
        "__aos_to_soa3_float4",
        "__aos_to_soa3_float8",
        "__aos_to_soa3_int32",
        "__aos_to_soa4_float",
+        "__aos_to_soa4_float1",
        "__aos_to_soa4_float16",
        "__aos_to_soa4_float4",
        "__aos_to_soa4_float8",
@@ -351,10 +353,14 @@ lSetInternalFunctions(llvm::Module *module) {
        "__atomic_add_int64_global",
        "__atomic_add_uniform_int32_global",
        "__atomic_add_uniform_int64_global",
+        "__atomic_add_varying_int32_global",
+        "__atomic_add_varying_int64_global",
        "__atomic_and_int32_global",
        "__atomic_and_int64_global",
        "__atomic_and_uniform_int32_global",
        "__atomic_and_uniform_int64_global",
+        "__atomic_and_varying_int32_global",
+        "__atomic_and_varying_int64_global",
        "__atomic_compare_exchange_double_global",
        "__atomic_compare_exchange_float_global",
        "__atomic_compare_exchange_int32_global",
@@ -363,18 +369,30 @@ lSetInternalFunctions(llvm::Module *module) {
        "__atomic_compare_exchange_uniform_float_global",
        "__atomic_compare_exchange_uniform_int32_global",
        "__atomic_compare_exchange_uniform_int64_global",
+        "__atomic_compare_exchange_varying_double_global",
+        "__atomic_compare_exchange_varying_float_global",
+        "__atomic_compare_exchange_varying_int32_global",
+        "__atomic_compare_exchange_varying_int64_global",
        "__atomic_max_uniform_int32_global",
        "__atomic_max_uniform_int64_global",
        "__atomic_min_uniform_int32_global",
        "__atomic_min_uniform_int64_global",
+        "__atomic_max_varying_int32_global",
+        "__atomic_max_varying_int64_global",
+        "__atomic_min_varying_int32_global",
+        "__atomic_min_varying_int64_global",
        "__atomic_or_int32_global",
        "__atomic_or_int64_global",
        "__atomic_or_uniform_int32_global",
        "__atomic_or_uniform_int64_global",
+        "__atomic_or_varying_int32_global",
+        "__atomic_or_varying_int64_global",
        "__atomic_sub_int32_global",
        "__atomic_sub_int64_global",
        "__atomic_sub_uniform_int32_global",
        "__atomic_sub_uniform_int64_global",
+        "__atomic_sub_varying_int32_global",
+        "__atomic_sub_varying_int64_global",
        "__atomic_swap_double_global",
        "__atomic_swap_float_global",
        "__atomic_swap_int32_global",
@@ -383,14 +401,28 @@ lSetInternalFunctions(llvm::Module *module) {
        "__atomic_swap_uniform_float_global",
        "__atomic_swap_uniform_int32_global",
        "__atomic_swap_uniform_int64_global",
+        "__atomic_swap_varying_double_global",
+        "__atomic_swap_varying_float_global",
+        "__atomic_swap_varying_int32_global",
+        "__atomic_swap_varying_int64_global",
        "__atomic_umax_uniform_uint32_global",
        "__atomic_umax_uniform_uint64_global",
        "__atomic_umin_uniform_uint32_global",
        "__atomic_umin_uniform_uint64_global",
+        "__atomic_umax_varying_uint32_global",
+        "__atomic_umax_varying_uint64_global",
+        "__atomic_umin_varying_uint32_global",
+        "__atomic_umin_varying_uint64_global",
        "__atomic_xor_int32_global",
        "__atomic_xor_int64_global",
        "__atomic_xor_uniform_int32_global",
        "__atomic_xor_uniform_int64_global",
+        "__atomic_xor_uniform_int32_global",
+        "__atomic_xor_uniform_int64_global",
+        "__atomic_xor_varying_int32_global",
+        "__atomic_xor_varying_int64_global",
+        "__atomic_xor_varying_int32_global",
+        "__atomic_xor_varying_int64_global",
        "__broadcast_double",
        "__broadcast_float",
        "__broadcast_i16",
@@ -413,6 +445,7 @@ lSetInternalFunctions(llvm::Module *module) {
        "__do_assert_uniform",
        "__do_assert_varying",
        "__do_print",
+        "__do_print_nvptx",
        "__doublebits_uniform_int64",
        "__doublebits_varying_int64",
        "__exclusive_scan_add_double",
@@ -427,6 +460,8 @@ lSetInternalFunctions(llvm::Module *module) {
        "__extract_int32",
        "__extract_int64",
        "__extract_int8",
+        "__extract_float",
+        "__extract_double",
        "__fastmath",
        "__float_to_half_uniform",
        "__float_to_half_varying",
@@ -443,6 +478,8 @@ lSetInternalFunctions(llvm::Module *module) {
        "__insert_int32",
        "__insert_int64",
        "__insert_int8",
+        "__insert_float",
+        "__insert_double",
        "__intbits_uniform_double",
        "__intbits_uniform_float",
        "__intbits_varying_double",
@@ -479,6 +516,7 @@ lSetInternalFunctions(llvm::Module *module) {
        "__min_varying_uint32",
        "__min_varying_uint64",
        "__movmsk",
+        "__movmsk_ptx",
        "__new_uniform_32rt",
        "__new_uniform_64rt",
        "__new_varying32_32rt",
@@ -560,11 +598,13 @@ lSetInternalFunctions(llvm::Module *module) {
        "__shuffle_i64",
        "__shuffle_i8",
        "__soa_to_aos3_float",
+        "__soa_to_aos3_float1",
        "__soa_to_aos3_float16",
        "__soa_to_aos3_float4",
        "__soa_to_aos3_float8",
        "__soa_to_aos3_int32",
        "__soa_to_aos4_float",
+        "__soa_to_aos4_float1",
        "__soa_to_aos4_float16",
        "__soa_to_aos4_float4",
        "__soa_to_aos4_float8",
@@ -622,6 +662,24 @@ lSetInternalFunctions(llvm::Module *module) {
        "__vec4_add_int32",
        "__vselect_float",
        "__vselect_i32",
+        "__program_index",
+        "__program_count",
+        "__warp_index",
+        "__task_index0",
+        "__task_index1",
+        "__task_index2",
+        "__task_index",
+        "__task_count0",
+        "__task_count1",
+        "__task_count2",
+        "__task_count",
+        "__cvt_loc2gen",
+        "__cvt_loc2gen_var",
+        "__cvt_const2gen",
+        "__puts_nvptx",
+        "ISPCAlloc",
+        "ISPCLaunch",
+        "ISPCSync",
    };

    int count = sizeof(names) / sizeof(names[0]);
@@ -694,6 +752,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
            g->target->getISA() != Target::NEON16 &&
            g->target->getISA() != Target::NEON8)
 #endif // !__arm__
+        if (g->target->getISA() != Target::NVPTX)
        {
            Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
                   mTriple.getArch() == bcTriple.getArch());
@@ -855,7 +914,17 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    // Next, add the target's custom implementations of the various needed
    // builtin functions (e.g. __masked_store_32(), etc).
    switch (g->target->getISA()) {
-
+    case Target::NVPTX: 
+      {
+        if (runtime32) {
+            fprintf(stderr, "Unforetunatly 32bit targets are supported at the moment .. \n");
+            assert(0);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_nvptx_64bit);
+        }
+        break;
+      };
 #ifdef ISPC_ARM_ENABLED
    case Target::NEON8: {
        if (runtime32) {
@@ -1125,7 +1194,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    }

    // define the 'programCount' builtin variable
-    lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
+    if (g->target->getISA() != Target::NVPTX)
+    {
+      lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
+    }
+    else
+    {
+      lDefineConstantInt("programCount", 32, module, symbolTable);
+    }

    // define the 'programIndex' builtin
    lDefineProgramIndex(module, symbolTable);
@@ -1155,6 +1231,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(),
                       module, symbolTable);

+    lDefineConstantInt("__is_nvptx_target", (int)(g->target->getISA() == Target::NVPTX),
+                       module, symbolTable);
+          
    if (g->forceAlignment != -1) {
        llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
        alignment->setInitializer(LLVMInt32(g->forceAlignment));
--- a/builtins/__do_print_nvptx.cu
+++ b/builtins/__do_print_nvptx.cu
@@ -0,0 +1,130 @@
+#include <cstdio>
+
+#define PRINT_BUF_SIZE 4096
+#define uint64_t unsigned long long
+
+static __device__ size_t d_strlen(const char *str)
+{
+  const char *s;
+
+  for (s = str; *s; ++s)
+    ;
+  return (s - str);
+}
+
+static __device__  char* d_strncat(char *dest, const char *src, size_t n)
+{
+  size_t dest_len = d_strlen(dest);
+  size_t i;
+
+  for (i = 0 ; i < n && src[i] != '\0' ; i++)
+    dest[dest_len + i] = src[i];
+  dest[dest_len + i] = '\0';
+
+  return dest;
+}
+ 
+#define APPEND(str)                                        \
+    do {                                                   \
+        int offset = bufp - &printString[0];               \
+        *bufp = '\0';                                      \
+        d_strncat(bufp, str, PRINT_BUF_SIZE-offset);         \
+        bufp += d_strlen(str);                               \
+        if (bufp >= &printString[PRINT_BUF_SIZE])          \
+            goto done;                                     \
+    } while (0) /* eat semicolon */
+
+
+#define PRINT_SCALAR(fmt, type)                  \
+    sprintf(tmpBuf, fmt, *((type *)ptr));        \
+    APPEND(tmpBuf);                              \
+    break
+
+#define PRINT_VECTOR(fmt, type)                                         \
+    *bufp++ = '[';                                                      \
+    if (bufp == &printString[PRINT_BUF_SIZE]) break;                    \
+    for (int i = 0; i < width; ++i) {                                   \
+        /* only print the value if the current lane is executing */     \
+        type val0 = *((type*)ptr);                                      \
+        type val = val0;                                                \
+        if (mask & (1ull<<i))                                           \
+            sprintf(tmpBuf, fmt, val);                                  \
+        else                                                            \
+            sprintf(tmpBuf, "(( * )) ");                                \
+        APPEND(tmpBuf);                                                 \
+        *bufp++ = (i != width-1 ? ',' : ']');                           \
+    }                                                                   \
+    break
+
+extern "C"
+__device__ void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
+                void **args) {
+    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
+    char *bufp = &printString[0];
+    char tmpBuf[256];
+    const char  trueBuf[] = "true";
+    const char falseBuf[] = "false";
+
+    int argCount = 0;
+    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
+        // Format strings are just single percent signs.
+        if (*format != '%') {
+            *bufp++ = *format;
+        }
+        else {
+            if (*types) {
+                void *ptr = args[argCount++];
+                // Based on the encoding in the types string, cast the
+                // value appropriately and print it with a reasonable
+                // printf() formatting string.
+                switch (*types) {
+                case 'b': {
+                    const char *tmpBuf1 =  *((bool *)ptr) ? trueBuf : falseBuf;
+                    APPEND(tmpBuf1);
+                    break;
+                }
+                case 'B': {
+                    *bufp++ = '[';
+                    if (bufp == &printString[PRINT_BUF_SIZE])
+                        break;
+                    for (int i = 0; i < width; ++i) {
+                        bool val0 = *((bool*)ptr);                                     
+                        bool val = val0;                                                \
+                        if (mask & (1ull << i)) {
+                            const char *tmpBuf1 =  val ? trueBuf : falseBuf;
+                            APPEND(tmpBuf1);
+                        }
+                        else
+                            APPEND("_________");
+                        *bufp++ = (i != width-1) ? ',' : ']';
+                    }
+                    break;
+                }
+                case 'i': PRINT_SCALAR("%d", int);
+                case 'I': PRINT_VECTOR("%d", int);
+                case 'u': PRINT_SCALAR("%u", unsigned int);
+                case 'U': PRINT_VECTOR("%u", unsigned int);
+                case 'f': PRINT_SCALAR("%f", float);
+                case 'F': PRINT_VECTOR("%f", float);
+                case 'l': PRINT_SCALAR("%lld", long long);
+                case 'L': PRINT_VECTOR("%lld", long long);
+                case 'v': PRINT_SCALAR("%llu", unsigned long long);
+                case 'V': PRINT_VECTOR("%llu", unsigned long long);
+                case 'd': PRINT_SCALAR("%f", double);
+                case 'D': PRINT_VECTOR("%f", double);
+                case 'p': PRINT_SCALAR("%p", void *);
+                case 'P': PRINT_VECTOR("%p", void *);
+                default:
+                    APPEND("UNKNOWN TYPE ");
+                    *bufp++ = *types;
+                }
+                ++types;
+            }
+        }
+        ++format;
+    }
+
+ done:
+    *bufp = '\n'; bufp++;
+    *bufp = '\0';
+}
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -185,6 +185,81 @@ void __do_print(const char *format, const char *types, int width, uint64_t mask,
    fflush(stdout);
 }

+/* this is print for PTX target only */
+int __puts_nvptx(const char *);
+void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
+                void **args) {
+#if 0
+    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
+    char *bufp = &printString[0];
+    char tmpBuf[256];
+
+    int argCount = 0;
+    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
+        // Format strings are just single percent signs.
+        if (*format != '%') {
+            *bufp++ = *format;
+        }
+        else {
+            if (*types) {
+                void *ptr = args[argCount++];
+                // Based on the encoding in the types string, cast the
+                // value appropriately and print it with a reasonable
+                // printf() formatting string.
+                switch (*types) {
+                case 'b': {
+                    sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
+                    APPEND(tmpBuf);
+                    break;
+                }
+                case 'B': {
+                    *bufp++ = '[';
+                    if (bufp == &printString[PRINT_BUF_SIZE])
+                        break;
+                    for (int i = 0; i < width; ++i) {
+                        if (mask & (1ull << i)) {
+                            sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
+                            APPEND(tmpBuf);
+                        }
+                        else
+                            APPEND("_________");
+                        *bufp++ = (i != width-1) ? ',' : ']';
+                    }
+                    break;
+                }
+                case 'i': PRINT_SCALAR("%d", int);
+                case 'I': PRINT_VECTOR("%d", int);
+                case 'u': PRINT_SCALAR("%u", unsigned int);
+                case 'U': PRINT_VECTOR("%u", unsigned int);
+                case 'f': PRINT_SCALAR("%f", float);
+                case 'F': PRINT_VECTOR("%f", float);
+                case 'l': PRINT_SCALAR("%lld", long long);
+                case 'L': PRINT_VECTOR("%lld", long long);
+                case 'v': PRINT_SCALAR("%llu", unsigned long long);
+                case 'V': PRINT_VECTOR("%llu", unsigned long long);
+                case 'd': PRINT_SCALAR("%f", double);
+                case 'D': PRINT_VECTOR("%f", double);
+                case 'p': PRINT_SCALAR("%p", void *);
+                case 'P': PRINT_VECTOR("%p", void *);
+                default:
+                    APPEND("UNKNOWN TYPE ");
+                    *bufp++ = *types;
+                }
+                ++types;
+            }
+        }
+        ++format;
+    }
+
+ done:
+    *bufp = '\n'; bufp++;
+    *bufp = '\0';
+    __puts_nvptx(printString);
+#else
+    __puts_nvptx("---nvptx printing is not support---\n");
+#endif
+}
+

 int __num_cores() {
 #if defined(_MSC_VER) || defined(__MINGW32__)
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -288,4 +288,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
 ;; int8/int16 builtins

 define_avgs()
+declare_nvptx()

--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -10,6 +10,7 @@ packed_load_and_store()
 scans()
 int64minmax()
 aossoa()
+declare_nvptx()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -392,4 +392,4 @@ declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
 ;; int8/int16 builtins

 define_avgs()
-
+declare_nvptx()
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -344,3 +344,4 @@ packed_load_and_store(4)
 ;; prefetch

 define_prefetches()
+declare_nvptx()
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -274,3 +274,4 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {

 define_avgs()

+declare_nvptx()
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -278,3 +278,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
  %call = call i64 @llvm.ctpop.i64(i64 %0)
  ret i64 %call
 }
+
+declare_nvptx()
--- a/builtins/util-nvptx.m4
+++ b/builtins/util-nvptx.m4
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -4541,3 +4541,60 @@ define(`rcpd_decl', `
 declare  double @__rcp_uniform_double(double)
 declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
 ')
+
+define(`declare_nvptx',
+`
+declare i32 @__program_index()  nounwind readnone alwaysinline
+declare i32 @__program_count()  nounwind readnone alwaysinline
+declare i32 @__warp_index()  nounwind readnone alwaysinline
+declare i32 @__task_index0()  nounwind readnone alwaysinline
+declare i32 @__task_index1()  nounwind readnone alwaysinline
+declare i32 @__task_index2()  nounwind readnone alwaysinline
+declare i32 @__task_index()  nounwind readnone alwaysinline
+declare i32 @__task_count0()  nounwind readnone alwaysinline
+declare i32 @__task_count1()  nounwind readnone alwaysinline
+declare i32 @__task_count2()  nounwind readnone alwaysinline
+declare i32 @__task_count()  nounwind readnone alwaysinline
+declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
+declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
+declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline
+declare i64 @__movmsk_ptx(<WIDTH x i1>) nounwind readnone alwaysinline;
+')
+
+define(`global_atomic_varying',`
+declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
+')
+
+define(`global_atomic_cas_varying',`
+declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %cmp, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
+')
+
+global_atomic_cas_varying(WIDTH, compare_exchange, i32, int32)
+global_atomic_cas_varying(WIDTH, compare_exchange, i64, int64)
+global_atomic_cas_varying(WIDTH, compare_exchange, float, float)
+global_atomic_cas_varying(WIDTH, compare_exchange, double, double)
+
+global_atomic_varying(WIDTH, swap, i32, int32)
+global_atomic_varying(WIDTH, swap, i64, int64)
+global_atomic_varying(WIDTH, swap, float, float)
+global_atomic_varying(WIDTH, swap, double, double)
+
+global_atomic_varying(WIDTH, add, i32, int32)
+global_atomic_varying(WIDTH, sub, i32, int32)
+global_atomic_varying(WIDTH, and, i32, int32)
+global_atomic_varying(WIDTH, or, i32, int32)
+global_atomic_varying(WIDTH, xor, i32, int32)
+global_atomic_varying(WIDTH, min, i32, int32)
+global_atomic_varying(WIDTH, max, i32, int32)
+global_atomic_varying(WIDTH, umin, i32, uint32)
+global_atomic_varying(WIDTH, umax, i32, uint32)
+
+global_atomic_varying(WIDTH, add, i64, int64)
+global_atomic_varying(WIDTH, sub, i64, int64)
+global_atomic_varying(WIDTH, and, i64, int64)
+global_atomic_varying(WIDTH, or, i64, int64)
+global_atomic_varying(WIDTH, xor, i64, int64)
+global_atomic_varying(WIDTH, min, i64, int64)
+global_atomic_varying(WIDTH, max, i64, int64)
+global_atomic_varying(WIDTH, umin, i64, uint64)
+global_atomic_varying(WIDTH, umax, i64, uint64)
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -57,6 +57,8 @@
  #include <llvm/IR/Instructions.h>
  #include <llvm/IR/DerivedTypes.h>
 #endif
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Support/FormattedStream.h>

 /** This is a small utility structure that records information related to one
    level of nested control flow.  It's mostly used in correctly restoring
@@ -1371,29 +1373,97 @@ FunctionEmitContext::None(llvm::Value *mask) {


 llvm::Value *
-FunctionEmitContext::LaneMask(llvm::Value *v) {
-    // Call the target-dependent movmsk function to turn the vector mask
-    // into an i64 value
-    std::vector<Symbol *> mm;
-    m->symbolTable->LookupFunction("__movmsk", &mm);
-    if (g->target->getMaskBitCount() == 1)
-        AssertPos(currentPos, mm.size() == 1);
-    else
-        // There should be one with signed int signature, one unsigned int.
-        AssertPos(currentPos, mm.size() == 2);
-    // We can actually call either one, since both are i32s as far as
-    // LLVM's type system is concerned...
-    llvm::Function *fmm = mm[0]->function;
-    return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
+FunctionEmitContext::LaneMask(llvm::Value *v) 
+{
+#if 1 /* this makes mandelbrot example slower, why ?!? */
+  const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk";
+#else
+  const char *__movmsk = "__movmsk";
+#endif
+  // Call the target-dependent movmsk function to turn the vector mask
+  // into an i64 value
+  std::vector<Symbol *> mm;
+  m->symbolTable->LookupFunction(__movmsk, &mm);
+  if (g->target->getMaskBitCount() == 1)
+    AssertPos(currentPos, mm.size() == 1);
+  else
+    // There should be one with signed int signature, one unsigned int.
+    AssertPos(currentPos, mm.size() == 2);
+  // We can actually call either one, since both are i32s as far as
+  // LLVM's type system is concerned...
+  llvm::Function *fmm = mm[0]->function;
+  return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
+}
+
+bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName)
+{
+  llvm::Type *type = vector->getType();
+  if (type == LLVMTypes::Int8VectorType)
+    funcName += "_int8";
+  else if (type == LLVMTypes::Int16VectorType)
+    funcName += "_int16";
+  else if (type == LLVMTypes::Int32VectorType)
+    funcName += "_int32";
+  else if (type == LLVMTypes::Int64VectorType)
+    funcName += "_int64";
+  else if (type == LLVMTypes::FloatVectorType)
+    funcName += "_float";
+  else if (type == LLVMTypes::DoubleVectorType)
+    funcName += "_double";
+  else
+    return false;
+  return true;
+}
+
+llvm::Value*
+FunctionEmitContext::Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar)
+{
+  std::string funcName = "__insert";
+  assert(lAppendInsertExtractName(vector, funcName));
+  assert(lane->getType() == LLVMTypes::Int32Type);
+  
+  llvm::Function *func = m->module->getFunction(funcName.c_str());
+  assert(func != NULL);
+  std::vector<llvm::Value *> args;
+  args.push_back(vector);
+  args.push_back(lane);
+  args.push_back(scalar);
+  llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
+  return ret;
+}
+
+llvm::Value*
+FunctionEmitContext::Extract(llvm::Value *vector, llvm::Value *lane)
+{
+  std::string funcName = "__extract";
+  assert(lAppendInsertExtractName(vector, funcName));
+  assert(lane->getType() == LLVMTypes::Int32Type);
+  
+  llvm::Function *func = m->module->getFunction(funcName.c_str());
+  assert(func != NULL);
+  std::vector<llvm::Value *> args;
+  args.push_back(vector);
+  args.push_back(lane);
+  llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
+  return ret;
 }


 llvm::Value *
 FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
+  if (g->target->getISA() == Target::NVPTX)
+  {
+    // Compare the two masks to get a vector of i1s
+    llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
+        v1, v2, "v1==v2");
+    return ExtractInst(cmp, 0);  /* this works without calling All(..) in PTX. Why ?!? */
+  }
+  else
+  {
 #if 0
    // Compare the two masks to get a vector of i1s
    llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
-                               v1, v2, "v1==v2");
+        v1, v2, "v1==v2");
    // Turn that into a bool vector type (often i32s)
    cmp = I1VecToBoolVec(cmp);
    // And see if it's all on
@@ -1402,22 +1472,34 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
    llvm::Value *mm1 = LaneMask(v1);
    llvm::Value *mm2 = LaneMask(v2);
    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
-                   LLVMGetName("equal", v1, v2));
+        LLVMGetName("equal", v1, v2));
 #endif
+  }
 }

 llvm::Value *
 FunctionEmitContext::ProgramIndexVector(bool is32bits) {
    llvm::SmallVector<llvm::Constant*, 16> array;
    for (int i = 0; i < g->target->getVectorWidth() ; ++i) {
-        llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
-        array.push_back(C);
+      llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
+      array.push_back(C);
    }

    llvm::Constant* index = llvm::ConstantVector::get(array);

    return index;
 }
+llvm::Value *
+FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) {
+    llvm::Function *func_program_index  = m->module->getFunction("__program_index");
+    llvm::Value *__program_index    = CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__program_indexS");
+    llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), __program_index, 0, "foreach__program_indexV");
+#if 0
+    if (!is32bits)
+      index = ZExtInst(index, LLVMTypes::Int64VectandType);
+#endif
+    return index;
+}


 llvm::Value *
@@ -1830,6 +1912,7 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) {

    if (name == NULL)
        name = LLVMGetName(value, "_ptr2int");
+
    llvm::Type *type = LLVMTypes::PointerIntType;
    llvm::Instruction *inst = new llvm::PtrToIntInst(value, type, name, bblock);
    AddDebugPos(inst);
@@ -3523,98 +3606,199 @@ llvm::Value *
 FunctionEmitContext::LaunchInst(llvm::Value *callee,
                                std::vector<llvm::Value *> &argVals,
                                llvm::Value *launchCount[3]){
-    if (callee == NULL) {
+
+    if (g->target->getISA() != Target::NVPTX)
+    {
+      if (callee == NULL) {
        AssertPos(currentPos, m->errorCount > 0);
        return NULL;
-    }
+      }

-    launchedTasks = true;
+      launchedTasks = true;

-    AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
-    llvm::Type *argType =
+      AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
+      llvm::Type *argType =
        (llvm::dyn_cast<llvm::Function>(callee))->arg_begin()->getType();
-    AssertPos(currentPos, llvm::PointerType::classof(argType));
-    llvm::PointerType *pt =
+      AssertPos(currentPos, llvm::PointerType::classof(argType));
+      llvm::PointerType *pt =
        llvm::dyn_cast<llvm::PointerType>(argType);
-    AssertPos(currentPos, llvm::StructType::classof(pt->getElementType()));
-    llvm::StructType *argStructType =
+      AssertPos(currentPos, llvm::StructType::classof(pt->getElementType()));
+      llvm::StructType *argStructType =
        static_cast<llvm::StructType *>(pt->getElementType());

-    llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
-    AssertPos(currentPos, falloc != NULL);
-    llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
-    if (structSize->getType() != LLVMTypes::Int64Type)
+      llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
+      AssertPos(currentPos, falloc != NULL);
+      llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
+      if (structSize->getType() != LLVMTypes::Int64Type)
        // ISPCAlloc expects the size as an uint64_t, but on 32-bit
        // targets, SizeOf returns a 32-bit value
        structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
-                              "struct_size_to_64");
-    int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth());
+            "struct_size_to_64");
+      int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth());

-    std::vector<llvm::Value *> allocArgs;
-    allocArgs.push_back(launchGroupHandlePtr);
-    allocArgs.push_back(structSize);
-    allocArgs.push_back(LLVMInt32(align));
-    llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
-    llvm::Value *argmem = BitCastInst(voidmem, pt);
+      std::vector<llvm::Value *> allocArgs;
+      allocArgs.push_back(launchGroupHandlePtr);
+      allocArgs.push_back(structSize);
+      allocArgs.push_back(LLVMInt32(align));
+      llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
+      llvm::Value *argmem = BitCastInst(voidmem, pt);

-    // Copy the values of the parameters into the appropriate place in
-    // the argument block
-    for (unsigned int i = 0; i < argVals.size(); ++i) {
+      // Copy the values of the parameters into the appropriate place in
+      // the argument block
+      for (unsigned int i = 0; i < argVals.size(); ++i) {
        llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
        // don't need to do masked store here, I think
        StoreInst(argVals[i], ptr);
-    }
+      }

-    if (argStructType->getNumElements() == argVals.size() + 1) {
+      if (argStructType->getNumElements() == argVals.size() + 1) {
        // copy in the mask
        llvm::Value *mask = GetFullMask();
        llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
-                                            "funarg_mask");
+            "funarg_mask");
        StoreInst(mask, ptr);
-    }
+      }

-    // And emit the call to the user-supplied task launch function, passing
-    // a pointer to the task function being called and a pointer to the
-    // argument block we just filled in
-    llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
-    llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
-    AssertPos(currentPos, flaunch != NULL);
-    std::vector<llvm::Value *> args;
-    args.push_back(launchGroupHandlePtr);
-    args.push_back(fptr);
-    args.push_back(voidmem);
-    args.push_back(launchCount[0]);
-    args.push_back(launchCount[1]);
-    args.push_back(launchCount[2]);
-    return CallInst(flaunch, NULL, args, "");
+      // And emit the call to the user-supplied task launch function, passing
+      // a pointer to the task function being called and a pointer to the
+      // argument block we just filled in
+      llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
+      llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
+      AssertPos(currentPos, flaunch != NULL);
+      std::vector<llvm::Value *> args;
+      args.push_back(launchGroupHandlePtr);
+      args.push_back(fptr);
+      args.push_back(voidmem);
+      args.push_back(launchCount[0]);
+      args.push_back(launchCount[1]);
+      args.push_back(launchCount[2]);
+      return CallInst(flaunch, NULL, args, "");
+    }
+    else /* NVPTX */
+    {
+      if (callee == NULL) {
+        AssertPos(currentPos, m->errorCount > 0);
+        return NULL;
+      }
+      launchedTasks = true;
+
+      AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
+      std::vector<llvm::Type*> argTypes;
+
+      llvm::Function *F = llvm::dyn_cast<llvm::Function>(callee);
+      const unsigned int nArgs = F->arg_size();
+      llvm::Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+      for (; I != E; ++I) 
+        argTypes.push_back(I->getType());
+      llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes);
+      llvm::StructType *argStructType = static_cast<llvm::StructType *>(st);
+      llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
+      if (structSize->getType() != LLVMTypes::Int64Type)
+        structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
+            "struct_size_to_64");
+
+      const int align = 8;
+      llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
+      AssertPos(currentPos, falloc != NULL);
+      std::vector<llvm::Value *> allocArgs;
+      allocArgs.push_back(launchGroupHandlePtr);
+      allocArgs.push_back(structSize);
+      allocArgs.push_back(LLVMInt32(align));
+      llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
+      llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64");
+      llvm::BasicBlock* if_true  = CreateBasicBlock("if_true");
+      llvm::BasicBlock* if_false = CreateBasicBlock("if_false");
+
+      /* check if the pointer returned by ISPCAlloc is not NULL 
+       * --------------
+       * this is a workaround for not checking the value of programIndex 
+       * because ISPCAlloc will return NULL pointer for all programIndex > 0
+       * of course, if ISPAlloc fails to get parameter buffer, the pointer for programIndex = 0
+       * will also be NULL
+       * This check must be added, and also rewrite the code to make it less opaque 
+       */
+      llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1");
+      BranchInst(if_true, if_false, cmp1);
+
+      /**********************/
+      bblock = if_true;    
+
+      // label_if_then block:
+      llvm::Type *pt = llvm::PointerType::getUnqual(st);
+      llvm::Value *argmem = BitCastInst(voidmem, pt);
+      for (unsigned int i = 0; i < argVals.size(); ++i) 
+      {
+        llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
+        // don't need to do masked store here, I think
+        StoreInst(argVals[i], ptr);
+      }
+      if (nArgs == argVals.size() + 1) {
+        // copy in the mask
+        llvm::Value *mask = GetFullMask();
+        llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
+            "funarg_mask");
+        StoreInst(mask, ptr);
+      }
+      BranchInst(if_false);
+
+      /**********************/
+      bblock = if_false;
+
+      llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
+      llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
+      AssertPos(currentPos, flaunch != NULL);
+      std::vector<llvm::Value *> args;
+      args.push_back(launchGroupHandlePtr);
+      args.push_back(fptr);
+      args.push_back(voidmem);
+      args.push_back(launchCount[0]);
+      args.push_back(launchCount[1]);
+      args.push_back(launchCount[2]);
+      llvm::Value *ret =  CallInst(flaunch, NULL, args, "");
+      return ret;
+    }
 }


 void
 FunctionEmitContext::SyncInst() {
-    llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
-    llvm::Value *nullPtrValue =
+    if (g->target->getISA() != Target::NVPTX)
+    {
+      llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
+      llvm::Value *nullPtrValue =
        llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
-    llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
-                                   llvm::CmpInst::ICMP_NE,
-                                   launchGroupHandle, nullPtrValue);
-    llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
-    llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
-    BranchInst(bSync, bPostSync, nonNull);
+      llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
+          llvm::CmpInst::ICMP_NE,
+          launchGroupHandle, nullPtrValue);
+      llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
+      llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
+      BranchInst(bSync, bPostSync, nonNull);

-    SetCurrentBasicBlock(bSync);
-    llvm::Function *fsync = m->module->getFunction("ISPCSync");
-    if (fsync == NULL)
+      SetCurrentBasicBlock(bSync);
+      llvm::Function *fsync = m->module->getFunction("ISPCSync");
+      if (fsync == NULL)
        FATAL("Couldn't find ISPCSync declaration?!");
-    CallInst(fsync, NULL, launchGroupHandle, "");
+      CallInst(fsync, NULL, launchGroupHandle, "");

-    // zero out the handle so that if ISPCLaunch is called again in this
-    // function, it knows it's starting out from scratch
-    StoreInst(nullPtrValue, launchGroupHandlePtr);
+      // zero out the handle so that if ISPCLaunch is called again in this
+      // function, it knows it's starting out from scratch
+      StoreInst(nullPtrValue, launchGroupHandlePtr);

-    BranchInst(bPostSync);
+      BranchInst(bPostSync);

-    SetCurrentBasicBlock(bPostSync);
+      SetCurrentBasicBlock(bPostSync);
+    }
+    else /* NVPTX: don't do test, just call sync */
+    {
+      llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
+      llvm::Value *nullPtrValue =
+        llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
+      llvm::Function *fsync = m->module->getFunction("ISPCSync");
+      if (fsync == NULL)
+        FATAL("Couldn't find ISPCSync declaration?!");
+      CallInst(fsync, NULL, launchGroupHandle, "");
+      StoreInst(nullPtrValue, launchGroupHandlePtr);
+    }
 }


--- a/ctx.h
+++ b/ctx.h
@@ -291,6 +291,13 @@ public:
        of the mask is on. */
    llvm::Value *LaneMask(llvm::Value *mask);

+
+    /** Issues a call to __insert_int8/int16/int32/int64/float/double */
+    llvm::Value* Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar);
+    /** Issues a call to __extract_int8/int16/int32/int64/float/double */
+    llvm::Value* Extract(llvm::Value *vector, llvm::Value *lane);
+
+
    /** Given two masks of type LLVMTypes::MaskType, return an i1 value
        that indicates whether the two masks are equal. */
    llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
@@ -298,6 +305,7 @@ public:
    /** Generate ConstantVector, which contains ProgramIndex, i.e.
        < i32 0, i32 1, i32 2, i32 3> */
    llvm::Value *ProgramIndexVector(bool is32bits = true);
+    llvm::Value *ProgramIndexVectorPTX(bool is32bits = true);

    /** Given a string, create an anonymous global variable to hold its
        value and return the pointer to the string. */
--- a/decl.cpp
+++ b/decl.cpp
@@ -168,6 +168,13 @@ DeclSpecs::GetBaseType(SourcePos pos) const {
    retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);

    if (soaWidth > 0) {
+#if 0  /* see stmt.cpp in DeclStmt::EmitCode for work-around of SOAType Declaration */
+        if (g->target->getISA() == Target::NVPTX)
+        {
+            Error(pos, "\"soa\" data types are currently not supported with \"nvptx\" target.");
+            return NULL;
+        }
+#endif
        const StructType *st = CastType<StructType>(retType);

        if (st == NULL) {
@@ -402,6 +409,13 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
            return;
        }

+#if 0 /* NVPTX */
+        if (baseType->IsUniformType())
+        {
+          fprintf(stderr, " detected uniform array of size= %d  array= %s\n" ,arraySize,
+              baseType->IsArrayType() ? " true " : " false ");
+        }
+#endif
        const Type *arrayType = new ArrayType(baseType, arraySize);
        if (child != NULL) {
            child->InitFromType(arrayType, ds);
@@ -530,9 +544,9 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {

        returnType = returnType->ResolveUnboundVariability(Variability::Varying);

+        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
        bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
        bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
-        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
        bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0);

        if (isExported && isTask) {
@@ -541,9 +555,9 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
            return;
        }
        if (isExternC && isTask) {
-            Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
-                  "qualifiers");
-            return;
+          Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
+              "qualifiers");
+          return;
        }
        if (isExternC && isExported) {
            Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
--- a/expr.cpp
+++ b/expr.cpp
@@ -7867,6 +7867,12 @@ SizeOfExpr::TypeCheck() {
              "struct type \"%s\".", type->GetString().c_str());
        return NULL;
    }
+    if (type != NULL)
+      if (g->target->getISA() == Target::NVPTX && type->IsVaryingType())
+      {
+        Error(pos, "\"sizeof\" with varying data types is not yet supported with \"nvptx\" target.");
+        return NULL;
+      }

    return this;
 }
@@ -8661,6 +8667,11 @@ NewExpr::TypeCheck() {
        AssertPos(pos, m->errorCount > 0);
        return NULL;
    }
+    if (g->target->getISA() == Target::NVPTX && allocType->IsVaryingType())
+    {
+      Error(pos, "\"new\" with varying data types is not yet supported with \"nvptx\" target.");
+      return NULL;
+    }
    if (CastType<UndefinedStructType>(allocType) != NULL) {
        Error(pos, "Can't dynamically allocate storage for declared "
              "but not defined type \"%s\".", allocType->GetString().c_str());
--- a/func.cpp
+++ b/func.cpp
@@ -47,6 +47,7 @@
 #include <stdio.h>

 #if defined(LLVM_3_1) || defined(LLVM_3_2)
+  #include <llvm/Metadata.h>
  #include <llvm/LLVMContext.h>
  #include <llvm/Module.h>
  #include <llvm/Type.h>
@@ -54,6 +55,7 @@
  #include <llvm/Intrinsics.h>
  #include <llvm/DerivedTypes.h>
 #else
+  #include <llvm/IR/Metadata.h>
  #include <llvm/IR/LLVMContext.h>
  #include <llvm/IR/Module.h>
  #include <llvm/IR/Type.h>
@@ -128,7 +130,7 @@ Function::Function(Symbol *s, Stmt *c) {
            sym->parentFunction = this;
    }

-    if (type->isTask) {
+    if (type->isTask && g->target->getISA() != Target::NVPTX) {
        threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
        Assert(threadIndexSym);
        threadCountSym = m->symbolTable->LookupVariable("threadCount");
@@ -239,7 +241,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
 #endif
    const FunctionType *type = CastType<FunctionType>(sym->type);
    Assert(type != NULL);
-    if (type->isTask == true) {
+    if (type->isTask == true && g->target->getISA() != Target::NVPTX)  {
        // For tasks, there should always be three parameters: the
        // pointer to the structure that holds all of the arguments, the
        // thread index, and the thread count variables.
@@ -337,6 +339,16 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
            ctx->SetFunctionMask(argIter);
            Assert(++argIter == function->arg_end());
        }
+        if (type->isTask == true && g->target->getISA() == Target::NVPTX)
+        {
+          llvm::NamedMDNode* annotations =
+            m->module->getOrInsertNamedMetadata("nvvm.annotations");
+          llvm::SmallVector<llvm::Value*, 3> av;
+          av.push_back(function);
+          av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
+          av.push_back(LLVMInt32(1));
+          annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
+        }
    }

    // Finally, we can generate code for the function
@@ -497,8 +509,23 @@ Function::GenerateIR() {
                llvm::FunctionType *ftype = type->LLVMFunctionType(g->ctx, true);
                llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
                std::string functionName = sym->name;
+
                if (g->mangleFunctionsWithTarget)
                    functionName += std::string("_") + g->target->GetISAString();
+
+                if (g->target->getISA() == Target::NVPTX)
+                {
+                  functionName += std::string("___export");  /* add ___export to the end, for ptxcc to recognize it is exported */
+#if 0
+                  llvm::NamedMDNode* annotations =
+                    m->module->getOrInsertNamedMetadata("nvvm.annotations");
+                  llvm::SmallVector<llvm::Value*, 3> av;
+                  av.push_back(function);
+                  av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
+                  av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
+                  annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); 
+#endif
+                }
                llvm::Function *appFunction =
                    llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
 #if defined(LLVM_3_1)
@@ -538,6 +565,16 @@ Function::GenerateIR() {
                            FATAL("Function verificication failed");
                        }
                    }
+                    if (g->target->getISA() == Target::NVPTX)
+                    {
+                      llvm::NamedMDNode* annotations =
+                        m->module->getOrInsertNamedMetadata("nvvm.annotations");
+                      llvm::SmallVector<llvm::Value*, 3> av;
+                      av.push_back(appFunction);
+                      av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
+                      av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
+                      annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); 
+                    }
                }
            }
        }
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -280,6 +280,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
            arch = "arm";
        else
 #endif
+         if(!strncmp(isa, "nvptx", 5))
+           arch = "nvptx64";
+         else
            arch = "x86-64";
    }

@@ -707,6 +710,19 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        this->m_maskBitCount = 32;
    }
 #endif
+    else if (!strcasecmp(isa, "nvptx")) 
+    {
+        this->m_isa = Target::NVPTX;
+        this->m_cpu = "sm_35";
+        this->m_nativeVectorWidth = 32;
+        this->m_nativeVectorAlignment = 32;
+        this->m_vectorWidth = 1;
+        this->m_hasHalf = true;
+        this->m_maskingIsFree = true;
+        this->m_maskBitCount = 1;
+        this->m_hasTranscendentals = false; 
+        this->m_hasGather = this->m_hasScatter = false;
+    }
    else {
        Error(SourcePos(), "Target \"%s\" is unknown.  Choices are: %s.",
                isa, SupportedTargets());
@@ -784,7 +800,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        // Initialize target-specific "target-feature" attribute.
        if (!m_attributes.empty()) {
            llvm::AttrBuilder attrBuilder;
-            attrBuilder.addAttribute("target-cpu", this->m_cpu);
+            if (m_isa != Target::NVPTX)
+              attrBuilder.addAttribute("target-cpu", this->m_cpu);
            attrBuilder.addAttribute("target-features", this->m_attributes);
            this->m_tf_attributes = new llvm::AttributeSet(
                llvm::AttributeSet::get(
@@ -839,7 +856,7 @@ Target::SupportedTargets() {
        "avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 "
        "avx2-i32x8, avx2-i32x16, avx2-i64x4, "
        "generic-x1, generic-x4, generic-x8, generic-x16, "
-        "generic-x32, generic-x64";
+        "generic-x32, generic-x64, nvptx";
 }


@@ -866,6 +883,8 @@ Target::GetTripleString() const {
            triple.setArchName("i386");
        else if (m_arch == "x86-64")
            triple.setArchName("x86_64");
+        else if (m_arch == "nvptx64")
+          triple = llvm::Triple("nvptx64", "nvidia", "cuda");
        else
            triple.setArchName(m_arch);
    }
@@ -898,6 +917,8 @@ Target::ISAToString(ISA isa) {
        return "avx2";
    case Target::GENERIC:
        return "generic";
+    case Target::NVPTX:
+        return "nvptx";
    default:
        FATAL("Unhandled target in ISAToString()");
    }
@@ -936,6 +957,8 @@ Target::ISAToTargetString(ISA isa) {
        return "avx2-i32x8";
    case Target::GENERIC:
        return "generic-4";
+    case Target::NVPTX:
+        return "nvptx";
    default:
        FATAL("Unhandled target in ISAToTargetString()");
    }
--- a/ispc.h
+++ b/ispc.h
@@ -179,7 +179,7 @@ public:
        flexible/performant of them will apear last in the enumerant.  Note
        also that __best_available_isa() needs to be updated if ISAs are
        added or the enumerant values are reordered.  */
-    enum ISA {
+    enum ISA { NVPTX,
 #ifdef ISPC_ARM_ENABLED
               NEON32, NEON16, NEON8,
 #endif
@@ -606,6 +606,7 @@ struct Globals {
    /** Indicates that alignment in memory allocation routines should be
        forced to have given value. -1 value means natural alignment for the platforms. */
    int forceAlignment;
+    std::string PtxString;
 };

 enum {
--- a/main.cpp
+++ b/main.cpp
@@ -320,6 +320,11 @@ int main(int Argc, char *Argv[]) {
    LLVMInitializeARMTargetMC();
 #endif

+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXAsmPrinter();
+    LLVMInitializeNVPTXTargetMC();
+
    char *file = NULL;
    const char *headerFileName = NULL;
    const char *outFileName = NULL;
--- a/module.cpp
+++ b/module.cpp
@@ -444,6 +444,38 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
        return;
    }

+    if (g->target->getISA() == Target::NVPTX && 
+#if 0
+        !type->IsConstType()  &&
+#endif
+#if 1
+        at != NULL &&
+#endif
+        type->IsVaryingType())
+    {
+      Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target.");
+      return;
+#if 0
+        int nel = 32;  /* warp-size */
+        if (type->IsArrayType())
+        {
+          const ArrayType *at = CastType<ArrayType>(type);
+          /* we must scale # elements by 4, because a thread-block will run 4 warps
+           * or 128 threads.
+           * ***note-to-me***:please define these value (128threads/4warps)
+           * in nvptx-target definition
+           * instead of compile-time constants 
+           */
+          nel *= at->GetElementCount();
+          assert (!type->IsSOAType());
+          type = new ArrayType(at->GetElementType()->GetAsUniformType(), nel);
+        }
+        else
+          type = new ArrayType(type->GetAsUniformType(), nel);
+#endif
+    }
+
+
    llvm::Type *llvmType = type->LLVMType(g->ctx);
    if (llvmType == NULL)
        return;
@@ -643,6 +675,21 @@ lCheckExportedParameterTypes(const Type *type, const std::string &name,
    }
 }

+static void
+lCheckTaskParameterTypes(const Type *type, const std::string &name,
+                             SourcePos pos) {
+  if (g->target->getISA() != Target::NVPTX) 
+    return;
+  if (lRecursiveCheckValidParamType(type, false) == false) {
+    if (CastType<VectorType>(type))
+      Error(pos, "Vector-typed parameter \"%s\" is illegal in a task "
+          "function with \"nvptx\" target.", name.c_str());
+    else
+      Error(pos, "Varying parameter \"%s\" is illegal in a task function with \"nvptx\" target.",
+          name.c_str());
+    }
+}
+

 /** Given a function type, loop through the function parameters and see if
    any are StructTypes.  If so, issue an error; this is currently broken
@@ -801,7 +848,8 @@ Module::AddFunctionDeclaration(const std::string &name,
 #else // LLVM 3.1 and 3.3+
        function->addFnAttr(llvm::Attribute::AlwaysInline);
 #endif
-    if (functionType->isTask)
+    /* evghenii: fails function verification when "if" executed in nvptx target */
+    if (functionType->isTask && g->target->getISA() != Target::NVPTX)
        // This also applies transitively to members I think?
 #if defined(LLVM_3_1)
        function->setDoesNotAlias(1, true);
@@ -822,6 +870,13 @@ Module::AddFunctionDeclaration(const std::string &name,
        Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false)
        Error(pos, "Task-qualified functions must have void return type.");

+    if (g->target->getISA() == Target::NVPTX &&
+        Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false &&
+        functionType->isExported)
+    {
+        Error(pos, "Export-qualified functions must have void return type with \"nvptx\" target.");
+    }
+
    if (functionType->isExported || functionType->isExternC)
        lCheckForStructParameters(functionType, pos);

@@ -841,6 +896,9 @@ Module::AddFunctionDeclaration(const std::string &name,
        if (functionType->isExported) {
          lCheckExportedParameterTypes(argType, argName, argPos);
        }
+        if (functionType->isTask) {
+          lCheckTaskParameterTypes(argType, argName, argPos);
+        }

        // ISPC assumes that no pointers alias.  (It should be possible to
        // specify when this is not the case, but this should be the
@@ -959,7 +1017,13 @@ Module::writeOutput(OutputType outputType, const char *outFileName,
        const char *fileType = NULL;
        switch (outputType) {
        case Asm:
-            if (strcasecmp(suffix, "s"))
+            if (g->target->getISA() != Target::NVPTX)
+            {
+              if (strcasecmp(suffix, "s"))
+                fileType = "assembly";
+            }
+            else
+              if (strcasecmp(suffix, "ptx"))
                fileType = "assembly";
            break;
        case Bitcode:
@@ -1057,6 +1121,11 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) {
    }

    llvm::raw_fd_ostream fos(fd, (fd != 1), false);
+    if (g->target->getISA() == Target::NVPTX)
+    {
+      const std::string dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
+      module->setDataLayout(dl_string);
+    }
    llvm::WriteBitcodeToFile(module, fos);
    return true;
 }
@@ -2095,6 +2164,24 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
            opts.addMacroDef(g->cppArgs[i].substr(2));
        }
    }
+    if (g->target->getISA() == Target::NVPTX)
+    {
+      opts.addMacroDef("__NVPTX__");
+      opts.addMacroDef("programIndex=__programIndex()");
+      opts.addMacroDef("cif=if");
+      opts.addMacroDef("cfor=for");
+      opts.addMacroDef("cwhile=while");
+      opts.addMacroDef("ccontinue=continue");
+      opts.addMacroDef("cdo=do");
+      opts.addMacroDef("taskIndex0=__taskIndex0()");
+      opts.addMacroDef("taskIndex1=__taskIndex1()");
+      opts.addMacroDef("taskIndex2=__taskIndex2()");
+      opts.addMacroDef("taskIndex=__taskIndex()");
+      opts.addMacroDef("taskCount0=__taskCount0()");
+      opts.addMacroDef("taskCount1=__taskCount1()");
+      opts.addMacroDef("taskCount2=__taskCount2()");
+      opts.addMacroDef("taskCount=__taskCount()");
+    }

 #if defined(LLVM_3_1)
    inst.getLangOpts().BCPLComment = 1;
@@ -2540,6 +2627,29 @@ lCreateDispatchModule(std::map<std::string, FunctionTargetVariants> &functions)
    return module;
 }

+static std::string lCBEMangle(const std::string &S) {
+  std::string Result;
+
+  for (unsigned i = 0, e = S.size(); i != e; ++i) {
+    if (i+1 != e && ((S[i] == '>' && S[i+1] == '>') ||
+                     (S[i] == '<' && S[i+1] == '<'))) {
+      Result += '_';
+      Result += 'A'+(S[i]&15);
+      Result += 'A'+((S[i]>>4)&15);
+      Result += '_';
+      i++;
+    } else if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') {
+      Result += S[i];
+    } else {
+      Result += '_';
+      Result += 'A'+(S[i]&15);
+      Result += 'A'+((S[i]>>4)&15);
+      Result += '_';
+    }
+  }
+  return Result;
+}
+

 int
 Module::CompileAndOutput(const char *srcFile,
@@ -2555,7 +2665,7 @@ Module::CompileAndOutput(const char *srcFile,
                         const char *hostStubFileName,
                         const char *devStubFileName)
 {
-    if (target == NULL || strchr(target, ',') == NULL) {
+  if (target == NULL || strchr(target, ',') == NULL) {
        // We're only compiling to a single target
        g->target = new Target(arch, cpu, target, generatePIC);
        if (!g->target->isValid())
@@ -2563,6 +2673,32 @@ Module::CompileAndOutput(const char *srcFile,

        m = new Module(srcFile);
        if (m->CompileFile() == 0) {
+
+            /* NVPTX:
+             * for PTX target replace '.' with '_' in all global variables 
+             * a PTX identifier name must match [a-zA-Z$_][a-zA-Z$_0-9]*
+             */
+            if (g->target->getISA() == Target::NVPTX)
+            {
+              /* mangle global variables names */
+              {
+                llvm::Module::global_iterator I = m->module->global_begin(), E = m->module->global_end();
+                for (; I != E; I++)
+                  I->setName(lCBEMangle(I->getName()));
+              }
+
+              /* mangle functions names */
+              {
+                llvm::Module::iterator I = m->module->begin(), E = m->module->end();
+                for (; I != E; I++)
+                {
+                  std::string str = I->getName();
+                  if (str.find("operator") != std::string::npos)
+                    I->setName(lCBEMangle(str));
+                }
+              }
+            }
+
            if (outputType == CXX) {
                if (target == NULL || strncmp(target, "generic-", 8) != 0) {
                    Error(SourcePos(), "When generating C++ output, one of the \"generic-*\" "
@@ -2765,4 +2901,5 @@ Module::CompileAndOutput(const char *srcFile,

        return errorCount > 0;
    }
+    return true;
 }
--- a/20
+++ b/20
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+PATH=$ISPC_HOME/examples_ptx/ptxcc:$ISPC_HOME/examples_ptx/ptxgen:$PATH
+PTXCC=ptxcc
+ARGS=${@:2}
+if [ "$NVVM" == "1" ];
+then
+  LLVM32=$HOME/usr/local/llvm/bin-3.2
+  LLVMDIS=$LLVM32/bin/llvm-dis
+  PTXGEN=$ISPC_HOME/examples_ptx/ptxgen/ptxgen
+  $($LLVMDIS $1 -o $1.ll) && $($PTXGEN $1.ll > $1.ptx) && \
+  $($PTXCC $1.ptx -o $1.o -Xnvcc="-G") && \
+  $(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS);
+else
+  $($PTXCC $1 -o $1.o -Xnvcc="-G") && \
+  $(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS);
+fi
+
+
+
--- a/opt.cpp
+++ b/opt.cpp
@@ -133,6 +133,7 @@ static llvm::Pass *CreateDebugPass(char * output);
 static llvm::Pass *CreateReplaceStdlibShiftPass();

 static llvm::Pass *CreateFixBooleanSelectPass();
+static llvm::Pass *CreatePromoteLocalToPrivatePass();

 #define DEBUG_START_PASS(NAME)                                 \
    if (g->debugPrint &&                                       \
@@ -496,7 +497,11 @@ Optimize(llvm::Module *module, int optLevel) {
        // run absolutely no optimizations, since the front-end needs us to
        // take the various __pseudo_* functions it has emitted and turn
        // them into something that can actually execute.
-        optPM.add(CreateImproveMemoryOpsPass(), 100);
+        
+        if (g->opt.disableGatherScatterOptimizations == false &&
+            g->target->getVectorWidth() > 1) 
+          optPM.add(CreateImproveMemoryOpsPass(), 100);
+
        if (g->opt.disableHandlePseudoMemoryOps == false)
            optPM.add(CreateReplacePseudoMemoryOpsPass());

@@ -519,6 +524,8 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::initializeInstrumentation(*registry);
        llvm::initializeTarget(*registry);

+        if (g->target->getISA() == Target::NVPTX)
+          optPM.add(CreatePromoteLocalToPrivatePass());
        optPM.add(llvm::createGlobalDCEPass(), 185);

        // Setup to use LLVM default AliasAnalysis
@@ -577,7 +584,10 @@ Optimize(llvm::Module *module, int optLevel) {
        optPM.add(llvm::createGlobalOptimizerPass());
        optPM.add(llvm::createReassociatePass());
        optPM.add(llvm::createIPConstantPropagationPass());
-        optPM.add(CreateReplaceStdlibShiftPass(),229);
+
+        if (g->target->getISA() != Target::NVPTX)
+          optPM.add(CreateReplaceStdlibShiftPass(),229);
+
        optPM.add(llvm::createDeadArgEliminationPass(),230);
        optPM.add(llvm::createInstructionCombiningPass());
        optPM.add(llvm::createCFGSimplificationPass());
@@ -689,6 +699,111 @@ Optimize(llvm::Module *module, int optLevel) {

        // Should be the last
        optPM.add(CreateFixBooleanSelectPass(), 400);
+
+        if (g->target->getISA() == Target::NVPTX)
+        {
+          optPM.add(llvm::createGlobalDCEPass());
+
+          optPM.add(llvm::createTypeBasedAliasAnalysisPass());
+          optPM.add(llvm::createBasicAliasAnalysisPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          // Here clang has an experimental pass SROAPass instead of
+          // ScalarReplAggregatesPass. We should add it in the future.
+          optPM.add(llvm::createScalarReplAggregatesPass());
+          optPM.add(llvm::createEarlyCSEPass());
+          optPM.add(llvm::createLowerExpectIntrinsicPass());
+          optPM.add(llvm::createTypeBasedAliasAnalysisPass());
+          optPM.add(llvm::createBasicAliasAnalysisPass());
+
+          // Early optimizations to try to reduce the total amount of code to
+          // work with if we can
+          optPM.add(llvm::createReassociatePass());
+          optPM.add(llvm::createConstantPropagationPass());
+          optPM.add(llvm::createDeadInstEliminationPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+
+          optPM.add(llvm::createPromoteMemoryToRegisterPass());
+          optPM.add(llvm::createAggressiveDCEPass());
+
+
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createDeadInstEliminationPass());
+
+          // On to more serious optimizations
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          optPM.add(llvm::createPromoteMemoryToRegisterPass());
+          optPM.add(llvm::createGlobalOptimizerPass());
+          optPM.add(llvm::createReassociatePass());
+          optPM.add(llvm::createIPConstantPropagationPass());
+
+          optPM.add(llvm::createDeadArgEliminationPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          optPM.add(llvm::createPruneEHPass());
+          optPM.add(llvm::createFunctionAttrsPass());
+          optPM.add(llvm::createFunctionInliningPass());
+          optPM.add(llvm::createConstantPropagationPass());
+          optPM.add(llvm::createDeadInstEliminationPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+
+          optPM.add(llvm::createArgumentPromotionPass());
+#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
+          // Starting from 3.4 this functionality was moved to
+          // InstructionCombiningPass. See r184459 for details.
+          optPM.add(llvm::createSimplifyLibCallsPass());
+#endif
+          optPM.add(llvm::createAggressiveDCEPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createJumpThreadingPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createTailCallEliminationPass());
+
+          optPM.add(llvm::createInstructionCombiningPass());
+
+          optPM.add(llvm::createFunctionInliningPass());
+          optPM.add(llvm::createConstantPropagationPass());
+
+          optPM.add(llvm::createInstructionCombiningPass());
+
+          optPM.add(llvm::createIPSCCPPass());
+          optPM.add(llvm::createDeadArgEliminationPass());
+          optPM.add(llvm::createAggressiveDCEPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+
+          optPM.add(llvm::createFunctionInliningPass());
+          optPM.add(llvm::createArgumentPromotionPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          optPM.add(llvm::createReassociatePass());
+          optPM.add(llvm::createLoopRotatePass());
+          optPM.add(llvm::createLICMPass());
+//          optPM.add(llvm::createLoopUnswitchPass(false));
+#if 1
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createIndVarSimplifyPass());
+          optPM.add(llvm::createLoopIdiomPass());
+          optPM.add(llvm::createLoopDeletionPass());
+          optPM.add(llvm::createLoopUnrollPass());
+          optPM.add(llvm::createGVNPass());
+          optPM.add(llvm::createMemCpyOptPass());
+          optPM.add(llvm::createSCCPPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createJumpThreadingPass());
+          optPM.add(llvm::createCorrelatedValuePropagationPass());
+          optPM.add(llvm::createDeadStoreEliminationPass());
+          optPM.add(llvm::createAggressiveDCEPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createFunctionInliningPass());
+          optPM.add(llvm::createAggressiveDCEPass());
+          optPM.add(llvm::createStripDeadPrototypesPass());
+          optPM.add(llvm::createGlobalDCEPass());
+          optPM.add(llvm::createConstantMergePass());
+#endif
+        }
    }

    // Finish up by making sure we didn't mess anything up in the IR along
@@ -5267,4 +5382,63 @@ CreateFixBooleanSelectPass() {
    return new FixBooleanSelectPass();
 }

+///////////////////////////////////////////////////////////////////////////////
+// Detect addrspace(3)
+///////////////////////////////////////////////////////////////////////////////
+
+class PromoteLocalToPrivatePass: public llvm::BasicBlockPass
+{
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    PromoteLocalToPrivatePass() : BasicBlockPass(ID) {}
+
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+};
+
+char PromoteLocalToPrivatePass::ID = 0;
+
+bool 
+PromoteLocalToPrivatePass::runOnBasicBlock(llvm::BasicBlock &BB)
+{
+  std::vector<llvm::AllocaInst*> Allocas;
+
+  bool modifiedAny  = false;
+
+  llvm::Function *cvtFunc = m->module->getFunction("__cvt_loc2gen_var");
+
+  // Find allocas that are safe to promote, by looking at all instructions in
+  // the entry node
+  for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+  {
+    llvm::Instruction *inst = &*I;
+    if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
+    {
+      llvm::Function *func = ci->getCalledFunction();
+      if (cvtFunc && (cvtFunc == func))
+      {
+#if 0
+        fprintf(stderr , "--found cvt-- name= %s \n",
+            I->getName().str().c_str());
+#endif
+        llvm::AllocaInst *alloca = new llvm::AllocaInst(LLVMTypes::Int64Type, "opt_loc2var", ci);
+        assert(alloca != NULL);
+#if 0
+        const int align = 8; // g->target->getNativeVectorAlignment();
+        alloca->setAlignment(align);
+#endif
+        ci->replaceAllUsesWith(alloca);
+        modifiedAny = true;
+      }
+    }
+  }
+  return modifiedAny;
+}
+
+static llvm::Pass *
+CreatePromoteLocalToPrivatePass() {
+    return new PromoteLocalToPrivatePass();
+}
+
+
+

--- a/ptxtestcc.sh
+++ b/ptxtestcc.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+LLC=$HOME/usr/local/llvm/bin-trunk/bin/llc
+DIS=$HOME/usr/local/llvm/bin-3.2/bin/llvm-dis
+
+ISPC=ispc
+PTXCC=ptxcc
+PTXGEN=~/ptxgen
+$(cat $1 |grep -v 'width'|$ISPC --target=nvptx --emit-llvm -o -|$LLC -march=nvptx64 -mcpu=sm_35 -o $1.ptx) && \
+#$(cat $1 |grep -v 'width'|$ISPC --target=nvptx --emit-llvm -o -|$DIS -o $1_32_ptx.ll && $PTXGEN $1_32_ptx.ll > $1.ptx) && \
+$($PTXCC $1.ptx  -Xptxas=-v -o $1.ptx.o) && \
+nvcc -o test_nvptx test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.ptx.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt -DTEST_SIG=$2
+
+
+
--- a/run_tests.py
+++ b/run_tests.py
@@ -204,6 +204,8 @@ def run_test(testname):
            return (1, 0)
        else:
            global is_generic_target
+            global is_nvptx_target
+            global is_nvptx_nvvm
            if is_windows:
                if is_generic_target:
                    obj_name = "%s.cpp" % os.path.basename(filename)
@@ -218,6 +220,13 @@ def run_test(testname):
            else:
                if is_generic_target:
                    obj_name = "%s.cpp" % testname
+                elif is_nvptx_target:
+                  if os.environ.get("NVVM") == "1":
+                    is_nvptx_nvvm = True
+                    obj_name = "%s.bc" % testname
+                  else:
+                    obj_name = "%s.ptx" % testname
+                    is_nvptx_nvvm = False
                else:
                    obj_name = "%s.o" % testname
                exe_name = "%s.run" % testname
@@ -248,13 +257,32 @@ def run_test(testname):
                    cc_cmd += ' -Wl,-no_pie'
                if should_fail:
                    cc_cmd += " -DEXPECT_FAILURE"
+                if is_nvptx_target:
+                  nvptxcc_exe = "nvptxcc"
+                  nvptxcc_exe_rel = add_prefix(nvptxcc_exe)
+                  cc_cmd = "%s %s -DTEST_SIG=%d -o %s" % \
+                      (nvptxcc_exe_rel, obj_name, match, exe_name)

-            ispc_cmd = ispc_exe_rel + " --woff %s -o %s --arch=%s --target=%s" % \
+            ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --arch=%s --target=%s" % \
                       (filename, obj_name, options.arch, options.target)
            if options.no_opt:
                ispc_cmd += " -O0" 
            if is_generic_target:
                ispc_cmd += " --emit-c++ --c++-include-file=%s" % add_prefix(options.include_file)
+            if is_nvptx_target:
+                filename4ptx = filename+".ptx.parsed_ispc"
+                grep_cmd = "grep -v 'export uniform int width' %s > %s " % \
+                    (filename, filename4ptx)
+                if options.verbose:
+                  print "Grepping: %s" % grep_cmd
+                sp = subprocess.Popen(grep_cmd, shell=True)
+                sp.communicate()
+                if is_nvptx_nvvm:
+                  ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-llvm --target=%s" % \
+                         (filename4ptx, obj_name, options.target)
+                else:
+                  ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-asm --target=%s" % \
+                         (filename4ptx, obj_name, options.target)

        # compile the ispc code, make the executable, and run it...
        (compile_error, run_error) = run_cmds([ispc_cmd, cc_cmd], 
@@ -269,7 +297,7 @@ def run_test(testname):
                    basename = os.path.basename(filename)
                    os.unlink("%s.pdb" % basename)
                    os.unlink("%s.ilk" % basename)
-            os.unlink(obj_name)
+#            os.unlink(obj_name)
        except:
            None

@@ -290,6 +318,7 @@ def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test
    ispc_exe = glob_var[3]
    global is_generic_target
    is_generic_target = glob_var[4]
+    global is_nvptx_target
    global run_tests_log
    run_tests_log = glob_var[5]    

@@ -505,6 +534,8 @@ def run_tests(options1, args, print_version):
 
    if options.target == 'neon':
        options.arch = 'arm'
+    if options.target == "nvptx":
+        options.arch = "nvptx64"
 
    # use relative path to not depend on host directory, which may possibly
    # have white spaces and unicode characters.
@@ -531,8 +562,10 @@ def run_tests(options1, args, print_version):
    ispc_exe += " " + options.ispc_flags

    global is_generic_target
+    global is_nvptx_target
    is_generic_target = (options.target.find("generic-") != -1 and
                     options.target != "generic-1" and options.target != "generic-x1")
+    is_nvptx_target = (options.target.find("nvptx") != -1)
    if is_generic_target and options.include_file == None:
        if options.target == "generic-4" or options.target == "generic-x4":
            error("No generics #include specified; using examples/intrinsics/sse4.h\n", 2)
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -57,6 +57,31 @@
  #error Unknown value of ISPC_MASK_BITS
 #endif

+
+
+///////////////////////////////////////////////////////////////////////////
+// CUDA Specific primitives
+//
+/***************/
+
+__declspec(safe,cost0) static inline varying int __programIndex() { return __program_index(); }
+__declspec(safe,cost0) static inline uniform int __programCount() { return __program_count(); }
+__declspec(safe,cost0) static inline uniform int __warpIndex()    { return __warp_index();    }
+
+/***************/
+
+__declspec(safe,cost0) static inline uniform int __taskIndex0() { return __task_index0(); }
+__declspec(safe,cost0) static inline uniform int __taskIndex1() { return __task_index1(); }
+__declspec(safe,cost0) static inline uniform int __taskIndex2() { return __task_index2(); }
+__declspec(safe,cost0) static inline uniform int __taskIndex () { return __task_index (); }
+
+/***************/
+
+__declspec(safe,cost0) static inline uniform int __taskCount0() { return __task_count0(); }
+__declspec(safe,cost0) static inline uniform int __taskCount1() { return __task_count1(); }
+__declspec(safe,cost0) static inline uniform int __taskCount2() { return __task_count2(); }
+__declspec(safe,cost0) static inline uniform int __taskCount () { return __task_count (); }
+
 ///////////////////////////////////////////////////////////////////////////
 // Low level primitives

@@ -464,7 +489,10 @@ __declspec(safe)
 static inline uniform int popcnt(bool v) {
    // As with any() and all(), only count across the active lanes
 #if (ISPC_MASK_BITS == 1)
-    return __popcnt_int64(__movmsk(v & __mask));
+    if (__is_nvptx_target)
+      return __popcnt_int64(__movmsk_ptx(v & __mask));
+    else
+      return __popcnt_int64(__movmsk(v & __mask));
 #else
    return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask));
 #endif
@@ -1226,6 +1254,11 @@ packed_store_active(uniform int a[], int vals) {
    return __packed_store_active(a, vals, (IntMaskType)__mask);
 }

+static inline uniform int 
+packed_store_active(bool active, uniform int a[], int vals) {
+    return __packed_store_active(a, vals, (IntMaskType)(-(int)active));
+}
+
 static inline uniform int
 packed_store_active2(uniform int a[], int vals) {
    return __packed_store_active2(a, vals, (IntMaskType)__mask);
@@ -1236,6 +1269,9 @@ packed_store_active2(uniform int a[], int vals) {
 // System information

 static inline uniform int num_cores() {
+  if (__is_nvptx_target)
+    return 15*32; // K20/K20X/K40 - 15SMX x 32 warps/smx (max is 64 warps/smx)
+  else
    return __num_cores();
 }

@@ -1783,7 +1819,7 @@ static inline void memory_barrier() {
    __memory_barrier();
 }

-#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
+#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE,TC)                        \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
    TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
    return ret;                                                         \
@@ -1794,6 +1830,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
    return ret;                                                         \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
    uniform TA * uniform ptrArray[programCount];                        \
    ptrArray[programIndex] = ptr;                                       \
    TA ret;                                                             \
@@ -1804,10 +1844,15 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
    return ret;                                                         \
+  } \
 }                                                                       \

-#define DEFINE_ATOMIC_SWAP(TA,TB)                                       \
+#define DEFINE_ATOMIC_SWAP(TA,TB,MASKTYPE,TC)                \
 static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
    uniform int i = 0;                                                  \
    TA ret[programCount];                                               \
    TA memVal;                                                          \
@@ -1838,6 +1883,7 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
       originally got back from memory... */                            \
    ret[lastSwap] = memVal;                                             \
    return ret[programIndex];                                           \
+  }\
 }                                                                       \
 static inline uniform TA atomic_swap_global(uniform TA * uniform ptr,   \
                                            uniform TA value) {         \
@@ -1845,6 +1891,10 @@ static inline uniform TA atomic_swap_global(uniform TA * uniform ptr,   \
    return ret;                                                         \
 }                                                                       \
 static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
    uniform TA * uniform ptrArray[programCount];                        \
    ptrArray[programIndex] = ptr;                                       \
    TA ret;                                                             \
@@ -1855,9 +1905,10 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
    return ret;                                                         \
+  }\
 }                                                                       \

-#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB)                          \
+#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC)                          \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
    uniform TA oneval = reduce_##OPA(value);                            \
    TA ret;                                                             \
@@ -1872,6 +1923,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
                                       TA value) {                      \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
    uniform TA * uniform ptrArray[programCount];                        \
    ptrArray[programIndex] = ptr;                                       \
    TA ret;                                                             \
@@ -1882,57 +1937,58 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
    return ret;                                                         \
+  } \
 }

-DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
-DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
-DEFINE_ATOMIC_SWAP(int32,int32)
+DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64)
+DEFINE_ATOMIC_SWAP(int32,int32,IntMaskType,int64)

 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
-DEFINE_ATOMIC_SWAP(unsigned int32,int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_SWAP(unsigned int32,int32,UIntMaskType, unsigned int64)

-DEFINE_ATOMIC_SWAP(float,float)
+DEFINE_ATOMIC_SWAP(float,float,IntMaskType,int64)

-DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
-DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
-DEFINE_ATOMIC_SWAP(int64,int64)
+DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64)
+DEFINE_ATOMIC_SWAP(int64,int64,IntMaskType, int64)

 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
-DEFINE_ATOMIC_SWAP(unsigned int64,int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_SWAP(unsigned int64,int64,UIntMaskType, unsigned int64)

-DEFINE_ATOMIC_SWAP(double,double)
+DEFINE_ATOMIC_SWAP(double,double,IntMaskType, int64)

 #undef DEFINE_ATOMIC_OP
 #undef DEFINE_ATOMIC_MINMAX_OP
 #undef DEFINE_ATOMIC_SWAP

-#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
+#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE, TC)                           \
 static inline uniform TA atomic_compare_exchange_global(               \
         uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
    uniform TA ret =                                                    \
@@ -1947,6 +2003,10 @@ static inline TA atomic_compare_exchange_global(                           \
 } \
 static inline TA atomic_compare_exchange_global(               \
         uniform TA * varying ptr, TA oldval, TA newval) { \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_compare_exchange_varying_##TB##_global((TC)ptr, oldval, newval, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
    uniform TA * uniform ptrArray[programCount];                        \
    ptrArray[programIndex] = ptr;                                       \
    TA ret;                                                             \
@@ -1958,14 +2018,15 @@ static inline TA atomic_compare_exchange_global(               \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
    return ret;                                                         \
+  } \
 }

-ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
-ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
-ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
-ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
-ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
-ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
+ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType,int64)
+ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType,unsigned int64)
+ATOMIC_DECL_CMPXCHG(float, float, IntMaskType,int64)
+ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType,int64)
+ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType,unsigned int64)
+ATOMIC_DECL_CMPXCHG(double, double, IntMaskType,int64)

 #undef ATOMIC_DECL_CMPXCHG

@@ -2032,12 +2093,20 @@ static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value)
 }                                                                      \
 static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) {    \
    TYPE ret;                                                          \
+  if (__is_nvptx_target) {                                            \
+    foreach_active (i) {                                             \
+        uniform TYPE * uniform ptr = (uniform TYPE * uniform)extract((int64)p, i); \
+        ret  = insert(ret, i, *ptr);                                \
+        *ptr = OPFUNC(*ptr, extract(value, i));                \
+    }                                                                  \
+  } else {    \
    uniform TYPE * uniform ptrs[programCount];                         \
    ptrs[programIndex] = p;                                            \
    foreach_active (i) {                                             \
        ret = insert(ret, i, *ptrs[i]);                                \
        *ptrs[i] = OPFUNC(*ptrs[i], extract(value, i));                \
    }                                                                  \
+  } \
    return ret;                                                        \
 }

--- a/stmt.cpp
+++ b/stmt.cpp
@@ -142,6 +142,62 @@ lHasUnsizedArrays(const Type *type) {
        return lHasUnsizedArrays(at->GetElementType());
 }

+static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos &currentPos, const bool variable = false)
+{
+  if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX) 
+    return value;
+  llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
+  const int addressSpace = pt->getAddressSpace();
+  if (addressSpace != 3 && addressSpace != 4) 
+    return value;
+
+  llvm::Type *elTy = pt->getElementType();
+
+  /* convert elTy addrspace(3)* to i64* addrspace(3)* */
+  llvm::PointerType *Int64Ptr3 = llvm::PointerType::get(LLVMTypes::Int64Type, addressSpace);
+  value = ctx->BitCastInst(value, Int64Ptr3, "gep2gen_cast1");
+
+  /* convert i64* addrspace(3) to i64* */
+  llvm::Function *__cvt2gen = m->module->getFunction(
+      addressSpace == 3 ? (variable ? "__cvt_loc2gen_var" : "__cvt_loc2gen") : "__cvt_const2gen");
+
+  std::vector<llvm::Value *> __cvt2gen_args;
+  __cvt2gen_args.push_back(value);
+  value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, variable ? "gep2gen_cvt_var" : "gep2gen_cvt", ctx->GetCurrentBasicBlock());
+
+  /* compute offset */
+  if (addressSpace == 3)
+  {
+    assert(elTy->isArrayTy());
+    const int numElTot = elTy->getArrayNumElements();
+    const int numEl    = numElTot/4;
+#if 0
+    fprintf(stderr, " --- detected addrspace(3) sz= %d --- \n", numEl);
+#endif
+    llvm::ArrayType *arrTy = llvm::dyn_cast<llvm::ArrayType>(pt->getArrayElementType());
+    assert(arrTy != NULL);
+    llvm::Type *arrElTy = arrTy->getElementType();
+#if 0
+    if (arrElTy->isArrayTy())
+      Error(currentPos, "Currently \"nvptx\" target doesn't support array-of-array");
+#endif
+
+    /* convert i64* to errElTy* */
+    llvm::PointerType *arrElTyPt0 = llvm::PointerType::get(arrElTy, 0);
+    value  = ctx->BitCastInst(value, arrElTyPt0, "gep2gen_cast2");
+
+    llvm::Function *func_warp_index    = m->module->getFunction("__warp_index");
+    llvm::Value *warpId = ctx->CallInst(func_warp_index, NULL, std::vector<llvm::Value*>(),  "gep2gen_warp_index");
+    llvm::Value *offset = ctx->BinaryOperator(llvm::Instruction::Mul, warpId, LLVMInt32(numEl), "gep2gen_offset");
+    value = llvm::GetElementPtrInst::Create(value, offset, "gep2gen_offset", ctx->GetCurrentBasicBlock());
+  }
+
+  /* convert arrElTy* to elTy* */
+  llvm::PointerType *elTyPt0 = llvm::PointerType::get(elTy, 0);
+  value  = ctx->BitCastInst(value, elTyPt0, "gep2gen_cast3");
+
+  return value;
+}

 void
 DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
@@ -205,7 +261,22 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
            return;
        }

+
        if (sym->storageClass == SC_STATIC) {
+
+            if (g->target->getISA() == Target::NVPTX && !sym->type->IsConstType())
+                PerformanceWarning(sym->pos, 
+                    "Non-constant static variable ""\"%s\" is stored in __global address sace with ""\"nvptx\" target.",
+                    sym->name.c_str());
+            if (g->target->getISA() == Target::NVPTX && sym->type->IsVaryingType())
+                PerformanceWarning(sym->pos, 
+                    "\"const static varying\" variable ""\"%s\" is stored in __global address space with ""\"nvptx\" target.",
+                    sym->name.c_str());
+            if (g->target->getISA() == Target::NVPTX && sym->type->IsUniformType())
+                PerformanceWarning(sym->pos, 
+                    "\"const static uniform\" variable ""\"%s\" is stored in __constant address space with ""\"nvptx\" target.",
+                    sym->name.c_str());
+
            // For static variables, we need a compile-time constant value
            // for its initializer; if there's no initializer, we use a
            // zero value.
@@ -233,19 +304,97 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
            if (cinit == NULL)
                cinit = llvm::Constant::getNullValue(llvmType);

+            int addressSpace = 0;
+            if (g->target->getISA() == Target::NVPTX &&
+                sym->type->IsConstType() &&
+                sym->type->IsUniformType())
+              addressSpace = 4;
+
            // Allocate space for the static variable in global scope, so
            // that it persists across function calls
            sym->storagePtr =
                new llvm::GlobalVariable(*m->module, llvmType,
                                         sym->type->IsConstType(),
                                         llvm::GlobalValue::InternalLinkage, cinit,
-                                         llvm::Twine("static.") +
+                                         llvm::Twine("static_") +
                                         llvm::Twine(sym->pos.first_line) +
-                                         llvm::Twine(".") + sym->name.c_str());
+                                         llvm::Twine("_") + sym->name.c_str(),
+                                         NULL,
+                                         llvm::GlobalVariable::NotThreadLocal,
+                                         addressSpace);
+            sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos);
            // Tell the FunctionEmitContext about the variable
            ctx->EmitVariableDebugInfo(sym);
        }
-        else {
+        else if ((sym->type->IsUniformType() || sym->type->IsSOAType()) &&
+          /* NVPTX:
+           * only non-constant uniform data types are stored in shared memory 
+           * constant uniform are automatically promoted to varying 
+           */
+           !sym->type->IsConstType() &&
+#if 1     
+           sym->type->IsArrayType() &&
+#endif
+           g->target->getISA() == Target::NVPTX)
+          {
+              PerformanceWarning(sym->pos,
+                  "Non-constant \"uniform\" data types might be slow with \"nvptx\" target. "
+                  "Unless data sharing between program instances is desired, try \"const [static] uniform\", \"varying\" or \"uniform new uniform \"+\"delete\" if possible.");
+
+              /* with __shared__ memory everything must be an array */
+              int nel = 4;
+              ArrayType *nat;
+              bool variable = true;
+              if (sym->type->IsArrayType())
+              {
+                const ArrayType *at = CastType<ArrayType>(sym->type);
+                /* we must scale # elements by 4, because a thread-block will run 4 warps
+                 * or 128 threads.
+                 * ***note-to-me***:please define these value (128threads/4warps)
+                 * in nvptx-target definition
+                 * instead of compile-time constants 
+                 */
+                nel *= at->GetElementCount();
+                if (sym->type->IsSOAType())
+                  nel *= sym->type->GetSOAWidth();
+                nat = new ArrayType(at->GetElementType(), nel);
+                variable = false;
+              }
+              else
+                nat = new ArrayType(sym->type, nel);
+
+              llvm::Type *llvmTypeUn = nat->LLVMType(g->ctx);
+              llvm::Constant *cinit = llvm::UndefValue::get(llvmTypeUn);
+
+              sym->storagePtr =
+                new llvm::GlobalVariable(*m->module, llvmTypeUn,
+                    sym->type->IsConstType(),
+                    llvm::GlobalValue::InternalLinkage, 
+                    cinit,
+                    llvm::Twine("local_") + 
+                    llvm::Twine(sym->pos.first_line) +
+                    llvm::Twine("_") + sym->name.c_str(),
+                    NULL,
+                    llvm::GlobalVariable::NotThreadLocal,
+                    /*AddressSpace=*/3);
+              sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos, variable);
+              llvm::PointerType *ptrTy = llvm::PointerType::get(sym->type->LLVMType(g->ctx),0);
+              sym->storagePtr = ctx->BitCastInst(sym->storagePtr, ptrTy, "uniform_decl");
+
+              // Tell the FunctionEmitContext about the variable; must do
+              // this before the initializer stuff.
+              ctx->EmitVariableDebugInfo(sym);
+
+              if (initExpr == 0 && sym->type->IsConstType())
+                Error(sym->pos, "Missing initializer for const variable "
+                    "\"%s\".", sym->name.c_str());
+
+              // And then get it initialized...
+              sym->parentFunction = ctx->GetFunction();
+              InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos);
+          }
+          else
+          {
            // For non-static variables, allocate storage on the stack
            sym->storagePtr = ctx->AllocaInst(llvmType, sym->name.c_str());

@@ -253,10 +402,14 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
            // this before the initializer stuff.
            ctx->EmitVariableDebugInfo(sym);

+            if (initExpr == 0 && sym->type->IsConstType())
+              Error(sym->pos, "Missing initializer for const variable "
+                  "\"%s\".", sym->name.c_str());
+
            // And then get it initialized...
            sym->parentFunction = ctx->GetFunction();
            InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos);
-        }
+          }
    }
 }

@@ -415,6 +568,19 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {
    if (testValue == NULL)
        return;

+
+#if 0
+    if (!isUniform && g->target->getISA() == Target::NVPTX)
+    {
+      /* With "nvptx" target, SIMT hardware takes care of non-uniform 
+       * control flow. We trick ISPC to generate uniform control flow.
+       */
+      testValue = ctx->ExtractInst(testValue, 0);
+      isUniform = true;
+    }
+#endif
+
+
    if (isUniform) {
        ctx->StartUniformIf();
        if (doAllCheck)
@@ -696,7 +862,11 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,

    // Do any of the program instances want to run the 'true'
    // block?  If not, jump ahead to bNext.
+#if 1
    llvm::Value *maskAnyTrueQ = ctx->Any(ctx->GetFullMask());
+#else
+    llvm::Value *maskAnyTrueQ = ctx->ExtractInst(ctx->GetFullMask(),0);
+#endif
    ctx->BranchInst(bRunTrue, bNext, maskAnyTrueQ);

    // Emit statements for true
@@ -713,7 +883,11 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,

    // Similarly, check to see if any of the instances want to
    // run the 'false' block...
+#if 1
    llvm::Value *maskAnyFalseQ = ctx->Any(ctx->GetFullMask());
+#else
+    llvm::Value *maskAnyFalseQ = ctx->ExtractInst(ctx->GetFullMask(),0);
+#endif
    ctx->BranchInst(bRunFalse, bDone, maskAnyFalseQ);

    // Emit code for false
@@ -1273,7 +1447,10 @@ static llvm::Value *
 lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
                      llvm::Value *uniformCounterPtr,
                      llvm::Value *varyingCounterPtr,
-                      const std::vector<int> &spans) {
+                      const std::vector<int> &spans) 
+{
+  if (g->target->getISA() != Target::NVPTX)
+  {
    // Smear the uniform counter value out to be varying
    llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
    llvm::Value *smearCounter = ctx->BroadcastValue(
@@ -1306,6 +1483,93 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
                            LLVMInt32Vector(delta), "iter_val");
    ctx->StoreInst(varyingCounter, varyingCounterPtr);
    return varyingCounter;
+  }
+  else /* NVPTX == true */
+  {
+    // Smear the uniform counter value out to be varying
+    llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
+    llvm::Value *smearCounter = ctx->BroadcastValue(
+        counter, LLVMTypes::Int32VectorType, "smear_counter");
+
+    // Figure out the offsets; this is a little bit tricky.  As an example,
+    // consider a 2D tiled foreach loop, where we're running 8-wide and
+    // where the inner dimension has a stride of 4 and the outer dimension
+    // has a stride of 2.  For the inner dimension, we want the offsets
+    // (0,1,2,3,0,1,2,3), and for the outer dimension we want
+    // (0,0,0,0,1,1,1,1).
+    int32_t delta[ISPC_MAX_NVEC];
+    const int vecWidth = 32; 
+    std::vector<llvm::Constant*> constDeltaList;
+    for (int i = 0; i < vecWidth; ++i) 
+    {
+      int d = i;
+      // First, account for the effect of any dimensions at deeper
+      // nesting levels than the current one.
+      int prevDimSpanCount = 1;
+      for (int j = dim; j < nDims-1; ++j)
+        prevDimSpanCount *= spans[j+1];
+      d /= prevDimSpanCount;
+
+      // And now with what's left, figure out our own offset
+      delta[i] = d % spans[dim];
+      constDeltaList.push_back(LLVMInt8(delta[i]));
+    }
+
+    llvm::ArrayType* ArrayDelta = llvm::ArrayType::get(LLVMTypes::Int8Type, 32);
+//    llvm::PointerType::get(ArrayDelta, 4); /* constant memory */
+
+
+    llvm::GlobalVariable* globalDelta = new llvm::GlobalVariable(
+        /*Module=*/*m->module,
+        /*Type=*/ArrayDelta,
+        /*isConstant=*/true,
+        /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
+        /*Initializer=*/0, // has initializer, specified below
+        /*Name=*/"constDeltaForeach");
+#if 0
+        /*ThreadLocalMode=*/llvm::GlobalVariable::NotThreadLocal,
+        /*unsigned AddressSpace=*/4 /*constant*/);
+#endif
+
+
+    llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList);
+
+    globalDelta->setInitializer(constDelta);
+    llvm::Function *func_program_index = m->module->getFunction("__program_index");
+    llvm::Value *laneIdx = ctx->CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__programIndex");
+
+    std::vector<llvm::Value*> ptr_arrayidx_indices;
+    ptr_arrayidx_indices.push_back(LLVMInt32(0));
+    ptr_arrayidx_indices.push_back(laneIdx);
+#if 1
+    llvm::Instruction* ptr_arrayidx = llvm::GetElementPtrInst::Create(globalDelta, ptr_arrayidx_indices, "arrayidx", ctx->GetCurrentBasicBlock());
+    llvm::LoadInst* int8_39 = new llvm::LoadInst(ptr_arrayidx, "", false, ctx->GetCurrentBasicBlock());
+    llvm::Value * int32_39 = ctx->ZExtInst(int8_39, LLVMTypes::Int32Type);
+
+    llvm::VectorType* VectorTy_2 = llvm::VectorType::get(llvm::IntegerType::get(*g->ctx, 32), 1);
+    llvm::UndefValue* const_packed_41 = llvm::UndefValue::get(VectorTy_2);
+
+    llvm::InsertElementInst* packed_43 = llvm::InsertElementInst::Create(
+//        llvm::UndefValue(LLVMInt32Vector),
+        const_packed_41,
+        int32_39, LLVMInt32(0), "", ctx->GetCurrentBasicBlock());
+#endif
+
+
+    // Add the deltas to compute the varying counter values; store the
+    // result to memory and then return it directly as well.
+#if 0
+    llvm::Value *varyingCounter =
+        ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
+                            LLVMInt32Vector(delta), "iter_val");
+#else
+    llvm::Value *varyingCounter =
+        ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
+                            packed_43, "iter_val");
+#endif
+    ctx->StoreInst(varyingCounter, varyingCounterPtr);
+    return varyingCounter;
+  }
 }


@@ -1383,7 +1647,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {

    // This should be caught during typechecking
    AssertPos(pos, startExprs.size() == dimVariables.size() &&
-              endExprs.size() == dimVariables.size());
+        endExprs.size() == dimVariables.size());
    int nDims = (int)dimVariables.size();

    ///////////////////////////////////////////////////////////////////////
@@ -1394,64 +1658,66 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;

    std::vector<int> span(nDims, 0);
-    lGetSpans(nDims-1, nDims, g->target->getVectorWidth(), isTiled, &span[0]);
+    const int vectorWidth = 
+      g->target->getISA() == Target::NVPTX ? 32 : g->target->getVectorWidth();
+    lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]);

    for (int i = 0; i < nDims; ++i) {
-        // Basic blocks that we'll fill in later with the looping logic for
-        // this dimension.
-        bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
-        if (i < nDims-1)
-            // stepping for the innermost dimension is handled specially
-            bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
-        bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));
+      // Basic blocks that we'll fill in later with the looping logic for
+      // this dimension.
+      bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
+      if (i < nDims-1)
+        // stepping for the innermost dimension is handled specially
+        bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
+      bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));

-        // Start and end value for this loop dimension
-        llvm::Value *sv = startExprs[i]->GetValue(ctx);
-        llvm::Value *ev = endExprs[i]->GetValue(ctx);
-        if (sv == NULL || ev == NULL)
-            return;
-        startVals.push_back(sv);
-        endVals.push_back(ev);
+      // Start and end value for this loop dimension
+      llvm::Value *sv = startExprs[i]->GetValue(ctx);
+      llvm::Value *ev = endExprs[i]->GetValue(ctx);
+      if (sv == NULL || ev == NULL)
+        return;
+      startVals.push_back(sv);
+      endVals.push_back(ev);

-        // nItems = endVal - startVal
-        llvm::Value *nItems =
-            ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");
+      // nItems = endVal - startVal
+      llvm::Value *nItems =
+        ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");

-        // nExtras = nItems % (span for this dimension)
-        // This gives us the number of extra elements we need to deal with
-        // at the end of the loop for this dimension that don't fit cleanly
-        // into a vector width.
-        nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
-                                              LLVMInt32(span[i]), "nextras"));
+      // nExtras = nItems % (span for this dimension)
+      // This gives us the number of extra elements we need to deal with
+      // at the end of the loop for this dimension that don't fit cleanly
+      // into a vector width.
+      nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
+            LLVMInt32(span[i]), "nextras"));

-        // alignedEnd = endVal - nExtras
-        alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev,
-                                                 nExtras[i], "aligned_end"));
+      // alignedEnd = endVal - nExtras
+      alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev,
+            nExtras[i], "aligned_end"));

-        ///////////////////////////////////////////////////////////////////////
-        // Each dimension has a loop counter that is a uniform value that
-        // goes from startVal to endVal, in steps of the span for this
-        // dimension.  Its value is only used internally here for looping
-        // logic and isn't directly available in the user's program code.
-        uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type,
-                                                     "counter"));
-        ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
+      ///////////////////////////////////////////////////////////////////////
+      // Each dimension has a loop counter that is a uniform value that
+      // goes from startVal to endVal, in steps of the span for this
+      // dimension.  Its value is only used internally here for looping
+      // logic and isn't directly available in the user's program code.
+      uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type,
+            "counter"));
+      ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);

-        // There is also a varying variable that holds the set of index
-        // values for each dimension in the current loop iteration; this is
-        // the value that is program-visible.
-        dimVariables[i]->storagePtr =
-            ctx->AllocaInst(LLVMTypes::Int32VectorType,
-                            dimVariables[i]->name.c_str());
-        dimVariables[i]->parentFunction = ctx->GetFunction();
-        ctx->EmitVariableDebugInfo(dimVariables[i]);
+      // There is also a varying variable that holds the set of index
+      // values for each dimension in the current loop iteration; this is
+      // the value that is program-visible.
+      dimVariables[i]->storagePtr =
+        ctx->AllocaInst(LLVMTypes::Int32VectorType,
+            dimVariables[i]->name.c_str());
+      dimVariables[i]->parentFunction = ctx->GetFunction();
+      ctx->EmitVariableDebugInfo(dimVariables[i]);

-        // Each dimension also maintains a mask that represents which of
-        // the varying elements in the current iteration should be
-        // processed.  (i.e. this is used to disable the lanes that have
-        // out-of-bounds offsets.)
-        extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask"));
-        ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
+      // Each dimension also maintains a mask that represents which of
+      // the varying elements in the current iteration should be
+      // processed.  (i.e. this is used to disable the lanes that have
+      // out-of-bounds offsets.)
+      extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask"));
+      ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
    }

    ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR);
@@ -1464,14 +1730,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    // a given dimension in preparation for running through its loop again,
    // after the enclosing level advances its counter.
    for (int i = 0; i < nDims; ++i) {
-        ctx->SetCurrentBasicBlock(bbReset[i]);
-        if (i == 0)
-            ctx->BranchInst(bbExit);
-        else {
-            ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
-            ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
-            ctx->BranchInst(bbStep[i-1]);
-        }
+      ctx->SetCurrentBasicBlock(bbReset[i]);
+      if (i == 0)
+        ctx->BranchInst(bbExit);
+      else {
+        ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
+        ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
+        ctx->BranchInst(bbStep[i-1]);
+      }
    }

    ///////////////////////////////////////////////////////////////////////////
@@ -1481,67 +1747,67 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    // this for the innermost dimension, which has a more complex stepping
    // structure..
    for (int i = 0; i < nDims-1; ++i) {
-        ctx->SetCurrentBasicBlock(bbStep[i]);
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
-        llvm::Value *newCounter =
-            ctx->BinaryOperator(llvm::Instruction::Add, counter,
-                                LLVMInt32(span[i]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
-        ctx->BranchInst(bbTest[i]);
+      ctx->SetCurrentBasicBlock(bbStep[i]);
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
+      llvm::Value *newCounter =
+        ctx->BinaryOperator(llvm::Instruction::Add, counter,
+            LLVMInt32(span[i]), "new_counter");
+      ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
+      ctx->BranchInst(bbTest[i]);
    }

    ///////////////////////////////////////////////////////////////////////////
    // foreach_test (for all dimensions other than the innermost...)
    std::vector<llvm::Value *> inExtras;
    for (int i = 0; i < nDims-1; ++i) {
-        ctx->SetCurrentBasicBlock(bbTest[i]);
+      ctx->SetCurrentBasicBlock(bbTest[i]);

-        llvm::Value *haveExtras =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT,
-                         endVals[i], alignedEnd[i], "have_extras");
+      llvm::Value *haveExtras =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT,
+            endVals[i], alignedEnd[i], "have_extras");

-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter");
-        llvm::Value *atAlignedEnd =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
-                         counter, alignedEnd[i], "at_aligned_end");
-        llvm::Value *inEx =
-            ctx->BinaryOperator(llvm::Instruction::And, haveExtras,
-                                atAlignedEnd, "in_extras");
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter");
+      llvm::Value *atAlignedEnd =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
+            counter, alignedEnd[i], "at_aligned_end");
+      llvm::Value *inEx =
+        ctx->BinaryOperator(llvm::Instruction::And, haveExtras,
+            atAlignedEnd, "in_extras");

-        if (i == 0)
-            inExtras.push_back(inEx);
-        else
-            inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx,
-                                                   inExtras[i-1], "in_extras_all"));
+      if (i == 0)
+        inExtras.push_back(inEx);
+      else
+        inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx,
+              inExtras[i-1], "in_extras_all"));

-        llvm::Value *varyingCounter =
-            lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i],
-                                  dimVariables[i]->storagePtr, span);
+      llvm::Value *varyingCounter =
+        lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i],
+            dimVariables[i]->storagePtr, span);

-        llvm::Value *smearEnd = ctx->BroadcastValue(
-            endVals[i], LLVMTypes::Int32VectorType, "smear_end");
+      llvm::Value *smearEnd = ctx->BroadcastValue(
+          endVals[i], LLVMTypes::Int32VectorType, "smear_end");

-        // Do a vector compare of its value to the end value to generate a
-        // mask for this last bit of work.
-        llvm::Value *emask =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-                         varyingCounter, smearEnd);
-        emask = ctx->I1VecToBoolVec(emask);
+      // Do a vector compare of its value to the end value to generate a
+      // mask for this last bit of work.
+      llvm::Value *emask =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            varyingCounter, smearEnd);
+      emask = ctx->I1VecToBoolVec(emask);

-        if (i == 0)
-            ctx->StoreInst(emask, extrasMaskPtrs[i]);
-        else {
-            llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
-            llvm::Value *newMask =
-                ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
-                                    "extras_mask");
-            ctx->StoreInst(newMask, extrasMaskPtrs[i]);
-        }
+      if (i == 0)
+        ctx->StoreInst(emask, extrasMaskPtrs[i]);
+      else {
+        llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
+        llvm::Value *newMask =
+          ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
+              "extras_mask");
+        ctx->StoreInst(newMask, extrasMaskPtrs[i]);
+      }

-        llvm::Value *notAtEnd =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-                         counter, endVals[i]);
-        ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
+      llvm::Value *notAtEnd =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            counter, endVals[i]);
+      ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
    }

    ///////////////////////////////////////////////////////////////////////////
@@ -1578,18 +1844,18 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    // (i.e. processing extra elements that don't exactly fit into a
    // vector).
    llvm::BasicBlock *bbOuterInExtras =
-        ctx->CreateBasicBlock("outer_in_extras");
+      ctx->CreateBasicBlock("outer_in_extras");
    llvm::BasicBlock *bbOuterNotInExtras =
-        ctx->CreateBasicBlock("outer_not_in_extras");
+      ctx->CreateBasicBlock("outer_not_in_extras");

    ctx->SetCurrentBasicBlock(bbTest[nDims-1]);
    if (inExtras.size())
-        ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
-                        inExtras.back());
+      ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
+          inExtras.back());
    else
-        // for a 1D iteration domain, we certainly don't have any enclosing
-        // dimensions that are processing extra elements.
-        ctx->BranchInst(bbOuterNotInExtras);
+      // for a 1D iteration domain, we certainly don't have any enclosing
+      // dimensions that are processing extra elements.
+      ctx->BranchInst(bbOuterNotInExtras);

    ///////////////////////////////////////////////////////////////////////////
    // One or more outer dimensions in extras, so we need to mask for the loop
@@ -1604,21 +1870,21 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    //   // run loop body with mask
    // }
    llvm::BasicBlock *bbAllInnerPartialOuter =
-        ctx->CreateBasicBlock("all_inner_partial_outer");
+      ctx->CreateBasicBlock("all_inner_partial_outer");
    llvm::BasicBlock *bbPartial =
-        ctx->CreateBasicBlock("both_partial");
+      ctx->CreateBasicBlock("both_partial");
    ctx->SetCurrentBasicBlock(bbOuterInExtras); {
-        // Update the varying counter value here, since all subsequent
-        // blocks along this path need it.
-        lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
-                              dimVariables[nDims-1]->storagePtr, span);
+      // Update the varying counter value here, since all subsequent
+      // blocks along this path need it.
+      lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
+          dimVariables[nDims-1]->storagePtr, span);

-        // here we just check to see if counter < alignedEnd
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
-        llvm::Value *beforeAlignedEnd =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-                         counter, alignedEnd[nDims-1], "before_aligned_end");
-        ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
+      // here we just check to see if counter < alignedEnd
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
+      llvm::Value *beforeAlignedEnd =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            counter, alignedEnd[nDims-1], "before_aligned_end");
+      ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
    }

    // Below we have a basic block that runs the loop body code for the
@@ -1637,53 +1903,53 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    // should step the loop counter for the next enclosing dimension
    // instead.
    llvm::Value *stepIndexAfterMaskedBodyPtr =
-        ctx->AllocaInst(LLVMTypes::BoolType, "step_index");
+      ctx->AllocaInst(LLVMTypes::BoolType, "step_index");

    ///////////////////////////////////////////////////////////////////////////
    // We're in the inner loop part where the only masking is due to outer
    // dimensions but the innermost dimension fits fully into a vector's
    // width.  Set the mask and jump to the masked loop body.
    ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); {
-        llvm::Value *mask;
-        if (nDims == 1)
-            // 1D loop; we shouldn't ever get here anyway
-            mask = LLVMMaskAllOff;
-        else
-            mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
+      llvm::Value *mask;
+      if (nDims == 1)
+        // 1D loop; we shouldn't ever get here anyway
+        mask = LLVMMaskAllOff;
+      else
+        mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);

-        ctx->SetInternalMask(mask);
+      ctx->SetInternalMask(mask);

-        ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbMaskedBody);
+      ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
+      ctx->BranchInst(bbMaskedBody);
    }

    ///////////////////////////////////////////////////////////////////////////
    // We need to include the effect of the innermost dimension in the mask
    // for the final bits here
    ctx->SetCurrentBasicBlock(bbPartial); {
-        llvm::Value *varyingCounter =
-            ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
-        llvm::Value *smearEnd = ctx->BroadcastValue(
-            endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
+      llvm::Value *varyingCounter =
+        ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
+      llvm::Value *smearEnd = ctx->BroadcastValue(
+          endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");

-        llvm::Value *emask =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-                         varyingCounter, smearEnd);
-        emask = ctx->I1VecToBoolVec(emask);
+      llvm::Value *emask =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            varyingCounter, smearEnd);
+      emask = ctx->I1VecToBoolVec(emask);

-        if (nDims == 1) {
-            ctx->SetInternalMask(emask);
-        }
-        else {
-            llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
-            llvm::Value *newMask =
-                ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
-                                    "extras_mask");
-            ctx->SetInternalMask(newMask);
-        }
+      if (nDims == 1) {
+        ctx->SetInternalMask(emask);
+      }
+      else {
+        llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
+        llvm::Value *newMask =
+          ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
+              "extras_mask");
+        ctx->SetInternalMask(newMask);
+      }

-        ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbMaskedBody);
+      ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
+      ctx->BranchInst(bbMaskedBody);
    }

    ///////////////////////////////////////////////////////////////////////////
@@ -1699,14 +1965,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    //   // run loop body with mask
    // }
    llvm::BasicBlock *bbPartialInnerAllOuter =
-        ctx->CreateBasicBlock("partial_inner_all_outer");
+      ctx->CreateBasicBlock("partial_inner_all_outer");
    ctx->SetCurrentBasicBlock(bbOuterNotInExtras); {
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
-        llvm::Value *beforeAlignedEnd =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-                         counter, alignedEnd[nDims-1], "before_aligned_end");
-        ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
-                        beforeAlignedEnd);
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
+      llvm::Value *beforeAlignedEnd =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            counter, alignedEnd[nDims-1], "before_aligned_end");
+      ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
+          beforeAlignedEnd);
    }

    ///////////////////////////////////////////////////////////////////////////
@@ -1716,26 +1982,26 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    // value of the varying loop counter and have the statements in the
    // loop body emit their code.
    llvm::BasicBlock *bbFullBodyContinue =
-        ctx->CreateBasicBlock("foreach_full_continue");
+      ctx->CreateBasicBlock("foreach_full_continue");
    ctx->SetCurrentBasicBlock(bbFullBody); {
-        ctx->SetInternalMask(LLVMMaskAllOn);
-        ctx->SetBlockEntryMask(LLVMMaskAllOn);
-        lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
-                              dimVariables[nDims-1]->storagePtr, span);
-        ctx->SetContinueTarget(bbFullBodyContinue);
-        ctx->AddInstrumentationPoint("foreach loop body (all on)");
-        stmts->EmitCode(ctx);
-        AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL);
-        ctx->BranchInst(bbFullBodyContinue);
+      ctx->SetInternalMask(LLVMMaskAllOn);
+      ctx->SetBlockEntryMask(LLVMMaskAllOn);
+      lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
+          dimVariables[nDims-1]->storagePtr, span);
+      ctx->SetContinueTarget(bbFullBodyContinue);
+      ctx->AddInstrumentationPoint("foreach loop body (all on)");
+      stmts->EmitCode(ctx);
+      AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL);
+      ctx->BranchInst(bbFullBodyContinue);
    }
    ctx->SetCurrentBasicBlock(bbFullBodyContinue); {
-        ctx->RestoreContinuedLanes();
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
-        llvm::Value *newCounter =
-            ctx->BinaryOperator(llvm::Instruction::Add, counter,
-                                LLVMInt32(span[nDims-1]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
-        ctx->BranchInst(bbOuterNotInExtras);
+      ctx->RestoreContinuedLanes();
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
+      llvm::Value *newCounter =
+        ctx->BinaryOperator(llvm::Instruction::Add, counter,
+            LLVMInt32(span[nDims-1]), "new_counter");
+      ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
+      ctx->BranchInst(bbOuterNotInExtras);
    }

    ///////////////////////////////////////////////////////////////////////////
@@ -1743,33 +2009,33 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    // less than the end value, in which case we need to run the body one
    // more time to get the extra bits.
    llvm::BasicBlock *bbSetInnerMask =
-        ctx->CreateBasicBlock("partial_inner_only");
+      ctx->CreateBasicBlock("partial_inner_only");
    ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); {
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
-        llvm::Value *beforeFullEnd =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-                         counter, endVals[nDims-1], "before_full_end");
-        ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
+      llvm::Value *beforeFullEnd =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            counter, endVals[nDims-1], "before_full_end");
+      ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
    }

    ///////////////////////////////////////////////////////////////////////////
    // The outer dimensions are all on, so the mask is just given by the
    // mask for the innermost dimension
    ctx->SetCurrentBasicBlock(bbSetInnerMask); {
-        llvm::Value *varyingCounter =
-            lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
-                                  dimVariables[nDims-1]->storagePtr, span);
-        llvm::Value *smearEnd = ctx->BroadcastValue(
-            endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
-        llvm::Value *emask =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-                         varyingCounter, smearEnd);
-        emask = ctx->I1VecToBoolVec(emask);
-        ctx->SetInternalMask(emask);
-        ctx->SetBlockEntryMask(emask);
+      llvm::Value *varyingCounter =
+        lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
+            dimVariables[nDims-1]->storagePtr, span);
+      llvm::Value *smearEnd = ctx->BroadcastValue(
+          endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
+      llvm::Value *emask =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            varyingCounter, smearEnd);
+      emask = ctx->I1VecToBoolVec(emask);
+      ctx->SetInternalMask(emask);
+      ctx->SetBlockEntryMask(emask);

-        ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbMaskedBody);
+      ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
+      ctx->BranchInst(bbMaskedBody);
    }

    ///////////////////////////////////////////////////////////////////////////
@@ -1779,34 +2045,34 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    // mask known to be all-on, which in turn leads to more efficient code
    // for that case.
    llvm::BasicBlock *bbStepInnerIndex =
-        ctx->CreateBasicBlock("step_inner_index");
+      ctx->CreateBasicBlock("step_inner_index");
    llvm::BasicBlock *bbMaskedBodyContinue =
-        ctx->CreateBasicBlock("foreach_masked_continue");
+      ctx->CreateBasicBlock("foreach_masked_continue");
    ctx->SetCurrentBasicBlock(bbMaskedBody); {
-        ctx->AddInstrumentationPoint("foreach loop body (masked)");
-        ctx->SetContinueTarget(bbMaskedBodyContinue);
-        ctx->DisableGatherScatterWarnings();
-        ctx->SetBlockEntryMask(ctx->GetFullMask());
-        stmts->EmitCode(ctx);
-        ctx->EnableGatherScatterWarnings();
-        ctx->BranchInst(bbMaskedBodyContinue);
+      ctx->AddInstrumentationPoint("foreach loop body (masked)");
+      ctx->SetContinueTarget(bbMaskedBodyContinue);
+      ctx->DisableGatherScatterWarnings();
+      ctx->SetBlockEntryMask(ctx->GetFullMask());
+      stmts->EmitCode(ctx);
+      ctx->EnableGatherScatterWarnings();
+      ctx->BranchInst(bbMaskedBodyContinue);
    }
    ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); {
-        ctx->RestoreContinuedLanes();
-        llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
+      ctx->RestoreContinuedLanes();
+      llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
+      ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
    }

    ///////////////////////////////////////////////////////////////////////////
    // step the innermost index, for the case where we're doing the
    // innermost for loop over full vectors.
    ctx->SetCurrentBasicBlock(bbStepInnerIndex); {
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
-        llvm::Value *newCounter =
-            ctx->BinaryOperator(llvm::Instruction::Add, counter,
-                                LLVMInt32(span[nDims-1]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
-        ctx->BranchInst(bbOuterInExtras);
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
+      llvm::Value *newCounter =
+        ctx->BinaryOperator(llvm::Instruction::Add, counter,
+            LLVMInt32(span[nDims-1]), "new_counter");
+      ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
+      ctx->BranchInst(bbOuterInExtras);
    }

    ///////////////////////////////////////////////////////////////////////////
@@ -1993,7 +2259,8 @@ ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const {
        // math...)

        // Get the "program index" vector value
-        llvm::Value *programIndex = ctx->ProgramIndexVector();
+        llvm::Value *programIndex = g->target->getISA() == Target::NVPTX ?
+          ctx->ProgramIndexVectorPTX() : ctx->ProgramIndexVector();

        // And smear the current lane out to a vector
        llvm::Value *firstSet32 =
@@ -2189,10 +2456,19 @@ ForeachUniqueStmt::EmitCode(FunctionEmitContext *ctx) const {

        // And load the corresponding element value from the temporary
        // memory storing the value of the varying expr.
-        llvm::Value *uniqueValuePtr =
+        llvm::Value *uniqueValue;
+        if (g->target->getISA() != Target::NVPTX)
+        {
+          llvm::Value *uniqueValuePtr =
            ctx->GetElementPtrInst(exprMem, LLVMInt64(0), firstSet, exprPtrType,
-                                   "unique_index_ptr");
-        llvm::Value *uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value");
+                "unique_index_ptr");
+          uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value");
+        }
+        else  /* in case of PTX target, use __shfl PTX intrinsics via __insert/__extract function */
+        {
+          llvm::Value *firstSet32 = ctx->TruncInst(firstSet, LLVMTypes::Int32Type);
+          uniqueValue = ctx->Extract(exprValue, firstSet32);
+        }

        // If it's a varying pointer type, need to convert from the int
        // type we store in the vector to the actual pointer type
@@ -3100,7 +3376,8 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
    }

    // Now we can emit code to call __do_print()
-    llvm::Function *printFunc = m->module->getFunction("__do_print");
+    llvm::Function *printFunc = g->target->getISA() != Target::NVPTX ?
+      m->module->getFunction("__do_print") : m->module->getFunction("__do_print_nvptx");
    AssertPos(pos, printFunc);

    llvm::Value *mask = ctx->GetFullMask();
--- a/test_static_cuda.cpp
+++ b/test_static_cuda.cpp
@@ -0,0 +1,440 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_WINDOWS
+#include <windows.h>
+#endif // ISPC_IS_WINDOWS
+
+#include <cassert>
+#include <cstring>
+#include <cstdio>
+#include <cstdint>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif
+
+/******************************/
+
+#include <cassert>
+#include <iostream>
+#include <cuda.h>
+#include "drvapi_error_string.h"
+#include "ispc_malloc.h"
+
+#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)
+// These are the inline versions for all of the SDK helper functions
+void __checkCudaErrors(CUresult err, const char *file, const int line) {
+  if(CUDA_SUCCESS != err) {
+    std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
+           << getCudaDrvErrorString(err) << "\" from file <" << file
+           << ", line " << line << "\n";
+    exit(-1);
+  }
+}
+
+
+/******************************/
+/****  Basic CUDriver API  ****/
+/******************************/
+
+CUcontext context;
+
+static void createContext(const int deviceId = 0, const bool verbose = true)
+{
+  CUdevice device;
+  int devCount;
+  checkCudaErrors(cuInit(0));
+  checkCudaErrors(cuDeviceGetCount(&devCount));
+  assert(devCount > 0);
+  checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
+
+  char name[128];
+  checkCudaErrors(cuDeviceGetName(name, 128, device));
+  if (verbose)
+    std::cout << "Using CUDA Device [0]: " << name << "\n";
+
+  int devMajor, devMinor;
+  checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
+  if (verbose)
+    std::cout << "Device Compute Capability: " 
+      << devMajor << "." << devMinor << "\n";
+  if (devMajor < 2) {
+    if (verbose)
+      std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
+    exit(1); 
+  }
+
+  // Create driver context
+  checkCudaErrors(cuCtxCreate(&context, 0, device));
+}
+static void destroyContext()
+{
+  checkCudaErrors(cuCtxDestroy(context));
+}
+
+static CUmodule loadModule(
+    const char * module,
+    const int maxrregcount = 64,
+    const char cudadevrt_lib[] = "libcudadevrt.a",
+    const size_t log_size = 32768,
+    const bool print_log = true
+    )
+{
+  CUmodule cudaModule;
+  // in this branch we use compilation with parameters
+
+  CUlinkState  CUState;
+  CUlinkState *lState = &CUState;
+  const int nOptions = 8;
+  CUjit_option options[nOptions];
+  void* optionVals[nOptions];
+  float walltime;
+  size_t logSize = log_size;
+  char error_log[logSize],
+       info_log[logSize];
+  void *cuOut;
+  size_t outSize;
+  int myErr = 0;
+
+  // Setup linker options
+  // Return walltime from JIT compilation
+  options[0] = CU_JIT_WALL_TIME;
+  optionVals[0] = (void*) &walltime;
+  // Pass a buffer for info messages
+  options[1] = CU_JIT_INFO_LOG_BUFFER;
+  optionVals[1] = (void*) info_log;
+  // Pass the size of the info buffer
+  options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+  optionVals[2] = (void*) logSize;
+  // Pass a buffer for error message
+  options[3] = CU_JIT_ERROR_LOG_BUFFER;
+  optionVals[3] = (void*) error_log;
+  // Pass the size of the error buffer
+  options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+  optionVals[4] = (void*) logSize;
+  // Make the linker verbose
+  options[5] = CU_JIT_LOG_VERBOSE;
+  optionVals[5] = (void*) 1;
+  // Max # of registers/pthread
+  options[6] = CU_JIT_MAX_REGISTERS;
+  int jitRegCount = maxrregcount;
+  optionVals[6] = (void *)(size_t)jitRegCount;
+  // Caching
+  options[7] = CU_JIT_CACHE_MODE;
+  optionVals[7] = (void *)CU_JIT_CACHE_OPTION_CA;
+  // Create a pending linker invocation
+
+  // Create a pending linker invocation
+  checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState));
+
+#if 0
+  if (sizeof(void *)==4)
+  {
+    // Load the PTX from the string myPtx32
+    printf("Loading myPtx32[] program\n");
+    // PTX May also be loaded from file, as per below.
+    myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0);
+  }
+  else
+#endif
+  {
+    // Load the PTX from the string myPtx (64-bit)
+    if (print_log)
+      fprintf(stderr, "Loading ptx..\n");
+    myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0);
+    myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, cudadevrt_lib, 0,0,0); 
+    // PTX May also be loaded from file, as per below.
+    // myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0);
+  }
+
+  // Complete the linker step
+  myErr = cuLinkComplete(*lState, &cuOut, &outSize);
+
+  if ( myErr != CUDA_SUCCESS )
+  {
+    // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above. 
+    fprintf(stderr,"PTX Linker Error:\n%s\n",error_log);
+    assert(0);
+  }    
+
+  // Linker walltime and info_log were requested in options above.
+ if (print_log)
+   fprintf(stderr, "CUDA Link Completed in %fms. Linker Output:\n%s\n",walltime,info_log);
+
+ // Load resulting cuBin into module
+ checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut));
+
+ // Destroy the linker invocation
+ checkCudaErrors(cuLinkDestroy(*lState));
+ return cudaModule;
+}
+static void unloadModule(CUmodule &cudaModule)
+{
+  checkCudaErrors(cuModuleUnload(cudaModule));
+}
+
+static CUfunction getFunction(CUmodule &cudaModule, const char * function)
+{
+  CUfunction cudaFunction;
+  checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
+  return cudaFunction;
+}
+
+static CUdeviceptr deviceMalloc(const size_t size)
+{
+  CUdeviceptr d_buf;
+  checkCudaErrors(cuMemAlloc(&d_buf, size));
+  return d_buf;
+}
+static void deviceFree(CUdeviceptr d_buf)
+{
+  checkCudaErrors(cuMemFree(d_buf));
+}
+static void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
+{
+  checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
+}
+static void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
+{
+  checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
+}
+#define deviceLaunch(func,params) \
+  checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \
+checkCudaErrors( \
+    cuLaunchKernel( \
+      (func), \
+      1,1,1, \
+      32, 1, 1, \
+      0, NULL, (params), NULL \
+      ));
+
+typedef CUdeviceptr devicePtr;
+
+
+/**************/
+#include <vector>
+static std::vector<char> readBinary(const char * filename, const bool print_size = false)
+{
+  std::vector<char> buffer;
+  FILE *fp = fopen(filename, "rb");
+  if (!fp )
+  {
+    fprintf(stderr, "file %s not found\n", filename);
+    assert(0);
+  }
+  fseek(fp, 0, SEEK_END); 
+  const unsigned long long size = ftell(fp);         /*calc the size needed*/
+  fseek(fp, 0, SEEK_SET); 
+  buffer.resize(size);
+
+  if (fp == NULL){ /*ERROR detection if file == empty*/
+    fprintf(stderr, "Error: There was an Error reading the file %s \n",filename);           
+    exit(1);
+  }
+  else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/
+    fprintf(stderr, "Error: There was an Error reading the file %s \n", filename);
+    exit(1);
+  }
+  if (print_size)
+    fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size());
+  return buffer;
+}
+
+static double CUDALaunch(
+    void **handlePtr, 
+    const char * func_name,
+    void **func_args,
+    const bool print_log = true,
+    const int maxrregcount = 64,
+    const char kernel_file[] = "__kernels.ptx",
+    const char cudadevrt_lib[] = "libcudadevrt.a",
+    const int log_size = 32768)
+{
+  fprintf(stderr, " launching kernel: %s \n", func_name);
+  const std::vector<char> module_str = readBinary(kernel_file, print_log);
+  const char *  module = &module_str[0];
+  CUmodule   cudaModule   = loadModule(module, maxrregcount, cudadevrt_lib, log_size, print_log);
+  CUfunction cudaFunction = getFunction(cudaModule, func_name);
+  deviceLaunch(cudaFunction, func_args);
+  checkCudaErrors(cuStreamSynchronize(0));
+  unloadModule(cudaModule);
+  return 0.0;
+}
+/******************************/
+
+
+extern "C" {
+//    extern int width();
+    int width() { return 32; }
+    extern void f_v(float *result);
+    extern void f_f(float *result, float *a);
+    extern void f_fu(float *result, float *a, float b);
+    extern void f_fi(float *result, float *a, int *b);
+    extern void f_du(float *result, double *a, double b);
+    extern void f_duf(float *result, double *a, float b);
+    extern void f_di(float *result, double *a, int *b);
+    extern void result(float *val);
+}
+
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ALIGN
+#else
+#define ALIGN __attribute__((aligned(64)))
+#endif
+
+int main(int argc, char *argv[]) {
+    int w = width();
+    assert(w <= 64);
+
+    float returned_result[64] ALIGN;
+    float vfloat[64] ALIGN;
+    double vdouble[64] ALIGN;
+    int vint[64] ALIGN;
+    int vint2[64] ALIGN;
+
+    const int device = 0;
+#if 0
+    const bool verbose = true;
+#else
+    const bool verbose = false;
+#endif
+
+    /*******************/
+    createContext(device, verbose);
+    /*******************/
+
+    devicePtr d_returned_result = deviceMalloc(64*sizeof(float));
+    devicePtr d_vfloat          = deviceMalloc(64*sizeof(float));
+    devicePtr d_vdouble         = deviceMalloc(64*sizeof(double));
+    devicePtr d_vint            = deviceMalloc(64*sizeof(int));
+    devicePtr d_vint2           = deviceMalloc(64*sizeof(int));
+
+
+    for (int i = 0; i < 64; ++i) {
+        returned_result[i] = -1e20;
+        vfloat[i] = i+1;
+        vdouble[i] = i+1;
+        vint[i] = 2*(i+1);
+        vint2[i] = i+5;
+    }
+
+    memcpyH2D(d_returned_result, returned_result, 64*sizeof(float));
+    memcpyH2D(d_vfloat         , vfloat,          64*sizeof(float));
+    memcpyH2D(d_vdouble        , vdouble,         64*sizeof(double));
+    memcpyH2D(d_vint        , vint,         64*sizeof(int));
+    memcpyH2D(d_vint2       , vint2,         64*sizeof(int));
+
+
+    float b = 5.;
+
+    const bool print_log = false;
+    const int  nreg = 64;
+#if (TEST_SIG == 0)
+    void *args[] = {&d_returned_result};
+    CUDALaunch(NULL, "f_v", args, print_log, nreg);
+#elif (TEST_SIG == 1)
+    void *args[] = {&d_returned_result, &d_vfloat};
+    CUDALaunch(NULL, "f_f", args, print_log, nreg);
+#elif (TEST_SIG == 2)
+    void *args[] = {&d_returned_result, &d_vfloat, &b};
+    CUDALaunch(NULL, "f_fu", args, print_log, nreg);
+#elif (TEST_SIG == 3)
+    void *args[] = {&d_returned_result, &d_vfloat, &vint};
+    CUDALaunch(NULL, "f_fi", args, print_log, nreg);
+#elif (TEST_SIG == 4)
+    int num = 5;
+    void *args[] = {&d_returned_result, &d_vdouble, &num};
+    CUDALaunch(NULL, "f_du", args, print_log, nreg);
+#elif (TEST_SIG == 5)
+    float num = 5.0f;
+    void *args[] = {&d_returned_result, &d_vdouble, &num};
+    CUDALaunch(NULL, "f_duf", args, print_log, nreg);
+#elif (TEST_SIG == 6)
+    void *args[] = {&d_returned_result, &d_vdouble, &v_int2};
+    CUDALaunch(NULL, "f_di", args, print_log, nreg);
+#else
+#error "Unknown or unset TEST_SIG value"
+#endif
+
+    float expected_result[64];
+    
+    memset(expected_result, 0, 64*sizeof(float));
+    devicePtr d_expected_result = deviceMalloc(64*sizeof(float));
+    memcpyH2D(d_expected_result, expected_result, 64*sizeof(float));
+    void *res_args[] = {&d_expected_result};
+    CUDALaunch(NULL, "result", res_args, print_log, nreg);
+    memcpyD2H(expected_result, d_expected_result, 64*sizeof(float));
+    memcpyD2H(returned_result, d_returned_result, 64*sizeof(float));
+
+    deviceFree(d_returned_result);
+    deviceFree(d_vfloat);
+    deviceFree(d_vdouble);
+    deviceFree(d_vint);
+    deviceFree(d_vint2);
+    deviceFree(d_expected_result);
+
+    /*******************/
+    destroyContext();
+    /*******************/
+
+    int errors = 0;
+    for (int i = 0; i < w; ++i) {
+        if (returned_result[i] != expected_result[i]) {
+#ifdef EXPECT_FAILURE
+            // bingo, failed
+            return 1;
+#else
+            printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
+                   argv[0], i, returned_result[i], returned_result[i],
+                   expected_result[i], expected_result[i]);
+            ++errors;
+#endif // EXPECT_FAILURE
+        }
+    }
+
+#ifdef EXPECT_FAILURE
+    // Don't expect to get here
+    return 0;
+#else
+    return errors > 0;
+#endif
+}
--- a/test_static_nvptx.cpp
+++ b/test_static_nvptx.cpp
@@ -0,0 +1,133 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_WINDOWS
+#include <windows.h>
+#endif // ISPC_IS_WINDOWS
+
+#include <cassert>
+#include <cstring>
+#include <cstdio>
+#include <stdint.h>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif
+
+#include "ispc_malloc.h"
+
+#define N 32
+extern "C" {
+    int width() { return N; }
+    extern void f_v(float *result);
+    extern void f_f(float *result, float *a);
+    extern void f_fu(float *result, float *a, float b);
+    extern void f_fi(float *result, float *a, int *b);
+    extern void f_du(float *result, double *a, double b);
+    extern void f_duf(float *result, double *a, float b);
+    extern void f_di(float *result, double *a, int *b);
+    extern void result(float *val);
+}
+
+int main(int argc, char *argv[]) {
+    int w = width();
+    assert(w <= N);
+
+    float *returned_result = new float[N*4];
+    float *vfloat = new float[N*4];
+    double *vdouble = new double[N*4];
+    int *vint = new int[N*4];
+    int *vint2 = new int[N*4];
+
+    for (int i = 0; i < N*4; ++i) {
+        returned_result[i] = -1e20;
+        vfloat[i] = i+1;
+        vdouble[i] = i+1;
+        vint[i] = 2*(i+1);
+        vint2[i] = i+5;
+    }
+
+    float b = 5.;
+
+#if (TEST_SIG == 0)
+    f_v(returned_result);
+#elif (TEST_SIG == 1)
+    f_f(returned_result, vfloat);
+#elif (TEST_SIG == 2)
+    f_fu(returned_result, vfloat, b);
+#elif (TEST_SIG == 3)
+    f_fi(returned_result, vfloat, vint);
+#elif (TEST_SIG == 4)
+    f_du(returned_result, vdouble, 5.);
+#elif (TEST_SIG == 5)
+    f_duf(returned_result, vdouble, 5.f);
+#elif (TEST_SIG == 6)
+    f_di(returned_result, vdouble, vint2);
+#else
+#error "Unknown or unset TEST_SIG value"
+#endif
+
+    float *expected_result = new float[N];
+    memset(expected_result, 0, N*sizeof(float));
+    result(expected_result);
+
+    int errors = 0;
+    for (int i = 0; i < w; ++i) {
+        if (returned_result[i] != expected_result[i]) 
+        {
+#ifdef EXPECT_FAILURE
+            // bingo, failed
+            return 1;
+#else
+            printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
+                   argv[0], i, returned_result[i], returned_result[i],
+                   expected_result[i], expected_result[i]);
+            ++errors;
+#endif // EXPECT_FAILURE
+        }
+    }
+
+#ifdef EXPECT_FAILURE
+    // Don't expect to get here
+    return 0;
+#else
+    return errors > 0;
+#endif
+}
--- a/tests/array-mixed-unif-vary-indexing-3.ispc
+++ b/tests/array-mixed-unif-vary-indexing-3.ispc
@@ -5,7 +5,13 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
    assert(programCount <= 64);
+#ifdef __NVPTX__
+    uniform float  * uniform xarr   =  uniform new uniform float[70*70];
+    uniform float (* uniform x)[70] = (uniform float (* uniform)[70])xarr;
+#define _SHMALLOC
+#else
    uniform float x[70][70];
+#endif
    for (uniform int i = 0; i < 70; ++i)
        for (uniform int j = 0; j < 70; ++j)
            x[i][j] = 2+b-5;
@@ -16,6 +22,10 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    else
        x[b-1][a-1] = 1;
    RET[programIndex] = x[4][a];
+
+#ifdef _SHMALLOC
+    delete xarr;
+#endif
 }

 export void result(uniform float RET[]) { 
--- a/tests/broadcast.ispc
+++ b/tests/broadcast.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    float b = (programCount == 1) ? 3 : broadcast(a, 2);
+    float b = (programCount == 1) ? 4 : broadcast(a, 2);
    RET[programIndex] = b;
 }

--- a/tests/c-test-64.ispc
+++ b/tests/c-test-64.ispc
@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 2;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 5;
-    RET[3] = RET[7] = RET[11] = RET[15] = 6;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 2;
+    RET[i+1] = 3;
+    RET[i+2] = 5;
+    RET[i+3] = 6;
+  }
 }
--- a/tests/c-test-65.ispc
+++ b/tests/c-test-65.ispc
@@ -18,6 +18,9 @@ export void f_fu(uniform float RET[4], uniform float aFOO[4], uniform float b) {

 export void result(uniform float RET[]) {
    RET[programIndex] = 3;
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+    for (int i = 0; i < programCount; i += 4)
+    {
+      RET[i+0] = 1;
+      RET[i+3] = 29;
+    }
 }
--- a/tests/c-test-66.ispc
+++ b/tests/c-test-66.ispc
@@ -19,6 +19,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {

 export void result(uniform float RET[]) {
    RET[programIndex] = 32;
-    RET[2] = RET[6] = RET[10] = RET[14] = 38;
-    RET[3] = RET[7] = RET[11] = RET[15] = 39;
+    for (int i = 0; i < programCount; i += 4)
+    {
+      RET[i+2] = 38;
+      RET[i+3] = 39;
+    }
 }
--- a/tests/cfor-array-struct-gather.ispc
+++ b/tests/cfor-array-struct-gather.ispc
@@ -4,14 +4,14 @@ export uniform int width() { return programCount; }


 struct Foo {
-    uniform float x[17];
+    uniform float x[programCount+1];
 };

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
    uniform Foo foo;
    uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
        foo.x[i] = i;

    if ((int)a & 1)
--- a/tests/cfor-gs-double-improve-multidim-1.ispc
+++ b/tests/cfor-gs-double-improve-multidim-1.ispc
@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex]; 
-    uniform double udx[25][25];
-    cfor (uniform int i = 0; i < 25; ++i)
-        cfor (uniform int j = 0; j < 25; ++j)
+    uniform double udx[programCount+1][programCount+1];
+    cfor (uniform int i = 0; i < programCount+1; ++i)
+        cfor (uniform int j = 0; j < programCount+1; ++j)
            udx[i][j] = 10*i+j;

    int x = 1;
--- a/tests/cfor-gs-improve-multidim-1.ispc
+++ b/tests/cfor-gs-improve-multidim-1.ispc
@@ -5,9 +5,9 @@ export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex]; 
-    uniform float udx[20][20];
-    cfor (uniform int i = 0; i < 20; ++i)
-        cfor (uniform int j = 0; j < 20; ++j)
+    uniform float udx[programCount+1][programCount+1];
+    cfor (uniform int i = 0; i < programCount+1; ++i)
+        cfor (uniform int j = 0; j < programCount+1; ++j)
            udx[i][j] = 100*i+j;

    int x = 1;
--- a/tests/cfor-gs-improve-multidim-struct-1.ispc
+++ b/tests/cfor-gs-improve-multidim-struct-1.ispc
@@ -4,19 +4,27 @@ export uniform int width() { return programCount; }


 struct Foo {
-    uniform float udx[25][25];
+    uniform float udx[32][32];
 };

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex]; 
+#ifndef __NVPTX__ 
    uniform Foo f[5];
+#else     /* too much shared memory allocated, nvcc fails to link */
+    uniform Foo * uniform f = uniform new uniform Foo[5];
+#define _UNMALLOC
+#endif
    cfor (uniform int i = 0; i < 5; ++i)
-        cfor (uniform int j = 0; j < 25; ++j)
-            cfor (uniform int k = 0; k < 25; ++k)
+        cfor (uniform int j = 0; j < 32; ++j)
+            cfor (uniform int k = 0; k < 32; ++k)
                f[i].udx[j][k] = 1000*i+100*j+k;

    int x = 1;
    RET[programIndex] = f[x+1].udx[b-4][programIndex];
+#ifdef _UNMALLOC
+    delete f;
+#endif
 }

 export void result(uniform float RET[]) { RET[programIndex] = 2100 +programIndex; }
--- a/tests/cfor-struct-gather-2.ispc
+++ b/tests/cfor-struct-gather-2.ispc
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
    uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
        foo[i].f = i*a;
    RET[programIndex] = func(foo, (int)a);
 }
--- a/tests/cfor-struct-gather-3.ispc
+++ b/tests/cfor-struct-gather-3.ispc
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
    uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
        foo[i].f = i*a;
    RET[programIndex] = func(foo, (int)a);
 }
--- a/tests/cfor-struct-gather.ispc
+++ b/tests/cfor-struct-gather.ispc
@@ -9,9 +9,9 @@ struct Foo {

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
    uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
        foo[i].f = i*a;
    RET[programIndex] = foo[(int)a].f;
 }
--- a/tests/cfor-struct-test-114.ispc
+++ b/tests/cfor-struct-test-114.ispc
@@ -10,9 +10,9 @@ struct Foo {
 export void f_fi(uniform float RET[], uniform float aFOO[], uniform int bFOO[]) {
    float a = aFOO[programIndex];
    int b = bFOO[programIndex];
-    varying Foo myFoo[17];
+    varying Foo myFoo[programCount+1];
    uniform int i;
-    cfor (i = 0; i < 17; ++i) {
+    cfor (i = 0; i < programCount+1; ++i) {
        myFoo[i].x = i;
        myFoo[i].f = 2*i;
    }
--- a/tests/cfor-test-134.ispc
+++ b/tests/cfor-test-134.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
--- a/tests/cfor-test-135.ispc
+++ b/tests/cfor-test-135.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
--- a/tests/cfor-test-136.ispc
+++ b/tests/cfor-test-136.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
--- a/tests/cfor-test-64.ispc
+++ b/tests/cfor-test-64.ispc
@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 2;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 5;
-    RET[3] = RET[7] = RET[11] = RET[15] = 6;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 2;
+    RET[i+1] = 3;
+    RET[i+2] = 5;
+    RET[i+3] = 6;
+  }
 }
--- a/tests/cfor-test-65.ispc
+++ b/tests/cfor-test-65.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
--- a/tests/cfor-test-66.ispc
+++ b/tests/cfor-test-66.ispc
@@ -18,8 +18,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 32;
-    RET[1] = RET[5] = RET[9] = RET[13] = 32;
-    RET[2] = RET[6] = RET[10] = RET[14] = 38;
-    RET[3] = RET[7] = RET[11] = RET[15] = 39;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 32;
+    RET[i+1] = 32;
+    RET[i+2] = 38;
+    RET[i+3] = 39;
+  }
 }
--- a/tests/cfor-unif-struct-test-114.ispc
+++ b/tests/cfor-unif-struct-test-114.ispc
@@ -8,9 +8,9 @@ struct Foo {
 };
 export void f_fi(uniform float RET[], uniform float a[], uniform int bFOO[]) {
    int b = bFOO[programIndex];
-    uniform struct Foo myFoo[17];
+    uniform struct Foo myFoo[programCount+1];
    uniform int i;
-    cfor (i = 0; i < 17; ++i) {
+    cfor (i = 0; i < programCount+1; ++i) {
        myFoo[i].x = i;
        myFoo[i].f = 2*i;
    }
--- a/tests/const-fold-1.ispc
+++ b/tests/const-fold-1.ispc
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex];
    uniform int x = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
-    static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
+    const static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
    RET[programIndex] = (x == y) ? 1. : 0.;
 }

--- a/tests/const-fold-2.ispc
+++ b/tests/const-fold-2.ispc
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex];
    uniform int x = (170 >> 4) % 5;
-    static uniform int y = (170 >> 4) % 5;
+    const static uniform int y = (170 >> 4) % 5;
    RET[programIndex] = (x == y) ? 1. : 0.;
 }

--- a/tests/const-fold-3.ispc
+++ b/tests/const-fold-3.ispc
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex];
    uniform int x = (17 < 2) || (6 >= 5) && (20 >= 20);
-    static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20);
+    const static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20);
    RET[programIndex] = ((x!=0) == (y!=0)) ? 1. : 0.;
 }

--- a/tests/launch-8.ispc
+++ b/tests/launch-8.ispc
@@ -2,22 +2,23 @@
 export uniform int width() { return programCount; }


-#define N0 10
+#define N0 12
 #define N1 20
 #define N2 50
 static uniform float array[N2][N1][N0];

-task void x(const float f) {
+task void x(const uniform float farray[]) {
+    const float f = farray[programIndex];
    uniform int j;

-    assert(taskCount  == (int32)N0*N1*N2);
-    assert(taskCount0 == (int32)N0);
-    assert(taskCount1 == (int32)N1);
-    assert(taskCount2 == (int32)N2);
-    assert(taskIndex  == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2));
-    assert(taskIndex0 < (int32)N0);
-    assert(taskIndex1 < (int32)N1);
-    assert(taskIndex2 < (int32)N2);
+    assert(taskCount  == (uniform int32)N0*N1*N2);
+    assert(taskCount0 == (uniform int32)N0);
+    assert(taskCount1 == (uniform int32)N1);
+    assert(taskCount2 == (uniform int32)N2);
+    assert(taskIndex  == (uniform int32)taskIndex0 + (uniform int32)N0*(taskIndex1 +(uniform int32) N1*taskIndex2));
+    assert(taskIndex0 < (uniform int32)N0);
+    assert(taskIndex1 < (uniform int32)N1);
+    assert(taskIndex2 < (uniform int32)N2);

    const uniform int i0 = taskIndex0;
    const uniform int i1 = taskIndex1;
@@ -30,7 +31,7 @@ task void x(const float f) {
        array[i2][i1][i0] = i;
 }
 export void f_f(uniform float RET[], uniform float fFOO[]) { 
-    float f = fFOO[programIndex];
+    uniform float * uniform f = fFOO;
    launch[N2][N1][N0] x(f);
    sync;
    RET[programIndex] = array[N2-1][N1-1][N0-1];
@@ -38,5 +39,5 @@ export void f_f(uniform float RET[], uniform float fFOO[]) {


 export void result(uniform float RET[]) {
-    RET[programIndex] = 9999.000000;
+    RET[programIndex] = 11999.000000;
 }
--- a/tests/launch-9.ispc
+++ b/tests/launch-9.ispc
@@ -2,12 +2,13 @@
 export uniform int width() { return programCount; }


-#define N0 10
+#define N0 12
 #define N1 20
 #define N2 50
 static uniform float array[N2][N1][N0];

-task void x(const float f) {
+task void x(const uniform float farray[]) {
+    const float f = farray[programIndex];
    uniform int j;

    assert(taskCount  == (int32)N0*N1*N2);
@@ -30,13 +31,13 @@ task void x(const float f) {
        array[i2][i1][i0] = i;
 }
 export void f_f(uniform float RET[], uniform float fFOO[]) { 
-    float f = fFOO[programIndex];
-    launch[N0,N1,N2] x(f);
+    uniform float * uniform f = fFOO;
+    launch[N2][N1][N0] x(f);
    sync;
    RET[programIndex] = array[N2-1][N1-1][N0-1];
 }


 export void result(uniform float RET[]) {
-    RET[programIndex] = 9999.000000;
+    RET[programIndex] = 11999.000000;
 }
--- a/tests/operators2.ispc
+++ b/tests/operators2.ispc
@@ -1,4 +1,9 @@
+#ifdef __NVPTX__
+uniform int _off[programCount];
+#define off _off[programIndex]
+#else  /* global varying data types are not yet supported with "nvptx" target */
 int off;
+#endif

 export uniform int width() { return programCount; }

@@ -22,11 +27,11 @@ struct S operator/(struct S rr, struct S rv) {
    return c;
 }

-struct S a;
-struct S b;
-struct S d;

 export void f_f(uniform float RET[], uniform float aFOO[]) {
+    struct S a;
+    struct S b;
+    struct S d;
    int T = programIndex;
    a.a = aFOO[programIndex];
    b.a = -aFOO[programIndex];
--- a/tests/soa-16.ispc
+++ b/tests/soa-16.ispc
@@ -15,6 +15,16 @@ static void p(uniform float *uniform ptr) {
 }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+#ifdef __NVPTX__  /* soa is converted to shared memory story for now, use smaller amount to check the test */
+    soa<4> Point pts[10];
+    for (uniform int i = 0; i < 40; ++i) {
+        pts[i].x = b*i;
+        pts[i].y[0] = 2*b*i;
+        pts[i].y[1] = 2*b*i+1;
+        pts[i].y[2] = 2*b*i+2;
+        pts[i].z = 3*b*i;
+    }
+#else
    soa<4> Point pts[30];
    for (uniform int i = 0; i < 120; ++i) {
        pts[i].x = b*i;
@@ -23,6 +33,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
        pts[i].y[2] = 2*b*i+2;
        pts[i].z = 3*b*i;
    }
+#endif

    float a = aFOO[programIndex]; 
    a *= -1;
--- a/tests/soa-17.ispc
+++ b/tests/soa-17.ispc
@@ -16,6 +16,16 @@ static void p(uniform float *uniform ptr) {
 }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+#ifdef __NVPTX__  /* soa is converted to shared memory story for now, use smaller amount to check the test */
+    soa<4> Point pts[15];
+    for (uniform int i = 0; i < 60; ++i) {
+        pts[i].x = b*i;
+        pts[i].y[0] = 2*b*i;
+        pts[i].y[1] = 2*b*i+1;
+        pts[i].y[2] = 2*b*i+2;
+        pts[i].z = 3*b*i;
+    }
+#else
    soa<4> Point pts[40];
    for (uniform int i = 0; i < 160; ++i) {
        pts[i].x = b*i;
@@ -24,6 +34,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
        pts[i].y[2] = 2*b*i+2;
        pts[i].z = 3*b*i;
    }
+#endif

    float a = aFOO[programIndex]; 
    a *= -1;
--- a/tests/soa-3.ispc
+++ b/tests/soa-3.ispc
@@ -6,6 +6,17 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex]; 

+#ifdef __NVPTX__  /* soa is converted to shared memory story for now, use smaller amount to check the test */
+    soa<8> Point pts[4];
+//CO    uniform Point pts[80];
+    foreach (i = 0 ... 40) {
+        pts[i].x = b*i;
+        pts[i].y[0] = 2*b*i;
+        pts[i].y[1] = 2*b*i+1;
+        pts[i].y[2] = 2*b*i+2;
+        pts[i].z = 3*b*i;
+    }
+#else
    soa<8> Point pts[10];
 //CO    uniform Point pts[80];
    foreach (i = 0 ... 80) {
@@ -15,6 +26,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
        pts[i].y[2] = 2*b*i+2;
        pts[i].z = 3*b*i;
    }
+#endif

    assert(programCount < 80);
    RET[programIndex] = pts[programIndex].y[2];
--- a/tests/test-134.ispc
+++ b/tests/test-134.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
--- a/tests/test-135.ispc
+++ b/tests/test-135.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
--- a/tests/test-136.ispc
+++ b/tests/test-136.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
--- a/tests/test-140.ispc
+++ b/tests/test-140.ispc
@@ -8,8 +8,11 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }

 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 0x0.0p+0;
-    RET[1] = RET[5] = RET[9] = RET[13] = 0x1.62e43p-1;
-    RET[2] = RET[6] = RET[10] = RET[14] =  0x1.193ea8p+0;
-    RET[3] = RET[7] = RET[11] = RET[15] = 0x1.62e43p+0;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 0x0.0p+0;
+    RET[i+1] = 0x1.62e43p-1;
+    RET[i+2] = 0x1.193ea8p+0;
+    RET[i+3] = 0x1.62e43p+0;
+  }
 }
--- a/tests/test-141.ispc
+++ b/tests/test-141.ispc
@@ -5,7 +5,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex];
    // calculation error 1e-6 is the same as in icc
-    RET[programIndex] = (exp(-log(1/a)) - a) < 1e-6 ? 1 : 0;
+    RET[programIndex] = (exp(-log(1/a)) - a)/a < 1e-6 ? 1 : 0;
 }

 export void result(uniform float RET[4]) {
--- a/tests/test-142.ispc
+++ b/tests/test-142.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    RET[programIndex] = round(a+.499999); 
+    RET[programIndex] = round(a+.49999); 
 }

 export void result(uniform float RET[]) {
--- a/tests/test-144.ispc
+++ b/tests/test-144.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    RET[programIndex] = floor(a+.999999); 
+    RET[programIndex] = floor(a+.99999); 
 }

 export void result(uniform float RET[]) {
--- a/tests/uniform-1.ispc
+++ b/tests/uniform-1.ispc
@@ -0,0 +1,34 @@
+
+export uniform int width() { return programCount; }
+
+
+task void f_f_task(uniform float RET[], uniform float aFOO[]) {
+    uniform float val[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        val[i] = 0;
+
+    foreach (i = 0 ... programCount)
+        val[i] += aFOO[programCount*taskIndex + i] - 1;
+
+    uniform float sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        sum += val[i];
+
+    if (programIndex < 32/4)
+      RET[programCount/4*taskIndex + programIndex] = sum;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) 
+{
+  launch[4] f_f_task(RET,  aFOO);
+}
+task void result_task(uniform float RET[])
+{
+  const uniform float ret = reduce_add(programIndex + programCount*taskIndex);
+  if (programIndex < 32/4)
+    RET[programCount/4*taskIndex + programIndex] = ret;
+}
+
+export void result(uniform float RET[]) {
+  launch[4] result_task(RET);
+}
--- a/type.cpp
+++ b/type.cpp
@@ -749,7 +749,7 @@ EnumType::Mangle() const {
    std::string ret;
    if (isConst) ret += "C";
    ret += variability.MangleString();
-    ret += std::string("enum[") + name + std::string("]");
+    ret += std::string("enum_5B_") + name + std::string("_5C_");
    return ret;
 }

@@ -1420,7 +1420,7 @@ ArrayType::Mangle() const {
        sprintf(buf, "%d", numElements);
    else
        buf[0] = '\0';
-    return s + "[" + buf + "]";
+    return s + "_5B_" + buf + "_5C_";
 }


@@ -2058,12 +2058,12 @@ lMangleStruct(Variability variability, bool isConst, const std::string &name) {
    Assert(variability != Variability::Unbound);

    std::string ret;
-    ret += "s[";
+    ret += "s_5B_";
    if (isConst)
        ret += "_c_";
    ret += variability.MangleString();

-    ret += name + std::string("]");
+    ret += name + std::string("_5C_");
    return ret;
 }

@@ -3009,7 +3009,7 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
        llvmArgTypes.push_back(LLVMTypes::MaskType);

    std::vector<llvm::Type *> callTypes;
-    if (isTask) {
+    if (isTask && g->target->getISA() != Target::NVPTX) {
        // Tasks take three arguments: a pointer to a struct that holds the
        // actual task arguments, the thread index, and the total number of
        // threads the tasks system has running.  (Task arguments are