merged with nvptx

2014-02-20 11:01:58 +01:00
parent c325c0e085 11612a24ee
commit 4196c723eb
68 changed files with 8181 additions and 470 deletions
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -338,11 +338,13 @@ lSetInternalFunctions(llvm::Module *module) {
        "__all",
        "__any",
        "__aos_to_soa3_float",
        "__aos_to_soa3_float1",
        "__aos_to_soa3_float16",
        "__aos_to_soa3_float4",
        "__aos_to_soa3_float8",
        "__aos_to_soa3_int32",
        "__aos_to_soa4_float",
        "__aos_to_soa4_float1",
        "__aos_to_soa4_float16",
        "__aos_to_soa4_float4",
        "__aos_to_soa4_float8",
@@ -351,10 +353,14 @@ lSetInternalFunctions(llvm::Module *module) {
        "__atomic_add_int64_global",
        "__atomic_add_uniform_int32_global",
        "__atomic_add_uniform_int64_global",
        "__atomic_add_varying_int32_global",
        "__atomic_add_varying_int64_global",
        "__atomic_and_int32_global",
        "__atomic_and_int64_global",
        "__atomic_and_uniform_int32_global",
        "__atomic_and_uniform_int64_global",
        "__atomic_and_varying_int32_global",
        "__atomic_and_varying_int64_global",
        "__atomic_compare_exchange_double_global",
        "__atomic_compare_exchange_float_global",
        "__atomic_compare_exchange_int32_global",
@@ -363,18 +369,30 @@ lSetInternalFunctions(llvm::Module *module) {
        "__atomic_compare_exchange_uniform_float_global",
        "__atomic_compare_exchange_uniform_int32_global",
        "__atomic_compare_exchange_uniform_int64_global",
        "__atomic_compare_exchange_varying_double_global",
        "__atomic_compare_exchange_varying_float_global",
        "__atomic_compare_exchange_varying_int32_global",
        "__atomic_compare_exchange_varying_int64_global",
        "__atomic_max_uniform_int32_global",
        "__atomic_max_uniform_int64_global",
        "__atomic_min_uniform_int32_global",
        "__atomic_min_uniform_int64_global",
        "__atomic_max_varying_int32_global",
        "__atomic_max_varying_int64_global",
        "__atomic_min_varying_int32_global",
        "__atomic_min_varying_int64_global",
        "__atomic_or_int32_global",
        "__atomic_or_int64_global",
        "__atomic_or_uniform_int32_global",
        "__atomic_or_uniform_int64_global",
        "__atomic_or_varying_int32_global",
        "__atomic_or_varying_int64_global",
        "__atomic_sub_int32_global",
        "__atomic_sub_int64_global",
        "__atomic_sub_uniform_int32_global",
        "__atomic_sub_uniform_int64_global",
        "__atomic_sub_varying_int32_global",
        "__atomic_sub_varying_int64_global",
        "__atomic_swap_double_global",
        "__atomic_swap_float_global",
        "__atomic_swap_int32_global",
@@ -383,14 +401,28 @@ lSetInternalFunctions(llvm::Module *module) {
        "__atomic_swap_uniform_float_global",
        "__atomic_swap_uniform_int32_global",
        "__atomic_swap_uniform_int64_global",
        "__atomic_swap_varying_double_global",
        "__atomic_swap_varying_float_global",
        "__atomic_swap_varying_int32_global",
        "__atomic_swap_varying_int64_global",
        "__atomic_umax_uniform_uint32_global",
        "__atomic_umax_uniform_uint64_global",
        "__atomic_umin_uniform_uint32_global",
        "__atomic_umin_uniform_uint64_global",
        "__atomic_umax_varying_uint32_global",
        "__atomic_umax_varying_uint64_global",
        "__atomic_umin_varying_uint32_global",
        "__atomic_umin_varying_uint64_global",
        "__atomic_xor_int32_global",
        "__atomic_xor_int64_global",
        "__atomic_xor_uniform_int32_global",
        "__atomic_xor_uniform_int64_global",
        "__atomic_xor_uniform_int32_global",
        "__atomic_xor_uniform_int64_global",
        "__atomic_xor_varying_int32_global",
        "__atomic_xor_varying_int64_global",
        "__atomic_xor_varying_int32_global",
        "__atomic_xor_varying_int64_global",
        "__broadcast_double",
        "__broadcast_float",
        "__broadcast_i16",
@@ -413,6 +445,7 @@ lSetInternalFunctions(llvm::Module *module) {
        "__do_assert_uniform",
        "__do_assert_varying",
        "__do_print",
        "__do_print_nvptx",
        "__doublebits_uniform_int64",
        "__doublebits_varying_int64",
        "__exclusive_scan_add_double",
@@ -427,6 +460,8 @@ lSetInternalFunctions(llvm::Module *module) {
        "__extract_int32",
        "__extract_int64",
        "__extract_int8",
        "__extract_float",
        "__extract_double",
        "__fastmath",
        "__float_to_half_uniform",
        "__float_to_half_varying",
@@ -443,6 +478,8 @@ lSetInternalFunctions(llvm::Module *module) {
        "__insert_int32",
        "__insert_int64",
        "__insert_int8",
        "__insert_float",
        "__insert_double",
        "__intbits_uniform_double",
        "__intbits_uniform_float",
        "__intbits_varying_double",
@@ -479,6 +516,7 @@ lSetInternalFunctions(llvm::Module *module) {
        "__min_varying_uint32",
        "__min_varying_uint64",
        "__movmsk",
        "__movmsk_ptx",
        "__new_uniform_32rt",
        "__new_uniform_64rt",
        "__new_varying32_32rt",
@@ -560,11 +598,13 @@ lSetInternalFunctions(llvm::Module *module) {
        "__shuffle_i64",
        "__shuffle_i8",
        "__soa_to_aos3_float",
        "__soa_to_aos3_float1",
        "__soa_to_aos3_float16",
        "__soa_to_aos3_float4",
        "__soa_to_aos3_float8",
        "__soa_to_aos3_int32",
        "__soa_to_aos4_float",
        "__soa_to_aos4_float1",
        "__soa_to_aos4_float16",
        "__soa_to_aos4_float4",
        "__soa_to_aos4_float8",
@@ -622,6 +662,24 @@ lSetInternalFunctions(llvm::Module *module) {
        "__vec4_add_int32",
        "__vselect_float",
        "__vselect_i32",
        "__program_index",
        "__program_count",
        "__warp_index",
        "__task_index0",
        "__task_index1",
        "__task_index2",
        "__task_index",
        "__task_count0",
        "__task_count1",
        "__task_count2",
        "__task_count",
        "__cvt_loc2gen",
        "__cvt_loc2gen_var",
        "__cvt_const2gen",
        "__puts_nvptx",
        "ISPCAlloc",
        "ISPCLaunch",
        "ISPCSync",
    };
    int count = sizeof(names) / sizeof(names[0]);
@@ -694,6 +752,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
            g->target->getISA() != Target::NEON16 &&
            g->target->getISA() != Target::NEON8)
 #endif // !__arm__
        if (g->target->getISA() != Target::NVPTX)
        {
            Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
                   mTriple.getArch() == bcTriple.getArch());
@@ -855,7 +914,17 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    // Next, add the target's custom implementations of the various needed
    // builtin functions (e.g. __masked_store_32(), etc).
    switch (g->target->getISA()) {
-
+    case Target::NVPTX: 
      {
        if (runtime32) {
            fprintf(stderr, "Unforetunatly 32bit targets are supported at the moment .. \n");
            assert(0);
        }
        else {
            EXPORT_MODULE(builtins_bitcode_nvptx_64bit);
        }
        break;
      };
 #ifdef ISPC_ARM_ENABLED
    case Target::NEON8: {
        if (runtime32) {
@@ -1125,7 +1194,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    }
    // define the 'programCount' builtin variable
    if (g->target->getISA() != Target::NVPTX)
    {
      lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
    }
    else
    {
      lDefineConstantInt("programCount", 32, module, symbolTable);
    }
    // define the 'programIndex' builtin
    lDefineProgramIndex(module, symbolTable);
@@ -1155,6 +1231,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(),
                       module, symbolTable);
    lDefineConstantInt("__is_nvptx_target", (int)(g->target->getISA() == Target::NVPTX),
                       module, symbolTable);
    if (g->forceAlignment != -1) {
        llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
        alignment->setInitializer(LLVMInt32(g->forceAlignment));
--- a/builtins/__do_print_nvptx.cu
+++ b/builtins/__do_print_nvptx.cu
@@ -0,0 +1,130 @@
 #include <cstdio>
 #define PRINT_BUF_SIZE 4096
 #define uint64_t unsigned long long
 static __device__ size_t d_strlen(const char *str)
 {
  const char *s;
  for (s = str; *s; ++s)
    ;
  return (s - str);
 }
 static __device__  char* d_strncat(char *dest, const char *src, size_t n)
 {
  size_t dest_len = d_strlen(dest);
  size_t i;
  for (i = 0 ; i < n && src[i] != '\0' ; i++)
    dest[dest_len + i] = src[i];
  dest[dest_len + i] = '\0';
  return dest;
 }
 #define APPEND(str)                                        \
    do {                                                   \
        int offset = bufp - &printString[0];               \
        *bufp = '\0';                                      \
        d_strncat(bufp, str, PRINT_BUF_SIZE-offset);         \
        bufp += d_strlen(str);                               \
        if (bufp >= &printString[PRINT_BUF_SIZE])          \
            goto done;                                     \
    } while (0) /* eat semicolon */
 #define PRINT_SCALAR(fmt, type)                  \
    sprintf(tmpBuf, fmt, *((type *)ptr));        \
    APPEND(tmpBuf);                              \
    break
 #define PRINT_VECTOR(fmt, type)                                         \
    *bufp++ = '[';                                                      \
    if (bufp == &printString[PRINT_BUF_SIZE]) break;                    \
    for (int i = 0; i < width; ++i) {                                   \
        /* only print the value if the current lane is executing */     \
        type val0 = *((type*)ptr);                                      \
        type val = val0;                                                \
        if (mask & (1ull<<i))                                           \
            sprintf(tmpBuf, fmt, val);                                  \
        else                                                            \
            sprintf(tmpBuf, "(( * )) ");                                \
        APPEND(tmpBuf);                                                 \
        *bufp++ = (i != width-1 ? ',' : ']');                           \
    }                                                                   \
    break
 extern "C"
 __device__ void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
                void **args) {
    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
    char *bufp = &printString[0];
    char tmpBuf[256];
    const char  trueBuf[] = "true";
    const char falseBuf[] = "false";
    int argCount = 0;
    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
        // Format strings are just single percent signs.
        if (*format != '%') {
            *bufp++ = *format;
        }
        else {
            if (*types) {
                void *ptr = args[argCount++];
                // Based on the encoding in the types string, cast the
                // value appropriately and print it with a reasonable
                // printf() formatting string.
                switch (*types) {
                case 'b': {
                    const char *tmpBuf1 =  *((bool *)ptr) ? trueBuf : falseBuf;
                    APPEND(tmpBuf1);
                    break;
                }
                case 'B': {
                    *bufp++ = '[';
                    if (bufp == &printString[PRINT_BUF_SIZE])
                        break;
                    for (int i = 0; i < width; ++i) {
                        bool val0 = *((bool*)ptr);                                     
                        bool val = val0;                                                \
                        if (mask & (1ull << i)) {
                            const char *tmpBuf1 =  val ? trueBuf : falseBuf;
                            APPEND(tmpBuf1);
                        }
                        else
                            APPEND("_________");
                        *bufp++ = (i != width-1) ? ',' : ']';
                    }
                    break;
                }
                case 'i': PRINT_SCALAR("%d", int);
                case 'I': PRINT_VECTOR("%d", int);
                case 'u': PRINT_SCALAR("%u", unsigned int);
                case 'U': PRINT_VECTOR("%u", unsigned int);
                case 'f': PRINT_SCALAR("%f", float);
                case 'F': PRINT_VECTOR("%f", float);
                case 'l': PRINT_SCALAR("%lld", long long);
                case 'L': PRINT_VECTOR("%lld", long long);
                case 'v': PRINT_SCALAR("%llu", unsigned long long);
                case 'V': PRINT_VECTOR("%llu", unsigned long long);
                case 'd': PRINT_SCALAR("%f", double);
                case 'D': PRINT_VECTOR("%f", double);
                case 'p': PRINT_SCALAR("%p", void *);
                case 'P': PRINT_VECTOR("%p", void *);
                default:
                    APPEND("UNKNOWN TYPE ");
                    *bufp++ = *types;
                }
                ++types;
            }
        }
        ++format;
    }
 done:
    *bufp = '\n'; bufp++;
    *bufp = '\0';
 }
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -185,6 +185,81 @@ void __do_print(const char *format, const char *types, int width, uint64_t mask,
    fflush(stdout);
 }
 /* this is print for PTX target only */
 int __puts_nvptx(const char *);
 void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
                void **args) {
 #if 0
    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
    char *bufp = &printString[0];
    char tmpBuf[256];
    int argCount = 0;
    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
        // Format strings are just single percent signs.
        if (*format != '%') {
            *bufp++ = *format;
        }
        else {
            if (*types) {
                void *ptr = args[argCount++];
                // Based on the encoding in the types string, cast the
                // value appropriately and print it with a reasonable
                // printf() formatting string.
                switch (*types) {
                case 'b': {
                    sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
                    APPEND(tmpBuf);
                    break;
                }
                case 'B': {
                    *bufp++ = '[';
                    if (bufp == &printString[PRINT_BUF_SIZE])
                        break;
                    for (int i = 0; i < width; ++i) {
                        if (mask & (1ull << i)) {
                            sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
                            APPEND(tmpBuf);
                        }
                        else
                            APPEND("_________");
                        *bufp++ = (i != width-1) ? ',' : ']';
                    }
                    break;
                }
                case 'i': PRINT_SCALAR("%d", int);
                case 'I': PRINT_VECTOR("%d", int);
                case 'u': PRINT_SCALAR("%u", unsigned int);
                case 'U': PRINT_VECTOR("%u", unsigned int);
                case 'f': PRINT_SCALAR("%f", float);
                case 'F': PRINT_VECTOR("%f", float);
                case 'l': PRINT_SCALAR("%lld", long long);
                case 'L': PRINT_VECTOR("%lld", long long);
                case 'v': PRINT_SCALAR("%llu", unsigned long long);
                case 'V': PRINT_VECTOR("%llu", unsigned long long);
                case 'd': PRINT_SCALAR("%f", double);
                case 'D': PRINT_VECTOR("%f", double);
                case 'p': PRINT_SCALAR("%p", void *);
                case 'P': PRINT_VECTOR("%p", void *);
                default:
                    APPEND("UNKNOWN TYPE ");
                    *bufp++ = *types;
                }
                ++types;
            }
        }
        ++format;
    }
 done:
    *bufp = '\n'; bufp++;
    *bufp = '\0';
    __puts_nvptx(printString);
 #else
    __puts_nvptx("---nvptx printing is not support---\n");
 #endif
 }
 int __num_cores() {
 #if defined(_MSC_VER) || defined(__MINGW32__)
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -288,4 +288,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
 ;; int8/int16 builtins
 define_avgs()
 declare_nvptx()
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -10,6 +10,7 @@ packed_load_and_store()
 scans()
 int64minmax()
 aossoa()
 declare_nvptx()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -392,4 +392,4 @@ declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
 ;; int8/int16 builtins
 define_avgs()
-
+declare_nvptx()
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -344,3 +344,4 @@ packed_load_and_store(4)
 ;; prefetch
 define_prefetches()
 declare_nvptx()
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -274,3 +274,4 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
 define_avgs()
 declare_nvptx()
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -278,3 +278,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
  %call = call i64 @llvm.ctpop.i64(i64 %0)
  ret i64 %call
 }
 declare_nvptx()
--- a/builtins/util-nvptx.m4
+++ b/builtins/util-nvptx.m4
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -4541,3 +4541,60 @@ define(`rcpd_decl', `
 declare  double @__rcp_uniform_double(double)
 declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
 ')
 define(`declare_nvptx',
 `
 declare i32 @__program_index()  nounwind readnone alwaysinline
 declare i32 @__program_count()  nounwind readnone alwaysinline
 declare i32 @__warp_index()  nounwind readnone alwaysinline
 declare i32 @__task_index0()  nounwind readnone alwaysinline
 declare i32 @__task_index1()  nounwind readnone alwaysinline
 declare i32 @__task_index2()  nounwind readnone alwaysinline
 declare i32 @__task_index()  nounwind readnone alwaysinline
 declare i32 @__task_count0()  nounwind readnone alwaysinline
 declare i32 @__task_count1()  nounwind readnone alwaysinline
 declare i32 @__task_count2()  nounwind readnone alwaysinline
 declare i32 @__task_count()  nounwind readnone alwaysinline
 declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
 declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
 declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline
 declare i64 @__movmsk_ptx(<WIDTH x i1>) nounwind readnone alwaysinline;
 ')
 define(`global_atomic_varying',`
 declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
 ')
 define(`global_atomic_cas_varying',`
 declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %cmp, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
 ')
 global_atomic_cas_varying(WIDTH, compare_exchange, i32, int32)
 global_atomic_cas_varying(WIDTH, compare_exchange, i64, int64)
 global_atomic_cas_varying(WIDTH, compare_exchange, float, float)
 global_atomic_cas_varying(WIDTH, compare_exchange, double, double)
 global_atomic_varying(WIDTH, swap, i32, int32)
 global_atomic_varying(WIDTH, swap, i64, int64)
 global_atomic_varying(WIDTH, swap, float, float)
 global_atomic_varying(WIDTH, swap, double, double)
 global_atomic_varying(WIDTH, add, i32, int32)
 global_atomic_varying(WIDTH, sub, i32, int32)
 global_atomic_varying(WIDTH, and, i32, int32)
 global_atomic_varying(WIDTH, or, i32, int32)
 global_atomic_varying(WIDTH, xor, i32, int32)
 global_atomic_varying(WIDTH, min, i32, int32)
 global_atomic_varying(WIDTH, max, i32, int32)
 global_atomic_varying(WIDTH, umin, i32, uint32)
 global_atomic_varying(WIDTH, umax, i32, uint32)
 global_atomic_varying(WIDTH, add, i64, int64)
 global_atomic_varying(WIDTH, sub, i64, int64)
 global_atomic_varying(WIDTH, and, i64, int64)
 global_atomic_varying(WIDTH, or, i64, int64)
 global_atomic_varying(WIDTH, xor, i64, int64)
 global_atomic_varying(WIDTH, min, i64, int64)
 global_atomic_varying(WIDTH, max, i64, int64)
 global_atomic_varying(WIDTH, umin, i64, uint64)
 global_atomic_varying(WIDTH, umax, i64, uint64)
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -57,6 +57,8 @@
  #include <llvm/IR/Instructions.h>
  #include <llvm/IR/DerivedTypes.h>
 #endif
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Support/FormattedStream.h>
 /** This is a small utility structure that records information related to one
    level of nested control flow.  It's mostly used in correctly restoring
@@ -1371,11 +1373,17 @@ FunctionEmitContext::None(llvm::Value *mask) {
 llvm::Value *
-FunctionEmitContext::LaneMask(llvm::Value *v) {
+FunctionEmitContext::LaneMask(llvm::Value *v) 
 {
 #if 1 /* this makes mandelbrot example slower, why ?!? */
  const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk";
 #else
  const char *__movmsk = "__movmsk";
 #endif
  // Call the target-dependent movmsk function to turn the vector mask
  // into an i64 value
  std::vector<Symbol *> mm;
-    m->symbolTable->LookupFunction("__movmsk", &mm);
+  m->symbolTable->LookupFunction(__movmsk, &mm);
  if (g->target->getMaskBitCount() == 1)
    AssertPos(currentPos, mm.size() == 1);
  else
@@ -1387,9 +1395,71 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
  return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
 }
 bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName)
 {
  llvm::Type *type = vector->getType();
  if (type == LLVMTypes::Int8VectorType)
    funcName += "_int8";
  else if (type == LLVMTypes::Int16VectorType)
    funcName += "_int16";
  else if (type == LLVMTypes::Int32VectorType)
    funcName += "_int32";
  else if (type == LLVMTypes::Int64VectorType)
    funcName += "_int64";
  else if (type == LLVMTypes::FloatVectorType)
    funcName += "_float";
  else if (type == LLVMTypes::DoubleVectorType)
    funcName += "_double";
  else
    return false;
  return true;
 }
 llvm::Value*
 FunctionEmitContext::Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar)
 {
  std::string funcName = "__insert";
  assert(lAppendInsertExtractName(vector, funcName));
  assert(lane->getType() == LLVMTypes::Int32Type);
  llvm::Function *func = m->module->getFunction(funcName.c_str());
  assert(func != NULL);
  std::vector<llvm::Value *> args;
  args.push_back(vector);
  args.push_back(lane);
  args.push_back(scalar);
  llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
  return ret;
 }
 llvm::Value*
 FunctionEmitContext::Extract(llvm::Value *vector, llvm::Value *lane)
 {
  std::string funcName = "__extract";
  assert(lAppendInsertExtractName(vector, funcName));
  assert(lane->getType() == LLVMTypes::Int32Type);
  llvm::Function *func = m->module->getFunction(funcName.c_str());
  assert(func != NULL);
  std::vector<llvm::Value *> args;
  args.push_back(vector);
  args.push_back(lane);
  llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
  return ret;
 }
 llvm::Value *
 FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
  if (g->target->getISA() == Target::NVPTX)
  {
    // Compare the two masks to get a vector of i1s
    llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
        v1, v2, "v1==v2");
    return ExtractInst(cmp, 0);  /* this works without calling All(..) in PTX. Why ?!? */
  }
  else
  {
 #if 0
    // Compare the two masks to get a vector of i1s
    llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
@@ -1405,6 +1475,7 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
        LLVMGetName("equal", v1, v2));
 #endif
  }
 }
 llvm::Value *
 FunctionEmitContext::ProgramIndexVector(bool is32bits) {
@@ -1418,6 +1489,17 @@ FunctionEmitContext::ProgramIndexVector(bool is32bits) {
    return index;
 }
 llvm::Value *
 FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) {
    llvm::Function *func_program_index  = m->module->getFunction("__program_index");
    llvm::Value *__program_index    = CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__program_indexS");
    llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), __program_index, 0, "foreach__program_indexV");
 #if 0
    if (!is32bits)
      index = ZExtInst(index, LLVMTypes::Int64VectandType);
 #endif
    return index;
 }
 llvm::Value *
@@ -1830,6 +1912,7 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) {
    if (name == NULL)
        name = LLVMGetName(value, "_ptr2int");
    llvm::Type *type = LLVMTypes::PointerIntType;
    llvm::Instruction *inst = new llvm::PtrToIntInst(value, type, name, bblock);
    AddDebugPos(inst);
@@ -3523,6 +3606,9 @@ llvm::Value *
 FunctionEmitContext::LaunchInst(llvm::Value *callee,
                                std::vector<llvm::Value *> &argVals,
                                llvm::Value *launchCount[3]){
    if (g->target->getISA() != Target::NVPTX)
    {
      if (callee == NULL) {
        AssertPos(currentPos, m->errorCount > 0);
        return NULL;
@@ -3588,10 +3674,96 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
      args.push_back(launchCount[2]);
      return CallInst(flaunch, NULL, args, "");
    }
    else /* NVPTX */
    {
      if (callee == NULL) {
        AssertPos(currentPos, m->errorCount > 0);
        return NULL;
      }
      launchedTasks = true;
      AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
      std::vector<llvm::Type*> argTypes;
      llvm::Function *F = llvm::dyn_cast<llvm::Function>(callee);
      const unsigned int nArgs = F->arg_size();
      llvm::Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
      for (; I != E; ++I) 
        argTypes.push_back(I->getType());
      llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes);
      llvm::StructType *argStructType = static_cast<llvm::StructType *>(st);
      llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
      if (structSize->getType() != LLVMTypes::Int64Type)
        structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
            "struct_size_to_64");
      const int align = 8;
      llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
      AssertPos(currentPos, falloc != NULL);
      std::vector<llvm::Value *> allocArgs;
      allocArgs.push_back(launchGroupHandlePtr);
      allocArgs.push_back(structSize);
      allocArgs.push_back(LLVMInt32(align));
      llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
      llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64");
      llvm::BasicBlock* if_true  = CreateBasicBlock("if_true");
      llvm::BasicBlock* if_false = CreateBasicBlock("if_false");
      /* check if the pointer returned by ISPCAlloc is not NULL 
       * --------------
       * this is a workaround for not checking the value of programIndex 
       * because ISPCAlloc will return NULL pointer for all programIndex > 0
       * of course, if ISPAlloc fails to get parameter buffer, the pointer for programIndex = 0
       * will also be NULL
       * This check must be added, and also rewrite the code to make it less opaque 
       */
      llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1");
      BranchInst(if_true, if_false, cmp1);
      /**********************/
      bblock = if_true;    
      // label_if_then block:
      llvm::Type *pt = llvm::PointerType::getUnqual(st);
      llvm::Value *argmem = BitCastInst(voidmem, pt);
      for (unsigned int i = 0; i < argVals.size(); ++i) 
      {
        llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
        // don't need to do masked store here, I think
        StoreInst(argVals[i], ptr);
      }
      if (nArgs == argVals.size() + 1) {
        // copy in the mask
        llvm::Value *mask = GetFullMask();
        llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
            "funarg_mask");
        StoreInst(mask, ptr);
      }
      BranchInst(if_false);
      /**********************/
      bblock = if_false;
      llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
      llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
      AssertPos(currentPos, flaunch != NULL);
      std::vector<llvm::Value *> args;
      args.push_back(launchGroupHandlePtr);
      args.push_back(fptr);
      args.push_back(voidmem);
      args.push_back(launchCount[0]);
      args.push_back(launchCount[1]);
      args.push_back(launchCount[2]);
      llvm::Value *ret =  CallInst(flaunch, NULL, args, "");
      return ret;
    }
 }
 void
 FunctionEmitContext::SyncInst() {
    if (g->target->getISA() != Target::NVPTX)
    {
      llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
      llvm::Value *nullPtrValue =
        llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
@@ -3616,6 +3788,18 @@ FunctionEmitContext::SyncInst() {
      SetCurrentBasicBlock(bPostSync);
    }
    else /* NVPTX: don't do test, just call sync */
    {
      llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
      llvm::Value *nullPtrValue =
        llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
      llvm::Function *fsync = m->module->getFunction("ISPCSync");
      if (fsync == NULL)
        FATAL("Couldn't find ISPCSync declaration?!");
      CallInst(fsync, NULL, launchGroupHandle, "");
      StoreInst(nullPtrValue, launchGroupHandlePtr);
    }
 }
 /** When we gathering from or scattering to a varying atomic type, we need
--- a/ctx.h
+++ b/ctx.h
@@ -291,6 +291,13 @@ public:
        of the mask is on. */
    llvm::Value *LaneMask(llvm::Value *mask);
    /** Issues a call to __insert_int8/int16/int32/int64/float/double */
    llvm::Value* Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar);
    /** Issues a call to __extract_int8/int16/int32/int64/float/double */
    llvm::Value* Extract(llvm::Value *vector, llvm::Value *lane);
    /** Given two masks of type LLVMTypes::MaskType, return an i1 value
        that indicates whether the two masks are equal. */
    llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
@@ -298,6 +305,7 @@ public:
    /** Generate ConstantVector, which contains ProgramIndex, i.e.
        < i32 0, i32 1, i32 2, i32 3> */
    llvm::Value *ProgramIndexVector(bool is32bits = true);
    llvm::Value *ProgramIndexVectorPTX(bool is32bits = true);
    /** Given a string, create an anonymous global variable to hold its
        value and return the pointer to the string. */
--- a/decl.cpp
+++ b/decl.cpp
@@ -168,6 +168,13 @@ DeclSpecs::GetBaseType(SourcePos pos) const {
    retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
    if (soaWidth > 0) {
 #if 0  /* see stmt.cpp in DeclStmt::EmitCode for work-around of SOAType Declaration */
        if (g->target->getISA() == Target::NVPTX)
        {
            Error(pos, "\"soa\" data types are currently not supported with \"nvptx\" target.");
            return NULL;
        }
 #endif
        const StructType *st = CastType<StructType>(retType);
        if (st == NULL) {
@@ -402,6 +409,13 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
            return;
        }
 #if 0 /* NVPTX */
        if (baseType->IsUniformType())
        {
          fprintf(stderr, " detected uniform array of size= %d  array= %s\n" ,arraySize,
              baseType->IsArrayType() ? " true " : " false ");
        }
 #endif
        const Type *arrayType = new ArrayType(baseType, arraySize);
        if (child != NULL) {
            child->InitFromType(arrayType, ds);
@@ -530,9 +544,9 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
        returnType = returnType->ResolveUnboundVariability(Variability::Varying);
        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
        bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
        bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
        bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0);
        if (isExported && isTask) {
--- a/expr.cpp
+++ b/expr.cpp
@@ -7867,6 +7867,12 @@ SizeOfExpr::TypeCheck() {
              "struct type \"%s\".", type->GetString().c_str());
        return NULL;
    }
    if (type != NULL)
      if (g->target->getISA() == Target::NVPTX && type->IsVaryingType())
      {
        Error(pos, "\"sizeof\" with varying data types is not yet supported with \"nvptx\" target.");
        return NULL;
      }
    return this;
 }
@@ -8661,6 +8667,11 @@ NewExpr::TypeCheck() {
        AssertPos(pos, m->errorCount > 0);
        return NULL;
    }
    if (g->target->getISA() == Target::NVPTX && allocType->IsVaryingType())
    {
      Error(pos, "\"new\" with varying data types is not yet supported with \"nvptx\" target.");
      return NULL;
    }
    if (CastType<UndefinedStructType>(allocType) != NULL) {
        Error(pos, "Can't dynamically allocate storage for declared "
              "but not defined type \"%s\".", allocType->GetString().c_str());
--- a/func.cpp
+++ b/func.cpp
@@ -47,6 +47,7 @@
 #include <stdio.h>
 #if defined(LLVM_3_1) || defined(LLVM_3_2)
  #include <llvm/Metadata.h>
  #include <llvm/LLVMContext.h>
  #include <llvm/Module.h>
  #include <llvm/Type.h>
@@ -54,6 +55,7 @@
  #include <llvm/Intrinsics.h>
  #include <llvm/DerivedTypes.h>
 #else
  #include <llvm/IR/Metadata.h>
  #include <llvm/IR/LLVMContext.h>
  #include <llvm/IR/Module.h>
  #include <llvm/IR/Type.h>
@@ -128,7 +130,7 @@ Function::Function(Symbol *s, Stmt *c) {
            sym->parentFunction = this;
    }
-    if (type->isTask) {
+    if (type->isTask && g->target->getISA() != Target::NVPTX) {
        threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
        Assert(threadIndexSym);
        threadCountSym = m->symbolTable->LookupVariable("threadCount");
@@ -239,7 +241,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
 #endif
    const FunctionType *type = CastType<FunctionType>(sym->type);
    Assert(type != NULL);
-    if (type->isTask == true) {
+    if (type->isTask == true && g->target->getISA() != Target::NVPTX)  {
        // For tasks, there should always be three parameters: the
        // pointer to the structure that holds all of the arguments, the
        // thread index, and the thread count variables.
@@ -337,6 +339,16 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
            ctx->SetFunctionMask(argIter);
            Assert(++argIter == function->arg_end());
        }
        if (type->isTask == true && g->target->getISA() == Target::NVPTX)
        {
          llvm::NamedMDNode* annotations =
            m->module->getOrInsertNamedMetadata("nvvm.annotations");
          llvm::SmallVector<llvm::Value*, 3> av;
          av.push_back(function);
          av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
          av.push_back(LLVMInt32(1));
          annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
        }
    }
    // Finally, we can generate code for the function
@@ -497,8 +509,23 @@ Function::GenerateIR() {
                llvm::FunctionType *ftype = type->LLVMFunctionType(g->ctx, true);
                llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
                std::string functionName = sym->name;
                if (g->mangleFunctionsWithTarget)
                    functionName += std::string("_") + g->target->GetISAString();
                if (g->target->getISA() == Target::NVPTX)
                {
                  functionName += std::string("___export");  /* add ___export to the end, for ptxcc to recognize it is exported */
 #if 0
                  llvm::NamedMDNode* annotations =
                    m->module->getOrInsertNamedMetadata("nvvm.annotations");
                  llvm::SmallVector<llvm::Value*, 3> av;
                  av.push_back(function);
                  av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
                  av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
                  annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); 
 #endif
                }
                llvm::Function *appFunction =
                    llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
 #if defined(LLVM_3_1)
@@ -538,6 +565,16 @@ Function::GenerateIR() {
                            FATAL("Function verificication failed");
                        }
                    }
                    if (g->target->getISA() == Target::NVPTX)
                    {
                      llvm::NamedMDNode* annotations =
                        m->module->getOrInsertNamedMetadata("nvvm.annotations");
                      llvm::SmallVector<llvm::Value*, 3> av;
                      av.push_back(appFunction);
                      av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
                      av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
                      annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); 
                    }
                }
            }
        }
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -280,6 +280,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
            arch = "arm";
        else
 #endif
         if(!strncmp(isa, "nvptx", 5))
           arch = "nvptx64";
         else
            arch = "x86-64";
    }
@@ -707,6 +710,19 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        this->m_maskBitCount = 32;
    }
 #endif
    else if (!strcasecmp(isa, "nvptx")) 
    {
        this->m_isa = Target::NVPTX;
        this->m_cpu = "sm_35";
        this->m_nativeVectorWidth = 32;
        this->m_nativeVectorAlignment = 32;
        this->m_vectorWidth = 1;
        this->m_hasHalf = true;
        this->m_maskingIsFree = true;
        this->m_maskBitCount = 1;
        this->m_hasTranscendentals = false; 
        this->m_hasGather = this->m_hasScatter = false;
    }
    else {
        Error(SourcePos(), "Target \"%s\" is unknown.  Choices are: %s.",
                isa, SupportedTargets());
@@ -784,6 +800,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        // Initialize target-specific "target-feature" attribute.
        if (!m_attributes.empty()) {
            llvm::AttrBuilder attrBuilder;
            if (m_isa != Target::NVPTX)
              attrBuilder.addAttribute("target-cpu", this->m_cpu);
            attrBuilder.addAttribute("target-features", this->m_attributes);
            this->m_tf_attributes = new llvm::AttributeSet(
@@ -839,7 +856,7 @@ Target::SupportedTargets() {
        "avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 "
        "avx2-i32x8, avx2-i32x16, avx2-i64x4, "
        "generic-x1, generic-x4, generic-x8, generic-x16, "
-        "generic-x32, generic-x64";
+        "generic-x32, generic-x64, nvptx";
 }
@@ -866,6 +883,8 @@ Target::GetTripleString() const {
            triple.setArchName("i386");
        else if (m_arch == "x86-64")
            triple.setArchName("x86_64");
        else if (m_arch == "nvptx64")
          triple = llvm::Triple("nvptx64", "nvidia", "cuda");
        else
            triple.setArchName(m_arch);
    }
@@ -898,6 +917,8 @@ Target::ISAToString(ISA isa) {
        return "avx2";
    case Target::GENERIC:
        return "generic";
    case Target::NVPTX:
        return "nvptx";
    default:
        FATAL("Unhandled target in ISAToString()");
    }
@@ -936,6 +957,8 @@ Target::ISAToTargetString(ISA isa) {
        return "avx2-i32x8";
    case Target::GENERIC:
        return "generic-4";
    case Target::NVPTX:
        return "nvptx";
    default:
        FATAL("Unhandled target in ISAToTargetString()");
    }
--- a/ispc.h
+++ b/ispc.h
@@ -179,7 +179,7 @@ public:
        flexible/performant of them will apear last in the enumerant.  Note
        also that __best_available_isa() needs to be updated if ISAs are
        added or the enumerant values are reordered.  */
-    enum ISA {
+    enum ISA { NVPTX,
 #ifdef ISPC_ARM_ENABLED
               NEON32, NEON16, NEON8,
 #endif
@@ -606,6 +606,7 @@ struct Globals {
    /** Indicates that alignment in memory allocation routines should be
        forced to have given value. -1 value means natural alignment for the platforms. */
    int forceAlignment;
    std::string PtxString;
 };
 enum {
--- a/main.cpp
+++ b/main.cpp
@@ -320,6 +320,11 @@ int main(int Argc, char *Argv[]) {
    LLVMInitializeARMTargetMC();
 #endif
    LLVMInitializeNVPTXTargetInfo();
    LLVMInitializeNVPTXTarget();
    LLVMInitializeNVPTXAsmPrinter();
    LLVMInitializeNVPTXTargetMC();
    char *file = NULL;
    const char *headerFileName = NULL;
    const char *outFileName = NULL;
--- a/module.cpp
+++ b/module.cpp
@@ -444,6 +444,38 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
        return;
    }
    if (g->target->getISA() == Target::NVPTX && 
 #if 0
        !type->IsConstType()  &&
 #endif
 #if 1
        at != NULL &&
 #endif
        type->IsVaryingType())
    {
      Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target.");
      return;
 #if 0
        int nel = 32;  /* warp-size */
        if (type->IsArrayType())
        {
          const ArrayType *at = CastType<ArrayType>(type);
          /* we must scale # elements by 4, because a thread-block will run 4 warps
           * or 128 threads.
           * ***note-to-me***:please define these value (128threads/4warps)
           * in nvptx-target definition
           * instead of compile-time constants 
           */
          nel *= at->GetElementCount();
          assert (!type->IsSOAType());
          type = new ArrayType(at->GetElementType()->GetAsUniformType(), nel);
        }
        else
          type = new ArrayType(type->GetAsUniformType(), nel);
 #endif
    }
    llvm::Type *llvmType = type->LLVMType(g->ctx);
    if (llvmType == NULL)
        return;
@@ -643,6 +675,21 @@ lCheckExportedParameterTypes(const Type *type, const std::string &name,
    }
 }
 static void
 lCheckTaskParameterTypes(const Type *type, const std::string &name,
                             SourcePos pos) {
  if (g->target->getISA() != Target::NVPTX) 
    return;
  if (lRecursiveCheckValidParamType(type, false) == false) {
    if (CastType<VectorType>(type))
      Error(pos, "Vector-typed parameter \"%s\" is illegal in a task "
          "function with \"nvptx\" target.", name.c_str());
    else
      Error(pos, "Varying parameter \"%s\" is illegal in a task function with \"nvptx\" target.",
          name.c_str());
    }
 }
 /** Given a function type, loop through the function parameters and see if
    any are StructTypes.  If so, issue an error; this is currently broken
@@ -801,7 +848,8 @@ Module::AddFunctionDeclaration(const std::string &name,
 #else // LLVM 3.1 and 3.3+
        function->addFnAttr(llvm::Attribute::AlwaysInline);
 #endif
-    if (functionType->isTask)
+    /* evghenii: fails function verification when "if" executed in nvptx target */
    if (functionType->isTask && g->target->getISA() != Target::NVPTX)
        // This also applies transitively to members I think?
 #if defined(LLVM_3_1)
        function->setDoesNotAlias(1, true);
@@ -822,6 +870,13 @@ Module::AddFunctionDeclaration(const std::string &name,
        Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false)
        Error(pos, "Task-qualified functions must have void return type.");
    if (g->target->getISA() == Target::NVPTX &&
        Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false &&
        functionType->isExported)
    {
        Error(pos, "Export-qualified functions must have void return type with \"nvptx\" target.");
    }
    if (functionType->isExported || functionType->isExternC)
        lCheckForStructParameters(functionType, pos);
@@ -841,6 +896,9 @@ Module::AddFunctionDeclaration(const std::string &name,
        if (functionType->isExported) {
          lCheckExportedParameterTypes(argType, argName, argPos);
        }
        if (functionType->isTask) {
          lCheckTaskParameterTypes(argType, argName, argPos);
        }
        // ISPC assumes that no pointers alias.  (It should be possible to
        // specify when this is not the case, but this should be the
@@ -959,8 +1017,14 @@ Module::writeOutput(OutputType outputType, const char *outFileName,
        const char *fileType = NULL;
        switch (outputType) {
        case Asm:
            if (g->target->getISA() != Target::NVPTX)
            {
              if (strcasecmp(suffix, "s"))
                fileType = "assembly";
            }
            else
              if (strcasecmp(suffix, "ptx"))
                fileType = "assembly";
            break;
        case Bitcode:
            if (strcasecmp(suffix, "bc"))
@@ -1057,6 +1121,11 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) {
    }
    llvm::raw_fd_ostream fos(fd, (fd != 1), false);
    if (g->target->getISA() == Target::NVPTX)
    {
      const std::string dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
      module->setDataLayout(dl_string);
    }
    llvm::WriteBitcodeToFile(module, fos);
    return true;
 }
@@ -2095,6 +2164,24 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
            opts.addMacroDef(g->cppArgs[i].substr(2));
        }
    }
    if (g->target->getISA() == Target::NVPTX)
    {
      opts.addMacroDef("__NVPTX__");
      opts.addMacroDef("programIndex=__programIndex()");
      opts.addMacroDef("cif=if");
      opts.addMacroDef("cfor=for");
      opts.addMacroDef("cwhile=while");
      opts.addMacroDef("ccontinue=continue");
      opts.addMacroDef("cdo=do");
      opts.addMacroDef("taskIndex0=__taskIndex0()");
      opts.addMacroDef("taskIndex1=__taskIndex1()");
      opts.addMacroDef("taskIndex2=__taskIndex2()");
      opts.addMacroDef("taskIndex=__taskIndex()");
      opts.addMacroDef("taskCount0=__taskCount0()");
      opts.addMacroDef("taskCount1=__taskCount1()");
      opts.addMacroDef("taskCount2=__taskCount2()");
      opts.addMacroDef("taskCount=__taskCount()");
    }
 #if defined(LLVM_3_1)
    inst.getLangOpts().BCPLComment = 1;
@@ -2540,6 +2627,29 @@ lCreateDispatchModule(std::map<std::string, FunctionTargetVariants> &functions)
    return module;
 }
 static std::string lCBEMangle(const std::string &S) {
  std::string Result;
  for (unsigned i = 0, e = S.size(); i != e; ++i) {
    if (i+1 != e && ((S[i] == '>' && S[i+1] == '>') ||
                     (S[i] == '<' && S[i+1] == '<'))) {
      Result += '_';
      Result += 'A'+(S[i]&15);
      Result += 'A'+((S[i]>>4)&15);
      Result += '_';
      i++;
    } else if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') {
      Result += S[i];
    } else {
      Result += '_';
      Result += 'A'+(S[i]&15);
      Result += 'A'+((S[i]>>4)&15);
      Result += '_';
    }
  }
  return Result;
 }
 int
 Module::CompileAndOutput(const char *srcFile,
@@ -2563,6 +2673,32 @@ Module::CompileAndOutput(const char *srcFile,
        m = new Module(srcFile);
        if (m->CompileFile() == 0) {
            /* NVPTX:
             * for PTX target replace '.' with '_' in all global variables 
             * a PTX identifier name must match [a-zA-Z$_][a-zA-Z$_0-9]*
             */
            if (g->target->getISA() == Target::NVPTX)
            {
              /* mangle global variables names */
              {
                llvm::Module::global_iterator I = m->module->global_begin(), E = m->module->global_end();
                for (; I != E; I++)
                  I->setName(lCBEMangle(I->getName()));
              }
              /* mangle functions names */
              {
                llvm::Module::iterator I = m->module->begin(), E = m->module->end();
                for (; I != E; I++)
                {
                  std::string str = I->getName();
                  if (str.find("operator") != std::string::npos)
                    I->setName(lCBEMangle(str));
                }
              }
            }
            if (outputType == CXX) {
                if (target == NULL || strncmp(target, "generic-", 8) != 0) {
                    Error(SourcePos(), "When generating C++ output, one of the \"generic-*\" "
@@ -2765,4 +2901,5 @@ Module::CompileAndOutput(const char *srcFile,
        return errorCount > 0;
    }
    return true;
 }
--- a/20
+++ b/20
@@ -0,0 +1,20 @@
 #!/bin/sh
 PATH=$ISPC_HOME/examples_ptx/ptxcc:$ISPC_HOME/examples_ptx/ptxgen:$PATH
 PTXCC=ptxcc
 ARGS=${@:2}
 if [ "$NVVM" == "1" ];
 then
  LLVM32=$HOME/usr/local/llvm/bin-3.2
  LLVMDIS=$LLVM32/bin/llvm-dis
  PTXGEN=$ISPC_HOME/examples_ptx/ptxgen/ptxgen
  $($LLVMDIS $1 -o $1.ll) && $($PTXGEN $1.ll > $1.ptx) && \
  $($PTXCC $1.ptx -o $1.o -Xnvcc="-G") && \
  $(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS);
 else
  $($PTXCC $1 -o $1.o -Xnvcc="-G") && \
  $(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS);
 fi
--- a/opt.cpp
+++ b/opt.cpp
@@ -133,6 +133,7 @@ static llvm::Pass *CreateDebugPass(char * output);
 static llvm::Pass *CreateReplaceStdlibShiftPass();
 static llvm::Pass *CreateFixBooleanSelectPass();
 static llvm::Pass *CreatePromoteLocalToPrivatePass();
 #define DEBUG_START_PASS(NAME)                                 \
    if (g->debugPrint &&                                       \
@@ -496,7 +497,11 @@ Optimize(llvm::Module *module, int optLevel) {
        // run absolutely no optimizations, since the front-end needs us to
        // take the various __pseudo_* functions it has emitted and turn
        // them into something that can actually execute.
        if (g->opt.disableGatherScatterOptimizations == false &&
            g->target->getVectorWidth() > 1) 
          optPM.add(CreateImproveMemoryOpsPass(), 100);
        if (g->opt.disableHandlePseudoMemoryOps == false)
            optPM.add(CreateReplacePseudoMemoryOpsPass());
@@ -519,6 +524,8 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::initializeInstrumentation(*registry);
        llvm::initializeTarget(*registry);
        if (g->target->getISA() == Target::NVPTX)
          optPM.add(CreatePromoteLocalToPrivatePass());
        optPM.add(llvm::createGlobalDCEPass(), 185);
        // Setup to use LLVM default AliasAnalysis
@@ -577,7 +584,10 @@ Optimize(llvm::Module *module, int optLevel) {
        optPM.add(llvm::createGlobalOptimizerPass());
        optPM.add(llvm::createReassociatePass());
        optPM.add(llvm::createIPConstantPropagationPass());
        if (g->target->getISA() != Target::NVPTX)
          optPM.add(CreateReplaceStdlibShiftPass(),229);
        optPM.add(llvm::createDeadArgEliminationPass(),230);
        optPM.add(llvm::createInstructionCombiningPass());
        optPM.add(llvm::createCFGSimplificationPass());
@@ -689,6 +699,111 @@ Optimize(llvm::Module *module, int optLevel) {
        // Should be the last
        optPM.add(CreateFixBooleanSelectPass(), 400);
        if (g->target->getISA() == Target::NVPTX)
        {
          optPM.add(llvm::createGlobalDCEPass());
          optPM.add(llvm::createTypeBasedAliasAnalysisPass());
          optPM.add(llvm::createBasicAliasAnalysisPass());
          optPM.add(llvm::createCFGSimplificationPass());
          // Here clang has an experimental pass SROAPass instead of
          // ScalarReplAggregatesPass. We should add it in the future.
          optPM.add(llvm::createScalarReplAggregatesPass());
          optPM.add(llvm::createEarlyCSEPass());
          optPM.add(llvm::createLowerExpectIntrinsicPass());
          optPM.add(llvm::createTypeBasedAliasAnalysisPass());
          optPM.add(llvm::createBasicAliasAnalysisPass());
          // Early optimizations to try to reduce the total amount of code to
          // work with if we can
          optPM.add(llvm::createReassociatePass());
          optPM.add(llvm::createConstantPropagationPass());
          optPM.add(llvm::createDeadInstEliminationPass());
          optPM.add(llvm::createCFGSimplificationPass());
          optPM.add(llvm::createPromoteMemoryToRegisterPass());
          optPM.add(llvm::createAggressiveDCEPass());
          optPM.add(llvm::createInstructionCombiningPass());
          optPM.add(llvm::createDeadInstEliminationPass());
          // On to more serious optimizations
          optPM.add(llvm::createInstructionCombiningPass());
          optPM.add(llvm::createCFGSimplificationPass());
          optPM.add(llvm::createPromoteMemoryToRegisterPass());
          optPM.add(llvm::createGlobalOptimizerPass());
          optPM.add(llvm::createReassociatePass());
          optPM.add(llvm::createIPConstantPropagationPass());
          optPM.add(llvm::createDeadArgEliminationPass());
          optPM.add(llvm::createInstructionCombiningPass());
          optPM.add(llvm::createCFGSimplificationPass());
          optPM.add(llvm::createPruneEHPass());
          optPM.add(llvm::createFunctionAttrsPass());
          optPM.add(llvm::createFunctionInliningPass());
          optPM.add(llvm::createConstantPropagationPass());
          optPM.add(llvm::createDeadInstEliminationPass());
          optPM.add(llvm::createCFGSimplificationPass());
          optPM.add(llvm::createArgumentPromotionPass());
 #if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
          // Starting from 3.4 this functionality was moved to
          // InstructionCombiningPass. See r184459 for details.
          optPM.add(llvm::createSimplifyLibCallsPass());
 #endif
          optPM.add(llvm::createAggressiveDCEPass());
          optPM.add(llvm::createInstructionCombiningPass());
          optPM.add(llvm::createJumpThreadingPass());
          optPM.add(llvm::createCFGSimplificationPass());
          optPM.add(llvm::createInstructionCombiningPass());
          optPM.add(llvm::createTailCallEliminationPass());
          optPM.add(llvm::createInstructionCombiningPass());
          optPM.add(llvm::createFunctionInliningPass());
          optPM.add(llvm::createConstantPropagationPass());
          optPM.add(llvm::createInstructionCombiningPass());
          optPM.add(llvm::createIPSCCPPass());
          optPM.add(llvm::createDeadArgEliminationPass());
          optPM.add(llvm::createAggressiveDCEPass());
          optPM.add(llvm::createInstructionCombiningPass());
          optPM.add(llvm::createCFGSimplificationPass());
          optPM.add(llvm::createFunctionInliningPass());
          optPM.add(llvm::createArgumentPromotionPass());
          optPM.add(llvm::createInstructionCombiningPass());
          optPM.add(llvm::createCFGSimplificationPass());
          optPM.add(llvm::createReassociatePass());
          optPM.add(llvm::createLoopRotatePass());
          optPM.add(llvm::createLICMPass());
 //          optPM.add(llvm::createLoopUnswitchPass(false));
 #if 1
          optPM.add(llvm::createInstructionCombiningPass());
          optPM.add(llvm::createIndVarSimplifyPass());
          optPM.add(llvm::createLoopIdiomPass());
          optPM.add(llvm::createLoopDeletionPass());
          optPM.add(llvm::createLoopUnrollPass());
          optPM.add(llvm::createGVNPass());
          optPM.add(llvm::createMemCpyOptPass());
          optPM.add(llvm::createSCCPPass());
          optPM.add(llvm::createInstructionCombiningPass());
          optPM.add(llvm::createJumpThreadingPass());
          optPM.add(llvm::createCorrelatedValuePropagationPass());
          optPM.add(llvm::createDeadStoreEliminationPass());
          optPM.add(llvm::createAggressiveDCEPass());
          optPM.add(llvm::createCFGSimplificationPass());
          optPM.add(llvm::createInstructionCombiningPass());
          optPM.add(llvm::createFunctionInliningPass());
          optPM.add(llvm::createAggressiveDCEPass());
          optPM.add(llvm::createStripDeadPrototypesPass());
          optPM.add(llvm::createGlobalDCEPass());
          optPM.add(llvm::createConstantMergePass());
 #endif
        }
    }
    // Finish up by making sure we didn't mess anything up in the IR along
@@ -5267,4 +5382,63 @@ CreateFixBooleanSelectPass() {
    return new FixBooleanSelectPass();
 }
 ///////////////////////////////////////////////////////////////////////////////
 // Detect addrspace(3)
 ///////////////////////////////////////////////////////////////////////////////
 class PromoteLocalToPrivatePass: public llvm::BasicBlockPass
 {
  public:
    static char ID; // Pass identification, replacement for typeid
    PromoteLocalToPrivatePass() : BasicBlockPass(ID) {}
    bool runOnBasicBlock(llvm::BasicBlock &BB);
 };
 char PromoteLocalToPrivatePass::ID = 0;
 bool 
 PromoteLocalToPrivatePass::runOnBasicBlock(llvm::BasicBlock &BB)
 {
  std::vector<llvm::AllocaInst*> Allocas;
  bool modifiedAny  = false;
  llvm::Function *cvtFunc = m->module->getFunction("__cvt_loc2gen_var");
  // Find allocas that are safe to promote, by looking at all instructions in
  // the entry node
  for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
  {
    llvm::Instruction *inst = &*I;
    if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
    {
      llvm::Function *func = ci->getCalledFunction();
      if (cvtFunc && (cvtFunc == func))
      {
 #if 0
        fprintf(stderr , "--found cvt-- name= %s \n",
            I->getName().str().c_str());
 #endif
        llvm::AllocaInst *alloca = new llvm::AllocaInst(LLVMTypes::Int64Type, "opt_loc2var", ci);
        assert(alloca != NULL);
 #if 0
        const int align = 8; // g->target->getNativeVectorAlignment();
        alloca->setAlignment(align);
 #endif
        ci->replaceAllUsesWith(alloca);
        modifiedAny = true;
      }
    }
  }
  return modifiedAny;
 }
 static llvm::Pass *
 CreatePromoteLocalToPrivatePass() {
    return new PromoteLocalToPrivatePass();
 }
--- a/ptxtestcc.sh
+++ b/ptxtestcc.sh
@@ -0,0 +1,14 @@
 #!/bin/sh
 LLC=$HOME/usr/local/llvm/bin-trunk/bin/llc
 DIS=$HOME/usr/local/llvm/bin-3.2/bin/llvm-dis
 ISPC=ispc
 PTXCC=ptxcc
 PTXGEN=~/ptxgen
 $(cat $1 |grep -v 'width'|$ISPC --target=nvptx --emit-llvm -o -|$LLC -march=nvptx64 -mcpu=sm_35 -o $1.ptx) && \
 #$(cat $1 |grep -v 'width'|$ISPC --target=nvptx --emit-llvm -o -|$DIS -o $1_32_ptx.ll && $PTXGEN $1_32_ptx.ll > $1.ptx) && \
 $($PTXCC $1.ptx  -Xptxas=-v -o $1.ptx.o) && \
 nvcc -o test_nvptx test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.ptx.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt -DTEST_SIG=$2
--- a/run_tests.py
+++ b/run_tests.py
@@ -204,6 +204,8 @@ def run_test(testname):
            return (1, 0)
        else:
            global is_generic_target
            global is_nvptx_target
            global is_nvptx_nvvm
            if is_windows:
                if is_generic_target:
                    obj_name = "%s.cpp" % os.path.basename(filename)
@@ -218,6 +220,13 @@ def run_test(testname):
            else:
                if is_generic_target:
                    obj_name = "%s.cpp" % testname
                elif is_nvptx_target:
                  if os.environ.get("NVVM") == "1":
                    is_nvptx_nvvm = True
                    obj_name = "%s.bc" % testname
                  else:
                    obj_name = "%s.ptx" % testname
                    is_nvptx_nvvm = False
                else:
                    obj_name = "%s.o" % testname
                exe_name = "%s.run" % testname
@@ -248,13 +257,32 @@ def run_test(testname):
                    cc_cmd += ' -Wl,-no_pie'
                if should_fail:
                    cc_cmd += " -DEXPECT_FAILURE"
                if is_nvptx_target:
                  nvptxcc_exe = "nvptxcc"
                  nvptxcc_exe_rel = add_prefix(nvptxcc_exe)
                  cc_cmd = "%s %s -DTEST_SIG=%d -o %s" % \
                      (nvptxcc_exe_rel, obj_name, match, exe_name)
-            ispc_cmd = ispc_exe_rel + " --woff %s -o %s --arch=%s --target=%s" % \
+            ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --arch=%s --target=%s" % \
                       (filename, obj_name, options.arch, options.target)
            if options.no_opt:
                ispc_cmd += " -O0" 
            if is_generic_target:
                ispc_cmd += " --emit-c++ --c++-include-file=%s" % add_prefix(options.include_file)
            if is_nvptx_target:
                filename4ptx = filename+".ptx.parsed_ispc"
                grep_cmd = "grep -v 'export uniform int width' %s > %s " % \
                    (filename, filename4ptx)
                if options.verbose:
                  print "Grepping: %s" % grep_cmd
                sp = subprocess.Popen(grep_cmd, shell=True)
                sp.communicate()
                if is_nvptx_nvvm:
                  ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-llvm --target=%s" % \
                         (filename4ptx, obj_name, options.target)
                else:
                  ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-asm --target=%s" % \
                         (filename4ptx, obj_name, options.target)
        # compile the ispc code, make the executable, and run it...
        (compile_error, run_error) = run_cmds([ispc_cmd, cc_cmd], 
@@ -269,7 +297,7 @@ def run_test(testname):
                    basename = os.path.basename(filename)
                    os.unlink("%s.pdb" % basename)
                    os.unlink("%s.ilk" % basename)
-            os.unlink(obj_name)
+#            os.unlink(obj_name)
        except:
            None
@@ -290,6 +318,7 @@ def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test
    ispc_exe = glob_var[3]
    global is_generic_target
    is_generic_target = glob_var[4]
    global is_nvptx_target
    global run_tests_log
    run_tests_log = glob_var[5]    
@@ -505,6 +534,8 @@ def run_tests(options1, args, print_version):
    if options.target == 'neon':
        options.arch = 'arm'
    if options.target == "nvptx":
        options.arch = "nvptx64"
    # use relative path to not depend on host directory, which may possibly
    # have white spaces and unicode characters.
@@ -531,8 +562,10 @@ def run_tests(options1, args, print_version):
    ispc_exe += " " + options.ispc_flags
    global is_generic_target
    global is_nvptx_target
    is_generic_target = (options.target.find("generic-") != -1 and
                     options.target != "generic-1" and options.target != "generic-x1")
    is_nvptx_target = (options.target.find("nvptx") != -1)
    if is_generic_target and options.include_file == None:
        if options.target == "generic-4" or options.target == "generic-x4":
            error("No generics #include specified; using examples/intrinsics/sse4.h\n", 2)
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -57,6 +57,31 @@
  #error Unknown value of ISPC_MASK_BITS
 #endif
 ///////////////////////////////////////////////////////////////////////////
 // CUDA Specific primitives
 //
 /***************/
 __declspec(safe,cost0) static inline varying int __programIndex() { return __program_index(); }
 __declspec(safe,cost0) static inline uniform int __programCount() { return __program_count(); }
 __declspec(safe,cost0) static inline uniform int __warpIndex()    { return __warp_index();    }
 /***************/
 __declspec(safe,cost0) static inline uniform int __taskIndex0() { return __task_index0(); }
 __declspec(safe,cost0) static inline uniform int __taskIndex1() { return __task_index1(); }
 __declspec(safe,cost0) static inline uniform int __taskIndex2() { return __task_index2(); }
 __declspec(safe,cost0) static inline uniform int __taskIndex () { return __task_index (); }
 /***************/
 __declspec(safe,cost0) static inline uniform int __taskCount0() { return __task_count0(); }
 __declspec(safe,cost0) static inline uniform int __taskCount1() { return __task_count1(); }
 __declspec(safe,cost0) static inline uniform int __taskCount2() { return __task_count2(); }
 __declspec(safe,cost0) static inline uniform int __taskCount () { return __task_count (); }
 ///////////////////////////////////////////////////////////////////////////
 // Low level primitives
@@ -464,6 +489,9 @@ __declspec(safe)
 static inline uniform int popcnt(bool v) {
    // As with any() and all(), only count across the active lanes
 #if (ISPC_MASK_BITS == 1)
    if (__is_nvptx_target)
      return __popcnt_int64(__movmsk_ptx(v & __mask));
    else
      return __popcnt_int64(__movmsk(v & __mask));
 #else
    return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask));
@@ -1226,6 +1254,11 @@ packed_store_active(uniform int a[], int vals) {
    return __packed_store_active(a, vals, (IntMaskType)__mask);
 }
 static inline uniform int 
 packed_store_active(bool active, uniform int a[], int vals) {
    return __packed_store_active(a, vals, (IntMaskType)(-(int)active));
 }
 static inline uniform int
 packed_store_active2(uniform int a[], int vals) {
    return __packed_store_active2(a, vals, (IntMaskType)__mask);
@@ -1236,6 +1269,9 @@ packed_store_active2(uniform int a[], int vals) {
 // System information
 static inline uniform int num_cores() {
  if (__is_nvptx_target)
    return 15*32; // K20/K20X/K40 - 15SMX x 32 warps/smx (max is 64 warps/smx)
  else
    return __num_cores();
 }
@@ -1783,7 +1819,7 @@ static inline void memory_barrier() {
    __memory_barrier();
 }
-#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
+#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE,TC)                        \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
    TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
    return ret;                                                         \
@@ -1794,6 +1830,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
    return ret;                                                         \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
  if (__is_nvptx_target) {                                            \
    TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
    return ret;                                                         \
  } else {    \
    uniform TA * uniform ptrArray[programCount];                        \
    ptrArray[programIndex] = ptr;                                       \
    TA ret;                                                             \
@@ -1805,9 +1845,14 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
    }                                                                   \
    return ret;                                                         \
  } \
 }                                                                       \
-#define DEFINE_ATOMIC_SWAP(TA,TB)                                       \
+#define DEFINE_ATOMIC_SWAP(TA,TB,MASKTYPE,TC)                \
 static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
  if (__is_nvptx_target) {                                            \
    TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
    return ret;                                                         \
  } else {    \
    uniform int i = 0;                                                  \
    TA ret[programCount];                                               \
    TA memVal;                                                          \
@@ -1839,12 +1884,17 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
    ret[lastSwap] = memVal;                                             \
    return ret[programIndex];                                           \
  }\
 }                                                                       \
 static inline uniform TA atomic_swap_global(uniform TA * uniform ptr,   \
                                            uniform TA value) {         \
    uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value);   \
    return ret;                                                         \
 }                                                                       \
 static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
  if (__is_nvptx_target) {                                            \
    TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
    return ret;                                                         \
  } else {    \
    uniform TA * uniform ptrArray[programCount];                        \
    ptrArray[programIndex] = ptr;                                       \
    TA ret;                                                             \
@@ -1856,8 +1906,9 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
    }                                                                   \
    return ret;                                                         \
  }\
 }                                                                       \
-#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB)                          \
+#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC)                          \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
    uniform TA oneval = reduce_##OPA(value);                            \
    TA ret;                                                             \
@@ -1872,6 +1923,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
                                       TA value) {                      \
  if (__is_nvptx_target) {                                            \
    TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
    return ret;                                                         \
  } else {    \
    uniform TA * uniform ptrArray[programCount];                        \
    ptrArray[programIndex] = ptr;                                       \
    TA ret;                                                             \
@@ -1882,57 +1937,58 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
    return ret;                                                         \
  } \
 }
-DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType,int64)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType,int64)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64)
-DEFINE_ATOMIC_SWAP(int32,int32)
+DEFINE_ATOMIC_SWAP(int32,int32,IntMaskType,int64)
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType, unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType, unsigned int64)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64)
-DEFINE_ATOMIC_SWAP(unsigned int32,int32)
+DEFINE_ATOMIC_SWAP(unsigned int32,int32,UIntMaskType, unsigned int64)
-DEFINE_ATOMIC_SWAP(float,float)
+DEFINE_ATOMIC_SWAP(float,float,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType,int64)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64)
-DEFINE_ATOMIC_SWAP(int64,int64)
+DEFINE_ATOMIC_SWAP(int64,int64,IntMaskType, int64)
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_SWAP(unsigned int64,int64)
+DEFINE_ATOMIC_SWAP(unsigned int64,int64,UIntMaskType, unsigned int64)
-DEFINE_ATOMIC_SWAP(double,double)
+DEFINE_ATOMIC_SWAP(double,double,IntMaskType, int64)
 #undef DEFINE_ATOMIC_OP
 #undef DEFINE_ATOMIC_MINMAX_OP
 #undef DEFINE_ATOMIC_SWAP
-#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
+#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE, TC)                           \
 static inline uniform TA atomic_compare_exchange_global(               \
         uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
    uniform TA ret =                                                    \
@@ -1947,6 +2003,10 @@ static inline TA atomic_compare_exchange_global(                           \
 } \
 static inline TA atomic_compare_exchange_global(               \
         uniform TA * varying ptr, TA oldval, TA newval) { \
  if (__is_nvptx_target) {                                            \
    TA ret = __atomic_compare_exchange_varying_##TB##_global((TC)ptr, oldval, newval, (MASKTYPE)__mask);      \
    return ret;                                                         \
  } else {    \
    uniform TA * uniform ptrArray[programCount];                        \
    ptrArray[programIndex] = ptr;                                       \
    TA ret;                                                             \
@@ -1958,14 +2018,15 @@ static inline TA atomic_compare_exchange_global(               \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
    return ret;                                                         \
  } \
 }
-ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
+ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType,int64)
-ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
+ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType,unsigned int64)
-ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
+ATOMIC_DECL_CMPXCHG(float, float, IntMaskType,int64)
-ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
+ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType,int64)
-ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
+ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType,unsigned int64)
-ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
+ATOMIC_DECL_CMPXCHG(double, double, IntMaskType,int64)
 #undef ATOMIC_DECL_CMPXCHG
@@ -2032,12 +2093,20 @@ static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value)
 }                                                                      \
 static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) {    \
    TYPE ret;                                                          \
  if (__is_nvptx_target) {                                            \
    foreach_active (i) {                                             \
        uniform TYPE * uniform ptr = (uniform TYPE * uniform)extract((int64)p, i); \
        ret  = insert(ret, i, *ptr);                                \
        *ptr = OPFUNC(*ptr, extract(value, i));                \
    }                                                                  \
  } else {    \
    uniform TYPE * uniform ptrs[programCount];                         \
    ptrs[programIndex] = p;                                            \
    foreach_active (i) {                                             \
        ret = insert(ret, i, *ptrs[i]);                                \
        *ptrs[i] = OPFUNC(*ptrs[i], extract(value, i));                \
    }                                                                  \
  } \
    return ret;                                                        \
 }
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -142,6 +142,62 @@ lHasUnsizedArrays(const Type *type) {
        return lHasUnsizedArrays(at->GetElementType());
 }
 static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos &currentPos, const bool variable = false)
 {
  if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX) 
    return value;
  llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
  const int addressSpace = pt->getAddressSpace();
  if (addressSpace != 3 && addressSpace != 4) 
    return value;
  llvm::Type *elTy = pt->getElementType();
  /* convert elTy addrspace(3)* to i64* addrspace(3)* */
  llvm::PointerType *Int64Ptr3 = llvm::PointerType::get(LLVMTypes::Int64Type, addressSpace);
  value = ctx->BitCastInst(value, Int64Ptr3, "gep2gen_cast1");
  /* convert i64* addrspace(3) to i64* */
  llvm::Function *__cvt2gen = m->module->getFunction(
      addressSpace == 3 ? (variable ? "__cvt_loc2gen_var" : "__cvt_loc2gen") : "__cvt_const2gen");
  std::vector<llvm::Value *> __cvt2gen_args;
  __cvt2gen_args.push_back(value);
  value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, variable ? "gep2gen_cvt_var" : "gep2gen_cvt", ctx->GetCurrentBasicBlock());
  /* compute offset */
  if (addressSpace == 3)
  {
    assert(elTy->isArrayTy());
    const int numElTot = elTy->getArrayNumElements();
    const int numEl    = numElTot/4;
 #if 0
    fprintf(stderr, " --- detected addrspace(3) sz= %d --- \n", numEl);
 #endif
    llvm::ArrayType *arrTy = llvm::dyn_cast<llvm::ArrayType>(pt->getArrayElementType());
    assert(arrTy != NULL);
    llvm::Type *arrElTy = arrTy->getElementType();
 #if 0
    if (arrElTy->isArrayTy())
      Error(currentPos, "Currently \"nvptx\" target doesn't support array-of-array");
 #endif
    /* convert i64* to errElTy* */
    llvm::PointerType *arrElTyPt0 = llvm::PointerType::get(arrElTy, 0);
    value  = ctx->BitCastInst(value, arrElTyPt0, "gep2gen_cast2");
    llvm::Function *func_warp_index    = m->module->getFunction("__warp_index");
    llvm::Value *warpId = ctx->CallInst(func_warp_index, NULL, std::vector<llvm::Value*>(),  "gep2gen_warp_index");
    llvm::Value *offset = ctx->BinaryOperator(llvm::Instruction::Mul, warpId, LLVMInt32(numEl), "gep2gen_offset");
    value = llvm::GetElementPtrInst::Create(value, offset, "gep2gen_offset", ctx->GetCurrentBasicBlock());
  }
  /* convert arrElTy* to elTy* */
  llvm::PointerType *elTyPt0 = llvm::PointerType::get(elTy, 0);
  value  = ctx->BitCastInst(value, elTyPt0, "gep2gen_cast3");
  return value;
 }
 void
 DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
@@ -205,7 +261,22 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
            return;
        }
        if (sym->storageClass == SC_STATIC) {
            if (g->target->getISA() == Target::NVPTX && !sym->type->IsConstType())
                PerformanceWarning(sym->pos, 
                    "Non-constant static variable ""\"%s\" is stored in __global address sace with ""\"nvptx\" target.",
                    sym->name.c_str());
            if (g->target->getISA() == Target::NVPTX && sym->type->IsVaryingType())
                PerformanceWarning(sym->pos, 
                    "\"const static varying\" variable ""\"%s\" is stored in __global address space with ""\"nvptx\" target.",
                    sym->name.c_str());
            if (g->target->getISA() == Target::NVPTX && sym->type->IsUniformType())
                PerformanceWarning(sym->pos, 
                    "\"const static uniform\" variable ""\"%s\" is stored in __constant address space with ""\"nvptx\" target.",
                    sym->name.c_str());
            // For static variables, we need a compile-time constant value
            // for its initializer; if there's no initializer, we use a
            // zero value.
@@ -233,19 +304,97 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
            if (cinit == NULL)
                cinit = llvm::Constant::getNullValue(llvmType);
            int addressSpace = 0;
            if (g->target->getISA() == Target::NVPTX &&
                sym->type->IsConstType() &&
                sym->type->IsUniformType())
              addressSpace = 4;
            // Allocate space for the static variable in global scope, so
            // that it persists across function calls
            sym->storagePtr =
                new llvm::GlobalVariable(*m->module, llvmType,
                                         sym->type->IsConstType(),
                                         llvm::GlobalValue::InternalLinkage, cinit,
-                                         llvm::Twine("static.") +
+                                         llvm::Twine("static_") +
                                         llvm::Twine(sym->pos.first_line) +
-                                         llvm::Twine(".") + sym->name.c_str());
+                                         llvm::Twine("_") + sym->name.c_str(),
                                         NULL,
                                         llvm::GlobalVariable::NotThreadLocal,
                                         addressSpace);
            sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos);
            // Tell the FunctionEmitContext about the variable
            ctx->EmitVariableDebugInfo(sym);
        }
-        else {
+        else if ((sym->type->IsUniformType() || sym->type->IsSOAType()) &&
          /* NVPTX:
           * only non-constant uniform data types are stored in shared memory 
           * constant uniform are automatically promoted to varying 
           */
           !sym->type->IsConstType() &&
 #if 1     
           sym->type->IsArrayType() &&
 #endif
           g->target->getISA() == Target::NVPTX)
          {
              PerformanceWarning(sym->pos,
                  "Non-constant \"uniform\" data types might be slow with \"nvptx\" target. "
                  "Unless data sharing between program instances is desired, try \"const [static] uniform\", \"varying\" or \"uniform new uniform \"+\"delete\" if possible.");
              /* with __shared__ memory everything must be an array */
              int nel = 4;
              ArrayType *nat;
              bool variable = true;
              if (sym->type->IsArrayType())
              {
                const ArrayType *at = CastType<ArrayType>(sym->type);
                /* we must scale # elements by 4, because a thread-block will run 4 warps
                 * or 128 threads.
                 * ***note-to-me***:please define these value (128threads/4warps)
                 * in nvptx-target definition
                 * instead of compile-time constants 
                 */
                nel *= at->GetElementCount();
                if (sym->type->IsSOAType())
                  nel *= sym->type->GetSOAWidth();
                nat = new ArrayType(at->GetElementType(), nel);
                variable = false;
              }
              else
                nat = new ArrayType(sym->type, nel);
              llvm::Type *llvmTypeUn = nat->LLVMType(g->ctx);
              llvm::Constant *cinit = llvm::UndefValue::get(llvmTypeUn);
              sym->storagePtr =
                new llvm::GlobalVariable(*m->module, llvmTypeUn,
                    sym->type->IsConstType(),
                    llvm::GlobalValue::InternalLinkage, 
                    cinit,
                    llvm::Twine("local_") + 
                    llvm::Twine(sym->pos.first_line) +
                    llvm::Twine("_") + sym->name.c_str(),
                    NULL,
                    llvm::GlobalVariable::NotThreadLocal,
                    /*AddressSpace=*/3);
              sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos, variable);
              llvm::PointerType *ptrTy = llvm::PointerType::get(sym->type->LLVMType(g->ctx),0);
              sym->storagePtr = ctx->BitCastInst(sym->storagePtr, ptrTy, "uniform_decl");
              // Tell the FunctionEmitContext about the variable; must do
              // this before the initializer stuff.
              ctx->EmitVariableDebugInfo(sym);
              if (initExpr == 0 && sym->type->IsConstType())
                Error(sym->pos, "Missing initializer for const variable "
                    "\"%s\".", sym->name.c_str());
              // And then get it initialized...
              sym->parentFunction = ctx->GetFunction();
              InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos);
          }
          else
          {
            // For non-static variables, allocate storage on the stack
            sym->storagePtr = ctx->AllocaInst(llvmType, sym->name.c_str());
@@ -253,6 +402,10 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
            // this before the initializer stuff.
            ctx->EmitVariableDebugInfo(sym);
            if (initExpr == 0 && sym->type->IsConstType())
              Error(sym->pos, "Missing initializer for const variable "
                  "\"%s\".", sym->name.c_str());
            // And then get it initialized...
            sym->parentFunction = ctx->GetFunction();
            InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos);
@@ -415,6 +568,19 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {
    if (testValue == NULL)
        return;
 #if 0
    if (!isUniform && g->target->getISA() == Target::NVPTX)
    {
      /* With "nvptx" target, SIMT hardware takes care of non-uniform 
       * control flow. We trick ISPC to generate uniform control flow.
       */
      testValue = ctx->ExtractInst(testValue, 0);
      isUniform = true;
    }
 #endif
    if (isUniform) {
        ctx->StartUniformIf();
        if (doAllCheck)
@@ -696,7 +862,11 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
    // Do any of the program instances want to run the 'true'
    // block?  If not, jump ahead to bNext.
 #if 1
    llvm::Value *maskAnyTrueQ = ctx->Any(ctx->GetFullMask());
 #else
    llvm::Value *maskAnyTrueQ = ctx->ExtractInst(ctx->GetFullMask(),0);
 #endif
    ctx->BranchInst(bRunTrue, bNext, maskAnyTrueQ);
    // Emit statements for true
@@ -713,7 +883,11 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
    // Similarly, check to see if any of the instances want to
    // run the 'false' block...
 #if 1
    llvm::Value *maskAnyFalseQ = ctx->Any(ctx->GetFullMask());
 #else
    llvm::Value *maskAnyFalseQ = ctx->ExtractInst(ctx->GetFullMask(),0);
 #endif
    ctx->BranchInst(bRunFalse, bDone, maskAnyFalseQ);
    // Emit code for false
@@ -1273,7 +1447,10 @@ static llvm::Value *
 lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
                      llvm::Value *uniformCounterPtr,
                      llvm::Value *varyingCounterPtr,
-                      const std::vector<int> &spans) {
+                      const std::vector<int> &spans) 
 {
  if (g->target->getISA() != Target::NVPTX)
  {
    // Smear the uniform counter value out to be varying
    llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
    llvm::Value *smearCounter = ctx->BroadcastValue(
@@ -1307,6 +1484,93 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
    ctx->StoreInst(varyingCounter, varyingCounterPtr);
    return varyingCounter;
  }
  else /* NVPTX == true */
  {
    // Smear the uniform counter value out to be varying
    llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
    llvm::Value *smearCounter = ctx->BroadcastValue(
        counter, LLVMTypes::Int32VectorType, "smear_counter");
    // Figure out the offsets; this is a little bit tricky.  As an example,
    // consider a 2D tiled foreach loop, where we're running 8-wide and
    // where the inner dimension has a stride of 4 and the outer dimension
    // has a stride of 2.  For the inner dimension, we want the offsets
    // (0,1,2,3,0,1,2,3), and for the outer dimension we want
    // (0,0,0,0,1,1,1,1).
    int32_t delta[ISPC_MAX_NVEC];
    const int vecWidth = 32; 
    std::vector<llvm::Constant*> constDeltaList;
    for (int i = 0; i < vecWidth; ++i) 
    {
      int d = i;
      // First, account for the effect of any dimensions at deeper
      // nesting levels than the current one.
      int prevDimSpanCount = 1;
      for (int j = dim; j < nDims-1; ++j)
        prevDimSpanCount *= spans[j+1];
      d /= prevDimSpanCount;
      // And now with what's left, figure out our own offset
      delta[i] = d % spans[dim];
      constDeltaList.push_back(LLVMInt8(delta[i]));
    }
    llvm::ArrayType* ArrayDelta = llvm::ArrayType::get(LLVMTypes::Int8Type, 32);
 //    llvm::PointerType::get(ArrayDelta, 4); /* constant memory */
    llvm::GlobalVariable* globalDelta = new llvm::GlobalVariable(
        /*Module=*/*m->module,
        /*Type=*/ArrayDelta,
        /*isConstant=*/true,
        /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
        /*Initializer=*/0, // has initializer, specified below
        /*Name=*/"constDeltaForeach");
 #if 0
        /*ThreadLocalMode=*/llvm::GlobalVariable::NotThreadLocal,
        /*unsigned AddressSpace=*/4 /*constant*/);
 #endif
    llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList);
    globalDelta->setInitializer(constDelta);
    llvm::Function *func_program_index = m->module->getFunction("__program_index");
    llvm::Value *laneIdx = ctx->CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__programIndex");
    std::vector<llvm::Value*> ptr_arrayidx_indices;
    ptr_arrayidx_indices.push_back(LLVMInt32(0));
    ptr_arrayidx_indices.push_back(laneIdx);
 #if 1
    llvm::Instruction* ptr_arrayidx = llvm::GetElementPtrInst::Create(globalDelta, ptr_arrayidx_indices, "arrayidx", ctx->GetCurrentBasicBlock());
    llvm::LoadInst* int8_39 = new llvm::LoadInst(ptr_arrayidx, "", false, ctx->GetCurrentBasicBlock());
    llvm::Value * int32_39 = ctx->ZExtInst(int8_39, LLVMTypes::Int32Type);
    llvm::VectorType* VectorTy_2 = llvm::VectorType::get(llvm::IntegerType::get(*g->ctx, 32), 1);
    llvm::UndefValue* const_packed_41 = llvm::UndefValue::get(VectorTy_2);
    llvm::InsertElementInst* packed_43 = llvm::InsertElementInst::Create(
 //        llvm::UndefValue(LLVMInt32Vector),
        const_packed_41,
        int32_39, LLVMInt32(0), "", ctx->GetCurrentBasicBlock());
 #endif
    // Add the deltas to compute the varying counter values; store the
    // result to memory and then return it directly as well.
 #if 0
    llvm::Value *varyingCounter =
        ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
                            LLVMInt32Vector(delta), "iter_val");
 #else
    llvm::Value *varyingCounter =
        ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
                            packed_43, "iter_val");
 #endif
    ctx->StoreInst(varyingCounter, varyingCounterPtr);
    return varyingCounter;
  }
 }
 /** Returns the integer log2 of the given integer. */
@@ -1394,7 +1658,9 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;
    std::vector<int> span(nDims, 0);
-    lGetSpans(nDims-1, nDims, g->target->getVectorWidth(), isTiled, &span[0]);
+    const int vectorWidth = 
      g->target->getISA() == Target::NVPTX ? 32 : g->target->getVectorWidth();
    lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]);
    for (int i = 0; i < nDims; ++i) {
      // Basic blocks that we'll fill in later with the looping logic for
@@ -1993,7 +2259,8 @@ ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const {
        // math...)
        // Get the "program index" vector value
-        llvm::Value *programIndex = ctx->ProgramIndexVector();
+        llvm::Value *programIndex = g->target->getISA() == Target::NVPTX ?
          ctx->ProgramIndexVectorPTX() : ctx->ProgramIndexVector();
        // And smear the current lane out to a vector
        llvm::Value *firstSet32 =
@@ -2189,10 +2456,19 @@ ForeachUniqueStmt::EmitCode(FunctionEmitContext *ctx) const {
        // And load the corresponding element value from the temporary
        // memory storing the value of the varying expr.
        llvm::Value *uniqueValue;
        if (g->target->getISA() != Target::NVPTX)
        {
          llvm::Value *uniqueValuePtr =
            ctx->GetElementPtrInst(exprMem, LLVMInt64(0), firstSet, exprPtrType,
                "unique_index_ptr");
-        llvm::Value *uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value");
+          uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value");
        }
        else  /* in case of PTX target, use __shfl PTX intrinsics via __insert/__extract function */
        {
          llvm::Value *firstSet32 = ctx->TruncInst(firstSet, LLVMTypes::Int32Type);
          uniqueValue = ctx->Extract(exprValue, firstSet32);
        }
        // If it's a varying pointer type, need to convert from the int
        // type we store in the vector to the actual pointer type
@@ -3100,7 +3376,8 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
    }
    // Now we can emit code to call __do_print()
-    llvm::Function *printFunc = m->module->getFunction("__do_print");
+    llvm::Function *printFunc = g->target->getISA() != Target::NVPTX ?
      m->module->getFunction("__do_print") : m->module->getFunction("__do_print_nvptx");
    AssertPos(pos, printFunc);
    llvm::Value *mask = ctx->GetFullMask();
--- a/test_static_cuda.cpp
+++ b/test_static_cuda.cpp
@@ -0,0 +1,440 @@
 /*
  Copyright (c) 2010-2011, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #if defined(_WIN32) || defined(_WIN64)
 #define ISPC_IS_WINDOWS
 #elif defined(__linux__)
 #define ISPC_IS_LINUX
 #elif defined(__APPLE__)
 #define ISPC_IS_APPLE
 #endif
 #ifdef ISPC_IS_WINDOWS
 #include <windows.h>
 #endif // ISPC_IS_WINDOWS
 #include <cassert>
 #include <cstring>
 #include <cstdio>
 #include <cstdint>
 #ifdef ISPC_IS_LINUX
 #include <malloc.h>
 #endif
 /******************************/
 #include <cassert>
 #include <iostream>
 #include <cuda.h>
 #include "drvapi_error_string.h"
 #include "ispc_malloc.h"
 #define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)
 // These are the inline versions for all of the SDK helper functions
 void __checkCudaErrors(CUresult err, const char *file, const int line) {
  if(CUDA_SUCCESS != err) {
    std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
           << getCudaDrvErrorString(err) << "\" from file <" << file
           << ", line " << line << "\n";
    exit(-1);
  }
 }
 /******************************/
 /****  Basic CUDriver API  ****/
 /******************************/
 CUcontext context;
 static void createContext(const int deviceId = 0, const bool verbose = true)
 {
  CUdevice device;
  int devCount;
  checkCudaErrors(cuInit(0));
  checkCudaErrors(cuDeviceGetCount(&devCount));
  assert(devCount > 0);
  checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
  char name[128];
  checkCudaErrors(cuDeviceGetName(name, 128, device));
  if (verbose)
    std::cout << "Using CUDA Device [0]: " << name << "\n";
  int devMajor, devMinor;
  checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
  if (verbose)
    std::cout << "Device Compute Capability: " 
      << devMajor << "." << devMinor << "\n";
  if (devMajor < 2) {
    if (verbose)
      std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
    exit(1); 
  }
  // Create driver context
  checkCudaErrors(cuCtxCreate(&context, 0, device));
 }
 static void destroyContext()
 {
  checkCudaErrors(cuCtxDestroy(context));
 }
 static CUmodule loadModule(
    const char * module,
    const int maxrregcount = 64,
    const char cudadevrt_lib[] = "libcudadevrt.a",
    const size_t log_size = 32768,
    const bool print_log = true
    )
 {
  CUmodule cudaModule;
  // in this branch we use compilation with parameters
  CUlinkState  CUState;
  CUlinkState *lState = &CUState;
  const int nOptions = 8;
  CUjit_option options[nOptions];
  void* optionVals[nOptions];
  float walltime;
  size_t logSize = log_size;
  char error_log[logSize],
       info_log[logSize];
  void *cuOut;
  size_t outSize;
  int myErr = 0;
  // Setup linker options
  // Return walltime from JIT compilation
  options[0] = CU_JIT_WALL_TIME;
  optionVals[0] = (void*) &walltime;
  // Pass a buffer for info messages
  options[1] = CU_JIT_INFO_LOG_BUFFER;
  optionVals[1] = (void*) info_log;
  // Pass the size of the info buffer
  options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
  optionVals[2] = (void*) logSize;
  // Pass a buffer for error message
  options[3] = CU_JIT_ERROR_LOG_BUFFER;
  optionVals[3] = (void*) error_log;
  // Pass the size of the error buffer
  options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
  optionVals[4] = (void*) logSize;
  // Make the linker verbose
  options[5] = CU_JIT_LOG_VERBOSE;
  optionVals[5] = (void*) 1;
  // Max # of registers/pthread
  options[6] = CU_JIT_MAX_REGISTERS;
  int jitRegCount = maxrregcount;
  optionVals[6] = (void *)(size_t)jitRegCount;
  // Caching
  options[7] = CU_JIT_CACHE_MODE;
  optionVals[7] = (void *)CU_JIT_CACHE_OPTION_CA;
  // Create a pending linker invocation
  // Create a pending linker invocation
  checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState));
 #if 0
  if (sizeof(void *)==4)
  {
    // Load the PTX from the string myPtx32
    printf("Loading myPtx32[] program\n");
    // PTX May also be loaded from file, as per below.
    myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0);
  }
  else
 #endif
  {
    // Load the PTX from the string myPtx (64-bit)
    if (print_log)
      fprintf(stderr, "Loading ptx..\n");
    myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0);
    myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, cudadevrt_lib, 0,0,0); 
    // PTX May also be loaded from file, as per below.
    // myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0);
  }
  // Complete the linker step
  myErr = cuLinkComplete(*lState, &cuOut, &outSize);
  if ( myErr != CUDA_SUCCESS )
  {
    // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above. 
    fprintf(stderr,"PTX Linker Error:\n%s\n",error_log);
    assert(0);
  }    
  // Linker walltime and info_log were requested in options above.
 if (print_log)
   fprintf(stderr, "CUDA Link Completed in %fms. Linker Output:\n%s\n",walltime,info_log);
 // Load resulting cuBin into module
 checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut));
 // Destroy the linker invocation
 checkCudaErrors(cuLinkDestroy(*lState));
 return cudaModule;
 }
 static void unloadModule(CUmodule &cudaModule)
 {
  checkCudaErrors(cuModuleUnload(cudaModule));
 }
 static CUfunction getFunction(CUmodule &cudaModule, const char * function)
 {
  CUfunction cudaFunction;
  checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
  return cudaFunction;
 }
 static CUdeviceptr deviceMalloc(const size_t size)
 {
  CUdeviceptr d_buf;
  checkCudaErrors(cuMemAlloc(&d_buf, size));
  return d_buf;
 }
 static void deviceFree(CUdeviceptr d_buf)
 {
  checkCudaErrors(cuMemFree(d_buf));
 }
 static void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
 {
  checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
 }
 static void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
 {
  checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
 }
 #define deviceLaunch(func,params) \
  checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \
 checkCudaErrors( \
    cuLaunchKernel( \
      (func), \
      1,1,1, \
      32, 1, 1, \
      0, NULL, (params), NULL \
      ));
 typedef CUdeviceptr devicePtr;
 /**************/
 #include <vector>
 static std::vector<char> readBinary(const char * filename, const bool print_size = false)
 {
  std::vector<char> buffer;
  FILE *fp = fopen(filename, "rb");
  if (!fp )
  {
    fprintf(stderr, "file %s not found\n", filename);
    assert(0);
  }
  fseek(fp, 0, SEEK_END); 
  const unsigned long long size = ftell(fp);         /*calc the size needed*/
  fseek(fp, 0, SEEK_SET); 
  buffer.resize(size);
  if (fp == NULL){ /*ERROR detection if file == empty*/
    fprintf(stderr, "Error: There was an Error reading the file %s \n",filename);           
    exit(1);
  }
  else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/
    fprintf(stderr, "Error: There was an Error reading the file %s \n", filename);
    exit(1);
  }
  if (print_size)
    fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size());
  return buffer;
 }
 static double CUDALaunch(
    void **handlePtr, 
    const char * func_name,
    void **func_args,
    const bool print_log = true,
    const int maxrregcount = 64,
    const char kernel_file[] = "__kernels.ptx",
    const char cudadevrt_lib[] = "libcudadevrt.a",
    const int log_size = 32768)
 {
  fprintf(stderr, " launching kernel: %s \n", func_name);
  const std::vector<char> module_str = readBinary(kernel_file, print_log);
  const char *  module = &module_str[0];
  CUmodule   cudaModule   = loadModule(module, maxrregcount, cudadevrt_lib, log_size, print_log);
  CUfunction cudaFunction = getFunction(cudaModule, func_name);
  deviceLaunch(cudaFunction, func_args);
  checkCudaErrors(cuStreamSynchronize(0));
  unloadModule(cudaModule);
  return 0.0;
 }
 /******************************/
 extern "C" {
 //    extern int width();
    int width() { return 32; }
    extern void f_v(float *result);
    extern void f_f(float *result, float *a);
    extern void f_fu(float *result, float *a, float b);
    extern void f_fi(float *result, float *a, int *b);
    extern void f_du(float *result, double *a, double b);
    extern void f_duf(float *result, double *a, float b);
    extern void f_di(float *result, double *a, int *b);
    extern void result(float *val);
 }
 #if defined(_WIN32) || defined(_WIN64)
 #define ALIGN
 #else
 #define ALIGN __attribute__((aligned(64)))
 #endif
 int main(int argc, char *argv[]) {
    int w = width();
    assert(w <= 64);
    float returned_result[64] ALIGN;
    float vfloat[64] ALIGN;
    double vdouble[64] ALIGN;
    int vint[64] ALIGN;
    int vint2[64] ALIGN;
    const int device = 0;
 #if 0
    const bool verbose = true;
 #else
    const bool verbose = false;
 #endif
    /*******************/
    createContext(device, verbose);
    /*******************/
    devicePtr d_returned_result = deviceMalloc(64*sizeof(float));
    devicePtr d_vfloat          = deviceMalloc(64*sizeof(float));
    devicePtr d_vdouble         = deviceMalloc(64*sizeof(double));
    devicePtr d_vint            = deviceMalloc(64*sizeof(int));
    devicePtr d_vint2           = deviceMalloc(64*sizeof(int));
    for (int i = 0; i < 64; ++i) {
        returned_result[i] = -1e20;
        vfloat[i] = i+1;
        vdouble[i] = i+1;
        vint[i] = 2*(i+1);
        vint2[i] = i+5;
    }
    memcpyH2D(d_returned_result, returned_result, 64*sizeof(float));
    memcpyH2D(d_vfloat         , vfloat,          64*sizeof(float));
    memcpyH2D(d_vdouble        , vdouble,         64*sizeof(double));
    memcpyH2D(d_vint        , vint,         64*sizeof(int));
    memcpyH2D(d_vint2       , vint2,         64*sizeof(int));
    float b = 5.;
    const bool print_log = false;
    const int  nreg = 64;
 #if (TEST_SIG == 0)
    void *args[] = {&d_returned_result};
    CUDALaunch(NULL, "f_v", args, print_log, nreg);
 #elif (TEST_SIG == 1)
    void *args[] = {&d_returned_result, &d_vfloat};
    CUDALaunch(NULL, "f_f", args, print_log, nreg);
 #elif (TEST_SIG == 2)
    void *args[] = {&d_returned_result, &d_vfloat, &b};
    CUDALaunch(NULL, "f_fu", args, print_log, nreg);
 #elif (TEST_SIG == 3)
    void *args[] = {&d_returned_result, &d_vfloat, &vint};
    CUDALaunch(NULL, "f_fi", args, print_log, nreg);
 #elif (TEST_SIG == 4)
    int num = 5;
    void *args[] = {&d_returned_result, &d_vdouble, &num};
    CUDALaunch(NULL, "f_du", args, print_log, nreg);
 #elif (TEST_SIG == 5)
    float num = 5.0f;
    void *args[] = {&d_returned_result, &d_vdouble, &num};
    CUDALaunch(NULL, "f_duf", args, print_log, nreg);
 #elif (TEST_SIG == 6)
    void *args[] = {&d_returned_result, &d_vdouble, &v_int2};
    CUDALaunch(NULL, "f_di", args, print_log, nreg);
 #else
 #error "Unknown or unset TEST_SIG value"
 #endif
    float expected_result[64];
    memset(expected_result, 0, 64*sizeof(float));
    devicePtr d_expected_result = deviceMalloc(64*sizeof(float));
    memcpyH2D(d_expected_result, expected_result, 64*sizeof(float));
    void *res_args[] = {&d_expected_result};
    CUDALaunch(NULL, "result", res_args, print_log, nreg);
    memcpyD2H(expected_result, d_expected_result, 64*sizeof(float));
    memcpyD2H(returned_result, d_returned_result, 64*sizeof(float));
    deviceFree(d_returned_result);
    deviceFree(d_vfloat);
    deviceFree(d_vdouble);
    deviceFree(d_vint);
    deviceFree(d_vint2);
    deviceFree(d_expected_result);
    /*******************/
    destroyContext();
    /*******************/
    int errors = 0;
    for (int i = 0; i < w; ++i) {
        if (returned_result[i] != expected_result[i]) {
 #ifdef EXPECT_FAILURE
            // bingo, failed
            return 1;
 #else
            printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
                   argv[0], i, returned_result[i], returned_result[i],
                   expected_result[i], expected_result[i]);
            ++errors;
 #endif // EXPECT_FAILURE
        }
    }
 #ifdef EXPECT_FAILURE
    // Don't expect to get here
    return 0;
 #else
    return errors > 0;
 #endif
 }
--- a/test_static_nvptx.cpp
+++ b/test_static_nvptx.cpp
@@ -0,0 +1,133 @@
 /*
  Copyright (c) 2010-2011, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #if defined(_WIN32) || defined(_WIN64)
 #define ISPC_IS_WINDOWS
 #elif defined(__linux__)
 #define ISPC_IS_LINUX
 #elif defined(__APPLE__)
 #define ISPC_IS_APPLE
 #endif
 #ifdef ISPC_IS_WINDOWS
 #include <windows.h>
 #endif // ISPC_IS_WINDOWS
 #include <cassert>
 #include <cstring>
 #include <cstdio>
 #include <stdint.h>
 #ifdef ISPC_IS_LINUX
 #include <malloc.h>
 #endif
 #include "ispc_malloc.h"
 #define N 32
 extern "C" {
    int width() { return N; }
    extern void f_v(float *result);
    extern void f_f(float *result, float *a);
    extern void f_fu(float *result, float *a, float b);
    extern void f_fi(float *result, float *a, int *b);
    extern void f_du(float *result, double *a, double b);
    extern void f_duf(float *result, double *a, float b);
    extern void f_di(float *result, double *a, int *b);
    extern void result(float *val);
 }
 int main(int argc, char *argv[]) {
    int w = width();
    assert(w <= N);
    float *returned_result = new float[N*4];
    float *vfloat = new float[N*4];
    double *vdouble = new double[N*4];
    int *vint = new int[N*4];
    int *vint2 = new int[N*4];
    for (int i = 0; i < N*4; ++i) {
        returned_result[i] = -1e20;
        vfloat[i] = i+1;
        vdouble[i] = i+1;
        vint[i] = 2*(i+1);
        vint2[i] = i+5;
    }
    float b = 5.;
 #if (TEST_SIG == 0)
    f_v(returned_result);
 #elif (TEST_SIG == 1)
    f_f(returned_result, vfloat);
 #elif (TEST_SIG == 2)
    f_fu(returned_result, vfloat, b);
 #elif (TEST_SIG == 3)
    f_fi(returned_result, vfloat, vint);
 #elif (TEST_SIG == 4)
    f_du(returned_result, vdouble, 5.);
 #elif (TEST_SIG == 5)
    f_duf(returned_result, vdouble, 5.f);
 #elif (TEST_SIG == 6)
    f_di(returned_result, vdouble, vint2);
 #else
 #error "Unknown or unset TEST_SIG value"
 #endif
    float *expected_result = new float[N];
    memset(expected_result, 0, N*sizeof(float));
    result(expected_result);
    int errors = 0;
    for (int i = 0; i < w; ++i) {
        if (returned_result[i] != expected_result[i]) 
        {
 #ifdef EXPECT_FAILURE
            // bingo, failed
            return 1;
 #else
            printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
                   argv[0], i, returned_result[i], returned_result[i],
                   expected_result[i], expected_result[i]);
            ++errors;
 #endif // EXPECT_FAILURE
        }
    }
 #ifdef EXPECT_FAILURE
    // Don't expect to get here
    return 0;
 #else
    return errors > 0;
 #endif
 }
--- a/tests/array-mixed-unif-vary-indexing-3.ispc
+++ b/tests/array-mixed-unif-vary-indexing-3.ispc
@@ -5,7 +5,13 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
    assert(programCount <= 64);
 #ifdef __NVPTX__
    uniform float  * uniform xarr   =  uniform new uniform float[70*70];
    uniform float (* uniform x)[70] = (uniform float (* uniform)[70])xarr;
 #define _SHMALLOC
 #else
    uniform float x[70][70];
 #endif
    for (uniform int i = 0; i < 70; ++i)
        for (uniform int j = 0; j < 70; ++j)
            x[i][j] = 2+b-5;
@@ -16,6 +22,10 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    else
        x[b-1][a-1] = 1;
    RET[programIndex] = x[4][a];
 #ifdef _SHMALLOC
    delete xarr;
 #endif
 }
 export void result(uniform float RET[]) { 
--- a/tests/broadcast.ispc
+++ b/tests/broadcast.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    float b = (programCount == 1) ? 3 : broadcast(a, 2);
+    float b = (programCount == 1) ? 4 : broadcast(a, 2);
    RET[programIndex] = b;
 }
--- a/tests/c-test-64.ispc
+++ b/tests/c-test-64.ispc
@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 2;
+  for (int i = 0; i < programCount; i += 4)
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+  {
-    RET[2] = RET[6] = RET[10] = RET[14] = 5;
+    RET[i+0] = 2;
-    RET[3] = RET[7] = RET[11] = RET[15] = 6;
+    RET[i+1] = 3;
    RET[i+2] = 5;
    RET[i+3] = 6;
  }
 }
--- a/tests/c-test-65.ispc
+++ b/tests/c-test-65.ispc
@@ -18,6 +18,9 @@ export void f_fu(uniform float RET[4], uniform float aFOO[4], uniform float b) {
 export void result(uniform float RET[]) {
    RET[programIndex] = 3;
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+    for (int i = 0; i < programCount; i += 4)
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+    {
      RET[i+0] = 1;
      RET[i+3] = 29;
    }
 }
--- a/tests/c-test-66.ispc
+++ b/tests/c-test-66.ispc
@@ -19,6 +19,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 export void result(uniform float RET[]) {
    RET[programIndex] = 32;
-    RET[2] = RET[6] = RET[10] = RET[14] = 38;
+    for (int i = 0; i < programCount; i += 4)
-    RET[3] = RET[7] = RET[11] = RET[15] = 39;
+    {
      RET[i+2] = 38;
      RET[i+3] = 39;
    }
 }
--- a/tests/cfor-array-struct-gather.ispc
+++ b/tests/cfor-array-struct-gather.ispc
@@ -4,14 +4,14 @@ export uniform int width() { return programCount; }
 struct Foo {
-    uniform float x[17];
+    uniform float x[programCount+1];
 };
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
    uniform Foo foo;
    uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
        foo.x[i] = i;
    if ((int)a & 1)
--- a/tests/cfor-gs-double-improve-multidim-1.ispc
+++ b/tests/cfor-gs-double-improve-multidim-1.ispc
@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex]; 
-    uniform double udx[25][25];
+    uniform double udx[programCount+1][programCount+1];
-    cfor (uniform int i = 0; i < 25; ++i)
+    cfor (uniform int i = 0; i < programCount+1; ++i)
-        cfor (uniform int j = 0; j < 25; ++j)
+        cfor (uniform int j = 0; j < programCount+1; ++j)
            udx[i][j] = 10*i+j;
    int x = 1;
--- a/tests/cfor-gs-improve-multidim-1.ispc
+++ b/tests/cfor-gs-improve-multidim-1.ispc
@@ -5,9 +5,9 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex]; 
-    uniform float udx[20][20];
+    uniform float udx[programCount+1][programCount+1];
-    cfor (uniform int i = 0; i < 20; ++i)
+    cfor (uniform int i = 0; i < programCount+1; ++i)
-        cfor (uniform int j = 0; j < 20; ++j)
+        cfor (uniform int j = 0; j < programCount+1; ++j)
            udx[i][j] = 100*i+j;
    int x = 1;
--- a/tests/cfor-gs-improve-multidim-struct-1.ispc
+++ b/tests/cfor-gs-improve-multidim-struct-1.ispc
@@ -4,19 +4,27 @@ export uniform int width() { return programCount; }
 struct Foo {
-    uniform float udx[25][25];
+    uniform float udx[32][32];
 };
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex]; 
 #ifndef __NVPTX__ 
    uniform Foo f[5];
 #else     /* too much shared memory allocated, nvcc fails to link */
    uniform Foo * uniform f = uniform new uniform Foo[5];
 #define _UNMALLOC
 #endif
    cfor (uniform int i = 0; i < 5; ++i)
-        cfor (uniform int j = 0; j < 25; ++j)
+        cfor (uniform int j = 0; j < 32; ++j)
-            cfor (uniform int k = 0; k < 25; ++k)
+            cfor (uniform int k = 0; k < 32; ++k)
                f[i].udx[j][k] = 1000*i+100*j+k;
    int x = 1;
    RET[programIndex] = f[x+1].udx[b-4][programIndex];
 #ifdef _UNMALLOC
    delete f;
 #endif
 }
 export void result(uniform float RET[]) { RET[programIndex] = 2100 +programIndex; }
--- a/tests/cfor-struct-gather-2.ispc
+++ b/tests/cfor-struct-gather-2.ispc
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
    uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
        foo[i].f = i*a;
    RET[programIndex] = func(foo, (int)a);
 }
--- a/tests/cfor-struct-gather-3.ispc
+++ b/tests/cfor-struct-gather-3.ispc
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
    uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
        foo[i].f = i*a;
    RET[programIndex] = func(foo, (int)a);
 }
--- a/tests/cfor-struct-gather.ispc
+++ b/tests/cfor-struct-gather.ispc
@@ -9,9 +9,9 @@ struct Foo {
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
    uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
        foo[i].f = i*a;
    RET[programIndex] = foo[(int)a].f;
 }
--- a/tests/cfor-struct-test-114.ispc
+++ b/tests/cfor-struct-test-114.ispc
@@ -10,9 +10,9 @@ struct Foo {
 export void f_fi(uniform float RET[], uniform float aFOO[], uniform int bFOO[]) {
    float a = aFOO[programIndex];
    int b = bFOO[programIndex];
-    varying Foo myFoo[17];
+    varying Foo myFoo[programCount+1];
    uniform int i;
-    cfor (i = 0; i < 17; ++i) {
+    cfor (i = 0; i < programCount+1; ++i) {
        myFoo[i].x = i;
        myFoo[i].f = 2*i;
    }
--- a/tests/cfor-test-134.ispc
+++ b/tests/cfor-test-134.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+  for (int i = 0; i < programCount; i += 4)
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+  {
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
+    RET[i+0] = 1;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+    RET[i+1] = 3;
    RET[i+2] = 3;
    RET[i+3] = 29;
  }
 }
--- a/tests/cfor-test-135.ispc
+++ b/tests/cfor-test-135.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+  for (int i = 0; i < programCount; i += 4)
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+  {
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
+    RET[i+0] = 1;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+    RET[i+1] = 3;
    RET[i+2] = 3;
    RET[i+3] = 29;
  }
 }
--- a/tests/cfor-test-136.ispc
+++ b/tests/cfor-test-136.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+  for (int i = 0; i < programCount; i += 4)
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+  {
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
+    RET[i+0] = 1;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+    RET[i+1] = 3;
    RET[i+2] = 3;
    RET[i+3] = 29;
  }
 }
--- a/tests/cfor-test-64.ispc
+++ b/tests/cfor-test-64.ispc
@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 2;
+  for (int i = 0; i < programCount; i += 4)
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+  {
-    RET[2] = RET[6] = RET[10] = RET[14] = 5;
+    RET[i+0] = 2;
-    RET[3] = RET[7] = RET[11] = RET[15] = 6;
+    RET[i+1] = 3;
    RET[i+2] = 5;
    RET[i+3] = 6;
  }
 }
--- a/tests/cfor-test-65.ispc
+++ b/tests/cfor-test-65.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+  for (int i = 0; i < programCount; i += 4)
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+  {
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
+    RET[i+0] = 1;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+    RET[i+1] = 3;
    RET[i+2] = 3;
    RET[i+3] = 29;
  }
 }
--- a/tests/cfor-test-66.ispc
+++ b/tests/cfor-test-66.ispc
@@ -18,8 +18,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 32;
+  for (int i = 0; i < programCount; i += 4)
-    RET[1] = RET[5] = RET[9] = RET[13] = 32;
+  {
-    RET[2] = RET[6] = RET[10] = RET[14] = 38;
+    RET[i+0] = 32;
-    RET[3] = RET[7] = RET[11] = RET[15] = 39;
+    RET[i+1] = 32;
    RET[i+2] = 38;
    RET[i+3] = 39;
  }
 }
--- a/tests/cfor-unif-struct-test-114.ispc
+++ b/tests/cfor-unif-struct-test-114.ispc
@@ -8,9 +8,9 @@ struct Foo {
 };
 export void f_fi(uniform float RET[], uniform float a[], uniform int bFOO[]) {
    int b = bFOO[programIndex];
-    uniform struct Foo myFoo[17];
+    uniform struct Foo myFoo[programCount+1];
    uniform int i;
-    cfor (i = 0; i < 17; ++i) {
+    cfor (i = 0; i < programCount+1; ++i) {
        myFoo[i].x = i;
        myFoo[i].f = 2*i;
    }
--- a/tests/const-fold-1.ispc
+++ b/tests/const-fold-1.ispc
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex];
    uniform int x = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
-    static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
+    const static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
    RET[programIndex] = (x == y) ? 1. : 0.;
 }
--- a/tests/const-fold-2.ispc
+++ b/tests/const-fold-2.ispc
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex];
    uniform int x = (170 >> 4) % 5;
-    static uniform int y = (170 >> 4) % 5;
+    const static uniform int y = (170 >> 4) % 5;
    RET[programIndex] = (x == y) ? 1. : 0.;
 }
--- a/tests/const-fold-3.ispc
+++ b/tests/const-fold-3.ispc
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex];
    uniform int x = (17 < 2) || (6 >= 5) && (20 >= 20);
-    static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20);
+    const static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20);
    RET[programIndex] = ((x!=0) == (y!=0)) ? 1. : 0.;
 }
--- a/tests/launch-8.ispc
+++ b/tests/launch-8.ispc
@@ -2,22 +2,23 @@
 export uniform int width() { return programCount; }
-#define N0 10
+#define N0 12
 #define N1 20
 #define N2 50
 static uniform float array[N2][N1][N0];
-task void x(const float f) {
+task void x(const uniform float farray[]) {
    const float f = farray[programIndex];
    uniform int j;
-    assert(taskCount  == (int32)N0*N1*N2);
+    assert(taskCount  == (uniform int32)N0*N1*N2);
-    assert(taskCount0 == (int32)N0);
+    assert(taskCount0 == (uniform int32)N0);
-    assert(taskCount1 == (int32)N1);
+    assert(taskCount1 == (uniform int32)N1);
-    assert(taskCount2 == (int32)N2);
+    assert(taskCount2 == (uniform int32)N2);
-    assert(taskIndex  == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2));
+    assert(taskIndex  == (uniform int32)taskIndex0 + (uniform int32)N0*(taskIndex1 +(uniform int32) N1*taskIndex2));
-    assert(taskIndex0 < (int32)N0);
+    assert(taskIndex0 < (uniform int32)N0);
-    assert(taskIndex1 < (int32)N1);
+    assert(taskIndex1 < (uniform int32)N1);
-    assert(taskIndex2 < (int32)N2);
+    assert(taskIndex2 < (uniform int32)N2);
    const uniform int i0 = taskIndex0;
    const uniform int i1 = taskIndex1;
@@ -30,7 +31,7 @@ task void x(const float f) {
        array[i2][i1][i0] = i;
 }
 export void f_f(uniform float RET[], uniform float fFOO[]) { 
-    float f = fFOO[programIndex];
+    uniform float * uniform f = fFOO;
    launch[N2][N1][N0] x(f);
    sync;
    RET[programIndex] = array[N2-1][N1-1][N0-1];
@@ -38,5 +39,5 @@ export void f_f(uniform float RET[], uniform float fFOO[]) {
 export void result(uniform float RET[]) {
-    RET[programIndex] = 9999.000000;
+    RET[programIndex] = 11999.000000;
 }
--- a/tests/launch-9.ispc
+++ b/tests/launch-9.ispc
@@ -2,12 +2,13 @@
 export uniform int width() { return programCount; }
-#define N0 10
+#define N0 12
 #define N1 20
 #define N2 50
 static uniform float array[N2][N1][N0];
-task void x(const float f) {
+task void x(const uniform float farray[]) {
    const float f = farray[programIndex];
    uniform int j;
    assert(taskCount  == (int32)N0*N1*N2);
@@ -30,13 +31,13 @@ task void x(const float f) {
        array[i2][i1][i0] = i;
 }
 export void f_f(uniform float RET[], uniform float fFOO[]) { 
-    float f = fFOO[programIndex];
+    uniform float * uniform f = fFOO;
-    launch[N0,N1,N2] x(f);
+    launch[N2][N1][N0] x(f);
    sync;
    RET[programIndex] = array[N2-1][N1-1][N0-1];
 }
 export void result(uniform float RET[]) {
-    RET[programIndex] = 9999.000000;
+    RET[programIndex] = 11999.000000;
 }
--- a/tests/operators2.ispc
+++ b/tests/operators2.ispc
@@ -1,4 +1,9 @@
 #ifdef __NVPTX__
 uniform int _off[programCount];
 #define off _off[programIndex]
 #else  /* global varying data types are not yet supported with "nvptx" target */
 int off;
 #endif
 export uniform int width() { return programCount; }
@@ -22,11 +27,11 @@ struct S operator/(struct S rr, struct S rv) {
    return c;
 }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    struct S a;
    struct S b;
    struct S d;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    int T = programIndex;
    a.a = aFOO[programIndex];
    b.a = -aFOO[programIndex];
--- a/tests/soa-16.ispc
+++ b/tests/soa-16.ispc
@@ -15,6 +15,16 @@ static void p(uniform float *uniform ptr) {
 }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 #ifdef __NVPTX__  /* soa is converted to shared memory story for now, use smaller amount to check the test */
    soa<4> Point pts[10];
    for (uniform int i = 0; i < 40; ++i) {
        pts[i].x = b*i;
        pts[i].y[0] = 2*b*i;
        pts[i].y[1] = 2*b*i+1;
        pts[i].y[2] = 2*b*i+2;
        pts[i].z = 3*b*i;
    }
 #else
    soa<4> Point pts[30];
    for (uniform int i = 0; i < 120; ++i) {
        pts[i].x = b*i;
@@ -23,6 +33,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
        pts[i].y[2] = 2*b*i+2;
        pts[i].z = 3*b*i;
    }
 #endif
    float a = aFOO[programIndex]; 
    a *= -1;
--- a/tests/soa-17.ispc
+++ b/tests/soa-17.ispc
@@ -16,6 +16,16 @@ static void p(uniform float *uniform ptr) {
 }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 #ifdef __NVPTX__  /* soa is converted to shared memory story for now, use smaller amount to check the test */
    soa<4> Point pts[15];
    for (uniform int i = 0; i < 60; ++i) {
        pts[i].x = b*i;
        pts[i].y[0] = 2*b*i;
        pts[i].y[1] = 2*b*i+1;
        pts[i].y[2] = 2*b*i+2;
        pts[i].z = 3*b*i;
    }
 #else
    soa<4> Point pts[40];
    for (uniform int i = 0; i < 160; ++i) {
        pts[i].x = b*i;
@@ -24,6 +34,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
        pts[i].y[2] = 2*b*i+2;
        pts[i].z = 3*b*i;
    }
 #endif
    float a = aFOO[programIndex]; 
    a *= -1;
--- a/tests/soa-3.ispc
+++ b/tests/soa-3.ispc
@@ -6,6 +6,17 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex]; 
 #ifdef __NVPTX__  /* soa is converted to shared memory story for now, use smaller amount to check the test */
    soa<8> Point pts[4];
 //CO    uniform Point pts[80];
    foreach (i = 0 ... 40) {
        pts[i].x = b*i;
        pts[i].y[0] = 2*b*i;
        pts[i].y[1] = 2*b*i+1;
        pts[i].y[2] = 2*b*i+2;
        pts[i].z = 3*b*i;
    }
 #else
    soa<8> Point pts[10];
 //CO    uniform Point pts[80];
    foreach (i = 0 ... 80) {
@@ -15,6 +26,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
        pts[i].y[2] = 2*b*i+2;
        pts[i].z = 3*b*i;
    }
 #endif
    assert(programCount < 80);
    RET[programIndex] = pts[programIndex].y[2];
--- a/tests/test-134.ispc
+++ b/tests/test-134.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+  for (int i = 0; i < programCount; i += 4)
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+  {
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
+    RET[i+0] = 1;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+    RET[i+1] = 3;
    RET[i+2] = 3;
    RET[i+3] = 29;
  }
 }
--- a/tests/test-135.ispc
+++ b/tests/test-135.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+  for (int i = 0; i < programCount; i += 4)
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+  {
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
+    RET[i+0] = 1;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+    RET[i+1] = 3;
    RET[i+2] = 3;
    RET[i+3] = 29;
  }
 }
--- a/tests/test-136.ispc
+++ b/tests/test-136.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+  for (int i = 0; i < programCount; i += 4)
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+  {
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
+    RET[i+0] = 1;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+    RET[i+1] = 3;
    RET[i+2] = 3;
    RET[i+3] = 29;
  }
 }
--- a/tests/test-140.ispc
+++ b/tests/test-140.ispc
@@ -8,8 +8,11 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 0x0.0p+0;
+  for (int i = 0; i < programCount; i += 4)
-    RET[1] = RET[5] = RET[9] = RET[13] = 0x1.62e43p-1;
+  {
-    RET[2] = RET[6] = RET[10] = RET[14] =  0x1.193ea8p+0;
+    RET[i+0] = 0x0.0p+0;
-    RET[3] = RET[7] = RET[11] = RET[15] = 0x1.62e43p+0;
+    RET[i+1] = 0x1.62e43p-1;
    RET[i+2] = 0x1.193ea8p+0;
    RET[i+3] = 0x1.62e43p+0;
  }
 }
--- a/tests/test-141.ispc
+++ b/tests/test-141.ispc
@@ -5,7 +5,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex];
    // calculation error 1e-6 is the same as in icc
-    RET[programIndex] = (exp(-log(1/a)) - a) < 1e-6 ? 1 : 0;
+    RET[programIndex] = (exp(-log(1/a)) - a)/a < 1e-6 ? 1 : 0;
 }
 export void result(uniform float RET[4]) {
--- a/tests/test-142.ispc
+++ b/tests/test-142.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    RET[programIndex] = round(a+.499999); 
+    RET[programIndex] = round(a+.49999); 
 }
 export void result(uniform float RET[]) {
--- a/tests/test-144.ispc
+++ b/tests/test-144.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    RET[programIndex] = floor(a+.999999); 
+    RET[programIndex] = floor(a+.99999); 
 }
 export void result(uniform float RET[]) {
--- a/tests/uniform-1.ispc
+++ b/tests/uniform-1.ispc
@@ -0,0 +1,34 @@
 export uniform int width() { return programCount; }
 task void f_f_task(uniform float RET[], uniform float aFOO[]) {
    uniform float val[programCount];
    for (uniform int i = 0; i < programCount; ++i)
        val[i] = 0;
    foreach (i = 0 ... programCount)
        val[i] += aFOO[programCount*taskIndex + i] - 1;
    uniform float sum = 0;
    for (uniform int i = 0; i < programCount; ++i)
        sum += val[i];
    if (programIndex < 32/4)
      RET[programCount/4*taskIndex + programIndex] = sum;
 }
 export void f_f(uniform float RET[], uniform float aFOO[]) 
 {
  launch[4] f_f_task(RET,  aFOO);
 }
 task void result_task(uniform float RET[])
 {
  const uniform float ret = reduce_add(programIndex + programCount*taskIndex);
  if (programIndex < 32/4)
    RET[programCount/4*taskIndex + programIndex] = ret;
 }
 export void result(uniform float RET[]) {
  launch[4] result_task(RET);
 }
--- a/type.cpp
+++ b/type.cpp
@@ -749,7 +749,7 @@ EnumType::Mangle() const {
    std::string ret;
    if (isConst) ret += "C";
    ret += variability.MangleString();
-    ret += std::string("enum[") + name + std::string("]");
+    ret += std::string("enum_5B_") + name + std::string("_5C_");
    return ret;
 }
@@ -1420,7 +1420,7 @@ ArrayType::Mangle() const {
        sprintf(buf, "%d", numElements);
    else
        buf[0] = '\0';
-    return s + "[" + buf + "]";
+    return s + "_5B_" + buf + "_5C_";
 }
@@ -2058,12 +2058,12 @@ lMangleStruct(Variability variability, bool isConst, const std::string &name) {
    Assert(variability != Variability::Unbound);
    std::string ret;
-    ret += "s[";
+    ret += "s_5B_";
    if (isConst)
        ret += "_c_";
    ret += variability.MangleString();
-    ret += name + std::string("]");
+    ret += name + std::string("_5C_");
    return ret;
 }
@@ -3009,7 +3009,7 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
        llvmArgTypes.push_back(LLVMTypes::MaskType);
    std::vector<llvm::Type *> callTypes;
-    if (isTask) {
+    if (isTask && g->target->getISA() != Target::NVPTX) {
        // Tasks take three arguments: a pointer to a struct that holds the
        // actual task arguments, the thread index, and the total number of
        // threads the tasks system has running.  (Task arguments are
`@@ -274,3 +274,4 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {`

	`define_avgs()`	`define_avgs()`

		`declare_nvptx()`