diff --git a/builtins.cpp b/builtins.cpp
index fee322e7..fbc0d5a0 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -338,11 +338,13 @@ lSetInternalFunctions(llvm::Module *module) {
         "__all",
         "__any",
         "__aos_to_soa3_float",
+        "__aos_to_soa3_float1",
         "__aos_to_soa3_float16",
         "__aos_to_soa3_float4",
         "__aos_to_soa3_float8",
         "__aos_to_soa3_int32",
         "__aos_to_soa4_float",
+        "__aos_to_soa4_float1",
         "__aos_to_soa4_float16",
         "__aos_to_soa4_float4",
         "__aos_to_soa4_float8",
@@ -351,10 +353,14 @@ lSetInternalFunctions(llvm::Module *module) {
         "__atomic_add_int64_global",
         "__atomic_add_uniform_int32_global",
         "__atomic_add_uniform_int64_global",
+        "__atomic_add_varying_int32_global",
+        "__atomic_add_varying_int64_global",
         "__atomic_and_int32_global",
         "__atomic_and_int64_global",
         "__atomic_and_uniform_int32_global",
         "__atomic_and_uniform_int64_global",
+        "__atomic_and_varying_int32_global",
+        "__atomic_and_varying_int64_global",
         "__atomic_compare_exchange_double_global",
         "__atomic_compare_exchange_float_global",
         "__atomic_compare_exchange_int32_global",
@@ -363,18 +369,30 @@ lSetInternalFunctions(llvm::Module *module) {
         "__atomic_compare_exchange_uniform_float_global",
         "__atomic_compare_exchange_uniform_int32_global",
         "__atomic_compare_exchange_uniform_int64_global",
+        "__atomic_compare_exchange_varying_double_global",
+        "__atomic_compare_exchange_varying_float_global",
+        "__atomic_compare_exchange_varying_int32_global",
+        "__atomic_compare_exchange_varying_int64_global",
         "__atomic_max_uniform_int32_global",
         "__atomic_max_uniform_int64_global",
         "__atomic_min_uniform_int32_global",
         "__atomic_min_uniform_int64_global",
+        "__atomic_max_varying_int32_global",
+        "__atomic_max_varying_int64_global",
+        "__atomic_min_varying_int32_global",
+        "__atomic_min_varying_int64_global",
         "__atomic_or_int32_global",
         "__atomic_or_int64_global",
         "__atomic_or_uniform_int32_global",
         "__atomic_or_uniform_int64_global",
+        "__atomic_or_varying_int32_global",
+        "__atomic_or_varying_int64_global",
         "__atomic_sub_int32_global",
         "__atomic_sub_int64_global",
         "__atomic_sub_uniform_int32_global",
         "__atomic_sub_uniform_int64_global",
+        "__atomic_sub_varying_int32_global",
+        "__atomic_sub_varying_int64_global",
         "__atomic_swap_double_global",
         "__atomic_swap_float_global",
         "__atomic_swap_int32_global",
@@ -383,14 +401,28 @@ lSetInternalFunctions(llvm::Module *module) {
         "__atomic_swap_uniform_float_global",
         "__atomic_swap_uniform_int32_global",
         "__atomic_swap_uniform_int64_global",
+        "__atomic_swap_varying_double_global",
+        "__atomic_swap_varying_float_global",
+        "__atomic_swap_varying_int32_global",
+        "__atomic_swap_varying_int64_global",
         "__atomic_umax_uniform_uint32_global",
         "__atomic_umax_uniform_uint64_global",
         "__atomic_umin_uniform_uint32_global",
         "__atomic_umin_uniform_uint64_global",
+        "__atomic_umax_varying_uint32_global",
+        "__atomic_umax_varying_uint64_global",
+        "__atomic_umin_varying_uint32_global",
+        "__atomic_umin_varying_uint64_global",
         "__atomic_xor_int32_global",
         "__atomic_xor_int64_global",
         "__atomic_xor_uniform_int32_global",
         "__atomic_xor_uniform_int64_global",
+        "__atomic_xor_uniform_int32_global",
+        "__atomic_xor_uniform_int64_global",
+        "__atomic_xor_varying_int32_global",
+        "__atomic_xor_varying_int64_global",
+        "__atomic_xor_varying_int32_global",
+        "__atomic_xor_varying_int64_global",
         "__broadcast_double",
         "__broadcast_float",
         "__broadcast_i16",
@@ -413,6 +445,7 @@ lSetInternalFunctions(llvm::Module *module) {
         "__do_assert_uniform",
         "__do_assert_varying",
         "__do_print",
+        "__do_print_nvptx",
         "__doublebits_uniform_int64",
         "__doublebits_varying_int64",
         "__exclusive_scan_add_double",
@@ -427,6 +460,8 @@ lSetInternalFunctions(llvm::Module *module) {
         "__extract_int32",
         "__extract_int64",
         "__extract_int8",
+        "__extract_float",
+        "__extract_double",
         "__fastmath",
         "__float_to_half_uniform",
         "__float_to_half_varying",
@@ -443,6 +478,8 @@ lSetInternalFunctions(llvm::Module *module) {
         "__insert_int32",
         "__insert_int64",
         "__insert_int8",
+        "__insert_float",
+        "__insert_double",
         "__intbits_uniform_double",
         "__intbits_uniform_float",
         "__intbits_varying_double",
@@ -479,6 +516,7 @@ lSetInternalFunctions(llvm::Module *module) {
         "__min_varying_uint32",
         "__min_varying_uint64",
         "__movmsk",
+        "__movmsk_ptx",
         "__new_uniform_32rt",
         "__new_uniform_64rt",
         "__new_varying32_32rt",
@@ -560,11 +598,13 @@ lSetInternalFunctions(llvm::Module *module) {
         "__shuffle_i64",
         "__shuffle_i8",
         "__soa_to_aos3_float",
+        "__soa_to_aos3_float1",
         "__soa_to_aos3_float16",
         "__soa_to_aos3_float4",
         "__soa_to_aos3_float8",
         "__soa_to_aos3_int32",
         "__soa_to_aos4_float",
+        "__soa_to_aos4_float1",
         "__soa_to_aos4_float16",
         "__soa_to_aos4_float4",
         "__soa_to_aos4_float8",
@@ -622,6 +662,24 @@ lSetInternalFunctions(llvm::Module *module) {
         "__vec4_add_int32",
         "__vselect_float",
         "__vselect_i32",
+        "__program_index",
+        "__program_count",
+        "__warp_index",
+        "__task_index0",
+        "__task_index1",
+        "__task_index2",
+        "__task_index",
+        "__task_count0",
+        "__task_count1",
+        "__task_count2",
+        "__task_count",
+        "__cvt_loc2gen",
+        "__cvt_loc2gen_var",
+        "__cvt_const2gen",
+        "__puts_nvptx",
+        "ISPCAlloc",
+        "ISPCLaunch",
+        "ISPCSync",
     };
 
     int count = sizeof(names) / sizeof(names[0]);
@@ -694,6 +752,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
             g->target->getISA() != Target::NEON16 &&
             g->target->getISA() != Target::NEON8)
 #endif // !__arm__
+        if (g->target->getISA() != Target::NVPTX)
         {
             Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
                    mTriple.getArch() == bcTriple.getArch());
@@ -855,7 +914,17 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     // Next, add the target's custom implementations of the various needed
     // builtin functions (e.g. __masked_store_32(), etc).
     switch (g->target->getISA()) {
-
+    case Target::NVPTX: 
+      {
+        if (runtime32) {
+            fprintf(stderr, "Unforetunatly 32bit targets are supported at the moment .. \n");
+            assert(0);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_nvptx_64bit);
+        }
+        break;
+      };
 #ifdef ISPC_ARM_ENABLED
     case Target::NEON8: {
         if (runtime32) {
@@ -1125,7 +1194,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     }
 
     // define the 'programCount' builtin variable
-    lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
+    if (g->target->getISA() != Target::NVPTX)
+    {
+      lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
+    }
+    else
+    {
+      lDefineConstantInt("programCount", 32, module, symbolTable);
+    }
 
     // define the 'programIndex' builtin
     lDefineProgramIndex(module, symbolTable);
@@ -1155,6 +1231,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(),
                        module, symbolTable);
 
+    lDefineConstantInt("__is_nvptx_target", (int)(g->target->getISA() == Target::NVPTX),
+                       module, symbolTable);
+          
     if (g->forceAlignment != -1) {
         llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
         alignment->setInitializer(LLVMInt32(g->forceAlignment));
diff --git a/builtins/__do_print_nvptx.cu b/builtins/__do_print_nvptx.cu
new file mode 100644
index 00000000..dc1bbcce
--- /dev/null
+++ b/builtins/__do_print_nvptx.cu
@@ -0,0 +1,130 @@
+#include <cstdio>
+
+#define PRINT_BUF_SIZE 4096
+#define uint64_t unsigned long long
+
+static __device__ size_t d_strlen(const char *str)
+{
+  const char *s;
+
+  for (s = str; *s; ++s)
+    ;
+  return (s - str);
+}
+
+static __device__  char* d_strncat(char *dest, const char *src, size_t n)
+{
+  size_t dest_len = d_strlen(dest);
+  size_t i;
+
+  for (i = 0 ; i < n && src[i] != '\0' ; i++)
+    dest[dest_len + i] = src[i];
+  dest[dest_len + i] = '\0';
+
+  return dest;
+}
+ 
+#define APPEND(str)                                        \
+    do {                                                   \
+        int offset = bufp - &printString[0];               \
+        *bufp = '\0';                                      \
+        d_strncat(bufp, str, PRINT_BUF_SIZE-offset);         \
+        bufp += d_strlen(str);                               \
+        if (bufp >= &printString[PRINT_BUF_SIZE])          \
+            goto done;                                     \
+    } while (0) /* eat semicolon */
+
+
+#define PRINT_SCALAR(fmt, type)                  \
+    sprintf(tmpBuf, fmt, *((type *)ptr));        \
+    APPEND(tmpBuf);                              \
+    break
+
+#define PRINT_VECTOR(fmt, type)                                         \
+    *bufp++ = '[';                                                      \
+    if (bufp == &printString[PRINT_BUF_SIZE]) break;                    \
+    for (int i = 0; i < width; ++i) {                                   \
+        /* only print the value if the current lane is executing */     \
+        type val0 = *((type*)ptr);                                      \
+        type val = val0;                                                \
+        if (mask & (1ull<<i))                                           \
+            sprintf(tmpBuf, fmt, val);                                  \
+        else                                                            \
+            sprintf(tmpBuf, "(( * )) ");                                \
+        APPEND(tmpBuf);                                                 \
+        *bufp++ = (i != width-1 ? ',' : ']');                           \
+    }                                                                   \
+    break
+
+extern "C"
+__device__ void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
+                void **args) {
+    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
+    char *bufp = &printString[0];
+    char tmpBuf[256];
+    const char  trueBuf[] = "true";
+    const char falseBuf[] = "false";
+
+    int argCount = 0;
+    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
+        // Format strings are just single percent signs.
+        if (*format != '%') {
+            *bufp++ = *format;
+        }
+        else {
+            if (*types) {
+                void *ptr = args[argCount++];
+                // Based on the encoding in the types string, cast the
+                // value appropriately and print it with a reasonable
+                // printf() formatting string.
+                switch (*types) {
+                case 'b': {
+                    const char *tmpBuf1 =  *((bool *)ptr) ? trueBuf : falseBuf;
+                    APPEND(tmpBuf1);
+                    break;
+                }
+                case 'B': {
+                    *bufp++ = '[';
+                    if (bufp == &printString[PRINT_BUF_SIZE])
+                        break;
+                    for (int i = 0; i < width; ++i) {
+                        bool val0 = *((bool*)ptr);                                     
+                        bool val = val0;                                                \
+                        if (mask & (1ull << i)) {
+                            const char *tmpBuf1 =  val ? trueBuf : falseBuf;
+                            APPEND(tmpBuf1);
+                        }
+                        else
+                            APPEND("_________");
+                        *bufp++ = (i != width-1) ? ',' : ']';
+                    }
+                    break;
+                }
+                case 'i': PRINT_SCALAR("%d", int);
+                case 'I': PRINT_VECTOR("%d", int);
+                case 'u': PRINT_SCALAR("%u", unsigned int);
+                case 'U': PRINT_VECTOR("%u", unsigned int);
+                case 'f': PRINT_SCALAR("%f", float);
+                case 'F': PRINT_VECTOR("%f", float);
+                case 'l': PRINT_SCALAR("%lld", long long);
+                case 'L': PRINT_VECTOR("%lld", long long);
+                case 'v': PRINT_SCALAR("%llu", unsigned long long);
+                case 'V': PRINT_VECTOR("%llu", unsigned long long);
+                case 'd': PRINT_SCALAR("%f", double);
+                case 'D': PRINT_VECTOR("%f", double);
+                case 'p': PRINT_SCALAR("%p", void *);
+                case 'P': PRINT_VECTOR("%p", void *);
+                default:
+                    APPEND("UNKNOWN TYPE ");
+                    *bufp++ = *types;
+                }
+                ++types;
+            }
+        }
+        ++format;
+    }
+
+ done:
+    *bufp = '\n'; bufp++;
+    *bufp = '\0';
+}
diff --git a/builtins/builtins.c b/builtins/builtins.c
index f6c385fb..e7becf90 100644
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -185,6 +185,81 @@ void __do_print(const char *format, const char *types, int width, uint64_t mask,
     fflush(stdout);
 }
 
+/* this is print for PTX target only */
+int __puts_nvptx(const char *);
+void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
+                void **args) {
+#if 0
+    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
+    char *bufp = &printString[0];
+    char tmpBuf[256];
+
+    int argCount = 0;
+    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
+        // Format strings are just single percent signs.
+        if (*format != '%') {
+            *bufp++ = *format;
+        }
+        else {
+            if (*types) {
+                void *ptr = args[argCount++];
+                // Based on the encoding in the types string, cast the
+                // value appropriately and print it with a reasonable
+                // printf() formatting string.
+                switch (*types) {
+                case 'b': {
+                    sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
+                    APPEND(tmpBuf);
+                    break;
+                }
+                case 'B': {
+                    *bufp++ = '[';
+                    if (bufp == &printString[PRINT_BUF_SIZE])
+                        break;
+                    for (int i = 0; i < width; ++i) {
+                        if (mask & (1ull << i)) {
+                            sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
+                            APPEND(tmpBuf);
+                        }
+                        else
+                            APPEND("_________");
+                        *bufp++ = (i != width-1) ? ',' : ']';
+                    }
+                    break;
+                }
+                case 'i': PRINT_SCALAR("%d", int);
+                case 'I': PRINT_VECTOR("%d", int);
+                case 'u': PRINT_SCALAR("%u", unsigned int);
+                case 'U': PRINT_VECTOR("%u", unsigned int);
+                case 'f': PRINT_SCALAR("%f", float);
+                case 'F': PRINT_VECTOR("%f", float);
+                case 'l': PRINT_SCALAR("%lld", long long);
+                case 'L': PRINT_VECTOR("%lld", long long);
+                case 'v': PRINT_SCALAR("%llu", unsigned long long);
+                case 'V': PRINT_VECTOR("%llu", unsigned long long);
+                case 'd': PRINT_SCALAR("%f", double);
+                case 'D': PRINT_VECTOR("%f", double);
+                case 'p': PRINT_SCALAR("%p", void *);
+                case 'P': PRINT_VECTOR("%p", void *);
+                default:
+                    APPEND("UNKNOWN TYPE ");
+                    *bufp++ = *types;
+                }
+                ++types;
+            }
+        }
+        ++format;
+    }
+
+ done:
+    *bufp = '\n'; bufp++;
+    *bufp = '\0';
+    __puts_nvptx(printString);
+#else
+    __puts_nvptx("---nvptx printing is not support---\n");
+#endif
+}
+
 
 int __num_cores() {
 #if defined(_MSC_VER) || defined(__MINGW32__)
diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index 1c467476..a63dd7b2 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -288,4 +288,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
 ;; int8/int16 builtins
 
 define_avgs()
+declare_nvptx()
 
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 3dcd8373..e4cef4aa 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -10,6 +10,7 @@ packed_load_and_store()
 scans()
 int64minmax()
 aossoa()
+declare_nvptx()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 401c862d..c371326d 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -392,4 +392,4 @@ declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
 ;; int8/int16 builtins
 
 define_avgs()
-
+declare_nvptx()
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
index 1c0b421f..cb9d291f 100644
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -344,3 +344,4 @@ packed_load_and_store(4)
 ;; prefetch
 
 define_prefetches()
+declare_nvptx()
diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll
new file mode 100644
index 00000000..43a16987
--- /dev/null
+++ b/builtins/target-nvptx.ll
@@ -0,0 +1,2235 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Define the standard library builtins for the NOVEC target
+define(`MASK',`i1')
+define(`WIDTH',`1')
+
+;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() nounwind readnone
+
+;;;;;;;;;;
+
+define i32 @__program_index()  nounwind readnone alwaysinline
+{
+ %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+ %program_index = and i32 %tid, 31
+ ret i32 %program_index
+}
+define i32 @__program_count()  nounwind readnone alwaysinline
+{
+;; %tid = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+;; ret i32 %tid
+  ret i32 32
+}
+define i32 @__warp_index() nounwind readnone alwaysinline
+{
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %warp_index = lshr i32 %tid, 5
+  ret i32 %warp_index
+}
+
+;;;;;;;;;;;;
+
+define i32 @__task_index0()  nounwind readnone alwaysinline
+{
+ %bid  = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+ %bid4 = shl i32 %bid, 2
+ %warp_index = call i32 @__warp_index()
+ %task_index0 = add i32 %bid4, %warp_index
+ ret i32 %task_index0
+}
+define i32 @__task_index1()  nounwind readnone alwaysinline
+{
+ %task_index1 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+ ret i32 %task_index1
+}
+define i32 @__task_index2()  nounwind readnone alwaysinline
+{
+ %task_index2 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+ ret i32 %task_index2
+}
+define i32 @__task_index()  nounwind readnone alwaysinline
+{
+  %ti0 = call i32 @__task_index0()
+  %ti1 = call i32 @__task_index1()
+  %ti2 = call i32 @__task_index2()
+  %tc0 = call i32 @__task_count0()
+  %tc1 = call i32 @__task_count1()
+  %mul1 = mul i32 %tc1, %ti2
+  %add1 = add i32 %mul1, %ti1
+  %mul2 = mul i32 %add1, %tc0
+  %task_index = add i32 %mul2, %ti0
+  ret i32 %task_index
+}
+
+;;;;;
+
+define i32 @__task_count0()  nounwind readnone alwaysinline
+{
+ %nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+ %task_count0 = shl i32 %nb, 2
+ ret i32 %task_count0
+}
+define i32 @__task_count1()  nounwind readnone alwaysinline
+{
+ %task_count1 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+ ret i32 %task_count1
+}
+define i32 @__task_count2()  nounwind readnone alwaysinline
+{
+ %task_count2 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+ ret i32 %task_count2
+}
+define i32 @__task_count()  nounwind readnone alwaysinline
+{
+  %tc0 = call i32 @__task_count0()
+  %tc1 = call i32 @__task_count1()
+  %tc2 = call i32 @__task_count2()
+  %mul1 = mul i32 %tc1, %tc2
+  %task_count = mul i32 %mul1, %tc0
+  ret i32 %task_count
+}
+
+;;;;;;;;
+
+declare i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p3i64(i64 addrspace(3)*)
+declare i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p4i64(i64 addrspace(4)*)
+define i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
+{
+  %ptr =  tail call i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p3i64(i64 addrspace(3)* %0)
+  ret i64* %ptr
+}
+define i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline
+{
+  %ptr =  tail call i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p3i64(i64 addrspace(3)* %0)
+  ret i64* %ptr
+}
+define i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
+{
+  %ptr =  tail call i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p4i64(i64 addrspace(4)* %0)
+  ret i64* %ptr
+}
+
+;;;;;;;;
+;; i32
+define internal i32 @__shfl_i32_nvptx(i32, i32) nounwind readnone alwaysinline
+{
+  %shfl = tail call i32 asm sideeffect "shfl.idx.b32  $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) nounwind readnone alwaysinline
+  ret i32 %shfl
+}
+define internal i32 @__shfl_xor_i32_nvptx(i32, i32) nounwind readnone alwaysinline
+{
+  %shfl = tail call i32 asm sideeffect "shfl.bfly.b32  $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) nounwind readnone alwaysinline
+  ret i32 %shfl
+}
+;; float
+define internal float @__shfl_float_nvptx(float, i32) nounwind readnone alwaysinline
+{
+  %shfl = tail call float asm sideeffect "shfl.idx.b32  $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) nounwind readnone alwaysinline
+  ret float %shfl
+}
+define internal float @__shfl_xor_float_nvptx(float, i32) nounwind readnone alwaysinline
+{
+  %shfl = tail call float asm sideeffect "shfl.bfly.b32  $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) nounwind readnone alwaysinline
+  ret float %shfl
+}
+
+;;;;;;;;;;; min/max
+;; float/double
+define internal float @__fminf_nvptx(float,float) nounwind readnone alwaysinline
+{
+  %min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) nounwind readnone alwaysinline
+  ret float %min
+}
+define internal float @__fmaxf_nvptx(float,float) nounwind readnone alwaysinline
+{
+  %max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) nounwind readnone alwaysinline
+  ret float %max
+}
+
+;; int
+define(`int_minmax',`
+define internal $1 @__min_$1_signed($1,$1) nounwind readnone alwaysinline {
+  %c = icmp slt $1 %0, %1
+  %r = select i1 %c, $1 %0, $1 %1
+  ret $1 %r
+}
+define internal $1 @__max_$1_signed($1,$1) nounwind readnone alwaysinline {
+  %c = icmp sgt $1 %0, %1
+  %r = select i1 %c, $1 %0, $1 %1
+  ret $1 %r
+}
+define internal $1 @__min_$1_unsigned($1,$1) nounwind readnone alwaysinline  {
+  %c = icmp ult $1 %0, %1
+  %r = select i1 %c, $1 %0, $1 %1
+  ret $1 %r
+}
+define internal $1 @__max_$1_unsigned($1,$1) nounwind readnone alwaysinline {
+  %c = icmp ugt $1 %0, %1
+  %r = select i1 %c, $1 %0, $1 %1
+  ret $1 %r
+}
+')
+int_minmax(i8);
+int_minmax(i16);
+int_minmax(i32);
+int_minmax(i64);
+
+;; float/double
+define(`fp_minmax',`
+define internal $1 @__min_$1($1,$1) nounwind readnone alwaysinline {
+  %c = fcmp olt $1 %0, %1
+  %r = select i1 %c, $1 %0, $1 %1
+  ret $1 %r
+}
+define internal $1 @__max_$1($1,$1) nounwind readnone alwaysinline {
+  %c = fcmp ogt $1 %0, %1
+  %r = select i1 %c, $1 %0, $1 %1
+  ret $1 %r
+}
+')
+fp_minmax(float)
+fp_minmax(double)
+
+;;;;;;;;; __shfl/__shfl_xor intrinsics
+;;  i8/i16/i64 
+define(`shfl32',`
+define internal $2 @$1_$2_nvptx($2, i32) nounwind readnone alwaysinline
+{
+  %ext = zext $2 %0 to i32
+  %res = tail call i32 @$1_i32_nvptx(i32 %ext, i32 %1)
+  %ret = trunc i32 %res to $2
+  ret $2 %ret
+}
+')
+shfl32(__shfl,     i8);
+shfl32(__shfl_xor, i8);
+shfl32(__shfl,     i16);
+shfl32(__shfl_xor, i16);
+
+
+define(`shfl64',`
+define internal $2 @$1_$2_nvptx($2, i32) nounwind readnone alwaysinline
+{
+  %in   = bitcast $2 %0 to <2 x i32>
+  %in0  = extractelement <2 x i32> %in, i32 0
+  %in1  = extractelement <2 x i32> %in, i32 1
+  %out0 = tail call i32 @$1_i32_nvptx(i32 %in0, i32 %1)
+  %out1 = tail call i32 @$1_i32_nvptx(i32 %in1, i32 %1)
+  %out2 = insertelement <2 x i32> undef, i32 %out0, i32 0
+  %out  = insertelement <2 x i32> %out2, i32 %out1, i32 1
+  %ret  = bitcast <2 x i32> %out to $2
+  ret $2 %ret
+}
+')
+shfl64(__shfl,     i64)
+shfl64(__shfl_xor, i64)
+shfl64(__shfl,     double)
+shfl64(__shfl_xor, double)
+
+;;;;;;;;;;;;;
+define internal i32 @__ballot_nvptx(i1) nounwind readnone alwaysinline
+{
+  %conv = zext i1 %0 to i32
+  %res = tail call i32 asm sideeffect 
+      "{ .reg .pred %p1; 
+         setp.ne.u32 %p1, $1, 0; 
+         vote.ballot.b32  $0, %p1; 
+      }", "=r,r"(i32 %conv) nounwind readnone alwaysinline
+  ret i32 %res
+}
+define internal i32 @__lanemask_lt_nvptx() nounwind readnone alwaysinline
+{
+  %mask = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() nounwind readnone alwaysinline
+  ret i32 %mask
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; tasking
+
+;; this call allocate parameter buffer for kernel launch
+declare i64 @cudaGetParameterBuffer(i64, i64) nounwind
+define i8* @ISPCAlloc(i8**, i64 %size, i32 %align32) nounwind alwaysinline
+{
+entry:
+  %and = call i32 @__program_index()
+  %cmp = icmp eq i32 %and, 0
+  %align = zext i32 %align32 to i64
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %ptri64tmp = call i64 @cudaGetParameterBuffer(i64 %align, i64 %size);
+  br label %if.end
+
+if.end:
+  %ptri64 = phi i64 [ %ptri64tmp, %if.then ], [ 0, %entry ]
+  %ptr = inttoptr i64 %ptri64 to i8*
+  ret i8* %ptr
+}
+
+;; this actually launches kernel a kernel
+module asm "
+.extern .func  (.param .b32 func_retval0) cudaLaunchDevice
+(
+  .param .b64 cudaLaunchDevice_param_0,
+  .param .b64 cudaLaunchDevice_param_1,
+  .param .align 4 .b8 cudaLaunchDevice_param_2[12],
+  .param .align 4 .b8 cudaLaunchDevice_param_3[12],
+  .param .b32 cudaLaunchDevice_param_4,
+  .param .b64 cudaLaunchDevice_param_5
+);
+"
+define void @ISPCLaunch(i8**, i8* %func_ptr, i8* %func_args, i32 %ntx, i32 %nty, i32 %ntz) nounwind alwaysinline
+{
+entry:
+;;  only 1 lane must launch the kernel  !!!
+ %func_i64 = ptrtoint i8*  %func_ptr  to i64
+ %args_i64 = ptrtoint i8*  %func_args to i64
+
+;; nbx = (%ntx-1)/(blocksize/warpsize) + 1  for blocksize=128 & warpsize=32
+  %ntxm1   = add nsw i32 %ntx, -1
+;;  %ntxm1d4 = sdiv i32 %ntxm1, 4
+  %ntxm1d4 = ashr i32 %ntxm1, 2
+  %nbx     = add nsw i32 %ntxm1d4, 1
+  %and = call i32 @__program_index()
+;; if (laneIdx == 0)
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+
+ %res_tmp = call i32 asm sideeffect "{
+     .param .b64 param0;
+     st.param.b64	[param0+0], $1;
+     .param .b64 param1;
+     st.param.b64	[param1+0], $2;
+     .param .align 4 .b8 param2[12];
+     st.param.b32	[param2+0], $3; 
+     st.param.b32	[param2+4], $4; 
+     st.param.b32	[param2+8], $5; 
+     .param .align 4 .b8 param3[12];
+     st.param.b32	[param3+0], $6; 
+     st.param.b32	[param3+4], $7; 
+     st.param.b32	[param3+8], $8; 
+     .param .b32 param4;
+     st.param.b32	[param4+0], $9; 
+     .param .b64 param5;
+     st.param.b64	[param5+0], $10; 
+
+     .param .b32 retval0;
+     call.uni (retval0), 
+       cudaLaunchDevice,
+       (
+        param0, 
+        param1, 
+        param2, 
+        param3, 
+        param4, 
+        param5
+       );
+     ld.param.b32	$0, [retval0+0];
+  }
+  ", 
+"=r, l,l, r,r,r, r,r,r, r,l"(
+          i64 %func_i64,i64 %args_i64, 
+          i32 %nbx,i32 %nty,i32 %ntz, 
+          i32 128,i32 1,i32 1, i32 0,i64 0);
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+;;  %res = phi i32 [ %res_tmp, %if.then ], [ undef, %entry ]
+
+  ret void
+}
+
+;; this synchronizes kernel
+declare i32 @cudaDeviceSynchronize() nounwind
+define void @ISPCSync(i8*) nounwind alwaysinline
+{
+  call i32 @cudaDeviceSynchronize()
+  ret void;
+}
+
+
+;;;;;;;;;;;;;;
+
+
+
+include(`util-nvptx.m4')
+
+stdlib_core()
+packed_load_and_store()
+int64minmax()
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; broadcast/rotate/shuffle
+
+define_shuffles()
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aos/soa
+
+aossoa()
+
+;; dummy 1 wide vector ops
+declare  void
+@__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
+        <1 x float> %v3, <1 x float> * noalias %out0, 
+        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
+        <1 x float> * noalias %out3) nounwind alwaysinline ;
+
+declare  void
+@__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
+        <1 x float> %v3, <1 x float> * noalias %out0, 
+        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
+        <1 x float> * noalias %out3) nounwind alwaysinline ;
+
+declare  void
+@__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2);
+
+declare  void
+@__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2);
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @llvm.convert.from.fp16(i16) nounwind readnone
+declare i16   @llvm.convert.to.fp16(float) nounwind readnone
+define float @__half_to_float_uniform(i16 %v) nounwind readnone alwaysinline
+{
+  ;; %res = call float @llvm.convert.from.fp16(i16 %v)
+  %res = tail call float asm sideeffect 
+      "{ .reg .f16 tmp; 
+        mov.b16 tmp, $1;
+        cvt.f32.f16 $0, tmp;
+     }", "=f,h"(i16 %v) nounwind readnone alwaysinline
+  ret float %res
+}
+define i16 @__float_to_half_uniform(float %v) nounwind readnone alwaysinline
+{
+ ;; this will break the compiler, use inline asm similarly to above case
+ ;; %half = call i16 @llvm.convert.to.fp16(float %v)
+  %half = tail call i16 asm sideeffect 
+      "{ .reg .f16 tmp; 
+        cvt.rn.f16.f32 tmp, $1;
+        mov.b16 $0, tmp;
+     }", "=h,f"(float %v) nounwind readnone alwaysinline
+  ret i16 %half
+}
+define <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone alwaysinline
+{
+  %el = extractelement <1 x i16> %v, i32 0
+  %sf = call float @__half_to_float_uniform(i16 %el)
+  %vf = insertelement <1 x float> undef, float %sf, i32 0
+  ret <1 x float> %vf;
+}
+define <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone alwaysinline
+{
+  %el = extractelement <1 x float> %v, i32 0
+  %sh = call i16 @__float_to_half_uniform(float %el)
+  %vh = insertelement <1 x i16> undef, i16 %sh, i32 0
+  ret <1 x i16> %vh;
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+declare void @__fastmath() nounwind 
+
+;; round/floor/ceil
+
+define internal float @__round_uniform_float_ptx(float) nounwind readnone alwaysinline
+{
+  %2 = tail call float asm sideeffect
+        "{ .reg .pred p<3>; .reg .s32 r<4>; .reg .f32 f<10>;
+           mov.f32 f4, $1;
+           abs.f32 f5, f4;
+           mov.b32 r1, f4;
+           and.b32 r2, r1, -2147483648;
+           or.b32  r3, r2, 1056964608;
+           mov.b32 f6, r3;
+           add.f32 f7, f6, f4;
+           cvt.rzi.f32.f32	f8, f7;
+           setp.gt.f32	p1, f5, 0f4B000000;
+           selp.f32	f9, f4, f8, p1;
+           setp.geu.f32	p2, f5, 0f3F000000;
+           @p2 bra BB2_2;
+           cvt.rzi.f32.f32	f9, f4;
+BB2_2:
+           mov.f32 $0, f9;
+        }", "=f,f"(float %0) nounwind readnone alwaysinline
+  ret float %2
+}
+define  float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to <1 x i32>
+  %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648>
+  %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06>
+  %binop21.i = fadd <1 x float> %binop.i, <float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32>
+  %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to float
+  ret float %int_to_float_bitcast.i.i.i
+}
+define float @__floor_uniform_float(float) nounwind readnone alwaysinline
+{
+  %2 = tail call float asm sideeffect "cvt.rmi.f32.f32 $0, $1;", "=f,f"(float %0) nounwind alwaysinline readnone
+  ret float %2
+}
+define float @__ceil_uniform_float(float) nounwind readnone alwaysinline
+{
+  %2 = tail call float asm sideeffect "cvt.rpi.f32.f32 $0, $1;", "=f,f"(float %0) nounwind alwaysinline readnone
+  ret float %2
+}
+
+define double @__round_uniform_double(double) nounwind readnone alwaysinline
+{
+  %2 = tail call double asm sideeffect
+        "{ 
+          .reg .pred 	p<3>;
+          .reg .s32 	r<6>;
+          .reg .f64 	fd<9>;
+
+          mov.f64 	fd8, $1
+          abs.f64 	fd1, fd8;
+          setp.ge.f64	p1, fd1, 0d4330000000000000;
+          @p1 bra 	BB5_2;
+
+          add.f64 	fd5, fd1, 0d3FE0000000000000;
+          cvt.rzi.f64.f64	fd6, fd5;
+          setp.lt.f64	p2, fd1, 0d3FE0000000000000;
+          selp.f64	fd7, 0d0000000000000000, fd6, p2;
+          {
+            .reg .b32 temp; 
+            mov.b64 	{r1, temp}, fd7;
+          }
+          {
+            .reg .b32 temp; 
+            mov.b64 	{temp, r2}, fd7;
+          }
+          {
+            .reg .b32 temp; 
+            mov.b64 	{temp, r3}, fd8;
+          }
+          and.b32  	r4, r3, -2147483648;
+          or.b32  	r5, r2, r4;
+          mov.b64 	fd8, {r1, r5};
+
+BB5_2:
+          mov.f64	$0, fd8;
+        }", "=d,d"(double %0) nounwind readnone alwaysinline
+  ret double %2
+}
+define double @__floor_uniform_double(double) nounwind readnone alwaysinline
+{
+  %2 = tail call double asm sideeffect "cvt.rmi.f64.f64 $0, $1;", "=f,f"(double %0) nounwind alwaysinline readnone
+  ret double %2
+}
+define double @__ceil_uniform_double(double) nounwind readnone alwaysinline
+{
+  %2 = tail call double asm sideeffect "cvt.rpi.f64.f64 $0, $1;", "=f,f"(double %0) nounwind alwaysinline readnone
+  ret double %2
+}
+
+define  internal <1 x float> @__floor_varying_floatX(<1 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
+  %bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <1 x float> %binop.i
+}
+
+define(`rfc_varying',`
+define <1 x $2> @__$1_varying_$2(<1 x $2>) nounwind readonly alwaysinline
+{
+   %val = extractelement <1 x $2> %0, i32 0
+   %res = call $2 @__$1_uniform_$2($2 %val)
+   %ret = insertelement <1 x $2> undef, $2 %res, i32 0
+   ret <1 x $2> %ret
+}
+')
+rfc_varying(round, float)
+rfc_varying(floor, float)
+rfc_varying(ceil,  float)
+rfc_varying(round, double)
+rfc_varying(floor, double)
+rfc_varying(ceil,  double)
+
+;; min/max uniform
+
+;; declare float @__max_uniform_float(float, float) nounwind readnone 
+;; declare float @__min_uniform_float(float, float) nounwind readnone 
+define  float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  %d = fcmp ogt float %0, %1 
+  %r = select i1 %d, float %0, float %1
+  ret float %r
+
+}
+define  float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  %d = fcmp olt float %0, %1 
+  %r = select i1 %d, float %0, float %1
+  ret float %r
+
+}
+
+;; declare i32 @__min_uniform_int32(i32, i32) nounwind readnone 
+;; declare i32 @__max_uniform_int32(i32, i32) nounwind readnone 
+define  i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp slt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+define  i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp sgt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+;; declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone 
+;; declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone 
+define  i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ult i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+define  i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ugt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+;; declare i64 @__min_uniform_int64(i64, i64) nounwind readnone 
+;; declare i64 @__max_uniform_int64(i64, i64) nounwind readnone 
+define  internal i64 @__min_uniform_int64X(i64, i64) nounwind readonly alwaysinline {
+  %c = icmp slt i64 %0, %1
+  %r = select i1 %c, i64 %0, i64 %1
+  ret i64 %r
+}
+define  internal i64 @__max_uniform_int64X(i64, i64) nounwind readonly alwaysinline {
+  %c = icmp sgt i64 %0, %1
+  %r = select i1 %c, i64 %0, i64 %1
+  ret i64 %r
+}
+
+;; declare i64 @__min_uniform_uint64(i64, i64) nounwind readnone 
+;; declare i64 @__max_uniform_uint64(i64, i64) nounwind readnone 
+define  internal i64 @__min_uniform_uint64X(i64, i64) nounwind readonly alwaysinline {
+  %c = icmp ult i64 %0, %1
+  %r = select i1 %c, i64 %0, i64 %1
+  ret i64 %r
+}
+define  internal i64 @__max_uniform_uint64X(i64, i64) nounwind readonly alwaysinline {
+  %c = icmp ugt i64 %0, %1
+  %r = select i1 %c, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define  double @__max_uniform_double(double, double) nounwind readonly alwaysinline {
+  %d = fcmp ogt double %0, %1 
+  %r = select i1 %d, double %0, double %1
+  ret double %r
+}
+define  double @__min_uniform_double(double, double) nounwind readonly alwaysinline {
+  %d = fcmp olt double %0, %1 
+  %r = select i1 %d, double %0, double %1
+  ret double %r
+}
+
+;; min/max uniform
+
+
+define(`minmax_vy',`
+define <1 x $2> @__$1_varying_$3(<1 x $2>, <1 x $2>) nounwind readnone alwaysinline
+{
+  %v0 = extractelement <1 x $2> %0, i32 0
+  %v1 = extractelement <1 x $2> %1, i32 0
+  %r = call $2 @__$1_uniform_$3($2 %v0, $2 %v1)
+  %ret = insertelement <1 x $2> undef, $2 %r, i32 0
+  ret <1 x $2> %ret;
+}
+')
+minmax_vy(min, i32,  int32)
+minmax_vy(max, i32,  int32)
+minmax_vy(min, i32, uint32)
+minmax_vy(max, i32, uint32)
+minmax_vy(min, float, float)
+minmax_vy(max, float, float)
+minmax_vy(min, double, double)
+minmax_vy(max, double, double)
+
+;; sqrt/rsqrt/rcp
+
+declare float     @llvm.nvvm.rsqrt.approx.f(float %f) nounwind readonly alwaysinline
+declare float     @llvm.nvvm.sqrt.f(float %f) nounwind readonly alwaysinline
+declare double    @llvm.nvvm.rsqrt.approx.d(double %f) nounwind readonly alwaysinline
+declare double    @llvm.sqrt.f64(double %f) nounwind readonly alwaysinline
+
+;; declare float @__rcp_uniform_float(float) nounwind readnone 
+define  float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %ret = fdiv float 1.,%0
+;  %ret = tail call float asm sideeffect "rcp.approx.ftz.f32  $0, $1;", "=f,f"(float %0) nounwind readnone alwaysinline
+  ret float %ret
+}
+;; declare float @__sqrt_uniform_float(float) nounwind readnone 
+define  float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  %ret = call float @llvm.nvvm.sqrt.f(float %0)
+;  %ret = tail call float asm sideeffect "sqrt.approx.ftz.f32  $0, $1;", "=f,f"(float %0) nounwind readnone alwaysinline
+  ret float %ret
+}
+;; declare float @__rsqrt_uniform_float(float) nounwind readnone 
+define  float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline 
+{
+  %ret = call float @llvm.nvvm.rsqrt.approx.f(float %0)
+;  %ret = tail call float asm sideeffect "rsqrt.approx.ftz.f32  $0, $1;", "=f,f"(float %0) nounwind readnone alwaysinline
+  ret float %ret
+}
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone  alwaysinline
+{
+  %v = extractelement <1 x float> %0, i32 0
+  %r = call float @__rcp_uniform_float(float %v)
+  %rv = insertelement <1 x float> undef, float %r, i32 0 
+  ret <WIDTH x float> %rv
+}
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline
+{
+  %v = extractelement <1 x float> %0, i32 0
+  %r = call float @__rsqrt_uniform_float(float %v)
+  %rv = insertelement <1 x float> undef, float %r, i32 0 
+  ret <WIDTH x float> %rv
+}
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline
+{
+  %v = extractelement <1 x float> %0, i32 0
+  %r = call float @__sqrt_uniform_float(float %v)
+  %rv = insertelement <1 x float> undef, float %r, i32 0 
+  ret <WIDTH x float> %rv
+}
+
+;; declare double @__sqrt_uniform_double(double) nounwind readnone
+define  double @__sqrt_uniform_double(double) nounwind readonly alwaysinline {
+  %ret = call double @llvm.sqrt.f64(double %0)
+  ret double %ret
+}
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone alwaysinline
+{
+  %v = extractelement <1 x double> %0, i32 0
+  %r = call double @__sqrt_uniform_double(double %v)
+  %rv = insertelement <1 x double> undef, double %r, i32 0 
+  ret <WIDTH x double> %rv
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; population count
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+define  i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+ %call = call i32 @llvm.ctpop.i32(i32 %0)
+ ret i32 %call
+;;  %res = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %0) nounwind readnone alwaysinline
+ ;; ret i32 %res
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+define  i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; binary prefix sum
+
+define internal i64 @__warpBinExclusiveScan(i1 %p) nounwind readonly alwaysinline 
+{
+entry:
+  %call  = call i32 @__ballot_nvptx(i1 zeroext %p)
+  %call1 = call i32 @__popcnt_int32(i32 %call)
+  %call2 = call i32 @__lanemask_lt_nvptx()
+  %and = and i32 %call2, %call
+  %call3 = call i32 @__popcnt_int32(i32 %and)
+  %retval.sroa.1.4.insert.ext.i = zext i32 %call3 to i64
+  %retval.sroa.1.4.insert.shift.i = shl nuw i64 %retval.sroa.1.4.insert.ext.i, 32
+  %retval.sroa.0.0.insert.ext.i = zext i32 %call1 to i64
+  %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.1.4.insert.shift.i, %retval.sroa.0.0.insert.ext.i
+  ret i64 %retval.sroa.0.0.insert.insert.i
+}
+
+ctlztz()
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+;; svml is not support in PTX, will generate linking error
+
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define  i64 @__movmsk(<1 x i1>) nounwind readnone alwaysinline {
+  %v = extractelement <1 x i1> %0, i32 0
+  %v64 = zext i1 %v to i64
+  ret i64 %v64
+}
+define  i64 @__movmsk_ptx(<1 x i1>) nounwind readnone alwaysinline {
+  %v = extractelement <1 x i1> %0, i32 0
+   %v0  = call i32 @__ballot_nvptx(i1 %v)
+   %v64 = zext i32 %v0 to i64
+   ret i64 %v64
+}
+
+define  i1 @__any(<1 x i1>) nounwind readnone alwaysinline {
+  %v = extractelement <1 x i1> %0, i32 0
+  %res = call i32 @__ballot_nvptx(i1 %v)
+  %cmp = icmp ne i32 %res, 0
+  ret i1 %cmp
+}
+
+define  i1 @__all(<1 x i1>) nounwind readnone alwaysinline {
+  %v = extractelement <1 x i1> %0, i32 0
+  %res0 = call i32 @__ballot_nvptx(i1 %v)
+  %cmp = icmp eq i32 %res0, -1
+  ret i1 %cmp
+}
+
+define  i1 @__none(<1 x i1>) nounwind readnone alwaysinline {
+  %v = extractelement <1 x i1> %0, i32 0
+  %res = call i32 @__ballot_nvptx(i1 %v)
+  %cmp = icmp eq i32 %res, 0
+  ret i1 %cmp
+}
+
+;;;;;;;;; reductions i8
+define i16 @__reduce_add_int8(<1 x i8> %v) nounwind readnone alwaysinline {
+  %value8 = extractelement <1 x i8> %v, i32 0
+  %value  = zext i8 %value8 to i16
+  %call = tail call i16 @__shfl_xor_i16_nvptx(i16 %value, i32 16)
+  %call1 = add i16 %call, %value 
+  %call.1 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1, i32 8)
+  %call1.1 = add i16 %call1, %call.1 
+  %call.2 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1.1, i32 4)
+  %call1.2 = add i16 %call1.1, %call.2
+  %call.3 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1.2, i32 2)
+  %call1.3 = add i16 %call1.2, %call.3 
+  %call.4 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1.3, i32 1)
+  %call1.4 = add i16 %call1.3, %call.4 
+  ret i16 %call1.4
+}
+;;;;;;;;; reductions i16
+define i32 @__reduce_add_int16(<1 x i16> %v) nounwind readnone alwaysinline {
+  %value16 = extractelement <1 x i16> %v, i32 0
+  %value  = zext i16 %value16 to i32
+  %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
+  %call1 = add i32 %call, %value 
+  %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
+  %call1.1 = add i32 %call1, %call.1 
+  %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
+  %call1.2 = add i32 %call1.1, %call.2
+  %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
+  %call1.3 = add i32 %call1.2, %call.3 
+  %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
+  %call1.4 = add i32 %call1.3, %call.4 
+  ret i32 %call1.4
+}
+
+;;;;;;;;; reductions float
+define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
+  %value = extractelement <1 x float> %v, i32 0
+  %call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
+  %call1 = fadd float %call, %value 
+  %call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8)
+  %call1.1 = fadd float %call1, %call.1 
+  %call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4)
+  %call1.2 = fadd float %call1.1, %call.2
+  %call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2)
+  %call1.3 = fadd float %call1.2, %call.3 
+  %call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1)
+  %call1.4 = fadd float %call1.3, %call.4 
+  ret float %call1.4
+}
+define  float @__reduce_min_float(<1 x float>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x float> %0, i32 0
+  %call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
+  %call1 = tail call float @__fminf_nvptx(float %value, float %call) 
+  %call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8)
+  %call1.1 = tail call float @__fminf_nvptx(float %call1, float %call.1) 
+  %call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4)
+  %call1.2 = tail call float @__fminf_nvptx(float %call1.1, float %call.2) 
+  %call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2)
+  %call1.3 = tail call float @__fminf_nvptx(float %call1.2, float %call.3) 
+  %call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1)
+  %call1.4 = tail call float @__fminf_nvptx(float %call1.3, float %call.4) 
+  ret float %call1.4
+}
+define  float @__reduce_max_float(<1 x float>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x float> %0, i32 0
+  %call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
+  %call1 = tail call float @__fmaxf_nvptx(float %value, float %call) 
+  %call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8)
+  %call1.1 = tail call float @__fmaxf_nvptx(float %call1, float %call.1) 
+  %call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4)
+  %call1.2 = tail call float @__fmaxf_nvptx(float %call1.1, float %call.2) 
+  %call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2)
+  %call1.3 = tail call float @__fmaxf_nvptx(float %call1.2, float %call.3) 
+  %call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1)
+  %call1.4 = tail call float @__fmaxf_nvptx(float %call1.3, float %call.4) 
+  ret float %call1.4
+}
+
+;;;;;;;;; reductions int32
+define  i32 @__reduce_add_int32(<1 x i32>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i32> %0, i32 0
+  %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
+  %call1 = add i32 %call, %value 
+  %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
+  %call1.1 =add i32 %call1, %call.1 
+  %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
+  %call1.2 = add i32 %call1.1, %call.2
+  %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
+  %call1.3 = add i32 %call1.2, %call.3 
+  %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
+  %call1.4 = add i32 %call1.3, %call.4 
+  ret i32 %call1.4
+}
+define  i32 @__reduce_min_int32(<1 x i32>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i32> %0, i32 0
+  %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
+  %call1 = tail call i32 @__min_i32_signed(i32 %value, i32 %call) 
+  %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
+  %call1.1 = tail call i32 @__min_i32_signed(i32 %call1, i32 %call.1) 
+  %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
+  %call1.2 = tail call i32 @__min_i32_signed(i32 %call1.1, i32 %call.2) 
+  %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
+  %call1.3 = tail call i32 @__min_i32_signed(i32 %call1.2, i32 %call.3) 
+  %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
+  %call1.4 = tail call i32 @__min_i32_signed(i32 %call1.3, i32 %call.4) 
+  ret i32 %call1.4
+}
+define  i32 @__reduce_max_int32(<1 x i32>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i32> %0, i32 0
+  %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
+  %call1 = tail call i32 @__max_i32_signed(i32 %value, i32 %call) 
+  %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
+  %call1.1 = tail call i32 @__max_i32_signed(i32 %call1, i32 %call.1) 
+  %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
+  %call1.2 = tail call i32 @__max_i32_signed(i32 %call1.1, i32 %call.2) 
+  %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
+  %call1.3 = tail call i32 @__max_i32_signed(i32 %call1.2, i32 %call.3) 
+  %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
+  %call1.4 = tail call i32 @__max_i32_signed(i32 %call1.3, i32 %call.4) 
+  ret i32 %call1.4
+}
+
+;;;;;;;;; reductions uint32
+define  i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i32> %0, i32 0
+  %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
+  %call1 = tail call i32 @__min_i32_unsigned(i32 %value, i32 %call) 
+  %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
+  %call1.1 = tail call i32 @__min_i32_unsigned(i32 %call1, i32 %call.1) 
+  %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
+  %call1.2 = tail call i32 @__min_i32_unsigned(i32 %call1.1, i32 %call.2) 
+  %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
+  %call1.3 = tail call i32 @__min_i32_unsigned(i32 %call1.2, i32 %call.3) 
+  %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
+  %call1.4 = tail call i32 @__min_i32_unsigned(i32 %call1.3, i32 %call.4) 
+  ret i32 %call1.4
+}
+define  i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i32> %0, i32 0
+  %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
+  %call1 = tail call i32 @__max_i32_unsigned(i32 %value, i32 %call) 
+  %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
+  %call1.1 = tail call i32 @__max_i32_unsigned(i32 %call1, i32 %call.1) 
+  %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
+  %call1.2 = tail call i32 @__max_i32_unsigned(i32 %call1.1, i32 %call.2) 
+  %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
+  %call1.3 = tail call i32 @__max_i32_unsigned(i32 %call1.2, i32 %call.3) 
+  %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
+  %call1.4 = tail call i32 @__max_i32_unsigned(i32 %call1.3, i32 %call.4) 
+  ret i32 %call1.4
+ }
+
+;;;;;;;;; reductions double
+define  double @__reduce_add_double(<1 x double>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x double> %0, i32 0
+  %call = tail call double @__shfl_xor_double_nvptx(double %value, i32 16)
+  %call1 = fadd double %call, %value 
+  %call.1 = tail call double @__shfl_xor_double_nvptx(double %call1, i32 8)
+  %call1.1 = fadd double %call1, %call.1 
+  %call.2 = tail call double @__shfl_xor_double_nvptx(double %call1.1, i32 4)
+  %call1.2 = fadd double %call1.1, %call.2
+  %call.3 = tail call double @__shfl_xor_double_nvptx(double %call1.2, i32 2)
+  %call1.3 = fadd double %call1.2, %call.3 
+  %call.4 = tail call double @__shfl_xor_double_nvptx(double %call1.3, i32 1)
+  %call1.4 = fadd double %call1.3, %call.4 
+  ret double %call1.4
+}
+define  double @__reduce_min_double(<1 x double>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x double> %0, i32 0
+  %call = tail call double @__shfl_xor_double_nvptx(double %value, i32 16)
+  %call1 = tail call double @__min_double(double %value, double %call) 
+  %call.1 = tail call double @__shfl_xor_double_nvptx(double %call1, i32 8)
+  %call1.1 = tail call double @__min_double(double %call1, double %call.1) 
+  %call.2 = tail call double @__shfl_xor_double_nvptx(double %call1.1, i32 4)
+  %call1.2 = tail call double @__min_double(double %call1.1, double %call.2) 
+  %call.3 = tail call double @__shfl_xor_double_nvptx(double %call1.2, i32 2)
+  %call1.3 = tail call double @__min_double(double %call1.2, double %call.3) 
+  %call.4 = tail call double @__shfl_xor_double_nvptx(double %call1.3, i32 1)
+  %call1.4 = tail call double @__min_double(double %call1.3, double %call.4) 
+  ret double %call1.4
+}
+define  double @__reduce_max_double(<1 x double>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x double> %0, i32 0
+  %call = tail call double @__shfl_xor_double_nvptx(double %value, i32 16)
+  %call1 = tail call double @__max_double(double %value, double %call) 
+  %call.1 = tail call double @__shfl_xor_double_nvptx(double %call1, i32 8)
+  %call1.1 = tail call double @__max_double(double %call1, double %call.1) 
+  %call.2 = tail call double @__shfl_xor_double_nvptx(double %call1.1, i32 4)
+  %call1.2 = tail call double @__max_double(double %call1.1, double %call.2) 
+  %call.3 = tail call double @__shfl_xor_double_nvptx(double %call1.2, i32 2)
+  %call1.3 = tail call double @__max_double(double %call1.2, double %call.3) 
+  %call.4 = tail call double @__shfl_xor_double_nvptx(double %call1.3, i32 1)
+  %call1.4 = tail call double @__max_double(double %call1.3, double %call.4) 
+  ret double %call1.4
+}
+
+
+;;;;;;;;; reductions int64
+define  i64 @__reduce_add_int64(<1 x i64>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i64> %0, i32 0
+  %call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16)
+  %call1 = add i64 %call, %value 
+  %call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8)
+  %call1.1 =add i64 %call1, %call.1 
+  %call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4)
+  %call1.2 = add i64 %call1.1, %call.2
+  %call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2)
+  %call1.3 = add i64 %call1.2, %call.3 
+  %call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1)
+  %call1.4 = add i64 %call1.3, %call.4 
+  ret i64 %call1.4
+}
+define  i64 @__reduce_min_int64(<1 x i64>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i64> %0, i32 0
+  %call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16)
+  %call1 = tail call i64 @__min_i64_signed(i64 %value, i64 %call) 
+  %call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8)
+  %call1.1 = tail call i64 @__min_i64_signed(i64 %call1, i64 %call.1) 
+  %call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4)
+  %call1.2 = tail call i64 @__min_i64_signed(i64 %call1.1, i64 %call.2) 
+  %call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2)
+  %call1.3 = tail call i64 @__min_i64_signed(i64 %call1.2, i64 %call.3) 
+  %call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1)
+  %call1.4 = tail call i64 @__min_i64_signed(i64 %call1.3, i64 %call.4) 
+  ret i64 %call1.4
+}
+define  i64 @__reduce_max_int64(<1 x i64>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i64> %0, i32 0
+  %call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16)
+  %call1 = tail call i64 @__max_i64_signed(i64 %value, i64 %call) 
+  %call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8)
+  %call1.1 = tail call i64 @__max_i64_signed(i64 %call1, i64 %call.1) 
+  %call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4)
+  %call1.2 = tail call i64 @__max_i64_signed(i64 %call1.1, i64 %call.2) 
+  %call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2)
+  %call1.3 = tail call i64 @__max_i64_signed(i64 %call1.2, i64 %call.3) 
+  %call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1)
+  %call1.4 = tail call i64 @__max_i64_signed(i64 %call1.3, i64 %call.4) 
+  ret i64 %call1.4
+}
+define  i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i64> %0, i32 0
+  %call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16)
+  %call1 = tail call i64 @__min_i64_unsigned(i64 %value, i64 %call) 
+  %call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8)
+  %call1.1 = tail call i64 @__min_i64_unsigned(i64 %call1, i64 %call.1) 
+  %call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4)
+  %call1.2 = tail call i64 @__min_i64_unsigned(i64 %call1.1, i64 %call.2) 
+  %call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2)
+  %call1.3 = tail call i64 @__min_i64_unsigned(i64 %call1.2, i64 %call.3) 
+  %call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1)
+  %call1.4 = tail call i64 @__min_i64_unsigned(i64 %call1.3, i64 %call.4) 
+  ret i64 %call1.4
+}
+define  i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i64> %0, i32 0
+  %call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16)
+  %call1 = tail call i64 @__max_i64_unsigned(i64 %value, i64 %call) 
+  %call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8)
+  %call1.1 = tail call i64 @__max_i64_unsigned(i64 %call1, i64 %call.1) 
+  %call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4)
+  %call1.2 = tail call i64 @__max_i64_unsigned(i64 %call1.1, i64 %call.2) 
+  %call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2)
+  %call1.3 = tail call i64 @__max_i64_unsigned(i64 %call1.2, i64 %call.3) 
+  %call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1)
+  %call1.4 = tail call i64 @__max_i64_unsigned(i64 %call1.3, i64 %call.4) 
+  ret i64 %call1.4
+}
+
+;;;; reduce equal, must be tested and may fail if data has -1
+define internal i32 @__shfl_reduce_and_step_i32_nvptx(i32, i32) nounwind readnone alwaysinline
+{
+  %shfl = tail call i32 asm sideeffect
+      "{.reg .u32 r0; 
+        .reg .pred p;
+        shfl.bfly.b32  r0|p, $1, $2, 0;
+        @p and.b32 r0, r0, $3;
+        mov.u32 $0, r0;
+      }", "=r,r,r,r"(i32 %0, i32 %1, i32 %0) nounwind readnone alwaysinline
+  ret i32 %shfl
+}
+shfl64(__shfl_reduce_and_step, i64)
+
+define internal i32 @__reduce_and_i32(i32 %v0, i1 %mask) nounwind readnone alwaysinline
+{
+  %v  = select i1 %mask, i32 %v0, i32 -1
+  %s1 = tail call i32 @__shfl_reduce_and_step_i32_nvptx(i32 %v,  i32 16);
+  %s2 = tail call i32 @__shfl_reduce_and_step_i32_nvptx(i32 %s1, i32  8);
+  %s3 = tail call i32 @__shfl_reduce_and_step_i32_nvptx(i32 %s2, i32  4);
+  %s4 = tail call i32 @__shfl_reduce_and_step_i32_nvptx(i32 %s3, i32  2);
+  %s5 = tail call i32 @__shfl_reduce_and_step_i32_nvptx(i32 %s4, i32  1);
+  ret i32 %s5
+}
+define internal i64 @__reduce_and_i64(i64, i1) nounwind readnone alwaysinline
+{
+  %v   = bitcast i64 %0 to <2 x i32>
+  %v0  = extractelement <2 x i32> %v, i32 0
+  %v1  = extractelement <2 x i32> %v, i32 1
+  %s0  = call i32 @__reduce_and_i32(i32 %v0, i1 %1)
+  %s1  = call i32 @__reduce_and_i32(i32 %v1, i1 %1)
+  %tmp = insertelement <2 x i32> undef, i32 %s0, i32 0
+  %res = insertelement <2 x i32> %tmp,  i32 %s1, i32 1
+  %ret = bitcast <2 x i32> %res to i64
+  ret i64 %ret;
+}
+
+define(`reduce_equal',`
+define i1 @__reduce_equal_$2(<1 x $1> %v0, $1 * %samevalue, <1 x i1> %maskv) nounwind alwaysinline
+{
+entry:
+  %vv = bitcast <1 x $1> %v0 to <1 x $3>
+  %sv = extractelement <1 x $3> %vv, i32 0
+  %mask = extractelement <1 x i1> %maskv, i32 0
+
+  %s = call $3 @__reduce_and_$3($3 %sv, i1 %mask);
+
+  ;; find last active lane 
+  %nact  = call i32 @__ballot_nvptx(i1 %mask)
+  %lane1 = call i32 @__count_leading_zeros_i32(i32 %nact)
+  %lane  = sub i32 31, %lane1
+
+  ;; broadcast result from this lane
+  %r = tail call $3 @__shfl_$3_nvptx($3 %s, i32 %lane)
+
+  ;; compare result to the original value
+  %c0  = icmp eq $3 %r, %sv
+  %c1  = and i1 %c0, %mask
+  %neq = call i32 @__ballot_nvptx(i1 %c1)
+  %cmp = icmp eq i32 %neq, %nact
+
+  br i1 %cmp, label %all_equal, label %all_not_equal
+  
+all_equal:
+  %vstore = bitcast $3 %r to $1 
+  store $1 %vstore, $1* %samevalue;
+  ret i1 true
+
+all_not_equal:
+  ret i1 false
+
+}
+')
+reduce_equal(i32,    int32, i32);
+reduce_equal(i64,    int64, i64);
+reduce_equal(float,  float, i32);
+reduce_equal(double, double, i64);
+
+;;;;;;;;;;; shuffle
+define(`shuffle1', `
+define <1 x $1> @__shuffle_$1(<1 x $1>, <1 x i32>) nounwind readnone alwaysinline 
+{
+  %val  = extractelement <1 x $1> %0, i32 0
+  %lane = extractelement <1 x i32> %1, i32 0
+  %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane)
+  %retv = insertelement <1 x $1> undef, $1 %rets, i32 0
+  ret <1 x $1> %retv
+}
+')
+shuffle1(i8)
+shuffle1(i16)
+shuffle1(i32)
+shuffle1(i64)
+shuffle1(float)
+shuffle1(double)
+
+define(`shuffle2',`
+define <1 x $1> @__shuffle2_$1(<1 x $1>, <1 x $1>, <1 x i32>) nounwind readnone alwaysinline
+{
+  %val1 = extractelement <1 x  $1> %0, i32 0
+  %val2 = extractelement <1 x  $1> %1, i32 0
+
+  ;; fetch both values
+  %lane = extractelement <1 x i32> %2, i32 0
+  %lane_mask = and i32 %lane, 31
+  %ret1 = tail call $1 @__shfl_$1_nvptx($1 %val1, i32 %lane_mask);
+  %ret2 = tail call $1 @__shfl_$1_nvptx($1 %val2, i32 %lane_mask);
+
+  ;; select the correct one
+  %c    = icmp slt i32 %lane, 32              
+  %rets = select i1 %c, $1 %ret1, $1 %ret2
+  %retv = insertelement <1 x $1> undef, $1 %rets, i32 0
+  ret <1 x $1> %retv
+}
+')
+shuffle2(i8)
+shuffle2(i16)
+shuffle2(i32)
+shuffle2(i64)
+shuffle2(float)
+shuffle2(double)
+
+define(`shift',`
+define <1 x $1> @__shift_$1(<1 x $1>, i32) nounwind readnone alwaysinline
+{
+  %val  = extractelement <1 x $1> %0, i32 0
+  %lane = call i32 @__program_index()
+  %src  = add i32 %lane, %1
+  %ret  = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %src)
+  %c1   = icmp sge i32 %src, 0
+  %c2   = icmp slt i32 %src, 32
+  %c    = and i1 %c1, %c2
+  %rets = select i1 %c, $1 %ret, $1 zeroinitializer
+  %retv = insertelement <1 x $1> undef, $1 %rets, i32 0
+  ret <1 x $1> %retv
+}
+')
+shift(i8)
+shift(i16)
+shift(i32)
+shift(i64)
+shift(float)
+shift(double)
+
+define(`rotate', `
+define <1 x $1> @__rotate_$1(<1 x $1>, i32) nounwind readnone alwaysinline 
+{
+  %val  = extractelement <1 x $1> %0, i32 0
+  %tid  = call i32 @__program_index()
+  %src  = add i32 %tid, %1
+  %lane = and i32 %src, 31
+  %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane)
+  %retv = insertelement <1 x $1> undef, $1 %rets, i32 0
+  ret <1 x $1> %retv
+}
+')
+rotate(i8)
+rotate(i16)
+rotate(i32)
+rotate(i64)
+rotate(float)
+rotate(double)
+
+define(`broadcast', `
+define <1 x $1> @__broadcast_$1(<1 x $1>, i32) nounwind readnone alwaysinline 
+{
+  %val  = extractelement <1 x $1> %0, i32 0
+  %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %1)
+  %retv = insertelement <1 x $1> undef, $1 %rets, i32 0
+  ret <1 x $1> %retv
+}
+')
+broadcast(i8)
+broadcast(i16)
+broadcast(i32)
+broadcast(i64)
+broadcast(float)
+broadcast(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefix sum stuff
+
+define internal i32 @__shfl_scan_add_step_i32(i32 %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call i32 asm sideeffect  
+      "{.reg .u32 r0;
+       .reg .pred p;
+       shfl.up.b32 r0|p, $1, $2, 0;
+       @p add.u32 r0, r0, $3;
+       mov.u32 $0, r0;
+       }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) nounwind readnone alwaysinline
+  ret i32 %result;
+}
+define <1 x i32> @__exclusive_scan_add_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x i32> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v    = select i1 %mask, i32 %v0, i32 0
+  
+  %s1 = tail call i32 @__shfl_scan_add_step_i32(i32 %v,  i32  1);
+  %s2 = tail call i32 @__shfl_scan_add_step_i32(i32 %s1, i32  2);
+  %s3 = tail call i32 @__shfl_scan_add_step_i32(i32 %s2, i32  4);
+  %s4 = tail call i32 @__shfl_scan_add_step_i32(i32 %s3, i32  8);
+  %s5 = tail call i32 @__shfl_scan_add_step_i32(i32 %s4, i32 16);
+  %rets = sub i32 %s5, %v
+  %retv = insertelement <1 x i32> undef, i32 %rets, i32 0
+  ret <1 x i32> %retv
+}
+;;
+define internal i32 @__shfl_scan_or_step_i32(i32 %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call i32 asm sideeffect  
+      "{.reg .u32 r0;
+       .reg .pred p;
+       shfl.up.b32 r0|p, $1, $2, 0;
+       @p or.b32 r0, r0, $3;
+       mov.u32 $0, r0;
+       }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) nounwind readnone alwaysinline
+  ret i32 %result;
+}
+define <1 x i32> @__exclusive_scan_or_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x i32> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v1   = select i1 %mask, i32 %v0, i32 0
+
+  ;; shfl-up by one for exclusive scan
+  %v = tail call i32 asm sideeffect
+      "{.reg .u32 r0;
+        .reg .pred p;
+        shfl.up.b32 r0|p, $1, 1, 0;
+        @!p mov.u32 r0, 0;
+        mov.u32 $0, r0;
+      }","=r,r"(i32 %v1);
+  
+  %s1 = tail call i32 @__shfl_scan_or_step_i32(i32 %v,  i32  1);
+  %s2 = tail call i32 @__shfl_scan_or_step_i32(i32 %s1, i32  2);
+  %s3 = tail call i32 @__shfl_scan_or_step_i32(i32 %s2, i32  4);
+  %s4 = tail call i32 @__shfl_scan_or_step_i32(i32 %s3, i32  8);
+  %s5 = tail call i32 @__shfl_scan_or_step_i32(i32 %s4, i32 16);
+  %retv = insertelement <1 x i32> undef, i32 %s5, i32 0
+  ret <1 x i32> %retv
+}
+;;
+define internal i32 @__shfl_scan_and_step_i32(i32 %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call i32 asm sideeffect  
+      "{.reg .u32 r0;
+       .reg .pred p;
+       shfl.up.b32 r0|p, $1, $2, 0;
+       @p and.b32 r0, r0, $3;
+       mov.u32 $0, r0;
+       }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) nounwind readnone alwaysinline
+  ret i32 %result;
+}
+define <1 x i32> @__exclusive_scan_and_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x i32> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v1   = select i1 %mask, i32 %v0, i32 -1
+
+  ;; shfl-up by one for exclusive scan
+  %v = tail call i32 asm sideeffect
+      "{.reg .u32 r0;
+        .reg .pred p;
+        shfl.up.b32 r0|p, $1, 1, 0;
+        @!p mov.u32 r0, -1;
+        mov.u32 $0, r0;
+      }","=r,r"(i32 %v1);
+
+  %s1 = tail call i32 @__shfl_scan_and_step_i32(i32 %v,  i32  1);
+  %s2 = tail call i32 @__shfl_scan_and_step_i32(i32 %s1, i32  2);
+  %s3 = tail call i32 @__shfl_scan_and_step_i32(i32 %s2, i32  4);
+  %s4 = tail call i32 @__shfl_scan_and_step_i32(i32 %s3, i32  8);
+  %s5 = tail call i32 @__shfl_scan_and_step_i32(i32 %s4, i32 16);
+  %retv = insertelement <1 x i32> undef, i32 %s5, i32 0
+  ret <1 x i32> %retv
+}
+
+define internal float @__shfl_scan_add_step_float(float %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call float asm sideeffect  
+      "{.reg .f32 f0;
+       .reg .pred p;
+       shfl.up.b32 f0|p, $1, $2, 0;
+       @p add.f32 f0, f0, $3;
+       mov.f32 $0, f0;
+       }", "=f,f,r,f"(float %partial, i32 %up_offset, float %partial) nounwind readnone alwaysinline
+  ret float %result;
+}
+define <1 x float> @__exclusive_scan_add_float(<1 x float>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x float> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v    = select i1 %mask, float %v0, float zeroinitializer
+
+  %s1 = tail call float @__shfl_scan_add_step_float(float %v,  i32  1);
+  %s2 = tail call float @__shfl_scan_add_step_float(float %s1, i32  2);
+  %s3 = tail call float @__shfl_scan_add_step_float(float %s2, i32  4);
+  %s4 = tail call float @__shfl_scan_add_step_float(float %s3, i32  8);
+  %s5 = tail call float @__shfl_scan_add_step_float(float %s4, i32 16);
+  %rets = fsub float %s5, %v
+  %retv = insertelement <1 x float> undef, float %rets, i32 0
+  ret <1 x float> %retv
+}
+define internal double @__shfl_scan_add_step_double(double %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call double asm sideeffect  
+      "{.reg .s32 r<10>;
+        .reg .f64 fd0;
+       .reg .pred p;
+       .reg .b32 temp;
+       mov.b64 {r1,temp}, $1;
+       mov.b64 {temp,r2}, $1;
+       shfl.up.b32 r3,   r1, $2, 0;
+       shfl.up.b32 r4|p, r2, $2, 0;
+       mov.b64 fd0, {r3,r4};
+       @p add.f64 fd0, fd0, $3;
+       mov.f64 $0, fd0;
+       }", "=d,d,r,d"(double %partial, i32 %up_offset, double %partial) nounwind readnone alwaysinline
+  ret double %result;
+}
+define <1 x double> @__exclusive_scan_add_double(<1 x double>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x double> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v    = select i1 %mask, double %v0, double zeroinitializer
+
+  %s1 = tail call double @__shfl_scan_add_step_double(double %v,  i32  1);
+  %s2 = tail call double @__shfl_scan_add_step_double(double %s1, i32  2);
+  %s3 = tail call double @__shfl_scan_add_step_double(double %s2, i32  4);
+  %s4 = tail call double @__shfl_scan_add_step_double(double %s3, i32  8);
+  %s5 = tail call double @__shfl_scan_add_step_double(double %s4, i32 16);
+  %rets = fsub double %s5, %v
+  %retv = bitcast double %rets to <1 x double>
+  ret <1 x double> %retv
+}
+
+define internal i64 @__shfl_scan_add_step_i64(i64 %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call i64 asm sideeffect  
+      "{.reg .s32 r<10>;
+        .reg .s64 rl0;
+       .reg .pred p;
+       .reg .b32 temp;
+       mov.b64 {r1,temp}, $1;
+       mov.b64 {temp,r2}, $1;
+       shfl.up.b32 r3,   r1, $2, 0;
+       shfl.up.b32 r4|p, r2, $2, 0;
+       mov.b64 rl0, {r3,r4};
+       @p add.s64 rl0, rl0, $3;
+       mov.s64 $0, rl0;
+       }", "=l,l,r,l"(i64 %partial, i32 %up_offset, i64 %partial) nounwind readnone alwaysinline
+  ret i64 %result;
+}
+define <1 x i64> @__exclusive_scan_add_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x i64> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v    = select i1 %mask, i64 %v0, i64 zeroinitializer
+
+  %s1 = tail call i64 @__shfl_scan_add_step_i64(i64 %v,  i32  1);
+  %s2 = tail call i64 @__shfl_scan_add_step_i64(i64 %s1, i32  2);
+  %s3 = tail call i64 @__shfl_scan_add_step_i64(i64 %s2, i32  4);
+  %s4 = tail call i64 @__shfl_scan_add_step_i64(i64 %s3, i32  8);
+  %s5 = tail call i64 @__shfl_scan_add_step_i64(i64 %s4, i32 16);
+  %rets = sub i64 %s5, %v
+  %retv = bitcast i64 %rets to <1 x i64>
+  ret <1 x i64> %retv
+}
+
+define(`exclusive_scan_i64',`
+define <1 x i64> @__exclusive_scan_$1_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v = bitcast <1 x i64> %0 to <2 x i32>
+  %v0 = extractelement <2 x i32> %v, i32 0
+  %v1 = extractelement <2 x i32> %v, i32 1
+  %inp0 = bitcast i32 %v0 to <1 x i32>
+  %inp1 = bitcast i32 %v1 to <1 x i32>
+  %res0 = call <1 x i32> @__exclusive_scan_$1_i32(<1 x i32> %inp0, <1 x i1> %1);
+  %res1 = call <1 x i32> @__exclusive_scan_$1_i32(<1 x i32> %inp1, <1 x i1> %1);
+  %r0   = bitcast <1 x i32> %res0 to i32
+  %r1   = bitcast <1 x i32> %res1 to i32
+  %ret0 = insertelement <2 x i32> undef, i32 %r0, i32 0
+  %ret1 = insertelement <2 x i32> %ret0, i32 %r1, i32 1
+  %ret  = bitcast <2 x i32> %ret1 to <1 x i64>
+  ret <1 x i64> %ret
+}
+')
+exclusive_scan_i64(or)
+exclusive_scan_i64(and)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(float)
+gen_masked_store(i64)
+gen_masked_store(double)
+
+define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i8> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
+  store <WIDTH x i8> %v1, <WIDTH x i8> * %0
+  ret void
+}
+
+define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                      <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i16> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
+  store <WIDTH x i16> %v1, <WIDTH x i16> * %0
+  ret void
+}
+
+define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                      <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i32> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
+  store <WIDTH x i32> %v1, <WIDTH x i32> * %0
+  ret void
+}
+
+define void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                        <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x float> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x float> %1, <WIDTH x float> %v
+  store <WIDTH x float> %v1, <WIDTH x float> * %0
+  ret void
+}
+
+define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
+                            <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i64> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
+  store <WIDTH x i64> %v1, <WIDTH x i64> * %0
+  ret void
+}
+
+define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
+                            <WIDTH x double>, <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x double> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x double> %1, <WIDTH x double> %v
+  store <WIDTH x double> %v1, <WIDTH x double> * %0
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch
+define_prefetches()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; vector ops
+
+define(`extract_insert',`
+define $1 @__extract_$2(<1 x $1>, i32) nounwind readnone alwaysinline {
+  %val = extractelement <1 x $1> %0, i32 0
+  %extract = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %1)
+  ret $1 %extract
+}
+
+define <1 x $1> @__insert_$2(<1 x $1>, i32, 
+                                   $1) nounwind readnone alwaysinline {
+  %orig = extractelement <1 x $1> %0, i32 0
+  %lane = call i32 @__program_index() 
+  %c    = icmp eq i32 %lane, %1
+  %val  = select i1 %c, $1 %2, $1 %orig
+  %insert = insertelement <1 x $1> %0, $1 %val, i32 0
+  ret <1 x $1> %insert
+}
+')
+
+extract_insert(i8, int8)
+extract_insert(i16, int16)
+extract_insert(i32, int32)
+extract_insert(i64, int64)
+extract_insert(float, float)
+extract_insert(double, double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; assert
+
+declare void @__assertfail(i64,i64,i32,i64,i64) noreturn;
+declare i32 @vprintf(i64,i64)
+define i32 @__puts_nvptx(i8*) alwaysinline
+{
+  %str   = ptrtoint i8* %0 to i64
+  %parm  = or i64 0, 0
+  %call  = call i32 @vprintf(i64 %str, i64 %parm)
+;;  %cr    = alloca <3 x i8>
+;;  store <3 x i8> <i8 13, i8 10, i8 0>, <3 x i8>* %cr
+;;  %cr1   = ptrtoint <3 x i8>* %cr to i64
+;;  %call1 = call i32 @vprintf(i64 %cr1, i64 %parm)
+  ret i32 %call;
+}
+define internal void @__abort_nvptx(i8* %str) noreturn
+{
+  %tmp1 = alloca <3 x i8>
+  store <3 x i8> <i8 58, i8 58, i8 0>, <3 x i8>* %tmp1
+  %tmp2 = alloca <2 x i8>
+  store <2 x i8> <i8 0, i8 0>, <2 x i8>* %tmp2
+
+  %param1 = ptrtoint <2 x i8>* %tmp2 to i64
+  %param3 = or i32 0, 0
+  %string = ptrtoint i8* %str to i64
+  %param4 = ptrtoint <3 x i8>* %tmp1 to i64
+  %param5 = or i64 1, 1
+  call void @__assertfail(i64 %param1, i64 %string, i32 %param3, i64 %param4, i64 %param5);
+  ret void
+}
+
+define void @__do_assert_uniform(i8 *%str, i1 %test, <WIDTH x MASK> %mask) {
+  br i1 %test, label %ok, label %fail
+
+fail:
+  %lane = call i32 @__program_index()
+  %cmp  = icmp eq i32 %lane, 0
+  br i1 %cmp, label %fail_print, label %fail_void;
+  
+
+
+fail_print:
+  call void @__abort_nvptx(i8* %str) noreturn
+  unreachable
+
+fail_void:
+  unreachable
+
+ok:
+  ret void
+}
+
+
+define void @__do_assert_varying(i8 *%str, <WIDTH x MASK> %test,
+                                 <WIDTH x MASK> %mask) {
+  %nottest = xor <WIDTH x MASK> %test,
+                 < forloop(i, 1, eval(WIDTH-1), `MASK -1, ') MASK -1 >
+  %nottest_and_mask = and <WIDTH x MASK> %nottest, %mask
+  %mm = call i64 @__movmsk(<WIDTH x MASK> %nottest_and_mask)
+  %all_ok = icmp eq i64 %mm, 0
+  br i1 %all_ok, label %ok, label %fail
+
+fail:
+  call void @__abort_nvptx(i8* %str) noreturn
+  unreachable
+
+ok:
+  ret void
+}
+
+define i64 @__clock() nounwind alwaysinline {
+  %r = call i64 asm sideeffect "mov.b64 $0, %clock64;", "=l"();
+  ret i64 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; atomics and memory barriers
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; global_atomic_associative
+;; More efficient implementation for atomics that are associative (e.g.,
+;; add, and, ...).  If a basic implementation would do sometihng like:
+;; result0 = atomic_op(ptr, val0)
+;; result1 = atomic_op(ptr, val1)
+;; ..
+;; Then instead we can do:
+;; tmp = (val0 op val1 op ...)
+;; result0 = atomic_op(ptr, tmp)
+;; result1 = (result0 op val0)
+;; ..
+;; And more efficiently compute the same result
+;;
+;; Takes five parameters:
+;; $1: vector width of the target
+;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
+;;     (add, sub...)
+;; $3: return type of the LLVM atomic (e.g. i32)
+;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
+;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
+;; add
+define <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i32> %valv  to i32
+  br i1 %mask, label %exec, label %pass
+exec:
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  %oldv = bitcast i32 %old to <1 x i32>
+  ret <1 x i32> %oldv
+pass:
+  ret <1 x i32> %valv
+}
+;; sub
+define <1 x i32> @__atomic_sub_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %nvalv = sub <1 x i32> <i32 0>, %valv
+  %ret = call <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %nvalv, <1 x i1> %maskv);
+  ret <1 x i32> %ret;
+}
+;; and
+define <1 x i32> @__atomic_and_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i32> %valv  to i32
+  br i1 %mask, label %exec, label %pass
+exec:
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  %oldv = bitcast i32 %old to <1 x i32>
+  ret <1 x i32> %oldv
+pass:
+  ret <1 x i32> %valv
+}
+;; or
+define <1 x i32> @__atomic_or_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i32> %valv  to i32
+  br i1 %mask, label %exec, label %pass
+exec:
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  %oldv = bitcast i32 %old to <1 x i32>
+  ret <1 x i32> %oldv
+pass:
+  ret <1 x i32> %valv
+}
+;; xor
+define <1 x i32> @__atomic_xor_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i32> %valv  to i32
+  br i1 %mask, label %exec, label %pass
+exec:
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  %oldv = bitcast i32 %old to <1 x i32>
+  ret <1 x i32> %oldv
+pass:
+  ret <1 x i32> %valv
+}
+
+;;;;;;;;; int64
+define <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i64> %valv  to i64
+  br i1 %mask, label %exec, label %pass
+exec:
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  %oldv = bitcast i64 %old to <1 x i64>
+  ret <1 x i64> %oldv
+pass:
+  ret <1 x i64> %valv
+}
+define <1 x i64> @__atomic_sub_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %nvalv = sub <1 x i64> <i64 0>, %valv
+  %ret = call <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %nvalv, <1 x i1> %maskv);
+  ret <1 x i64> %ret;
+}
+
+;; and
+define <1 x i64> @__atomic_and_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i64> %valv  to i64
+  br i1 %mask, label %exec, label %pass
+exec:
+  %andr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %andr, i64 %val);
+  %oldv = bitcast i64 %old to <1 x i64>
+  ret <1 x i64> %oldv
+pass:
+  ret <1 x i64> %valv
+}
+
+;; or 
+define <1 x i64> @__atomic_or_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i64> %valv  to i64
+  br i1 %mask, label %exec, label %pass
+exec:
+  %orr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %orr, i64 %val);
+  %oldv = bitcast i64 %old to <1 x i64>
+  ret <1 x i64> %oldv
+pass:
+  ret <1 x i64> %valv
+}
+
+;; xor
+define <1 x i64> @__atomic_xor_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i64> %valv  to i64
+  br i1 %mask, label %exec, label %pass
+exec:
+  %xorr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %xorr, i64 %val);
+  %oldv = bitcast i64 %old to <1 x i64>
+  ret <1 x i64> %oldv
+pass:
+  ret <1 x i64> %valv
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; global_atomic_uniform
+;; Defines the implementation of a function that handles the mapping from
+;; an ispc atomic function to the underlying LLVM intrinsics.  This variant
+;; just calls the atomic once, for the given uniform value
+;;
+;; Takes four parameters:
+;; $1: vector width of the target
+;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
+;;     (add, sub...)
+;; $3: return type of the LLVM atomic (e.g. i32)
+;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
+
+define internal i32 @__get_first_active_lane()
+{
+  %nact  = call i32 @__ballot_nvptx(i1 true);
+  %lane1 = call i32 @__count_leading_zeros_i32(i32 %nact)
+  %lane  = sub i32 31, %lane1
+  ret i32 %lane
+}
+
+define internal i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i32 @__atomic_sub_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %nval = sub i32 0, %val;
+  %old = tail call i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %nval);
+  ret i32 %old;
+}
+define internal i32 @__atomic_and_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i32 @__atomic_or_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i32 @__atomic_xor_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i32 @__atomic_min_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.min.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i32 @__atomic_max_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.max.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i32 @__atomic_umin_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.min.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i32 @__atomic_umax_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.max.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+
+
+define internal i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal i64 @__atomic_sub_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %nval = sub i64 0, %val;
+  %old = tail call i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %nval);
+  ret i64 %old;
+}
+define internal i64 @__atomic_and_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal i64 @__atomic_or_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal i64 @__atomic_xor_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal i64 @__atomic_min_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.min.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal i64 @__atomic_max_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.max.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal i64 @__atomic_umin_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.min.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal i64 @__atomic_umax_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.max.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+
+define(`global_atomic',`
+define <1 x $3> @__atomic_$2_$4_global($3* %ptr,  <1 x $3> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x i1> %maskv to i1
+  %val  = bitcast <1 x $3> %valv  to $3
+  br i1 %mask, label %exec, label %pass
+exec:
+  %old = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %val);
+  %oldv = bitcast $3 %old to <1 x $3>
+  ret <1 x $3> %oldv
+pass:
+  ret <1 x $3> %valv
+}
+')
+define(`global_atomic_uniform',`
+define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline
+{
+entry:
+  %addr   = ptrtoint $3 * %ptr to i64
+  %active = call i32 @__get_first_active_lane();
+  %lane   = call i32 @__program_index();
+  %c      = icmp eq i32 %lane, %active
+  br i1 %c, label %p1, label %p2
+
+p1:
+  %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %val);
+  br label %p2;
+
+p2: 
+  %t1 = phi $3 [%t0, %p1], [zeroinitializer, %entry]
+  %old = call $3 @__shfl_$3_nvptx($3 %t1, i32 %active)
+  ret $3 %old;
+}
+')
+define(`global_atomic_varying',`
+define <1 x $3> @__atomic_$2_varying_$4_global(<1 x i64> %ptr, <1 x $3> %val, <1 x i1> %maskv) nounwind alwaysinline
+{
+entry:
+  %addr  = bitcast <1 x i64> %ptr   to i64
+  %c     = bitcast <1 x  i1> %maskv to  i1
+  br i1 %c, label %p1, label %p2
+
+p1:
+  %sv = bitcast <1 x $3> %val to $3
+  %sptr = inttoptr i64 %addr to $3*
+  %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %sptr, $3 %sv);
+  %t0v = bitcast $3 %t0 to <1 x $3>
+  ret < 1x $3> %t0v
+
+p2: 
+  ret <1 x $3> %val
+}
+')
+
+
+global_atomic_uniform(1, add, i32, int32)
+global_atomic_uniform(1, sub, i32, int32)
+global_atomic_uniform(1, and, i32, int32)
+global_atomic_uniform(1, or, i32, int32)
+global_atomic_uniform(1, xor, i32, int32)
+global_atomic_uniform(1, min, i32, int32)
+global_atomic_uniform(1, max, i32, int32)
+global_atomic_uniform(1, umin, i32, uint32)
+global_atomic_uniform(1, umax, i32, uint32)
+
+global_atomic_uniform(1, add, i64, int64)
+global_atomic_uniform(1, sub, i64, int64)
+global_atomic_uniform(1, and, i64, int64)
+global_atomic_uniform(1, or, i64, int64)
+global_atomic_uniform(1, xor, i64, int64)
+global_atomic_uniform(1, min, i64, int64)
+global_atomic_uniform(1, max, i64, int64)
+global_atomic_uniform(1, umin, i64, uint64)
+global_atomic_uniform(1, umax, i64, uint64)
+
+global_atomic_varying(1, add, i32, int32)
+global_atomic_varying(1, sub, i32, int32)
+global_atomic_varying(1, and, i32, int32)
+global_atomic_varying(1, or, i32, int32)
+global_atomic_varying(1, xor, i32, int32)
+global_atomic_varying(1, min, i32, int32)
+global_atomic_varying(1, max, i32, int32)
+global_atomic_varying(1, umin, i32, uint32)
+global_atomic_varying(1, umax, i32, uint32)
+
+global_atomic_varying(1, add, i64, int64)
+global_atomic_varying(1, sub, i64, int64)
+global_atomic_varying(1, and, i64, int64)
+global_atomic_varying(1, or, i64, int64)
+global_atomic_varying(1, xor, i64, int64)
+global_atomic_varying(1, min, i64, int64)
+global_atomic_varying(1, max, i64, int64)
+global_atomic_varying(1, umin, i64, uint64)
+global_atomic_varying(1, umax, i64, uint64)
+
+;; Macro to declare the function that implements the swap atomic.  
+;; Takes three parameters:
+;; $1: vector width of the target
+;; $2: llvm type of the vector elements (e.g. i32)
+;; $3: ispc type of the elements (e.g. int32)
+
+define internal i32 @__atomic_swap_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.exch.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i64 @__atomic_swap_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.exch.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal float @__atomic_swap_uniform_float_global_nvptx(float* %ptr, float %val) nounwind alwaysinline
+{
+   %ptrI = bitcast float* %ptr to i32*
+   %valI = bitcast float  %val to i32
+   %retI = call i32 @__atomic_swap_uniform_int32_global_nvptx(i32* %ptrI, i32 %valI)
+   %ret  = bitcast i32 %retI to float
+   ret float %ret
+}
+define internal double @__atomic_swap_uniform_double_global_nvptx(double* %ptr, double %val) nounwind alwaysinline
+{
+   %ptrI = bitcast double* %ptr to i64*
+   %valI = bitcast double  %val to i64
+   %retI = call i64 @__atomic_swap_uniform_int64_global_nvptx(i64* %ptrI, i64 %valI)
+   %ret  = bitcast i64 %retI to double
+   ret double %ret
+}
+global_atomic_uniform(1, swap, i32, int32)
+global_atomic_uniform(1, swap, i64, int64)
+global_atomic_uniform(1, swap, float, float)
+global_atomic_uniform(1, swap, double, double)
+global_atomic_varying(1, swap, i32, int32)
+global_atomic_varying(1, swap, i64, int64)
+global_atomic_varying(1, swap, float, float)
+global_atomic_varying(1, swap, double, double)
+
+
+;; Similarly, macro to declare the function that implements the compare/exchange
+;; atomic.  Takes three parameters:
+;; $1: vector width of the target
+;; $2: llvm type of the vector elements (e.g. i32)
+;; $3: ispc type of the elements (e.g. int32)
+
+define internal i32 @__atomic_compare_exchange_uniform_int32_global_nvptx(i32* %ptr, i32 %cmp, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.cas.b32 $0, [$1], $2, $3;", "=r,l,r,r"(i64 %addr, i32 %cmp, i32 %val);
+  ret i32 %old;
+}
+define internal i64 @__atomic_compare_exchange_uniform_int64_global_nvptx(i64* %ptr, i64 %cmp, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.cas.b64 $0, [$1], $2, $3;", "=l,l,l,l"(i64 %addr, i64 %cmp, i64 %val);
+  ret i64 %old;
+}
+define internal float @__atomic_compare_exchange_uniform_float_global_nvptx(float* %ptr, float %cmp, float %val) nounwind alwaysinline
+{
+   %ptrI = bitcast float* %ptr to i32*
+   %cmpI = bitcast float  %cmp to i32
+   %valI = bitcast float  %val to i32
+   %retI = call i32 @__atomic_compare_exchange_uniform_int32_global_nvptx(i32* %ptrI, i32 %cmpI, i32 %valI)
+   %ret  = bitcast i32 %retI to float
+   ret float %ret
+}
+define internal double @__atomic_compare_exchange_uniform_double_global_nvptx(double* %ptr, double %cmp, double %val) nounwind alwaysinline
+{
+   %ptrI = bitcast double* %ptr to i64*
+   %cmpI = bitcast double  %cmp to i64
+   %valI = bitcast double  %val to i64
+   %retI = call i64 @__atomic_compare_exchange_uniform_int64_global_nvptx(i64* %ptrI, i64 %cmpI, i64 %valI)
+   %ret  = bitcast i64 %retI to double
+   ret double %ret
+}
+
+;;;;;;;;;;;;
+define(`global_atomic_cas',`
+define <1 x $3> @__atomic_$2_$4_global($3* %ptr, <1 x $3> %cmpv, <1 x $3> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x i1> %maskv to i1
+  %cmp  = bitcast <1 x $3> %cmpv  to $3
+  %val  = bitcast <1 x $3> %valv  to $3
+  br i1 %mask, label %exec, label %pass
+exec:
+  %old = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %cmp, $3 %val);
+  %oldv = bitcast $3 %old to <1 x $3>
+  ret <1 x $3> %oldv
+pass:
+  ret <1 x $3> %valv
+}
+')
+define(`global_atomic_cas_uniform',`
+define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %cmp, $3 %val) nounwind alwaysinline
+{
+entry:
+  %addr   = ptrtoint $3 * %ptr to i64
+  %active = call i32 @__get_first_active_lane();
+  %lane   = call i32 @__program_index();
+  %c      = icmp eq i32 %lane, %active
+  br i1 %c, label %p1, label %p2
+
+p1:
+  %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %cmp, $3 %val);
+  br label %p2;
+
+p2: 
+  %t1 = phi $3 [%t0, %p1], [zeroinitializer, %entry]
+  %old = call $3 @__shfl_$3_nvptx($3 %t1, i32 %active)
+  ret $3 %old;
+}
+')
+define(`global_atomic_cas_varying',`
+define <1 x $3> @__atomic_$2_varying_$4_global(<1 x i64> %ptr, <1 x $3> %cmp, <1 x $3> %val, <1 x i1> %maskv) nounwind alwaysinline
+{
+entry:
+  %addr  = bitcast <1 x i64> %ptr   to i64
+  %c     = bitcast <1 x  i1> %maskv to  i1
+  br i1 %c, label %p1, label %p2
+
+p1:
+  %sv = bitcast <1 x $3> %val to $3
+  %sc = bitcast <1 x $3> %cmp to $3
+  %sptr = inttoptr i64 %addr to $3*
+  %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %sptr, $3 %sc, $3 %sv);
+  %t0v = bitcast $3 %t0 to <1 x $3>
+  ret < 1x $3> %t0v
+
+p2: 
+  ret <1 x $3> %val
+}
+')
+
+global_atomic_cas_uniform(1, compare_exchange, i32, int32)
+global_atomic_cas_uniform(1, compare_exchange, i64, int64)
+global_atomic_cas_uniform(1, compare_exchange, float, float)
+global_atomic_cas_uniform(1, compare_exchange, double, double)
+global_atomic_cas_varying(1, compare_exchange, i32, int32)
+global_atomic_cas_varying(1, compare_exchange, i64, int64)
+global_atomic_cas_varying(1, compare_exchange, float, float)
+global_atomic_cas_varying(1, compare_exchange, double, double)
+global_atomic_cas(1, compare_exchange, i32, int32)
+global_atomic_cas(1, compare_exchange, i64, int64)
+global_atomic_cas(1, compare_exchange, float, float)
+global_atomic_cas(1, compare_exchange, double, double)
+
+
+
+
+declare void @llvm.nvvm.membar.gl()
+declare void @llvm.nvvm.membar.sys()
+declare void @llvm.nvvm.membar.cta()
+
+define void @__memory_barrier() nounwind readnone alwaysinline {
+  ;; see http://llvm.org/bugs/show_bug.cgi?id=2829.  It seems like we
+  ;; only get an MFENCE on x86 if "device" is true, but IMHO we should
+  ;; in the case where the first 4 args are true but it is false.
+  ;;  So we just always set that to true...
+  call void @llvm.nvvm.membar.gl()
+  ret void
+}
diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll
index ad1d88bc..b20fdfb4 100644
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -274,3 +274,4 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
 
 define_avgs()
 
+declare_nvptx()
diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll
index 50dd0582..e1f9b2c8 100644
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -278,3 +278,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
   %call = call i64 @llvm.ctpop.i64(i64 %0)
   ret i64 %call
 }
+
+declare_nvptx()
diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4
new file mode 100644
index 00000000..19fcf68c
--- /dev/null
+++ b/builtins/util-nvptx.m4
@@ -0,0 +1,3417 @@
+;;  Copyright (c) 2010-2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;; This file provides a variety of macros used to generate LLVM bitcode
+;; parametrized in various ways.  Implementations of the standard library
+;; builtins for various targets can use macros from this file to simplify
+;; generating code for their implementations of those builtins.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; It is a bit of a pain to compute this in m4 for 32 and 64-wide targets...
+define(`ALL_ON_MASK',
+`ifelse(WIDTH, `64', `-1', 
+        WIDTH, `32', `4294967295',
+                     `eval((1<<WIDTH)-1)')')
+
+define(`MASK_HIGH_BIT_ON',
+`ifelse(WIDTH, `64', `-9223372036854775808',
+        WIDTH, `32', `2147483648',
+                     `eval(1<<(WIDTH-1))')')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; vector deconstruction utilities
+;; split 8-wide vector into 2 4-wide vectors
+;;
+;; $1: vector element type
+;; $2: 8-wide vector
+;; $3: first 4-wide vector
+;; $4: second 4-wide vector
+
+define(`v8tov4', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  $4 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+')
+
+define(`v16tov8', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  $4 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+define(`v4tov2', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 0, i32 1>
+  $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 2, i32 3>
+')
+
+define(`v8tov2', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 0, i32 1>
+  $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 2, i32 3>
+  $5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 4, i32 5>
+  $6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 6, i32 7>
+')
+
+define(`v16tov4', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  $5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  $6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; vector assembly: wider vector from two narrower vectors
+;;
+;; $1: vector element type
+;; $2: first n-wide vector
+;; $3: second n-wide vector
+;; $4: result 2*n-wide vector
+define(`v8tov16', `
+  $4 = shufflevector <8 x $1> $2, <8 x $1> $3,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Helper macro for calling various SSE instructions for scalar values
+;; but where the instruction takes a vector parameter.
+;; $1 : name of variable to put the final value in
+;; $2 : vector width of the target
+;; $3 : scalar type of the operand
+;; $4 : SSE intrinsic name
+;; $5 : variable name that has the scalar value
+;; For example, the following call causes the variable %ret to have
+;; the result of a call to sqrtss with the scalar value in %0
+;;  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+
+define(`sse_unary_scalar', `
+  %$1_vec = insertelement <$2 x $3> undef, $3 $5, i32 0
+  %$1_val = call <$2 x $3> $4(<$2 x $3> %$1_vec)
+  %$1 = extractelement <$2 x $3> %$1_val, i32 0
+')
+
+;; Similar to `sse_unary_scalar', this helper macro is for calling binary
+;; SSE instructions with scalar values, 
+;; $1: name of variable to put the result in
+;; $2: vector width of the target
+;; $3: scalar type of the operand
+;; $4 : SSE intrinsic name
+;; $5 : variable name that has the first scalar operand
+;; $6 : variable name that has the second scalar operand
+
+define(`sse_binary_scalar', `
+  %$1_veca = insertelement <$2 x $3> undef, $3 $5, i32 0
+  %$1_vecb = insertelement <$2 x $3> undef, $3 $6, i32 0
+  %$1_val = call <$2 x $3> $4(<$2 x $3> %$1_veca, <$2 x $3> %$1_vecb)
+  %$1 = extractelement <$2 x $3> %$1_val, i32 0
+')
+
+;; Do a reduction over a 4-wide vector
+;; $1: type of final scalar result
+;; $2: 4-wide function that takes 2 4-wide operands and returns the 
+;;     element-wise reduction
+;; $3: scalar function that takes two scalar operands and returns
+;;     the final reduction
+
+define(`reduce4', `
+  %v1 = shufflevector <4 x $1> %0, <4 x $1> undef,
+                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %0)
+  %m1a = extractelement <4 x $1> %m1, i32 0
+  %m1b = extractelement <4 x $1> %m1, i32 1
+  %m = call $1 $3($1 %m1a, $1 %m1b)
+  ret $1 %m
+'
+)
+
+;; Similar to `reduce4', do a reduction over an 8-wide vector
+;; $1: type of final scalar result
+;; $2: 8-wide function that takes 2 8-wide operands and returns the 
+;;     element-wise reduction
+;; $3: scalar function that takes two scalar operands and returns
+;;     the final reduction
+
+define(`reduce8', `
+  %v1 = shufflevector <8 x $1> %0, <8 x $1> undef,
+        <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %m1 = call <8 x $1> $2(<8 x $1> %v1, <8 x $1> %0)
+  %v2 = shufflevector <8 x $1> %m1, <8 x $1> undef,
+        <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %m2 = call <8 x $1> $2(<8 x $1> %v2, <8 x $1> %m1)
+  %m2a = extractelement <8 x $1> %m2, i32 0
+  %m2b = extractelement <8 x $1> %m2, i32 1
+  %m = call $1 $3($1 %m2a, $1 %m2b)
+  ret $1 %m
+'
+)
+
+define(`reduce16', `
+  %v1 = shufflevector <16 x $1> %0, <16 x $1> undef,
+        <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef>
+  %m1 = call <16 x $1> $2(<16 x $1> %v1, <16 x $1> %0)
+  %v2 = shufflevector <16 x $1> %m1, <16 x $1> undef,
+        <16 x i32> <i32 4, i32 5, i32 6, i32 7,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef>
+  %m2 = call <16 x $1> $2(<16 x $1> %v2, <16 x $1> %m1)
+  %v3 = shufflevector <16 x $1> %m2, <16 x $1> undef,
+        <16 x i32> <i32 2, i32 3, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef>
+  %m3 = call <16 x $1> $2(<16 x $1> %v3, <16 x $1> %m2)
+
+  %m3a = extractelement <16 x $1> %m3, i32 0
+  %m3b = extractelement <16 x $1> %m3, i32 1
+  %m = call $1 $3($1 %m3a, $1 %m3b)
+  ret $1 %m
+'
+)
+
+;; Do an reduction over an 8-wide vector, using a vector reduction function
+;; that only takes 4-wide vectors
+;; $1: type of final scalar result
+;; $2: 4-wide function that takes 2 4-wide operands and returns the 
+;;     element-wise reduction
+;; $3: scalar function that takes two scalar operands and returns
+;;     the final reduction
+
+define(`reduce8by4', `
+  v8tov4($1, %0, %v1, %v2)
+  %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2)
+  %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef,
+        <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m2 = call <4 x $1> $2(<4 x $1> %v3, <4 x $1> %m1)
+  %m2a = extractelement <4 x $1> %m2, i32 0
+  %m2b = extractelement <4 x $1> %m2, i32 1
+  %m = call $1 $3($1 %m2a, $1 %m2b)
+  ret $1 %m
+'
+)
+
+
+;; Apply a unary function to the 4-vector in %0, return the vector result.
+;; $1: scalar type of result
+;; $2: name of scalar function to call
+
+define(`unary1to4', `
+  %v_0 = extractelement <4 x $1> %0, i32 0
+  %r_0 = call $1 $2($1 %v_0)
+  %ret_0 = insertelement <4 x $1> undef, $1 %r_0, i32 0
+  %v_1 = extractelement <4 x $1> %0, i32 1
+  %r_1 = call $1 $2($1 %v_1)
+  %ret_1 = insertelement <4 x $1> %ret_0, $1 %r_1, i32 1
+  %v_2 = extractelement <4 x $1> %0, i32 2
+  %r_2 = call $1 $2($1 %v_2)
+  %ret_2 = insertelement <4 x $1> %ret_1, $1 %r_2, i32 2
+  %v_3 = extractelement <4 x $1> %0, i32 3
+  %r_3 = call $1 $2($1 %v_3)
+  %ret_3 = insertelement <4 x $1> %ret_2, $1 %r_3, i32 3
+  ret <4 x $1> %ret_3
+')
+
+define(`unary1to8', `
+  %v_0 = extractelement <8 x $1> %0, i32 0
+  %r_0 = call $1 $2($1 %v_0)
+  %ret_0 = insertelement <8 x $1> undef, $1 %r_0, i32 0
+  %v_1 = extractelement <8 x $1> %0, i32 1
+  %r_1 = call $1 $2($1 %v_1)
+  %ret_1 = insertelement <8 x $1> %ret_0, $1 %r_1, i32 1
+  %v_2 = extractelement <8 x $1> %0, i32 2
+  %r_2 = call $1 $2($1 %v_2)
+  %ret_2 = insertelement <8 x $1> %ret_1, $1 %r_2, i32 2
+  %v_3 = extractelement <8 x $1> %0, i32 3
+  %r_3 = call $1 $2($1 %v_3)
+  %ret_3 = insertelement <8 x $1> %ret_2, $1 %r_3, i32 3
+  %v_4 = extractelement <8 x $1> %0, i32 4
+  %r_4 = call $1 $2($1 %v_4)
+  %ret_4 = insertelement <8 x $1> %ret_3, $1 %r_4, i32 4
+  %v_5 = extractelement <8 x $1> %0, i32 5
+  %r_5 = call $1 $2($1 %v_5)
+  %ret_5 = insertelement <8 x $1> %ret_4, $1 %r_5, i32 5
+  %v_6 = extractelement <8 x $1> %0, i32 6
+  %r_6 = call $1 $2($1 %v_6)
+  %ret_6 = insertelement <8 x $1> %ret_5, $1 %r_6, i32 6
+  %v_7 = extractelement <8 x $1> %0, i32 7
+  %r_7 = call $1 $2($1 %v_7)
+  %ret_7 = insertelement <8 x $1> %ret_6, $1 %r_7, i32 7
+  ret <8 x $1> %ret_7
+')
+
+;; Given a unary function that takes a 2-wide vector and a 4-wide vector
+;; that we'd like to apply it to, extract 2 2-wide vectors from the 4-wide
+;; vector, apply it, and return the corresponding 4-wide vector result
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide unary vector function to apply
+;; $4: 4-wide operand value
+
+define(`unary2to4', `
+  %$1_0 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
+  %$1_1 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
+  %$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+'
+)
+
+;; Similar to `unary2to4', this applies a 2-wide binary function to two 4-wide
+;; vector operands
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide binary vector function to apply
+;; $4: First 4-wide operand value
+;; $5: Second 4-wide operand value
+
+define(`binary2to4', `
+%$1_0a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 0, i32 1>
+%$1_0b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> <i32 0, i32 1>
+%v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
+%$1_1a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 2, i32 3>
+%$1_1b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> <i32 2, i32 3>
+%v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
+%$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+'
+)
+
+;; Similar to `unary2to4', this maps a 4-wide unary function to an 8-wide 
+;; vector operand
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 4-wide unary vector function to apply
+;; $4: 8-wide operand value
+
+define(`unary4to8', `
+  %__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
+  %__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
+  %$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the input vector elements
+;; $3: scalar type of the result vector elements
+;; $4: 4-wide unary vector function to apply
+;; $5: 8-wide operand value
+
+define(`unary4to8conv', `
+  %$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
+  %$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+define(`unary4to16', `
+  %__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
+  %__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
+  %__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2)
+  %__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3)
+
+  %__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+define(`unary4to16conv', `
+  %$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
+  %$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2)
+  %$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3)
+
+  %$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+;; And so forth...
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 8-wide unary vector function to apply
+;; $4: 16-wide operand value
+
+define(`unary8to16', `
+  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1)
+  %$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+;; And along the lines of `binary2to4', this maps a 4-wide binary function to
+;; two 8-wide vector operands
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 4-wide unary vector function to apply
+;; $4: First 8-wide operand value
+;; $5: Second 8-wide operand value
+
+define(`binary4to8', `
+%$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b)
+%$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b)
+%$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+define(`binary8to16', `
+%$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0a, <8 x $2> %$1_0b)
+%$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1a, <8 x $2> %$1_1b)
+%$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, 
+         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+define(`binary4to16', `
+%$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%r$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b) 
+
+%$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%r$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b) 
+
+%$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%r$1_2 = call <4 x $2> $3(<4 x $2> %$1_2a, <4 x $2> %$1_2b) 
+
+%$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%r$1_3 = call <4 x $2> $3(<4 x $2> %$1_3a, <4 x $2> %$1_3b)
+
+%r$1_01 = shufflevector <4 x $2> %r$1_0, <4 x $2> %r$1_1, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%r$1_23 = shufflevector <4 x $2> %r$1_2, <4 x $2> %r$1_3, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+%$1 = shufflevector <8 x $2> %r$1_01, <8 x $2> %r$1_23, 
+          <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                      i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+;; Maps a 2-wide unary function to an 8-wide vector operand, returning an 
+;; 8-wide vector result
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide unary vector function to apply
+;; $4: 8-wide operand value
+
+define(`unary2to8', `
+  %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
+  %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
+  %$1_2 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2)
+  %$1_3 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3)
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>           
+'
+)
+
+define(`unary2to16', `
+  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
+  %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2)
+  %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3)
+  %$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4)
+  %$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5)
+  %$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6)
+  %$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7)
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+;; Maps an 2-wide binary function to two 8-wide vector operands
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide unary vector function to apply
+;; $4: First 8-wide operand value
+;; $5: Second 8-wide operand value
+
+define(`binary2to8', `
+  %$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
+  %$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
+  %$1_2a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %$1_2b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b)
+  %$1_3a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
+
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+define(`binary2to16', `
+  %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
+  %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
+  %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b)
+  %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
+  %$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b)
+  %$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b)
+  %$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b)
+  %$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b)
+
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>           
+
+  %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+;; The unary SSE round intrinsic takes a second argument that encodes the
+;; rounding mode.  This macro makes it easier to apply the 4-wide roundps
+;; to 8-wide vector operands
+;; $1: value to be rounded
+;; $2: integer encoding of rounding mode
+;; FIXME: this just has a ret statement at the end to return the result,
+;; which is inconsistent with the macros above 
+
+define(`round4to8', `
+%v0 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2)
+%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2)
+%ret = shufflevector <4 x float> %r0, <4 x float> %r1, 
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ret <8 x float> %ret
+'
+)
+
+define(`round4to16', `
+%v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2)
+%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2)
+%r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2)
+%r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2)
+%ret01 = shufflevector <4 x float> %r0, <4 x float> %r1,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret23 = shufflevector <4 x float> %r2, <4 x float> %r3,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret = shufflevector <8 x float> %ret01, <8 x float> %ret23,
+         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ret <16 x float> %ret
+'
+)
+
+define(`round8to16', `
+%v0 = shufflevector <16 x float> $1, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%v1 = shufflevector <16 x float> $1, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%r0 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v0, i32 $2)
+%r1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v1, i32 $2)
+%ret = shufflevector <8 x float> %r0, <8 x float> %r1, 
+         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ret <16 x float> %ret
+'
+)
+
+define(`round4to8double', `
+%v0 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2)
+%r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2)
+%ret = shufflevector <4 x double> %r0, <4 x double> %r1, 
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ret <8 x double> %ret
+'
+)
+
+; and similarly for doubles...
+
+define(`round2to4double', `
+%v0 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+%v1 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+%r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2)
+%r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2)
+%ret = shufflevector <2 x double> %r0, <2 x double> %r1, 
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ret <4 x double> %ret
+'
+)
+
+define(`round2to8double', `
+%v0 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+%v1 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+%v2 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+%v3 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+%r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2)
+%r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2)
+%r2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v2, i32 $2)
+%r3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v3, i32 $2)
+%ret0 = shufflevector <2 x double> %r0, <2 x double> %r1, 
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%ret1 = shufflevector <2 x double> %r2, <2 x double> %r3, 
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%ret = shufflevector <4 x double> %ret0, <4 x double> %ret1,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ret <8 x double> %ret
+'
+)
+
+define(`round4to16double', `
+%v0 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%v2 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%v3 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2)
+%r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2)
+%r2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v2, i32 $2)
+%r3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v3, i32 $2)
+%ret0 = shufflevector <4 x double> %r0, <4 x double> %r1, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret1 = shufflevector <4 x double> %r2, <4 x double> %r3, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret = shufflevector <8 x double> %ret0, <8 x double> %ret1,
+          <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                      i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ret <16 x double> %ret
+'
+)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; forloop macro
+
+divert(`-1')
+# forloop(var, from, to, stmt) - improved version:
+#   works even if VAR is not a strict macro name
+#   performs sanity check that FROM is larger than TO
+#   allows complex numerical expressions in TO and FROM
+define(`forloop', `ifelse(eval(`($3) >= ($2)'), `1',
+  `pushdef(`$1', eval(`$2'))_$0(`$1',
+    eval(`$3'), `$4')popdef(`$1')')')
+define(`_forloop',
+  `$3`'ifelse(indir(`$1'), `$2', `',
+    `define(`$1', incr(indir(`$1')))$0($@)')')
+divert`'dnl
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; stdlib_core
+;;
+;; This macro defines a bunch of helper routines that depend on the
+;; target's vector width
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define(`shuffles', `
+')
+
+define(`define_shuffles',`
+shuffles(i8, 1)
+shuffles(i16, 2)
+shuffles(float, 4)
+shuffles(i32, 4)
+shuffles(double, 8)
+shuffles(i64, 8)
+')
+
+
+define(`mask_converts', `
+define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) {
+  ret <$1 x i8> %0
+}
+define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) {
+  %r = trunc <$1 x i16> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) {
+  ret <$1 x i16> %0
+}
+define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) {
+  %r = sext <$1 x i16> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) {
+  %r = sext <$1 x i16> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) {
+  %r = trunc <$1 x i32> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) {
+  %r = trunc <$1 x i32> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) {
+  ret <$1 x i32> %0
+}
+define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
+  %r = sext <$1 x i32> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
+  ret <$1 x i64> %0
+}
+')
+
+mask_converts(WIDTH)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; count trailing zeros
+
+define(`ctlztz', `
+declare_count_zeros()
+
+define i32 @__count_trailing_zeros_i32(i32) nounwind readnone alwaysinline {
+  %c = call i32 @llvm.cttz.i32(i32 %0)
+  ret i32 %c
+}
+
+define i64 @__count_trailing_zeros_i64(i64) nounwind readnone alwaysinline {
+  %c = call i64 @llvm.cttz.i64(i64 %0)
+  ret i64 %c
+}
+
+define i32 @__count_leading_zeros_i32(i32) nounwind readnone alwaysinline {
+  %c = call i32 @llvm.ctlz.i32(i32 %0)
+  ret i32 %c
+}
+
+define i64 @__count_leading_zeros_i64(i64) nounwind readnone alwaysinline {
+  %c = call i64 @llvm.ctlz.i64(i64 %0)
+  ret i64 %c
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetching
+
+define(`define_prefetches', `
+declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality,
+                            i32 %cachetype) ; cachetype == 1 is dcache
+
+define void @__prefetch_read_uniform_1(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @__prefetch_read_uniform_2(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 2, i32 1)
+  ret void
+}
+
+define void @__prefetch_read_uniform_3(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 1, i32 1)
+  ret void
+}
+
+define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1)
+  ret void
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; AOS/SOA conversion primitives
+
+;; take 4 4-wide vectors laid out like <r0 g0 b0 a0> <r1 g1 b1 a1> ...
+;; and reorder them to <r0 r1 r2 r3> <g0 g1 g2 g3> ...
+
+define(`aossoa', `
+declare void
+@__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
+        <4 x float> %v3, <4 x float> * noalias %out0, 
+        <4 x float> * noalias %out1, <4 x float> * noalias %out2, 
+        <4 x float> * noalias %out3) nounwind alwaysinline ;
+
+;; Do the reverse of __aos_to_soa4_float4--reorder <r0 r1 r2 r3> <g0 g1 g2 g3> ..
+;; to <r0 g0 b0 a0> <r1 g1 b1 a1> ...
+;; This is the exact same set of operations that __soa_to_soa4_float4 does
+;; (a 4x4 transpose), so just call that...
+
+declare void
+@__soa_to_aos4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
+        <4 x float> %v3, <4 x float> * noalias %out0, 
+        <4 x float> * noalias %out1, <4 x float> * noalias %out2, 
+        <4 x float> * noalias %out3) nounwind alwaysinline;
+
+;; Convert 3-wide AOS values to SOA--specifically, given 3 4-vectors
+;; <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 x3 y3 z3>, transpose to
+;; <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3>.
+
+declare void
+@__aos_to_soa3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
+        <4 x float> * noalias %out0, <4 x float> * noalias %out1,
+        <4 x float> * noalias %out2) nounwind alwaysinline 
+;; The inverse of __aos_to_soa3_float4: convert 3 4-vectors
+;; <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3> to
+;; <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 x3 y3 z3>.
+
+declare void
+@__soa_to_aos3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
+        <4 x float> * noalias %out0, <4 x float> * noalias %out1,
+        <4 x float> * noalias %out2) nounwind alwaysinline 
+;; 8-wide
+;; These functions implement the 8-wide variants of the AOS/SOA conversion
+;; routines above.  These implementations are all built on top of the 4-wide
+;; vector versions.
+ 
+declare void
+@__aos_to_soa4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
+        <8 x float> %v3, <8 x float> * noalias %out0, 
+        <8 x float> * noalias %out1, <8 x float> * noalias %out2, 
+        <8 x float> * noalias %out3) nounwind alwaysinline 
+
+declare void
+@__soa_to_aos4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
+        <8 x float> %v3, <8 x float> * noalias %out0, 
+        <8 x float> * noalias %out1, <8 x float> * noalias %out2, 
+        <8 x float> * noalias %out3) nounwind alwaysinline
+
+declare void
+@__aos_to_soa3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
+        <8 x float> * noalias %out0, <8 x float> * noalias %out1,
+        <8 x float> * noalias %out2) nounwind alwaysinline ;
+
+
+declare void
+@__soa_to_aos3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
+        <8 x float> * noalias %out0, <8 x float> * noalias %out1,
+        <8 x float> * noalias %out2) nounwind alwaysinline ;
+
+;; 16-wide
+
+declare void
+@__aos_to_soa4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
+        <16 x float> %v3, <16 x float> * noalias %out0, 
+        <16 x float> * noalias %out1, <16 x float> * noalias %out2, 
+        <16 x float> * noalias %out3) nounwind alwaysinline ;
+
+
+declare void
+@__soa_to_aos4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
+        <16 x float> %v3, <16 x float> * noalias %out0, 
+        <16 x float> * noalias %out1, <16 x float> * noalias %out2, 
+        <16 x float> * noalias %out3) nounwind alwaysinline ;
+
+declare void
+@__aos_to_soa3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
+        <16 x float> * noalias %out0, <16 x float> * noalias %out1,
+        <16 x float> * noalias %out2) nounwind alwaysinline ;
+
+declare void
+@__soa_to_aos3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
+        <16 x float> * noalias %out0, <16 x float> * noalias %out1,
+        <16 x float> * noalias %out2) nounwind alwaysinline ;
+
+;; versions to be called from stdlib
+
+declare void
+@__aos_to_soa4_float(float * noalias %p,
+        <WIDTH x float> * noalias %out0, <WIDTH x float> * noalias %out1,
+        <WIDTH x float> * noalias %out2, <WIDTH x float> * noalias %out3)
+        nounwind alwaysinline ;
+
+
+declare void
+@__soa_to_aos4_float(<WIDTH x float> %v0, <WIDTH x float> %v1, <WIDTH x float> %v2,
+             <WIDTH x float> %v3, float * noalias %p) nounwind alwaysinline ;
+
+
+declare void
+@__aos_to_soa3_float(float * noalias %p,
+        <WIDTH x float> * %out0, <WIDTH x float> * %out1,
+        <WIDTH x float> * %out2) nounwind alwaysinline ;
+
+
+declare void
+@__soa_to_aos3_float(<WIDTH x float> %v0, <WIDTH x float> %v1, <WIDTH x float> %v2,
+                     float * noalias %p) nounwind alwaysinline ;
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define(`masked_load_float_double', `
+define <WIDTH x float> @__masked_load_float(i8 * %ptr,
+                                             <WIDTH x MASK> %mask) readonly alwaysinline {
+  %v32 = call <WIDTH x i32> @__masked_load_i32(i8 * %ptr, <WIDTH x MASK> %mask)
+  %vf = bitcast <WIDTH x i32> %v32 to <WIDTH x float>
+  ret <WIDTH x float> %vf
+}
+
+define <WIDTH x double> @__masked_load_double(i8 * %ptr,
+                                             <WIDTH x MASK> %mask) readonly alwaysinline {
+  %v64 = call <WIDTH x i64> @__masked_load_i64(i8 * %ptr, <WIDTH x MASK> %mask)
+  %vd = bitcast <WIDTH x i64> %v64 to <WIDTH x double>
+  ret <WIDTH x double> %vd
+}
+
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define(`masked_store_float_double', `
+define void @__masked_store_float(<WIDTH x float> * nocapture, <WIDTH x float>,
+                                  <WIDTH x MASK>) nounwind alwaysinline {
+  %ptr = bitcast <WIDTH x float> * %0 to <WIDTH x i32> *
+  %val = bitcast <WIDTH x float> %1 to <WIDTH x i32>
+  call void @__masked_store_i32(<WIDTH x i32> * %ptr, <WIDTH x i32> %val, <WIDTH x MASK> %2)
+  ret void
+}
+
+
+define void @__masked_store_double(<WIDTH x double> * nocapture, <WIDTH x double>,
+                                   <WIDTH x MASK>) nounwind alwaysinline {
+  %ptr = bitcast <WIDTH x double> * %0 to <WIDTH x i64> *
+  %val = bitcast <WIDTH x double> %1 to <WIDTH x i64>
+  call void @__masked_store_i64(<WIDTH x i64> * %ptr, <WIDTH x i64> %val, <WIDTH x MASK> %2)
+  ret void
+}
+
+define void @__masked_store_blend_float(<WIDTH x float> * nocapture, <WIDTH x float>,
+                                        <WIDTH x MASK>) nounwind alwaysinline {
+  %ptr = bitcast <WIDTH x float> * %0 to <WIDTH x i32> *
+  %val = bitcast <WIDTH x float> %1 to <WIDTH x i32>
+  call void @__masked_store_blend_i32(<WIDTH x i32> * %ptr, <WIDTH x i32> %val, <WIDTH x MASK> %2)
+  ret void
+}
+
+
+define void @__masked_store_blend_double(<WIDTH x double> * nocapture, <WIDTH x double>,
+                                         <WIDTH x MASK>) nounwind alwaysinline {
+  %ptr = bitcast <WIDTH x double> * %0 to <WIDTH x i64> *
+  %val = bitcast <WIDTH x double> %1 to <WIDTH x i64>
+  call void @__masked_store_blend_i64(<WIDTH x i64> * %ptr, <WIDTH x i64> %val, <WIDTH x MASK> %2)
+  ret void
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+define(`stdlib_core', `
+
+declare i32 @__fast_masked_vload()
+
+declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind
+
+declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
+declare i1 @__is_compile_time_constant_uniform_int32(i32)
+declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
+
+; This function declares placeholder masked store functions for the
+;  front-end to use.
+;
+;  void __pseudo_masked_store_i8 (uniform int8 *ptr, varying int8 values, mask)
+;  void __pseudo_masked_store_i16(uniform int16 *ptr, varying int16 values, mask)
+;  void __pseudo_masked_store_i32(uniform int32 *ptr, varying int32 values, mask)
+;  void __pseudo_masked_store_float(uniform float *ptr, varying float values, mask)
+;  void __pseudo_masked_store_i64(uniform int64 *ptr, varying int64 values, mask)
+;  void __pseudo_masked_store_double(uniform double *ptr, varying double values, mask)
+;
+;  These in turn are converted to native masked stores or to regular
+;  stores (if the mask is all on) by the MaskedStoreOptPass optimization
+;  pass.
+
+declare void @__pseudo_masked_store_i8(<WIDTH x i8> * nocapture, <WIDTH x i8>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_i16(<WIDTH x i16> * nocapture, <WIDTH x i16>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_i32(<WIDTH x i32> * nocapture, <WIDTH x i32>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_float(<WIDTH x float> * nocapture, <WIDTH x float>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_i64(<WIDTH x i64> * nocapture, <WIDTH x i64>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_double(<WIDTH x double> * nocapture, <WIDTH x double>, <WIDTH x MASK>)
+
+; Declare the pseudo-gather functions.  When the ispc front-end needs
+; to perform a gather, it generates a call to one of these functions,
+; which ideally have these signatures:
+;    
+; varying int8  __pseudo_gather_i8(varying int8 *, mask)
+; varying int16 __pseudo_gather_i16(varying int16 *, mask)
+; varying int32 __pseudo_gather_i32(varying int32 *, mask)
+; varying float __pseudo_gather_float(varying float *, mask)
+; varying int64 __pseudo_gather_i64(varying int64 *, mask)
+; varying double __pseudo_gather_double(varying double *, mask)
+;
+; However, vectors of pointers weren not legal in LLVM until recently, so
+; instead, it emits calls to functions that either take vectors of int32s
+; or int64s, depending on the compilation target.
+
+declare <WIDTH x i8>  @__pseudo_gather32_i8(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16> @__pseudo_gather32_i16(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32> @__pseudo_gather32_i32(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float> @__pseudo_gather32_float(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64> @__pseudo_gather32_i64(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double> @__pseudo_gather32_double(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>  @__pseudo_gather64_i8(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16> @__pseudo_gather64_i16(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32> @__pseudo_gather64_i32(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float> @__pseudo_gather64_float(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64> @__pseudo_gather64_i64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double> @__pseudo_gather64_double(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+
+; The ImproveMemoryOps optimization pass finds these calls and then 
+; tries to convert them to be calls to gather functions that take a uniform
+; base pointer and then a varying integer offset, when possible.
+;
+; For targets without a native gather instruction, it is best to factor the
+; integer offsets like "{1/2/4/8} * varying_offset + constant_offset",
+; where varying_offset includes non-compile time constant values, and
+; constant_offset includes compile-time constant values.  (The scalar loads
+; generated in turn can then take advantage of the free offsetting and scale by
+; 1/2/4/8 that is offered by the x86 addresisng modes.)
+;
+; varying int{8,16,32,float,64,double}
+; __pseudo_gather_factored_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
+;                                    int{32,64} offsets, uniform int32 offset_scale, 
+;                                    int{32,64} offset_delta, mask)
+;
+; For targets with a gather instruction, it is better to just factor them into
+; a gather from a uniform base pointer and then "{1/2/4/8} * offsets", where the
+; offsets are int32/64 vectors.
+;
+; varying int{8,16,32,float,64,double}
+; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
+;                                    uniform int32 offset_scale, int{32,64} offsets, mask)
+
+
+declare <WIDTH x i8>
+@__pseudo_gather_factored_base_offsets32_i8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                            <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_factored_base_offsets32_i16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_factored_base_offsets32_i32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_factored_base_offsets32_float(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                               <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_factored_base_offsets32_i64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_factored_base_offsets32_double(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>
+@__pseudo_gather_factored_base_offsets64_i8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                            <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_factored_base_offsets64_i16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_factored_base_offsets64_i32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_factored_base_offsets64_float(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                               <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_factored_base_offsets64_i64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_factored_base_offsets64_double(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>
+@__pseudo_gather_base_offsets32_i8(i8 *, i32, <WIDTH x i32>,
+                                   <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_base_offsets32_i16(i8 *, i32, <WIDTH x i32>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_base_offsets32_i32(i8 *, i32, <WIDTH x i32>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_base_offsets32_float(i8 *, i32, <WIDTH x i32>,
+                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_base_offsets32_i64(i8 *, i32, <WIDTH x i32>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_base_offsets32_double(i8 *, i32, <WIDTH x i32>,
+                                       <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>
+@__pseudo_gather_base_offsets64_i8(i8 *, i32, <WIDTH x i64>,
+                                   <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_base_offsets64_i16(i8 *, i32, <WIDTH x i64>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_base_offsets64_i32(i8 *, i32, <WIDTH x i64>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_base_offsets64_float(i8 *, i32, <WIDTH x i64>,
+                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_base_offsets64_i64(i8 *, i32, <WIDTH x i64>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_base_offsets64_double(i8 *, i32, <WIDTH x i64>,
+                                       <WIDTH x MASK>) nounwind readonly
+
+; Similarly to the pseudo-gathers defined above, we also declare undefined
+; pseudo-scatter instructions with signatures:
+;
+; void __pseudo_scatter_i8 (varying int8 *, varying int8 values, mask)
+; void __pseudo_scatter_i16(varying int16 *, varying int16 values, mask)
+; void __pseudo_scatter_i32(varying int32 *, varying int32 values, mask)
+; void __pseudo_scatter_float(varying float *, varying float values, mask)
+; void __pseudo_scatter_i64(varying int64 *, varying int64 values, mask)
+; void __pseudo_scatter_double(varying double *, varying double values, mask)
+;
+
+declare void @__pseudo_scatter32_i8(<WIDTH x i32>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_i16(<WIDTH x i32>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_i32(<WIDTH x i32>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_float(<WIDTH x i32>, <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_i64(<WIDTH x i32>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_double(<WIDTH x i32>, <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+declare void @__pseudo_scatter64_i8(<WIDTH x i64>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_i16(<WIDTH x i64>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_i32(<WIDTH x i64>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_float(<WIDTH x i64>, <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_i64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_double(<WIDTH x i64>, <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+; And the ImproveMemoryOps optimization pass also finds these and
+; either transforms them to scatters like:
+;
+; void __pseudo_scatter_factored_base_offsets{32,64}_i8(uniform int8 *base, 
+;             varying int32 offsets, uniform int32 offset_scale, 
+;             varying int{32,64} offset_delta, varying int8 values, mask)
+; (and similarly for 16/32/64 bit values)
+;
+; Or, if the target has a native scatter instruction:
+;
+; void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base, 
+;             uniform int32 offset_scale, varying int{32,64} offsets,
+;             varying int8 values, mask)
+; (and similarly for 16/32/64 bit values)
+
+declare void
+@__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                              <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                              <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                              <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                 <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                              <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                              <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                              <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                 <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_scatter_base_offsets32_i8(i8 * nocapture, i32, <WIDTH x i32>,
+                                    <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_i16(i8 * nocapture, i32, <WIDTH x i32>,
+                                     <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_i32(i8 * nocapture, i32, <WIDTH x i32>,
+                                     <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_float(i8 * nocapture, i32, <WIDTH x i32>,
+                                       <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_i64(i8 * nocapture, i32, <WIDTH x i32>,
+                                     <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_double(i8 * nocapture, i32, <WIDTH x i32>,
+                                        <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_scatter_base_offsets64_i8(i8 * nocapture, i32, <WIDTH x i64>,
+                                    <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_i16(i8 * nocapture, i32, <WIDTH x i64>,
+                                     <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_i32(i8 * nocapture, i32, <WIDTH x i64>,
+                                     <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_float(i8 * nocapture, i32, <WIDTH x i64>,
+                                       <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_i64(i8 * nocapture, i32, <WIDTH x i64>,
+                                     <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, <WIDTH x i64>,
+                                        <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+declare float @__log_uniform_float(float) nounwind readnone
+declare <WIDTH x float> @__log_varying_float(<WIDTH x float>) nounwind readnone
+declare float @__exp_uniform_float(float) nounwind readnone
+declare <WIDTH x float> @__exp_varying_float(<WIDTH x float>) nounwind readnone
+declare float @__pow_uniform_float(float, float) nounwind readnone
+declare <WIDTH x float> @__pow_varying_float(<WIDTH x float>, <WIDTH x float>) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+declare void @__use8(<WIDTH x i8>)
+declare void @__use16(<WIDTH x i16>)
+declare void @__use32(<WIDTH x i32>)
+declare void @__usefloat(<WIDTH x float>)
+declare void @__use64(<WIDTH x i64>)
+declare void @__usedouble(<WIDTH x double>)
+
+;; This is a temporary function that will be removed at the end of
+;; compilation--the idea is that it calls out to all of the various
+;; functions / pseudo-function declarations that we need to keep around
+;; so that they are available to the various optimization passes.  This
+;; then prevents those functions from being removed as dead code when
+;; we do early DCE...
+
+define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
+                               <WIDTH x i32> %v32, <WIDTH x i64> %v64,
+                               <WIDTH x MASK> %mask) {
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; loads
+  %ml8  = call <WIDTH x i8>  @__masked_load_i8(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %ml8)
+  %ml16 = call <WIDTH x i16> @__masked_load_i16(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %ml16)
+  %ml32 = call <WIDTH x i32> @__masked_load_i32(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %ml32)
+  %mlf = call <WIDTH x float> @__masked_load_float(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %mlf)
+  %ml64 = call <WIDTH x i64> @__masked_load_i64(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %ml64)
+  %mld = call <WIDTH x double> @__masked_load_double(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %mld)
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; stores
+  %pv8 = bitcast i8 * %ptr to <WIDTH x i8> *
+  call void @__pseudo_masked_store_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
+                                      <WIDTH x MASK> %mask)
+  %pv16 = bitcast i8 * %ptr to <WIDTH x i16> *
+  call void @__pseudo_masked_store_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
+                                       <WIDTH x MASK> %mask)
+  %pv32 = bitcast i8 * %ptr to <WIDTH x i32> *
+  call void @__pseudo_masked_store_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
+                                       <WIDTH x MASK> %mask)
+  %vf = bitcast <WIDTH x i32> %v32 to <WIDTH x float>
+  %pvf = bitcast i8 * %ptr to <WIDTH x float> *
+  call void @__pseudo_masked_store_float(<WIDTH x float> * %pvf, <WIDTH x float> %vf,
+                                         <WIDTH x MASK> %mask)
+  %pv64 = bitcast i8 * %ptr to <WIDTH x i64> *
+  call void @__pseudo_masked_store_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
+                                       <WIDTH x MASK> %mask)
+  %vd = bitcast <WIDTH x i64> %v64 to <WIDTH x double>
+  %pvd = bitcast i8 * %ptr to <WIDTH x double> *
+  call void @__pseudo_masked_store_double(<WIDTH x double> * %pvd, <WIDTH x double> %vd,
+                                         <WIDTH x MASK> %mask)
+
+  call void @__masked_store_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__masked_store_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__masked_store_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__masked_store_float(<WIDTH x float> * %pvf, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__masked_store_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__masked_store_double(<WIDTH x double> * %pvd, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__masked_store_blend_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
+                                     <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
+                                      <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
+                                      <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_float(<WIDTH x float> * %pvf, <WIDTH x float> %vf,
+                                        <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
+                                      <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_double(<WIDTH x double> * %pvd, <WIDTH x double> %vd,
+                                         <WIDTH x MASK> %mask)
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; gathers
+
+  %pg32_8 = call <WIDTH x i8>  @__pseudo_gather32_i8(<WIDTH x i32> %v32,
+                                                     <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pg32_8)
+  %pg32_16 = call <WIDTH x i16>  @__pseudo_gather32_i16(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pg32_16)
+  %pg32_32 = call <WIDTH x i32>  @__pseudo_gather32_i32(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pg32_32)
+  %pg32_f = call <WIDTH x float>  @__pseudo_gather32_float(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %pg32_f)
+  %pg32_64 = call <WIDTH x i64>  @__pseudo_gather32_i64(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pg32_64)
+  %pg32_d = call <WIDTH x double>  @__pseudo_gather32_double(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %pg32_d)
+
+  %pg64_8 = call <WIDTH x i8>  @__pseudo_gather64_i8(<WIDTH x i64> %v64,
+                                                     <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pg64_8)
+  %pg64_16 = call <WIDTH x i16>  @__pseudo_gather64_i16(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pg64_16)
+  %pg64_32 = call <WIDTH x i32>  @__pseudo_gather64_i32(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pg64_32)
+  %pg64_f = call <WIDTH x float>  @__pseudo_gather64_float(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %pg64_f)
+  %pg64_64 = call <WIDTH x i64>  @__pseudo_gather64_i64(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pg64_64)
+  %pg64_d = call <WIDTH x double>  @__pseudo_gather64_double(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %pg64_d)
+
+  %g32_8 = call <WIDTH x i8>  @__gather32_i8(<WIDTH x i32> %v32,
+                                                     <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %g32_8)
+  %g32_16 = call <WIDTH x i16>  @__gather32_i16(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %g32_16)
+  %g32_32 = call <WIDTH x i32>  @__gather32_i32(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %g32_32)
+  %g32_f = call <WIDTH x float>  @__gather32_float(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %g32_f)
+  %g32_64 = call <WIDTH x i64>  @__gather32_i64(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %g32_64)
+  %g32_d = call <WIDTH x double>  @__gather32_double(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %g32_d)
+
+  %g64_8 = call <WIDTH x i8>  @__gather64_i8(<WIDTH x i64> %v64,
+                                                     <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %g64_8)
+  %g64_16 = call <WIDTH x i16>  @__gather64_i16(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %g64_16)
+  %g64_32 = call <WIDTH x i32>  @__gather64_i32(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %g64_32)
+  %g64_f = call <WIDTH x float>  @__gather64_float(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %g64_f)
+  %g64_64 = call <WIDTH x i64>  @__gather64_i64(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %g64_64)
+  %g64_d = call <WIDTH x double>  @__gather64_double(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %g64_d)
+
+ifelse(HAVE_GATHER, `1', 
+`
+  %nfpgbo32_8 = call <WIDTH x i8>
+       @__pseudo_gather_base_offsets32_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfpgbo32_8)
+  %nfpgbo32_16 = call <WIDTH x i16>
+       @__pseudo_gather_base_offsets32_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfpgbo32_16)
+  %nfpgbo32_32 = call <WIDTH x i32>
+       @__pseudo_gather_base_offsets32_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfpgbo32_32)
+  %nfpgbo32_f = call <WIDTH x float>
+       @__pseudo_gather_base_offsets32_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfpgbo32_f)
+  %nfpgbo32_64 = call <WIDTH x i64>
+       @__pseudo_gather_base_offsets32_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfpgbo32_64)
+  %nfpgbo32_d = call <WIDTH x double>
+       @__pseudo_gather_base_offsets32_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfpgbo32_d)
+
+  %nfpgbo64_8 = call <WIDTH x i8>
+       @__pseudo_gather_base_offsets64_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfpgbo64_8)
+  %nfpgbo64_16 = call <WIDTH x i16>
+       @__pseudo_gather_base_offsets64_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfpgbo64_16)
+  %nfpgbo64_32 = call <WIDTH x i32>
+       @__pseudo_gather_base_offsets64_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfpgbo64_32)
+  %nfpgbo64_f = call <WIDTH x float>
+       @__pseudo_gather_base_offsets64_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfpgbo64_f)
+  %nfpgbo64_64 = call <WIDTH x i64>
+       @__pseudo_gather_base_offsets64_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfpgbo64_64)
+  %nfpgbo64_d = call <WIDTH x double>
+       @__pseudo_gather_base_offsets64_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfpgbo64_d)
+
+  %nfgbo32_8 = call <WIDTH x i8>
+       @__gather_base_offsets32_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfgbo32_8)
+  %nfgbo32_16 = call <WIDTH x i16>
+       @__gather_base_offsets32_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfgbo32_16)
+  %nfgbo32_32 = call <WIDTH x i32>
+       @__gather_base_offsets32_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfgbo32_32)
+  %nfgbo32_f = call <WIDTH x float>
+       @__gather_base_offsets32_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfgbo32_f)
+  %nfgbo32_64 = call <WIDTH x i64>
+       @__gather_base_offsets32_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfgbo32_64)
+  %nfgbo32_d = call <WIDTH x double>
+       @__gather_base_offsets32_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfgbo32_d)
+
+  %nfgbo64_8 = call <WIDTH x i8>
+       @__gather_base_offsets64_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfgbo64_8)
+  %nfgbo64_16 = call <WIDTH x i16>
+       @__gather_base_offsets64_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfgbo64_16)
+  %nfgbo64_32 = call <WIDTH x i32>
+       @__gather_base_offsets64_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfgbo64_32)
+  %nfgbo64_f = call <WIDTH x float>
+       @__gather_base_offsets64_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfgbo64_f)
+  %nfgbo64_64 = call <WIDTH x i64>
+       @__gather_base_offsets64_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfgbo64_64)
+  %nfgbo64_d = call <WIDTH x double>
+       @__gather_base_offsets64_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfgbo64_d)
+',
+`
+  %pgbo32_8 = call <WIDTH x i8>
+       @__pseudo_gather_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pgbo32_8)
+  %pgbo32_16 = call <WIDTH x i16>
+       @__pseudo_gather_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pgbo32_16)
+  %pgbo32_32 = call <WIDTH x i32>
+       @__pseudo_gather_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pgbo32_32)
+  %pgbo32_f = call <WIDTH x float>
+       @__pseudo_gather_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %pgbo32_f)
+  %pgbo32_64 = call <WIDTH x i64>
+       @__pseudo_gather_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pgbo32_64)
+  %pgbo32_d = call <WIDTH x double>
+       @__pseudo_gather_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %pgbo32_d)
+
+  %pgbo64_8 = call <WIDTH x i8>
+       @__pseudo_gather_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pgbo64_8)
+  %pgbo64_16 = call <WIDTH x i16>
+       @__pseudo_gather_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pgbo64_16)
+  %pgbo64_32 = call <WIDTH x i32>
+       @__pseudo_gather_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pgbo64_32)
+  %pgbo64_f = call <WIDTH x float>
+       @__pseudo_gather_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %pgbo64_f)
+  %pgbo64_64 = call <WIDTH x i64>
+       @__pseudo_gather_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pgbo64_64)
+  %pgbo64_d = call <WIDTH x double>
+       @__pseudo_gather_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %pgbo64_d)
+
+  %gbo32_8 = call <WIDTH x i8>
+       @__gather_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %gbo32_8)
+  %gbo32_16 = call <WIDTH x i16>
+       @__gather_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %gbo32_16)
+  %gbo32_32 = call <WIDTH x i32>
+       @__gather_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %gbo32_32)
+  %gbo32_f = call <WIDTH x float>
+       @__gather_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %gbo32_f)
+  %gbo32_64 = call <WIDTH x i64>
+       @__gather_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %gbo32_64)
+  %gbo32_d = call <WIDTH x double>
+       @__gather_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %gbo32_d)
+
+  %gbo64_8 = call <WIDTH x i8>
+       @__gather_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %gbo64_8)
+  %gbo64_16 = call <WIDTH x i16>
+       @__gather_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %gbo64_16)
+  %gbo64_32 = call <WIDTH x i32>
+       @__gather_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %gbo64_32)
+  %gbo64_f = call <WIDTH x float>
+       @__gather_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %gbo64_f)
+  %gbo64_64 = call <WIDTH x i64>
+       @__gather_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %gbo64_64)
+  %gbo64_d = call <WIDTH x double>
+       @__gather_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %pgbo64_d)
+')
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; scatters
+
+  call void @__pseudo_scatter32_i8(<WIDTH x i32> %v32, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_i16(<WIDTH x i32> %v32, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_i32(<WIDTH x i32> %v32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_float(<WIDTH x i32> %v32, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_i64(<WIDTH x i32> %v32, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_double(<WIDTH x i32> %v32, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_scatter64_i8(<WIDTH x i64> %v64, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_i16(<WIDTH x i64> %v64, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_i32(<WIDTH x i64> %v64, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_float(<WIDTH x i64> %v64, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_double(<WIDTH x i64> %v64, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter32_i8(<WIDTH x i32> %v32, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter32_i16(<WIDTH x i32> %v32, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter32_i32(<WIDTH x i32> %v32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter32_float(<WIDTH x i32> %v32, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter32_i64(<WIDTH x i32> %v32, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter32_double(<WIDTH x i32> %v32, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter64_i8(<WIDTH x i64> %v64, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter64_i16(<WIDTH x i64> %v64, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter64_i32(<WIDTH x i64> %v64, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter64_float(<WIDTH x i64> %v64, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter64_double(<WIDTH x i64> %v64, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+ifelse(HAVE_SCATTER, `1',
+`
+  call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter_base_offsets32_i8(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i16(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i32(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_float(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i64(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_double(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter_base_offsets64_i8(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i16(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i32(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_float(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i64(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_double(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+',
+`
+  call void @__pseudo_scatter_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_scatter_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+')
+
+  ret void
+}
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; various bitcasts from one type to another
+
+define <WIDTH x i32> @__intbits_varying_float(<WIDTH x float>) nounwind readnone alwaysinline {
+  %float_to_int_bitcast = bitcast <WIDTH x float> %0 to <WIDTH x i32>
+  ret <WIDTH x i32> %float_to_int_bitcast
+}
+
+define i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline {
+  %float_to_int_bitcast = bitcast float %0 to i32
+  ret i32 %float_to_int_bitcast
+}
+
+define <WIDTH x i64> @__intbits_varying_double(<WIDTH x double>) nounwind readnone alwaysinline {
+  %double_to_int_bitcast = bitcast <WIDTH x double> %0 to <WIDTH x i64>
+  ret <WIDTH x i64> %double_to_int_bitcast
+}
+
+define i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline {
+  %double_to_int_bitcast = bitcast double %0 to i64
+  ret i64 %double_to_int_bitcast
+}
+
+define <WIDTH x float> @__floatbits_varying_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
+  %int_to_float_bitcast = bitcast <WIDTH x i32> %0 to <WIDTH x float>
+  ret <WIDTH x float> %int_to_float_bitcast
+}
+
+define float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline {
+  %int_to_float_bitcast = bitcast i32 %0 to float
+  ret float %int_to_float_bitcast
+}
+
+define <WIDTH x double> @__doublebits_varying_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
+  %int_to_double_bitcast = bitcast <WIDTH x i64> %0 to <WIDTH x double>
+  ret <WIDTH x double> %int_to_double_bitcast
+}
+
+define double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline {
+  %int_to_double_bitcast = bitcast i64 %0 to double
+  ret double %int_to_double_bitcast
+}
+
+define <WIDTH x float> @__undef_varying() nounwind readnone alwaysinline {
+  ret <WIDTH x float> undef
+}
+
+define float @__undef_uniform() nounwind readnone alwaysinline {
+  ret float undef
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sign extension
+
+define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
+  %r = sext i1 %0 to i32
+  ret i32 %r
+}
+
+define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
+;;  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
+;; `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
+;; ret <WIDTH x i32> %se')
+  ifelse(MASK,i32, `%se = bitcast <WIDTH x i32> %0 to <WIDTH x i32>',
+         MASK,i64, `%se = trunc <WIDTH x MASK> %0 to <WIDTH x i32>',
+                   `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>')
+  ret <WIDTH x i32> %se
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; memcpy/memmove/memset
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src,
+                                        i32 %len, i32 %align, i1 %isvolatile)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* %dest, i8* %src,
+                                        i64 %len, i32 %align, i1 %isvolatile)
+
+declare void @__memcpy32(i8 * %dst, i8 * %src, i32 %len) alwaysinline;
+declare void @__memcpy64(i8 * %dst, i8 * %src, i64 %len) alwaysinline;
+
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src,
+                                         i32 %len, i32 %align, i1 %isvolatile)
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* %dest, i8* %src,
+                                         i64 %len, i32 %align, i1 %isvolatile)
+
+declare void @__memmove32(i8 * %dst, i8 * %src, i32 %len) alwaysinline;
+declare void @__memmove64(i8 * %dst, i8 * %src, i64 %len) alwaysinline
+
+declare void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 %len, i32 %align,
+                                   i1 %isvolatile)
+declare void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 %len, i32 %align,
+                                   i1 %isvolatile)
+
+declare void @__memset32(i8 * %dst, i8 %val, i32 %len) alwaysinline ;
+declare void @__memset64(i8 * %dst, i8 %val, i64 %len) alwaysinline;
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; new/delete
+
+;; Set of functions for 32 bit runtime.
+;; They are different for Windows and Unix (Linux/MacOS),
+;; on Windows we have to use _aligned_malloc/_aligned_free,
+;; while on Unix we use posix_memalign/free
+;;
+;; Note that this should be really two different libraries for 32 and 64
+;; environment and it should happen sooner or later
+
+ifelse(WIDTH, 1, `define(`ALIGNMENT', `16')', `define(`ALIGNMENT', `eval(WIDTH*4)')')
+
+@memory_alignment = internal constant i32 ALIGNMENT
+
+ifelse(BUILD_OS, `UNIX', 
+`
+
+ifelse(RUNTIME, `32',
+`
+
+;; Unix 32 bit environment.
+;; Use: posix_memalign and free
+;; Define:
+;; - __new_uniform_32rt
+;; - __new_varying32_32rt
+;; - __delete_uniform_32rt
+;; - __delete_varying_32rt
+
+declare i8* @malloc(i32)
+declare i32 @posix_memalign(i8**, i32, i32)
+declare void @free(i8 *)
+
+declare noalias i8 * @__new_uniform_32rt(i64 %size);
+declare <WIDTH x i64> @__new_varying32_32rt(<WIDTH x i32> %size, <WIDTH x MASK> %mask);
+declare void @__delete_uniform_32rt(i8 * %ptr);
+declare void @__delete_varying_32rt(<WIDTH x i64> %ptr, <WIDTH x MASK> %mask);
+
+',
+RUNTIME, `64',
+`
+
+;; Unix 64 bit environment.
+;; Use: posix_memalign and free
+;; Define:
+;; - __new_uniform_64rt
+;; - __new_varying32_64rt
+;; - __new_varying64_64rt
+;; - __delete_uniform_64rt
+;; - __delete_varying_64rt
+
+declare i8* @malloc(i64)
+declare void @free(i8 *)
+
+define noalias i8 * @__new_uniform_64rt(i64 %size) 
+{
+entry:
+;;  compute laneIdx = __tid_x() & (__warpsize() - 1)
+  %and = call i32 @__program_index()
+;; if (laneIdx == 0)
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %call2 = tail call noalias i8* @malloc(i64 %size) 
+  %phitmp = ptrtoint i8* %call2 to i64
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %ptr.0 = phi i64 [ %phitmp, %if.then ], [ undef, %entry ]
+  %val.sroa.0.0.extract.trunc = trunc i64 %ptr.0 to i32
+  %call3 = tail call i32 @__shfl_i32_nvptx(i32 %val.sroa.0.0.extract.trunc, i32 0)
+  %val.sroa.0.0.insert.ext = zext i32 %call3 to i64
+  %val.sroa.0.4.extract.shift = lshr i64 %ptr.0, 32
+  %val.sroa.0.4.extract.trunc = trunc i64 %val.sroa.0.4.extract.shift to i32
+  %call8 = tail call i32 @__shfl_i32_nvptx(i32 %val.sroa.0.4.extract.trunc, i32 0)
+  %val.sroa.0.4.insert.ext = zext i32 %call8 to i64
+  %val.sroa.0.4.insert.shift = shl nuw i64 %val.sroa.0.4.insert.ext, 32
+  %val.sroa.0.4.insert.insert = or i64 %val.sroa.0.4.insert.shift, %val.sroa.0.0.insert.ext
+  %0 = inttoptr i64 %val.sroa.0.4.insert.insert to i8*
+  ret i8* %0
+}
+define void @__delete_uniform_64rt(i8 * %ptr) 
+{
+entry:
+  %and = call i32 @__program_index()
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void @free(i8* %ptr) 
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define <1 x i64> @__new_varying32_64rt(<1 x i32> %sizev, <1 x i1> %maskv)
+{
+entry:
+  %size32 = extractelement <1 x i32> %sizev, i32 0
+  %mask   = extractelement <1 x  i1> %maskv, i32 0
+  %size64 = zext i32 %size32 to i64
+  br i1 %mask, label %alloc, label %skip
+
+alloc:
+  %ptr   = tail call noalias i8* @malloc(i64 %size64) 
+  %addr1 = ptrtoint i8* %ptr to i64
+  br label %skip
+
+skip:
+  %addr64 = phi i64 [ %addr1, %alloc], [ 0, %entry ]
+  %addr   = insertelement <1 x i64> undef, i64 %addr64, i32 0
+  ret <1 x i64> %addr
+}
+
+define <1 x i64> @__new_varying64_64rt(<1 x i64> %sizev, <1 x i1> %maskv)
+{
+entry:
+  %size64 = extractelement <1 x i64> %sizev, i32 0
+  %mask   = extractelement <1 x  i1> %maskv, i32 0
+  br i1 %mask, label %alloc, label %skip
+
+alloc:
+  %ptr   = tail call noalias i8* @malloc(i64 %size64) 
+  %addr1 = ptrtoint i8* %ptr to i64
+  br label %skip
+
+skip:
+  %addr64 = phi i64 [ %addr1, %alloc], [ 0, %entry ]
+  %addr   = insertelement <1 x i64> undef, i64 %addr64, i32 0
+  ret <1 x i64> %addr
+}
+
+define void @__delete_varying_64rt(<1 x i64> %ptrv, <1 x i1> %maskv)
+{
+entry:
+  %addr64 = extractelement <1 x i64> %ptrv,  i32 0
+  %mask   = extractelement <1 x  i1> %maskv, i32 0
+  br i1 %mask, label %free, label %skip
+
+free:
+  %ptr = inttoptr i64 %addr64 to i8*
+  tail call void @free(i8* %ptr) 
+  br label %skip
+
+skip:
+  ret void
+}
+', `
+errprint(`RUNTIME should be defined to either 32 or 64
+')
+m4exit(`1')
+')
+
+',
+BUILD_OS, `WINDOWS',
+`
+
+ifelse(RUNTIME, `32',
+`
+
+;; Windows 32 bit environment.
+;; Use: _aligned_malloc and _aligned_free
+;; Define:
+;; - __new_uniform_32rt
+;; - __new_varying32_32rt
+;; - __delete_uniform_32rt
+;; - __delete_varying_32rt
+
+declare i8* @_aligned_malloc(i32, i32)
+declare void @_aligned_free(i8 *)
+
+define noalias i8 * @__new_uniform_32rt(i64 %size) {
+  %conv = trunc i64 %size to i32
+  %alignment = load i32* @memory_alignment
+  %ptr = tail call i8* @_aligned_malloc(i32 %conv, i32 %alignment)
+  ret i8* %ptr
+}
+
+define <WIDTH x i64> @__new_varying32_32rt(<WIDTH x i32> %size, <WIDTH x MASK> %mask) {
+  %ret = alloca <WIDTH x i64>
+  store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %ret
+  %ret64 = bitcast <WIDTH x i64> * %ret to i64 *
+  %alignment = load i32* @memory_alignment
+
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+    %sz_LANE_ID = extractelement <WIDTH x i32> %size, i32 LANE
+    %ptr_LANE_ID = call noalias i8 * @_aligned_malloc(i32 %sz_LANE_ID, i32 %alignment)
+    %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64
+    %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE
+    store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID')
+
+  %r = load <WIDTH x i64> * %ret
+  ret <WIDTH x i64> %r
+}
+
+define void @__delete_uniform_32rt(i8 * %ptr) {
+  call void @_aligned_free(i8 * %ptr)
+  ret void
+}
+
+define void @__delete_varying_32rt(<WIDTH x i64> %ptr, <WIDTH x MASK> %mask) {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+      %iptr_LANE_ID = extractelement <WIDTH x i64> %ptr, i32 LANE
+      %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8 *
+      call void @_aligned_free(i8 * %ptr_LANE_ID)
+  ')
+  ret void
+}
+
+',
+RUNTIME, `64',
+`
+
+;; Windows 64 bit environment.
+;; Use: _aligned_malloc and _aligned_free
+;; Define:
+;; - __new_uniform_64rt
+;; - __new_varying32_64rt
+;; - __new_varying64_64rt
+;; - __delete_uniform_64rt
+;; - __delete_varying_64rt
+
+declare i8* @_aligned_malloc(i64, i64)
+declare void @_aligned_free(i8 *)
+
+define noalias i8 * @__new_uniform_64rt(i64 %size) {
+  %alignment = load i32* @memory_alignment
+  %alignment64 = sext i32 %alignment to i64
+  %ptr = tail call i8* @_aligned_malloc(i64 %size, i64 %alignment64)
+  ret i8* %ptr
+}
+
+define <WIDTH x i64> @__new_varying32_64rt(<WIDTH x i32> %size, <WIDTH x MASK> %mask) {
+  %ret = alloca <WIDTH x i64>
+  store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %ret
+  %ret64 = bitcast <WIDTH x i64> * %ret to i64 *
+  %alignment = load i32* @memory_alignment
+  %alignment64 = sext i32 %alignment to i64
+
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+    %sz_LANE_ID = extractelement <WIDTH x i32> %size, i32 LANE
+    %sz64_LANE_ID = zext i32 %sz_LANE_ID to i64
+    %ptr_LANE_ID = call noalias i8 * @_aligned_malloc(i64 %sz64_LANE_ID, i64 %alignment64)
+    %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64
+    %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE
+    store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID')
+
+  %r = load <WIDTH x i64> * %ret
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__new_varying64_64rt(<WIDTH x i64> %size, <WIDTH x MASK> %mask) {
+  %ret = alloca <WIDTH x i64>
+  store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %ret
+  %ret64 = bitcast <WIDTH x i64> * %ret to i64 *
+  %alignment = load i32* @memory_alignment
+  %alignment64 = sext i32 %alignment to i64
+
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+    %sz64_LANE_ID = extractelement <WIDTH x i64> %size, i32 LANE
+    %ptr_LANE_ID = call noalias i8 * @_aligned_malloc(i64 %sz64_LANE_ID, i64 %alignment64)
+    %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64
+    %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE
+    store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID')
+
+  %r = load <WIDTH x i64> * %ret
+  ret <WIDTH x i64> %r
+}
+
+define void @__delete_uniform_64rt(i8 * %ptr) {
+  call void @_aligned_free(i8 * %ptr)
+  ret void
+}
+
+define void @__delete_varying_64rt(<WIDTH x i64> %ptr, <WIDTH x MASK> %mask) {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+      %iptr_LANE_ID = extractelement <WIDTH x i64> %ptr, i32 LANE
+      %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8 *
+      call void @_aligned_free(i8 * %ptr_LANE_ID)
+  ')
+  ret void
+}
+
+', `
+errprint(`RUNTIME should be defined to either 32 or 64
+')
+m4exit(`1')
+')
+
+',
+`
+errprint(`BUILD_OS should be defined to either UNIX or WINDOWS
+')
+m4exit(`1')
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; read hw clock
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; stdlib transcendentals
+;;
+;; These functions provide entrypoints that call out to the libm 
+;; implementations of the transcendental functions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+declare float @sinf(float) nounwind readnone
+declare float @cosf(float) nounwind readnone
+declare void @sincosf(float, float *, float *) nounwind readnone
+declare float @asinf(float) nounwind readnone
+declare float @acosf(float) nounwind readnone
+declare float @tanf(float) nounwind readnone
+declare float @atanf(float) nounwind readnone
+declare float @atan2f(float, float) nounwind readnone
+declare float @expf(float) nounwind readnone
+declare float @logf(float) nounwind readnone
+declare float @powf(float, float) nounwind readnone
+
+define float @__stdlib_sinf(float) nounwind readnone alwaysinline {
+  %r = call float @sinf(float %0)
+  ret float %r
+}
+
+define float @__stdlib_cosf(float) nounwind readnone alwaysinline {
+  %r = call float @cosf(float %0)
+  ret float %r
+}
+
+define void @__stdlib_sincosf(float, float *, float *) nounwind readnone alwaysinline {
+  call void @sincosf(float %0, float *%1, float *%2)
+  ret void
+}
+
+define float @__stdlib_asinf(float) nounwind readnone alwaysinline {
+  %r = call float @asinf(float %0)
+  ret float %r
+}
+
+define float @__stdlib_acosf(float) nounwind readnone alwaysinline {
+  %r = call float @acosf(float %0)
+  ret float %r
+}
+
+define float @__stdlib_tanf(float) nounwind readnone alwaysinline {
+  %r = call float @tanf(float %0)
+  ret float %r
+}
+
+define float @__stdlib_atanf(float) nounwind readnone alwaysinline {
+  %r = call float @atanf(float %0)
+  ret float %r
+}
+
+define float @__stdlib_atan2f(float, float) nounwind readnone alwaysinline {
+  %r = call float @atan2f(float %0, float %1)
+  ret float %r
+}
+
+define float @__stdlib_logf(float) nounwind readnone alwaysinline {
+  %r = call float @logf(float %0)
+  ret float %r
+}
+
+define float @__stdlib_expf(float) nounwind readnone alwaysinline {
+  %r = call float @expf(float %0)
+  ret float %r
+}
+
+define float @__stdlib_powf(float, float) nounwind readnone alwaysinline {
+  %r = call float @powf(float %0, float %1)
+  ret float %r
+}
+
+declare double @sin(double) nounwind readnone
+declare double @asin(double) nounwind readnone
+declare double @cos(double) nounwind readnone
+declare void @sincos(double, double *, double *) nounwind readnone
+declare double @tan(double) nounwind readnone
+declare double @atan(double) nounwind readnone
+declare double @atan2(double, double) nounwind readnone
+declare double @exp(double) nounwind readnone
+declare double @log(double) nounwind readnone
+declare double @pow(double, double) nounwind readnone
+
+define double @__stdlib_sin(double) nounwind readnone alwaysinline {
+  %r = call double @sin(double %0)
+  ret double %r
+}
+
+define double @__stdlib_asin(double) nounwind readnone alwaysinline {
+  %r = call double @asin(double %0)
+  ret double %r
+}
+
+define double @__stdlib_cos(double) nounwind readnone alwaysinline {
+  %r = call double @cos(double %0)
+  ret double %r
+}
+
+define void @__stdlib_sincos(double, double *, double *) nounwind readnone alwaysinline {
+  call void @sincos(double %0, double *%1, double *%2)
+  ret void
+}
+
+define double @__stdlib_tan(double) nounwind readnone alwaysinline {
+  %r = call double @tan(double %0)
+  ret double %r
+}
+
+define double @__stdlib_atan(double) nounwind readnone alwaysinline {
+  %r = call double @atan(double %0)
+  ret double %r
+}
+
+define double @__stdlib_atan2(double, double) nounwind readnone alwaysinline {
+  %r = call double @atan2(double %0, double %1)
+  ret double %r
+}
+
+define double @__stdlib_log(double) nounwind readnone alwaysinline {
+  %r = call double @log(double %0)
+  ret double %r
+}
+
+define double @__stdlib_exp(double) nounwind readnone alwaysinline {
+  %r = call double @exp(double %0)
+  ret double %r
+}
+
+define double @__stdlib_pow(double, double) nounwind readnone alwaysinline {
+  %r = call double @pow(double %0, double %1)
+  ret double %r
+}
+
+
+')
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 64-bit integer min and max functions
+
+;; utility function used by int64minmax below.  This shouldn't be called by
+;; target .ll files directly.
+;; $1: target vector width
+;; $2: {min,max} (used in constructing function names)
+;; $3: {int64,uint64} (used in constructing function names)
+;; $4: {slt,sgt} comparison operator to used
+
+define(`i64minmax', `
+define i64 @__$2_uniform_$3(i64, i64) nounwind alwaysinline readnone {
+  %c = icmp $4 i64 %0, %1
+  %r = select i1 %c, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define <$1 x i64> @__$2_varying_$3(<$1 x i64>, <$1 x i64>) nounwind alwaysinline readnone {
+  %rptr = alloca <$1 x i64>
+  %r64ptr = bitcast <$1 x i64> * %rptr to i64 *
+
+  forloop(i, 0, eval($1-1), `
+  %v0_`'i = extractelement <$1 x i64> %0, i32 i
+  %v1_`'i = extractelement <$1 x i64> %1, i32 i
+  %c_`'i = icmp $4 i64 %v0_`'i, %v1_`'i
+  %v_`'i = select i1 %c_`'i, i64 %v0_`'i, i64 %v1_`'i
+  %ptr_`'i = getelementptr i64 * %r64ptr, i32 i
+  store i64 %v_`'i, i64 * %ptr_`'i
+')                  
+
+  %ret = load <$1 x i64> * %rptr
+  ret <$1 x i64> %ret
+}
+')
+
+;; this is the function that target .ll files should call; it just takes the target
+;; vector width as a parameter
+
+define(`int64minmax', `
+i64minmax(WIDTH,min,int64,slt)
+i64minmax(WIDTH,max,int64,sgt)
+i64minmax(WIDTH,min,uint64,ult)
+i64minmax(WIDTH,max,uint64,ugt)
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Emit general-purpose code to do a masked load for targets that dont have
+;; an instruction to do that.  Parameters:
+;; $1: element type for which to emit the function (i32, i64, ...) (and suffix for function name)
+;; $2: alignment for elements of type $1 (4, 8, ...)
+
+define(`masked_load', `
+define <WIDTH x $1> @__masked_load_$1(i8 *, <WIDTH x MASK> %mask) nounwind alwaysinline {
+entry:
+  %mm = call i64 @__movmsk(<WIDTH x MASK> %mask)
+  
+  ; if the first lane and the last lane are on, then it is safe to do a vector load
+  ; of the whole thing--what the lanes in the middle want turns out to not matter...
+  %mm_and_low = and i64 %mm, 1
+  %mm_and_high = and i64 %mm, MASK_HIGH_BIT_ON
+  %mm_and_high_shift = lshr i64 %mm_and_high, eval(WIDTH-1)
+  %mm_and_low_i1 = trunc i64 %mm_and_low to i1
+  %mm_and_high_shift_i1 = trunc i64 %mm_and_high_shift to i1
+  %can_vload = and i1 %mm_and_low_i1, %mm_and_high_shift_i1
+
+  %fast32 = call i32 @__fast_masked_vload()
+  %fast_i1 = trunc i32 %fast32 to i1
+  %can_vload_maybe_fast = or i1 %fast_i1, %can_vload
+
+  ; if we are not able to do a singe vload, we will accumulate lanes in this memory..
+  %retptr = alloca <WIDTH x $1>
+  %retptr32 = bitcast <WIDTH x $1> * %retptr to $1 *
+  br i1 %can_vload_maybe_fast, label %load, label %loop
+
+load: 
+  %ptr = bitcast i8 * %0 to <WIDTH x $1> *
+  %valall = load <WIDTH x $1> * %ptr, align $2
+  ret <WIDTH x $1> %valall
+
+loop:
+  ; loop over the lanes and see if each one is on...
+  %lane = phi i32 [ 0, %entry ], [ %next_lane, %lane_done ]
+  %lane64 = zext i32 %lane to i64
+  %lanemask = shl i64 1, %lane64
+  %mask_and = and i64 %mm, %lanemask
+  %do_lane = icmp ne i64 %mask_and, 0
+  br i1 %do_lane, label %load_lane, label %lane_done
+
+load_lane:
+  ; yes!  do the load and store the result into the appropriate place in the
+  ; allocaed memory above
+  %ptr32 = bitcast i8 * %0 to $1 *
+  %lane_ptr = getelementptr $1 * %ptr32, i32 %lane
+  %val = load $1 * %lane_ptr
+  %store_ptr = getelementptr $1 * %retptr32, i32 %lane
+  store $1 %val, $1 * %store_ptr
+  br label %lane_done
+
+lane_done:
+  %next_lane = add i32 %lane, 1
+  %done = icmp eq i32 %lane, eval(WIDTH-1)
+  br i1 %done, label %return, label %loop
+
+return:
+  %r = load <WIDTH x $1> * %retptr
+  ret <WIDTH x $1> %r
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+;; emit code to do masked store as a set of per-lane scalar stores
+;; parameters:
+;; $1: llvm type of elements (and suffix for function name)
+
+define(`gen_masked_store', `
+define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x MASK>) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %2, `
+      %ptr_LANE_ID = getelementptr <WIDTH x $1> * %0, i32 0, i32 LANE
+      %storeval_LANE_ID = extractelement <WIDTH x $1> %1, i32 LANE
+      store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID')
+  ret void
+}
+')
+
+define(`masked_store_blend_8_16_by_4', `
+define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
+                                     <4 x i32>) nounwind alwaysinline {
+  %old = load <4 x i8> * %0, align 1
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old32 = bitcast <4 x i8> %old to i32
+    %new32 = bitcast <4 x i8> %1 to i32
+
+    %mask8 = trunc <4 x i32> %2 to <4 x i8>
+    %mask32 = bitcast <4 x i8> %mask8 to i32
+    %notmask32 = xor i32 %mask32, -1
+
+    %newmasked = and i32 %new32, %mask32
+    %oldmasked = and i32 %old32, %notmask32
+    %result = or i32 %newmasked, %oldmasked
+
+    %resultvec = bitcast i32 %result to <4 x i8>
+  ',`
+    %m = trunc <4 x i32> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
+  ')
+  store <4 x i8> %resultvec, <4 x i8> * %0, align 1
+  ret void
+}
+
+define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
+                                      <4 x i32>) nounwind alwaysinline {
+  %old = load <4 x i16> * %0, align 2
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old64 = bitcast <4 x i16> %old to i64
+    %new64 = bitcast <4 x i16> %1 to i64
+
+    %mask16 = trunc <4 x i32> %2 to <4 x i16>
+    %mask64 = bitcast <4 x i16> %mask16 to i64
+    %notmask64 = xor i64 %mask64, -1
+
+    %newmasked = and i64 %new64, %mask64
+    %oldmasked = and i64 %old64, %notmask64
+    %result = or i64 %newmasked, %oldmasked
+
+    %resultvec = bitcast i64 %result to <4 x i16>
+  ',`
+    %m = trunc <4 x i32> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
+  ')
+  store <4 x i16> %resultvec, <4 x i16> * %0, align 2
+  ret void
+}
+')
+
+define(`masked_store_blend_8_16_by_4_mask64', `
+define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
+                                     <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i8> * %0, align 1
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old32 = bitcast <4 x i8> %old to i32
+    %new32 = bitcast <4 x i8> %1 to i32
+
+    %mask8 = trunc <4 x i64> %2 to <4 x i8>
+    %mask32 = bitcast <4 x i8> %mask8 to i32
+    %notmask32 = xor i32 %mask32, -1
+
+    %newmasked = and i32 %new32, %mask32
+    %oldmasked = and i32 %old32, %notmask32
+    %result = or i32 %newmasked, %oldmasked
+
+    %resultvec = bitcast i32 %result to <4 x i8>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
+  ')
+  store <4 x i8> %resultvec, <4 x i8> * %0, align 1
+  ret void
+}
+
+define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i16> * %0, align 2
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old64 = bitcast <4 x i16> %old to i64
+    %new64 = bitcast <4 x i16> %1 to i64
+
+    %mask16 = trunc <4 x i64> %2 to <4 x i16>
+    %mask64 = bitcast <4 x i16> %mask16 to i64
+    %notmask64 = xor i64 %mask64, -1
+
+    %newmasked = and i64 %new64, %mask64
+    %oldmasked = and i64 %old64, %notmask64
+    %result = or i64 %newmasked, %oldmasked
+
+    %resultvec = bitcast i64 %result to <4 x i16>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
+  ')
+  store <4 x i16> %resultvec, <4 x i16> * %0, align 2
+  ret void
+}
+')
+
+define(`masked_store_blend_8_16_by_8', `
+define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
+                                     <8 x i32>) nounwind alwaysinline {
+  %old = load <8 x i8> * %0, align 1
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old64 = bitcast <8 x i8> %old to i64
+    %new64 = bitcast <8 x i8> %1 to i64
+
+    %mask8 = trunc <8 x i32> %2 to <8 x i8>
+    %mask64 = bitcast <8 x i8> %mask8 to i64
+    %notmask64 = xor i64 %mask64, -1
+
+    %newmasked = and i64 %new64, %mask64
+    %oldmasked = and i64 %old64, %notmask64
+    %result = or i64 %newmasked, %oldmasked
+
+    %resultvec = bitcast i64 %result to <8 x i8>
+  ',`
+    %m = trunc <8 x i32> %2 to <8 x i1>
+    %resultvec = select <8 x i1> %m, <8 x i8> %1, <8 x i8> %old
+  ')
+  store <8 x i8> %resultvec, <8 x i8> * %0, align 1
+  ret void
+}
+
+define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>,
+                                      <8 x i32>) nounwind alwaysinline {
+  %old = load <8 x i16> * %0, align 2
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old128 = bitcast <8 x i16> %old to i128
+    %new128 = bitcast <8 x i16> %1 to i128
+
+    %mask16 = trunc <8 x i32> %2 to <8 x i16>
+    %mask128 = bitcast <8 x i16> %mask16 to i128
+    %notmask128 = xor i128 %mask128, -1
+
+    %newmasked = and i128 %new128, %mask128
+    %oldmasked = and i128 %old128, %notmask128
+    %result = or i128 %newmasked, %oldmasked
+
+    %resultvec = bitcast i128 %result to <8 x i16>
+  ',`
+    %m = trunc <8 x i32> %2 to <8 x i1>
+    %resultvec = select <8 x i1> %m, <8 x i16> %1, <8 x i16> %old
+  ')
+  store <8 x i16> %resultvec, <8 x i16> * %0, align 2
+  ret void
+}
+')
+
+
+define(`masked_store_blend_8_16_by_16', `
+define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
+                                     <16 x i32>) nounwind alwaysinline {
+  %old = load <16 x i8> * %0, align 1
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old128 = bitcast <16 x i8> %old to i128
+    %new128 = bitcast <16 x i8> %1 to i128
+
+    %mask8 = trunc <16 x i32> %2 to <16 x i8>
+    %mask128 = bitcast <16 x i8> %mask8 to i128
+    %notmask128 = xor i128 %mask128, -1
+
+    %newmasked = and i128 %new128, %mask128
+    %oldmasked = and i128 %old128, %notmask128
+    %result = or i128 %newmasked, %oldmasked
+
+    %resultvec = bitcast i128 %result to <16 x i8>
+  ',`
+    %m = trunc <16 x i32> %2 to <16 x i1>
+    %resultvec = select <16 x i1> %m, <16 x i8> %1, <16 x i8> %old
+  ')
+  store <16 x i8> %resultvec, <16 x i8> * %0, align 1
+  ret void
+}
+
+define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
+                                      <16 x i32>) nounwind alwaysinline {
+  %old = load <16 x i16> * %0, align 2
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old256 = bitcast <16 x i16> %old to i256
+    %new256 = bitcast <16 x i16> %1 to i256
+
+    %mask16 = trunc <16 x i32> %2 to <16 x i16>
+    %mask256 = bitcast <16 x i16> %mask16 to i256
+    %notmask256 = xor i256 %mask256, -1
+
+    %newmasked = and i256 %new256, %mask256
+    %oldmasked = and i256 %old256, %notmask256
+    %result = or i256 %newmasked, %oldmasked
+
+    %resultvec = bitcast i256 %result to <16 x i16>
+  ',`
+    %m = trunc <16 x i32> %2 to <16 x i1>
+    %resultvec = select <16 x i1> %m, <16 x i16> %1, <16 x i16> %old
+  ')
+  store <16 x i16> %resultvec, <16 x i16> * %0, align 2
+  ret void
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; packed load and store functions
+;;
+;; These define functions to emulate those nice packed load and packed store
+;; instructions.  For packed store, given a pointer to destination array and 
+;; an offset into the array, for each lane where the mask is on, the
+;; corresponding value for that lane is stored into packed locations in the
+;; destination array.  For packed load, each lane that has an active mask
+;; loads a sequential value from the array.
+;;
+;; $1: vector width of the target
+;;
+;; FIXME: use the per_lane macro, defined below, to implement these!
+
+define(`packed_load_and_store', `
+
+define i32 @__packed_load_active(i32 * %startptr, <1 x i32> * %val_ptr,
+                                 <1 x i1> %full_mask) nounwind alwaysinline {
+entry:
+  %active = extractelement <1 x i1> %full_mask, i32 0
+  %call = tail call i64 @__warpBinExclusiveScan(i1 zeroext %active)
+  %res.sroa.0.0.extract.trunc = trunc i64 %call to i32
+  br i1 %active, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %idxprom = ashr i64 %call, 32
+  %arrayidx = getelementptr inbounds i32* %startptr, i64 %idxprom
+  %val = load i32* %arrayidx, align 4
+  %valvec = insertelement <1 x i32> undef, i32 %val, i32 0
+  store <1 x i32> %valvec, <1 x i32>* %val_ptr, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i32 %res.sroa.0.0.extract.trunc
+}
+
+define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
+                                   <WIDTH x MASK> %full_mask) nounwind alwaysinline 
+{
+entry:
+  %active = extractelement <1 x i1> %full_mask, i32 0
+  %call = tail call i64 @__warpBinExclusiveScan(i1 zeroext %active)
+  %res.sroa.0.0.extract.trunc = trunc i64 %call to i32
+  br i1 %active, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %idxprom = ashr i64 %call, 32
+  %arrayidx = getelementptr inbounds i32* %startptr, i64 %idxprom
+  %val = extractelement <1 x i32> %vals, i32 0
+  store i32 %val, i32* %arrayidx, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i32 %res.sroa.0.0.extract.trunc
+}
+
+define i32 @__packed_store_active2(i32 * %startptr, <1 x i32> %vals,
+                                   <1 x i1> %full_mask) nounwind alwaysinline 
+{
+  %ret = call i32 @__packed_store_active(i32* %startptr, 
+           <1 x i32> %vals, <1 x i1> %full_mask);
+  ret i32 %ret
+}
+')
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reduce_equal
+
+;; count leading/trailing zeros
+;; Macros declares set of count-trailing and count-leading zeros.
+;; Macros behaves as a static functon - it works only at first invokation
+;; to avoid redifinition.
+define(`declare_count_zeros', `
+ifelse(count_zeros_are_defined, true, `',
+`
+declare i32 @llvm.ctlz.i32(i32)
+declare i64 @llvm.ctlz.i64(i64)
+declare i32 @llvm.cttz.i32(i32)
+declare i64 @llvm.cttz.i64(i64)
+
+define(`count_zeros_are_defined', true)
+')
+
+')
+
+define(`reduce_equal_aux', `
+declare_count_zeros()
+
+define i1 @__reduce_equal_$3(<$1 x $2> %v, $2 * %samevalue,
+                             <$1 x MASK> %mask) nounwind alwaysinline {
+entry:
+   %mm = call i64 @__movmsk(<$1 x MASK> %mask)
+   %allon = icmp eq i64 %mm, ALL_ON_MASK
+   br i1 %allon, label %check_neighbors, label %domixed
+
+domixed:
+  ; First, figure out which lane is the first active one
+  %first = call i64 @llvm.cttz.i64(i64 %mm)
+  %first32 = trunc i64 %first to i32
+  %baseval = extractelement <$1 x $2> %v, i32 %first32
+  %basev1 = insertelement <$1 x $2> undef, $2 %baseval, i32 0
+  ; get a vector that is that value smeared across all elements
+  %basesmear = shufflevector <$1 x $2> %basev1, <$1 x $2> undef,
+        <$1 x i32> < forloop(i, 0, eval($1-2), `i32 0, ') i32 0 >
+
+  ; now to a blend of that vector with the original vector, such that the
+  ; result will be the original value for the active lanes, and the value
+  ; from the first active lane for the inactive lanes.  Given that, we can
+  ; just unconditionally check if the lanes are all equal in check_neighbors
+  ; below without worrying about inactive lanes...
+  %ptr = alloca <$1 x $2>
+  store <$1 x $2> %basesmear, <$1 x $2> * %ptr
+  %castptr = bitcast <$1 x $2> * %ptr to <$1 x $4> *
+  %castv = bitcast <$1 x $2> %v to <$1 x $4>
+  call void @__masked_store_blend_i$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x MASK> %mask)
+  %blendvec = load <$1 x $2> * %ptr
+  br label %check_neighbors
+
+check_neighbors:
+  %vec = phi <$1 x $2> [ %blendvec, %domixed ], [ %v, %entry ]
+  ifelse($6, `32', `
+  ; For 32-bit elements, we rotate once and compare with the vector, which ends 
+  ; up comparing each element to its neighbor on the right.  Then see if
+  ; all of those values are true; if so, then all of the elements are equal..
+  %castvec = bitcast <$1 x $2> %vec to <$1 x $4>
+  %castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1)
+  %vr = bitcast <$1 x $4> %castvr to <$1 x $2>
+  %eq = $5 $7 <$1 x $2> %vec, %vr
+  ifelse(MASK,i1, `
+    %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)',
+    `%eqm = sext <$1 x i1> %eq to <$1 x MASK>
+    %eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)')
+  %alleq = icmp eq i64 %eqmm, ALL_ON_MASK
+  br i1 %alleq, label %all_equal, label %not_all_equal
+  ', `
+  ; But for 64-bit elements, it turns out to be more efficient to just
+  ; scalarize and do a individual pairwise comparisons and AND those
+  ; all together..
+  forloop(i, 0, eval($1-1), `
+  %v`'i = extractelement <$1 x $2> %vec, i32 i')
+
+  forloop(i, 0, eval($1-2), `
+  %eq`'i = $5 $7 $2 %v`'i, %v`'eval(i+1)')
+
+  %and0 = and i1 %eq0, %eq1
+  forloop(i, 1, eval($1-3), `
+  %and`'i = and i1 %and`'eval(i-1), %eq`'eval(i+1)')
+
+  br i1 %and`'eval($1-3), label %all_equal, label %not_all_equal
+  ')
+
+all_equal:
+  %the_value = extractelement <$1 x $2> %vec, i32 0
+  store $2 %the_value, $2 * %samevalue
+  ret i1 true
+
+not_all_equal:
+  ret i1 false
+}
+')
+
+define(`reduce_equal', `
+reduce_equal_aux($1, i32, int32, i32, icmp, 32, eq)
+reduce_equal_aux($1, float, float, i32, fcmp, 32, oeq)
+reduce_equal_aux($1, i64, int64, i64, icmp, 64, eq)
+reduce_equal_aux($1, double, double, i64, fcmp, 64, oeq)
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; per_lane
+;;
+;; The scary macro below encapsulates the 'scalarization' idiom--i.e. we have
+;; some operation that we'd like to perform only for the lanes where the
+;; mask is on
+;; $1: vector width of the target
+;; $2: variable that holds the mask
+;; $3: block of code to run for each lane that is on
+;;       Inside this code, any instances of the text "LANE" are replaced
+;;       with an i32 value that represents the current lane number
+
+; num lanes, mask, code block to do per lane
+define(`per_lane', `
+  br label %pl_entry
+
+pl_entry:
+  %pl_mask = call i64 @__movmsk($2)
+  %pl_mask_known = call i1 @__is_compile_time_constant_mask($2)
+  br i1 %pl_mask_known, label %pl_known_mask, label %pl_unknown_mask
+
+pl_known_mask:
+  ;; the mask is known at compile time; see if it is something we can
+  ;; handle more efficiently
+  %pl_is_allon = icmp eq i64 %pl_mask, ALL_ON_MASK
+  br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask
+
+pl_all_on:
+  ;; the mask is all on--just expand the code for each lane sequentially
+  forloop(i, 0, eval($1-1), 
+          `patsubst(`$3', `LANE', i)')
+  br label %pl_done
+
+pl_unknown_mask:
+  ;; we just run the general case, though we could
+  ;; try to be smart and just emit the code based on what it actually is,
+  ;; for example by emitting the code straight-line without a loop and doing 
+  ;; the lane tests explicitly, leaving later optimization passes to eliminate
+  ;; the stuff that is definitely not needed.  Not clear if we will frequently 
+  ;; encounter a mask that is known at compile-time but is not either all on or
+  ;; all off...
+  br label %pl_loop
+
+pl_loop:
+  ;; Loop over each lane and see if we want to do the work for this lane
+  %pl_lane = phi i32 [ 0, %pl_unknown_mask ], [ %pl_nextlane, %pl_loopend ]
+  %pl_lanemask = phi i64 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ]
+
+  ; is the current lane on?  if so, goto do work, otherwise to end of loop
+  %pl_and = and i64 %pl_mask, %pl_lanemask
+  %pl_doit = icmp eq i64 %pl_and, %pl_lanemask
+  br i1 %pl_doit, label %pl_dolane, label %pl_loopend 
+
+pl_dolane:
+  ;; If so, substitute in the code from the caller and replace the LANE
+  ;; stuff with the current lane number
+  patsubst(`patsubst(`$3', `LANE_ID', `_id')', `LANE', `%pl_lane')
+  br label %pl_loopend
+
+pl_loopend:
+  %pl_nextlane = add i32 %pl_lane, 1
+  %pl_nextlanemask = mul i64 %pl_lanemask, 2
+
+  ; are we done yet?
+  %pl_test = icmp ne i32 %pl_nextlane, $1
+  br i1 %pl_test, label %pl_loop, label %pl_done
+
+pl_done:
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+;;
+;; $1: scalar type for which to generate functions to do gathers
+
+define(`gen_gather_general', `
+; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
+define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs, 
+                                   <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
+  %ret_ptr = alloca <WIDTH x $1>
+  per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
+  %val_LANE_ID = load $1 * %ptr_LANE_ID
+  %store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
+  store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
+ ')
+
+  %ret = load <WIDTH x $1> * %ret_ptr
+  ret <WIDTH x $1> %ret
+}
+
+; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
+define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs, 
+                                   <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
+  %ret_ptr = alloca <WIDTH x $1>
+  per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
+  %val_LANE_ID = load $1 * %ptr_LANE_ID
+  %store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
+  store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
+ ')
+
+  %ret = load <WIDTH x $1> * %ret_ptr
+  ret <WIDTH x $1> %ret
+}
+')
+
+; vec width, type
+define(`gen_gather_factored', `
+;; Define the utility function to do the gather operation for a single element
+;; of the type
+define <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
+                                    <WIDTH x i32> %offset_delta, <WIDTH x $1> %ret,
+                                    i32 %lane) nounwind readonly alwaysinline {
+  ; compute address for this one from the base
+  %offset32 = extractelement <WIDTH x i32> %offsets, i32 %lane
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %offset64 = sext i32 %offset32 to i64
+  %scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset
+
+  %delta = extractelement <WIDTH x i32> %offset_delta, i32 %lane
+  %delta64 = sext i32 %delta to i64
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  ; load value and insert into returned value
+  %ptrcast = bitcast i8 * %finalptr to $1 *
+  %val = load $1 *%ptrcast
+  %updatedret = insertelement <WIDTH x $1> %ret, $1 %val, i32 %lane
+  ret <WIDTH x $1> %updatedret
+}
+
+define <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
+                                    <WIDTH x i64> %offset_delta, <WIDTH x $1> %ret,
+                                    i32 %lane) nounwind readonly alwaysinline {
+  ; compute address for this one from the base
+  %offset64 = extractelement <WIDTH x i64> %offsets, i32 %lane
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %offset_scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %offset_scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset
+
+  %delta64 = extractelement <WIDTH x i64> %offset_delta, i32 %lane
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  ; load value and insert into returned value
+  %ptrcast = bitcast i8 * %finalptr to $1 *
+  %val = load $1 *%ptrcast
+  %updatedret = insertelement <WIDTH x $1> %ret, $1 %val, i32 %lane
+  ret <WIDTH x $1> %updatedret
+}
+
+
+define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
+                                             <WIDTH x i32> %offset_delta,
+                                             <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
+  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
+  ; to require that the 0th element of the array being gathered from is always
+  ; legal to read from (and we do indeed require that, given the benefits!) 
+  ;
+  ; Set the offset to zero for lanes that are off
+  %offsetsPtr = alloca <WIDTH x i32>
+  store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %offsetsPtr
+  call void @__masked_store_blend_i32(<WIDTH x i32> * %offsetsPtr, <WIDTH x i32> %offsets, 
+                                      <WIDTH x MASK> %vecmask)
+  %newOffsets = load <WIDTH x i32> * %offsetsPtr
+
+  %deltaPtr = alloca <WIDTH x i32>
+  store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %deltaPtr
+  call void @__masked_store_blend_i32(<WIDTH x i32> * %deltaPtr, <WIDTH x i32> %offset_delta, 
+                                      <WIDTH x MASK> %vecmask)
+  %newDelta = load <WIDTH x i32> * %deltaPtr
+
+  %ret0 = call <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, <WIDTH x i32> %newOffsets,
+                                            i32 %offset_scale, <WIDTH x i32> %newDelta,
+                                            <WIDTH x $1> undef, i32 0)
+  forloop(lane, 1, eval(WIDTH-1), 
+          `patsubst(patsubst(`%retLANE = call <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, 
+                                <WIDTH x i32> %newOffsets, i32 %offset_scale, <WIDTH x i32> %newDelta,
+                                <WIDTH x $1> %retPREV, i32 LANE)
+                    ', `LANE', lane), `PREV', eval(lane-1))')
+  ret <WIDTH x $1> %ret`'eval(WIDTH-1)
+}
+
+define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
+                                             <WIDTH x i64> %offset_delta,
+                                             <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
+  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
+  ; to require that the 0th element of the array being gathered from is always
+  ; legal to read from (and we do indeed require that, given the benefits!) 
+  ;
+  ; Set the offset to zero for lanes that are off
+  %offsetsPtr = alloca <WIDTH x i64>
+  store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %offsetsPtr
+  call void @__masked_store_blend_i64(<WIDTH x i64> * %offsetsPtr, <WIDTH x i64> %offsets, 
+                                      <WIDTH x MASK> %vecmask)
+  %newOffsets = load <WIDTH x i64> * %offsetsPtr
+
+  %deltaPtr = alloca <WIDTH x i64>
+  store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %deltaPtr
+  call void @__masked_store_blend_i64(<WIDTH x i64> * %deltaPtr, <WIDTH x i64> %offset_delta, 
+                                      <WIDTH x MASK> %vecmask)
+  %newDelta = load <WIDTH x i64> * %deltaPtr
+
+  %ret0 = call <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %newOffsets,
+                                            i32 %offset_scale, <WIDTH x i64> %newDelta,
+                                            <WIDTH x $1> undef, i32 0)
+  forloop(lane, 1, eval(WIDTH-1), 
+          `patsubst(patsubst(`%retLANE = call <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, 
+                                <WIDTH x i64> %newOffsets, i32 %offset_scale, <WIDTH x i64> %newDelta,
+                                <WIDTH x $1> %retPREV, i32 LANE)
+                    ', `LANE', lane), `PREV', eval(lane-1))')
+  ret <WIDTH x $1> %ret`'eval(WIDTH-1)
+}
+
+gen_gather_general($1)
+'
+)
+
+; vec width, type
+define(`gen_gather', `
+
+gen_gather_factored($1)
+
+define <WIDTH x $1>
+@__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale,
+                           <WIDTH x i32> %offsets,
+                           <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
+  %scale_vec = bitcast i32 %offset_scale to <1 x i32>
+  %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef,
+     <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
+  %scaled_offsets = mul <WIDTH x i32> %smear_scale, %offsets
+  %v = call <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %scaled_offsets, i32 1, 
+                                                     <WIDTH x i32> zeroinitializer, <WIDTH x MASK> %vecmask)
+  ret <WIDTH x $1> %v
+}
+
+define <WIDTH x $1>
+@__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale,
+                            <WIDTH x i64> %offsets,
+                            <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
+  %scale64 = zext i32 %offset_scale to i64
+  %scale_vec = bitcast i64 %scale64 to <1 x i64>
+  %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef,
+     <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
+  %scaled_offsets = mul <WIDTH x i64> %smear_scale, %offsets
+  %v = call <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %scaled_offsets,
+                                                     i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x MASK> %vecmask)
+  ret <WIDTH x $1> %v
+}
+
+'
+)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gen_scatter
+;; Emit a function declaration for a scalarized scatter.
+;;
+;; $1: scalar type for which we want to generate code to scatter
+
+define(`gen_scatter', `
+;; Define the function that descripes the work to do to scatter a single
+;; value
+define void @__scatter_elt32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
+                                <WIDTH x i32> %offset_delta, <WIDTH x $1> %values,
+                                i32 %lane) nounwind alwaysinline {
+  %offset32 = extractelement <WIDTH x i32> %offsets, i32 %lane
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %offset64 = sext i32 %offset32 to i64
+  %scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset
+
+  %delta = extractelement <WIDTH x i32> %offset_delta, i32 %lane
+  %delta64 = sext i32 %delta to i64
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  %ptrcast = bitcast i8 * %finalptr to $1 *
+  %storeval = extractelement <WIDTH x $1> %values, i32 %lane
+  store $1 %storeval, $1 * %ptrcast
+  ret void
+}
+
+define void @__scatter_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
+                                <WIDTH x i64> %offset_delta, <WIDTH x $1> %values,
+                                i32 %lane) nounwind alwaysinline {
+  %offset64 = extractelement <WIDTH x i64> %offsets, i32 %lane
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset
+
+  %delta64 = extractelement <WIDTH x i64> %offset_delta, i32 %lane
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  %ptrcast = bitcast i8 * %finalptr to $1 *
+  %storeval = extractelement <WIDTH x $1> %values, i32 %lane
+  store $1 %storeval, $1 * %ptrcast
+  ret void
+}
+
+define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
+                                         <WIDTH x i32> %offset_delta, <WIDTH x $1> %values,
+                                         <WIDTH x MASK> %mask) nounwind alwaysinline {
+  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+      call void @__scatter_elt32_$1(i8 * %base, <WIDTH x i32> %offsets, i32 %offset_scale,
+                                    <WIDTH x i32> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
+  ret void
+}
+
+define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
+                                         <WIDTH x i64> %offset_delta, <WIDTH x $1> %values,
+                                         <WIDTH x MASK> %mask) nounwind alwaysinline {
+  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+      call void @__scatter_elt64_$1(i8 * %base, <WIDTH x i64> %offsets, i32 %offset_scale,
+                                    <WIDTH x i64> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
+  ret void
+}
+
+; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s
+define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
+                            <WIDTH x MASK> %mask) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
+  %val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
+  store $1 %val_LANE_ID, $1 * %ptr_LANE_ID
+ ')
+  ret void
+}
+
+; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s
+define void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,
+                            <WIDTH x MASK> %mask) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
+  %val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
+  store $1 %val_LANE_ID, $1 * %ptr_LANE_ID
+ ')
+  ret void
+}
+
+'
+)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rdrand 
+
+define(`rdrand_decls', `
+declare i1 @__rdrand_i16(i16 * nocapture)
+declare i1 @__rdrand_i32(i32 * nocapture)
+declare i1 @__rdrand_i64(i64 * nocapture)
+')
+
+define(`rdrand_definition', `
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rdrand
+
+declare {i16, i32} @llvm.x86.rdrand.16()
+declare {i32, i32} @llvm.x86.rdrand.32()
+declare {i64, i32} @llvm.x86.rdrand.64()
+
+define i1 @__rdrand_i16(i16 * %ptr) {
+  %v = call {i16, i32} @llvm.x86.rdrand.16()
+  %v0 = extractvalue {i16, i32} %v, 0
+  %v1 = extractvalue {i16, i32} %v, 1
+  store i16 %v0, i16 * %ptr
+  %good = icmp ne i32 %v1, 0
+  ret i1 %good
+}
+
+define i1 @__rdrand_i32(i32 * %ptr) {
+  %v = call {i32, i32} @llvm.x86.rdrand.32()
+  %v0 = extractvalue {i32, i32} %v, 0
+  %v1 = extractvalue {i32, i32} %v, 1
+  store i32 %v0, i32 * %ptr
+  %good = icmp ne i32 %v1, 0
+  ret i1 %good
+}
+
+define i1 @__rdrand_i64(i64 * %ptr) {
+  %v = call {i64, i32} @llvm.x86.rdrand.64()
+  %v0 = extractvalue {i64, i32} %v, 0
+  %v1 = extractvalue {i64, i32} %v, 1
+  store i64 %v0, i64 * %ptr
+  %good = icmp ne i32 %v1, 0
+  ret i1 %good
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define(`define_avg_up_uint8', `
+define <WIDTH x i8> @__avg_up_uint8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum1 = add <WIDTH x i16> %a16, %b16
+  %sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_up_int8', `
+define <WIDTH x i8> @__avg_up_int8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum1 = add <WIDTH x i16> %a16, %b16
+  %sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_up_uint16', `
+define <WIDTH x i16> @__avg_up_uint16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum1 = add <WIDTH x i32> %a32, %b32
+  %sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_up_int16', `
+define <WIDTH x i16> @__avg_up_int16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum1 = add <WIDTH x i32> %a32, %b32
+  %sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_down_uint8', `
+define <WIDTH x i8> @__avg_down_uint8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum = add <WIDTH x i16> %a16, %b16
+  %avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_down_int8', `
+define <WIDTH x i8> @__avg_down_int8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum = add <WIDTH x i16> %a16, %b16
+  %avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_down_uint16', `
+define <WIDTH x i16> @__avg_down_uint16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum = add <WIDTH x i32> %a32, %b32
+  %avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_down_int16', `
+define <WIDTH x i16> @__avg_down_int16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum = add <WIDTH x i32> %a32, %b32
+  %avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_up_avgs', `
+define_avg_up_uint8()
+define_avg_up_int8()
+define_avg_up_uint16()
+define_avg_up_int16()
+')
+
+define(`define_down_avgs', `
+define_avg_down_uint8()
+define_avg_down_int8()
+define_avg_down_uint16()
+define_avg_down_int16()
+')
+
+define(`define_avgs', `
+define_up_avgs()
+define_down_avgs()
+')
diff --git a/builtins/util.m4 b/builtins/util.m4
index fbd929a1..7f08adb3 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -4541,3 +4541,60 @@ define(`rcpd_decl', `
 declare  double @__rcp_uniform_double(double)
 declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
 ')
+
+define(`declare_nvptx',
+`
+declare i32 @__program_index()  nounwind readnone alwaysinline
+declare i32 @__program_count()  nounwind readnone alwaysinline
+declare i32 @__warp_index()  nounwind readnone alwaysinline
+declare i32 @__task_index0()  nounwind readnone alwaysinline
+declare i32 @__task_index1()  nounwind readnone alwaysinline
+declare i32 @__task_index2()  nounwind readnone alwaysinline
+declare i32 @__task_index()  nounwind readnone alwaysinline
+declare i32 @__task_count0()  nounwind readnone alwaysinline
+declare i32 @__task_count1()  nounwind readnone alwaysinline
+declare i32 @__task_count2()  nounwind readnone alwaysinline
+declare i32 @__task_count()  nounwind readnone alwaysinline
+declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
+declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
+declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline
+declare i64 @__movmsk_ptx(<WIDTH x i1>) nounwind readnone alwaysinline;
+')
+
+define(`global_atomic_varying',`
+declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
+')
+
+define(`global_atomic_cas_varying',`
+declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %cmp, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
+')
+
+global_atomic_cas_varying(WIDTH, compare_exchange, i32, int32)
+global_atomic_cas_varying(WIDTH, compare_exchange, i64, int64)
+global_atomic_cas_varying(WIDTH, compare_exchange, float, float)
+global_atomic_cas_varying(WIDTH, compare_exchange, double, double)
+
+global_atomic_varying(WIDTH, swap, i32, int32)
+global_atomic_varying(WIDTH, swap, i64, int64)
+global_atomic_varying(WIDTH, swap, float, float)
+global_atomic_varying(WIDTH, swap, double, double)
+
+global_atomic_varying(WIDTH, add, i32, int32)
+global_atomic_varying(WIDTH, sub, i32, int32)
+global_atomic_varying(WIDTH, and, i32, int32)
+global_atomic_varying(WIDTH, or, i32, int32)
+global_atomic_varying(WIDTH, xor, i32, int32)
+global_atomic_varying(WIDTH, min, i32, int32)
+global_atomic_varying(WIDTH, max, i32, int32)
+global_atomic_varying(WIDTH, umin, i32, uint32)
+global_atomic_varying(WIDTH, umax, i32, uint32)
+
+global_atomic_varying(WIDTH, add, i64, int64)
+global_atomic_varying(WIDTH, sub, i64, int64)
+global_atomic_varying(WIDTH, and, i64, int64)
+global_atomic_varying(WIDTH, or, i64, int64)
+global_atomic_varying(WIDTH, xor, i64, int64)
+global_atomic_varying(WIDTH, min, i64, int64)
+global_atomic_varying(WIDTH, max, i64, int64)
+global_atomic_varying(WIDTH, umin, i64, uint64)
+global_atomic_varying(WIDTH, umax, i64, uint64)
diff --git a/ctx.cpp b/ctx.cpp
index 6ff26c6a..1097a422 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -57,6 +57,8 @@
   #include <llvm/IR/Instructions.h>
   #include <llvm/IR/DerivedTypes.h>
 #endif
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Support/FormattedStream.h>
 
 /** This is a small utility structure that records information related to one
     level of nested control flow.  It's mostly used in correctly restoring
@@ -1371,29 +1373,97 @@ FunctionEmitContext::None(llvm::Value *mask) {
 
 
 llvm::Value *
-FunctionEmitContext::LaneMask(llvm::Value *v) {
-    // Call the target-dependent movmsk function to turn the vector mask
-    // into an i64 value
-    std::vector<Symbol *> mm;
-    m->symbolTable->LookupFunction("__movmsk", &mm);
-    if (g->target->getMaskBitCount() == 1)
-        AssertPos(currentPos, mm.size() == 1);
-    else
-        // There should be one with signed int signature, one unsigned int.
-        AssertPos(currentPos, mm.size() == 2);
-    // We can actually call either one, since both are i32s as far as
-    // LLVM's type system is concerned...
-    llvm::Function *fmm = mm[0]->function;
-    return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
+FunctionEmitContext::LaneMask(llvm::Value *v) 
+{
+#if 1 /* this makes mandelbrot example slower, why ?!? */
+  const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk";
+#else
+  const char *__movmsk = "__movmsk";
+#endif
+  // Call the target-dependent movmsk function to turn the vector mask
+  // into an i64 value
+  std::vector<Symbol *> mm;
+  m->symbolTable->LookupFunction(__movmsk, &mm);
+  if (g->target->getMaskBitCount() == 1)
+    AssertPos(currentPos, mm.size() == 1);
+  else
+    // There should be one with signed int signature, one unsigned int.
+    AssertPos(currentPos, mm.size() == 2);
+  // We can actually call either one, since both are i32s as far as
+  // LLVM's type system is concerned...
+  llvm::Function *fmm = mm[0]->function;
+  return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
+}
+
+bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName)
+{
+  llvm::Type *type = vector->getType();
+  if (type == LLVMTypes::Int8VectorType)
+    funcName += "_int8";
+  else if (type == LLVMTypes::Int16VectorType)
+    funcName += "_int16";
+  else if (type == LLVMTypes::Int32VectorType)
+    funcName += "_int32";
+  else if (type == LLVMTypes::Int64VectorType)
+    funcName += "_int64";
+  else if (type == LLVMTypes::FloatVectorType)
+    funcName += "_float";
+  else if (type == LLVMTypes::DoubleVectorType)
+    funcName += "_double";
+  else
+    return false;
+  return true;
+}
+
+llvm::Value*
+FunctionEmitContext::Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar)
+{
+  std::string funcName = "__insert";
+  assert(lAppendInsertExtractName(vector, funcName));
+  assert(lane->getType() == LLVMTypes::Int32Type);
+  
+  llvm::Function *func = m->module->getFunction(funcName.c_str());
+  assert(func != NULL);
+  std::vector<llvm::Value *> args;
+  args.push_back(vector);
+  args.push_back(lane);
+  args.push_back(scalar);
+  llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
+  return ret;
+}
+
+llvm::Value*
+FunctionEmitContext::Extract(llvm::Value *vector, llvm::Value *lane)
+{
+  std::string funcName = "__extract";
+  assert(lAppendInsertExtractName(vector, funcName));
+  assert(lane->getType() == LLVMTypes::Int32Type);
+  
+  llvm::Function *func = m->module->getFunction(funcName.c_str());
+  assert(func != NULL);
+  std::vector<llvm::Value *> args;
+  args.push_back(vector);
+  args.push_back(lane);
+  llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
+  return ret;
 }
 
 
 llvm::Value *
 FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
+  if (g->target->getISA() == Target::NVPTX)
+  {
+    // Compare the two masks to get a vector of i1s
+    llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
+        v1, v2, "v1==v2");
+    return ExtractInst(cmp, 0);  /* this works without calling All(..) in PTX. Why ?!? */
+  }
+  else
+  {
 #if 0
     // Compare the two masks to get a vector of i1s
     llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
-                               v1, v2, "v1==v2");
+        v1, v2, "v1==v2");
     // Turn that into a bool vector type (often i32s)
     cmp = I1VecToBoolVec(cmp);
     // And see if it's all on
@@ -1402,22 +1472,34 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
     llvm::Value *mm1 = LaneMask(v1);
     llvm::Value *mm2 = LaneMask(v2);
     return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
-                   LLVMGetName("equal", v1, v2));
+        LLVMGetName("equal", v1, v2));
 #endif
+  }
 }
 
 llvm::Value *
 FunctionEmitContext::ProgramIndexVector(bool is32bits) {
     llvm::SmallVector<llvm::Constant*, 16> array;
     for (int i = 0; i < g->target->getVectorWidth() ; ++i) {
-        llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
-        array.push_back(C);
+      llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
+      array.push_back(C);
     }
 
     llvm::Constant* index = llvm::ConstantVector::get(array);
 
     return index;
 }
+llvm::Value *
+FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) {
+    llvm::Function *func_program_index  = m->module->getFunction("__program_index");
+    llvm::Value *__program_index    = CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__program_indexS");
+    llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), __program_index, 0, "foreach__program_indexV");
+#if 0
+    if (!is32bits)
+      index = ZExtInst(index, LLVMTypes::Int64VectandType);
+#endif
+    return index;
+}
 
 
 llvm::Value *
@@ -1830,6 +1912,7 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) {
 
     if (name == NULL)
         name = LLVMGetName(value, "_ptr2int");
+
     llvm::Type *type = LLVMTypes::PointerIntType;
     llvm::Instruction *inst = new llvm::PtrToIntInst(value, type, name, bblock);
     AddDebugPos(inst);
@@ -3523,98 +3606,199 @@ llvm::Value *
 FunctionEmitContext::LaunchInst(llvm::Value *callee,
                                 std::vector<llvm::Value *> &argVals,
                                 llvm::Value *launchCount[3]){
-    if (callee == NULL) {
+
+    if (g->target->getISA() != Target::NVPTX)
+    {
+      if (callee == NULL) {
         AssertPos(currentPos, m->errorCount > 0);
         return NULL;
-    }
+      }
 
-    launchedTasks = true;
+      launchedTasks = true;
 
-    AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
-    llvm::Type *argType =
+      AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
+      llvm::Type *argType =
         (llvm::dyn_cast<llvm::Function>(callee))->arg_begin()->getType();
-    AssertPos(currentPos, llvm::PointerType::classof(argType));
-    llvm::PointerType *pt =
+      AssertPos(currentPos, llvm::PointerType::classof(argType));
+      llvm::PointerType *pt =
         llvm::dyn_cast<llvm::PointerType>(argType);
-    AssertPos(currentPos, llvm::StructType::classof(pt->getElementType()));
-    llvm::StructType *argStructType =
+      AssertPos(currentPos, llvm::StructType::classof(pt->getElementType()));
+      llvm::StructType *argStructType =
         static_cast<llvm::StructType *>(pt->getElementType());
 
-    llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
-    AssertPos(currentPos, falloc != NULL);
-    llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
-    if (structSize->getType() != LLVMTypes::Int64Type)
+      llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
+      AssertPos(currentPos, falloc != NULL);
+      llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
+      if (structSize->getType() != LLVMTypes::Int64Type)
         // ISPCAlloc expects the size as an uint64_t, but on 32-bit
         // targets, SizeOf returns a 32-bit value
         structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
-                              "struct_size_to_64");
-    int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth());
+            "struct_size_to_64");
+      int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth());
 
-    std::vector<llvm::Value *> allocArgs;
-    allocArgs.push_back(launchGroupHandlePtr);
-    allocArgs.push_back(structSize);
-    allocArgs.push_back(LLVMInt32(align));
-    llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
-    llvm::Value *argmem = BitCastInst(voidmem, pt);
+      std::vector<llvm::Value *> allocArgs;
+      allocArgs.push_back(launchGroupHandlePtr);
+      allocArgs.push_back(structSize);
+      allocArgs.push_back(LLVMInt32(align));
+      llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
+      llvm::Value *argmem = BitCastInst(voidmem, pt);
 
-    // Copy the values of the parameters into the appropriate place in
-    // the argument block
-    for (unsigned int i = 0; i < argVals.size(); ++i) {
+      // Copy the values of the parameters into the appropriate place in
+      // the argument block
+      for (unsigned int i = 0; i < argVals.size(); ++i) {
         llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
         // don't need to do masked store here, I think
         StoreInst(argVals[i], ptr);
-    }
+      }
 
-    if (argStructType->getNumElements() == argVals.size() + 1) {
+      if (argStructType->getNumElements() == argVals.size() + 1) {
         // copy in the mask
         llvm::Value *mask = GetFullMask();
         llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
-                                            "funarg_mask");
+            "funarg_mask");
         StoreInst(mask, ptr);
-    }
+      }
 
-    // And emit the call to the user-supplied task launch function, passing
-    // a pointer to the task function being called and a pointer to the
-    // argument block we just filled in
-    llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
-    llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
-    AssertPos(currentPos, flaunch != NULL);
-    std::vector<llvm::Value *> args;
-    args.push_back(launchGroupHandlePtr);
-    args.push_back(fptr);
-    args.push_back(voidmem);
-    args.push_back(launchCount[0]);
-    args.push_back(launchCount[1]);
-    args.push_back(launchCount[2]);
-    return CallInst(flaunch, NULL, args, "");
+      // And emit the call to the user-supplied task launch function, passing
+      // a pointer to the task function being called and a pointer to the
+      // argument block we just filled in
+      llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
+      llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
+      AssertPos(currentPos, flaunch != NULL);
+      std::vector<llvm::Value *> args;
+      args.push_back(launchGroupHandlePtr);
+      args.push_back(fptr);
+      args.push_back(voidmem);
+      args.push_back(launchCount[0]);
+      args.push_back(launchCount[1]);
+      args.push_back(launchCount[2]);
+      return CallInst(flaunch, NULL, args, "");
+    }
+    else /* NVPTX */
+    {
+      if (callee == NULL) {
+        AssertPos(currentPos, m->errorCount > 0);
+        return NULL;
+      }
+      launchedTasks = true;
+
+      AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
+      std::vector<llvm::Type*> argTypes;
+
+      llvm::Function *F = llvm::dyn_cast<llvm::Function>(callee);
+      const unsigned int nArgs = F->arg_size();
+      llvm::Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+      for (; I != E; ++I) 
+        argTypes.push_back(I->getType());
+      llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes);
+      llvm::StructType *argStructType = static_cast<llvm::StructType *>(st);
+      llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
+      if (structSize->getType() != LLVMTypes::Int64Type)
+        structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
+            "struct_size_to_64");
+
+      const int align = 8;
+      llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
+      AssertPos(currentPos, falloc != NULL);
+      std::vector<llvm::Value *> allocArgs;
+      allocArgs.push_back(launchGroupHandlePtr);
+      allocArgs.push_back(structSize);
+      allocArgs.push_back(LLVMInt32(align));
+      llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
+      llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64");
+      llvm::BasicBlock* if_true  = CreateBasicBlock("if_true");
+      llvm::BasicBlock* if_false = CreateBasicBlock("if_false");
+
+      /* check if the pointer returned by ISPCAlloc is not NULL 
+       * --------------
+       * this is a workaround for not checking the value of programIndex 
+       * because ISPCAlloc will return NULL pointer for all programIndex > 0
+       * of course, if ISPAlloc fails to get parameter buffer, the pointer for programIndex = 0
+       * will also be NULL
+       * This check must be added, and also rewrite the code to make it less opaque 
+       */
+      llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1");
+      BranchInst(if_true, if_false, cmp1);
+
+      /**********************/
+      bblock = if_true;    
+
+      // label_if_then block:
+      llvm::Type *pt = llvm::PointerType::getUnqual(st);
+      llvm::Value *argmem = BitCastInst(voidmem, pt);
+      for (unsigned int i = 0; i < argVals.size(); ++i) 
+      {
+        llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
+        // don't need to do masked store here, I think
+        StoreInst(argVals[i], ptr);
+      }
+      if (nArgs == argVals.size() + 1) {
+        // copy in the mask
+        llvm::Value *mask = GetFullMask();
+        llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
+            "funarg_mask");
+        StoreInst(mask, ptr);
+      }
+      BranchInst(if_false);
+
+      /**********************/
+      bblock = if_false;
+
+      llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
+      llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
+      AssertPos(currentPos, flaunch != NULL);
+      std::vector<llvm::Value *> args;
+      args.push_back(launchGroupHandlePtr);
+      args.push_back(fptr);
+      args.push_back(voidmem);
+      args.push_back(launchCount[0]);
+      args.push_back(launchCount[1]);
+      args.push_back(launchCount[2]);
+      llvm::Value *ret =  CallInst(flaunch, NULL, args, "");
+      return ret;
+    }
 }
 
 
 void
 FunctionEmitContext::SyncInst() {
-    llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
-    llvm::Value *nullPtrValue =
+    if (g->target->getISA() != Target::NVPTX)
+    {
+      llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
+      llvm::Value *nullPtrValue =
         llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
-    llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
-                                   llvm::CmpInst::ICMP_NE,
-                                   launchGroupHandle, nullPtrValue);
-    llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
-    llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
-    BranchInst(bSync, bPostSync, nonNull);
+      llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
+          llvm::CmpInst::ICMP_NE,
+          launchGroupHandle, nullPtrValue);
+      llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
+      llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
+      BranchInst(bSync, bPostSync, nonNull);
 
-    SetCurrentBasicBlock(bSync);
-    llvm::Function *fsync = m->module->getFunction("ISPCSync");
-    if (fsync == NULL)
+      SetCurrentBasicBlock(bSync);
+      llvm::Function *fsync = m->module->getFunction("ISPCSync");
+      if (fsync == NULL)
         FATAL("Couldn't find ISPCSync declaration?!");
-    CallInst(fsync, NULL, launchGroupHandle, "");
+      CallInst(fsync, NULL, launchGroupHandle, "");
 
-    // zero out the handle so that if ISPCLaunch is called again in this
-    // function, it knows it's starting out from scratch
-    StoreInst(nullPtrValue, launchGroupHandlePtr);
+      // zero out the handle so that if ISPCLaunch is called again in this
+      // function, it knows it's starting out from scratch
+      StoreInst(nullPtrValue, launchGroupHandlePtr);
 
-    BranchInst(bPostSync);
+      BranchInst(bPostSync);
 
-    SetCurrentBasicBlock(bPostSync);
+      SetCurrentBasicBlock(bPostSync);
+    }
+    else /* NVPTX: don't do test, just call sync */
+    {
+      llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
+      llvm::Value *nullPtrValue =
+        llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
+      llvm::Function *fsync = m->module->getFunction("ISPCSync");
+      if (fsync == NULL)
+        FATAL("Couldn't find ISPCSync declaration?!");
+      CallInst(fsync, NULL, launchGroupHandle, "");
+      StoreInst(nullPtrValue, launchGroupHandlePtr);
+    }
 }
 
 
diff --git a/ctx.h b/ctx.h
index 4dd30053..57160c17 100644
--- a/ctx.h
+++ b/ctx.h
@@ -291,6 +291,13 @@ public:
         of the mask is on. */
     llvm::Value *LaneMask(llvm::Value *mask);
 
+
+    /** Issues a call to __insert_int8/int16/int32/int64/float/double */
+    llvm::Value* Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar);
+    /** Issues a call to __extract_int8/int16/int32/int64/float/double */
+    llvm::Value* Extract(llvm::Value *vector, llvm::Value *lane);
+
+
     /** Given two masks of type LLVMTypes::MaskType, return an i1 value
         that indicates whether the two masks are equal. */
     llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
@@ -298,6 +305,7 @@ public:
     /** Generate ConstantVector, which contains ProgramIndex, i.e.
         < i32 0, i32 1, i32 2, i32 3> */
     llvm::Value *ProgramIndexVector(bool is32bits = true);
+    llvm::Value *ProgramIndexVectorPTX(bool is32bits = true);
 
     /** Given a string, create an anonymous global variable to hold its
         value and return the pointer to the string. */
diff --git a/decl.cpp b/decl.cpp
index 8a10543b..27a6d580 100644
--- a/decl.cpp
+++ b/decl.cpp
@@ -168,6 +168,13 @@ DeclSpecs::GetBaseType(SourcePos pos) const {
     retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
 
     if (soaWidth > 0) {
+#if 0  /* see stmt.cpp in DeclStmt::EmitCode for work-around of SOAType Declaration */
+        if (g->target->getISA() == Target::NVPTX)
+        {
+            Error(pos, "\"soa\" data types are currently not supported with \"nvptx\" target.");
+            return NULL;
+        }
+#endif
         const StructType *st = CastType<StructType>(retType);
 
         if (st == NULL) {
@@ -402,6 +409,13 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
             return;
         }
 
+#if 0 /* NVPTX */
+        if (baseType->IsUniformType())
+        {
+          fprintf(stderr, " detected uniform array of size= %d  array= %s\n" ,arraySize,
+              baseType->IsArrayType() ? " true " : " false ");
+        }
+#endif
         const Type *arrayType = new ArrayType(baseType, arraySize);
         if (child != NULL) {
             child->InitFromType(arrayType, ds);
@@ -530,9 +544,9 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
 
         returnType = returnType->ResolveUnboundVariability(Variability::Varying);
 
+        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
         bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
         bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
-        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
         bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0);
 
         if (isExported && isTask) {
@@ -541,9 +555,9 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
             return;
         }
         if (isExternC && isTask) {
-            Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
-                  "qualifiers");
-            return;
+          Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
+              "qualifiers");
+          return;
         }
         if (isExternC && isExported) {
             Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
diff --git a/expr.cpp b/expr.cpp
index b5c876fd..4a473fe7 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -7867,6 +7867,12 @@ SizeOfExpr::TypeCheck() {
               "struct type \"%s\".", type->GetString().c_str());
         return NULL;
     }
+    if (type != NULL)
+      if (g->target->getISA() == Target::NVPTX && type->IsVaryingType())
+      {
+        Error(pos, "\"sizeof\" with varying data types is not yet supported with \"nvptx\" target.");
+        return NULL;
+      }
 
     return this;
 }
@@ -8661,6 +8667,11 @@ NewExpr::TypeCheck() {
         AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
+    if (g->target->getISA() == Target::NVPTX && allocType->IsVaryingType())
+    {
+      Error(pos, "\"new\" with varying data types is not yet supported with \"nvptx\" target.");
+      return NULL;
+    }
     if (CastType<UndefinedStructType>(allocType) != NULL) {
         Error(pos, "Can't dynamically allocate storage for declared "
               "but not defined type \"%s\".", allocType->GetString().c_str());
diff --git a/func.cpp b/func.cpp
index 76ae43f5..578dd68a 100644
--- a/func.cpp
+++ b/func.cpp
@@ -47,6 +47,7 @@
 #include <stdio.h>
 
 #if defined(LLVM_3_1) || defined(LLVM_3_2)
+  #include <llvm/Metadata.h>
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Type.h>
@@ -54,6 +55,7 @@
   #include <llvm/Intrinsics.h>
   #include <llvm/DerivedTypes.h>
 #else
+  #include <llvm/IR/Metadata.h>
   #include <llvm/IR/LLVMContext.h>
   #include <llvm/IR/Module.h>
   #include <llvm/IR/Type.h>
@@ -128,7 +130,7 @@ Function::Function(Symbol *s, Stmt *c) {
             sym->parentFunction = this;
     }
 
-    if (type->isTask) {
+    if (type->isTask && g->target->getISA() != Target::NVPTX) {
         threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
         Assert(threadIndexSym);
         threadCountSym = m->symbolTable->LookupVariable("threadCount");
@@ -239,7 +241,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
 #endif
     const FunctionType *type = CastType<FunctionType>(sym->type);
     Assert(type != NULL);
-    if (type->isTask == true) {
+    if (type->isTask == true && g->target->getISA() != Target::NVPTX)  {
         // For tasks, there should always be three parameters: the
         // pointer to the structure that holds all of the arguments, the
         // thread index, and the thread count variables.
@@ -337,6 +339,16 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
             ctx->SetFunctionMask(argIter);
             Assert(++argIter == function->arg_end());
         }
+        if (type->isTask == true && g->target->getISA() == Target::NVPTX)
+        {
+          llvm::NamedMDNode* annotations =
+            m->module->getOrInsertNamedMetadata("nvvm.annotations");
+          llvm::SmallVector<llvm::Value*, 3> av;
+          av.push_back(function);
+          av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
+          av.push_back(LLVMInt32(1));
+          annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
+        }
     }
 
     // Finally, we can generate code for the function
@@ -492,13 +504,28 @@ Function::GenerateIR() {
         // the application can call it
         const FunctionType *type = CastType<FunctionType>(sym->type);
         Assert(type != NULL);
-        if (type->isExported) {
+        if (type->isExported) { 
             if (!type->isTask) {
                 llvm::FunctionType *ftype = type->LLVMFunctionType(g->ctx, true);
                 llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
                 std::string functionName = sym->name;
+
                 if (g->mangleFunctionsWithTarget)
                     functionName += std::string("_") + g->target->GetISAString();
+
+                if (g->target->getISA() == Target::NVPTX)
+                {
+                  functionName += std::string("___export");  /* add ___export to the end, for ptxcc to recognize it is exported */
+#if 0
+                  llvm::NamedMDNode* annotations =
+                    m->module->getOrInsertNamedMetadata("nvvm.annotations");
+                  llvm::SmallVector<llvm::Value*, 3> av;
+                  av.push_back(function);
+                  av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
+                  av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
+                  annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); 
+#endif
+                }
                 llvm::Function *appFunction =
                     llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
 #if defined(LLVM_3_1)
@@ -538,6 +565,16 @@ Function::GenerateIR() {
                             FATAL("Function verificication failed");
                         }
                     }
+                    if (g->target->getISA() == Target::NVPTX)
+                    {
+                      llvm::NamedMDNode* annotations =
+                        m->module->getOrInsertNamedMetadata("nvvm.annotations");
+                      llvm::SmallVector<llvm::Value*, 3> av;
+                      av.push_back(appFunction);
+                      av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
+                      av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
+                      annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); 
+                    }
                 }
             }
         }
diff --git a/ispc.cpp b/ispc.cpp
index 1386d65e..bd973517 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -280,6 +280,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             arch = "arm";
         else
 #endif
+         if(!strncmp(isa, "nvptx", 5))
+           arch = "nvptx64";
+         else
             arch = "x86-64";
     }
 
@@ -707,6 +710,19 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskBitCount = 32;
     }
 #endif
+    else if (!strcasecmp(isa, "nvptx")) 
+    {
+        this->m_isa = Target::NVPTX;
+        this->m_cpu = "sm_35";
+        this->m_nativeVectorWidth = 32;
+        this->m_nativeVectorAlignment = 32;
+        this->m_vectorWidth = 1;
+        this->m_hasHalf = true;
+        this->m_maskingIsFree = true;
+        this->m_maskBitCount = 1;
+        this->m_hasTranscendentals = false; 
+        this->m_hasGather = this->m_hasScatter = false;
+    }
     else {
         Error(SourcePos(), "Target \"%s\" is unknown.  Choices are: %s.",
                 isa, SupportedTargets());
@@ -784,7 +800,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         // Initialize target-specific "target-feature" attribute.
         if (!m_attributes.empty()) {
             llvm::AttrBuilder attrBuilder;
-            attrBuilder.addAttribute("target-cpu", this->m_cpu);
+            if (m_isa != Target::NVPTX)
+              attrBuilder.addAttribute("target-cpu", this->m_cpu);
             attrBuilder.addAttribute("target-features", this->m_attributes);
             this->m_tf_attributes = new llvm::AttributeSet(
                 llvm::AttributeSet::get(
@@ -839,7 +856,7 @@ Target::SupportedTargets() {
         "avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 "
         "avx2-i32x8, avx2-i32x16, avx2-i64x4, "
         "generic-x1, generic-x4, generic-x8, generic-x16, "
-        "generic-x32, generic-x64";
+        "generic-x32, generic-x64, nvptx";
 }
 
 
@@ -866,6 +883,8 @@ Target::GetTripleString() const {
             triple.setArchName("i386");
         else if (m_arch == "x86-64")
             triple.setArchName("x86_64");
+        else if (m_arch == "nvptx64")
+          triple = llvm::Triple("nvptx64", "nvidia", "cuda");
         else
             triple.setArchName(m_arch);
     }
@@ -898,6 +917,8 @@ Target::ISAToString(ISA isa) {
         return "avx2";
     case Target::GENERIC:
         return "generic";
+    case Target::NVPTX:
+        return "nvptx";
     default:
         FATAL("Unhandled target in ISAToString()");
     }
@@ -936,6 +957,8 @@ Target::ISAToTargetString(ISA isa) {
         return "avx2-i32x8";
     case Target::GENERIC:
         return "generic-4";
+    case Target::NVPTX:
+        return "nvptx";
     default:
         FATAL("Unhandled target in ISAToTargetString()");
     }
diff --git a/ispc.h b/ispc.h
index 4b6df8c3..ffe9739c 100644
--- a/ispc.h
+++ b/ispc.h
@@ -179,7 +179,7 @@ public:
         flexible/performant of them will apear last in the enumerant.  Note
         also that __best_available_isa() needs to be updated if ISAs are
         added or the enumerant values are reordered.  */
-    enum ISA {
+    enum ISA { NVPTX,
 #ifdef ISPC_ARM_ENABLED
                NEON32, NEON16, NEON8,
 #endif
@@ -606,6 +606,7 @@ struct Globals {
     /** Indicates that alignment in memory allocation routines should be
         forced to have given value. -1 value means natural alignment for the platforms. */
     int forceAlignment;
+    std::string PtxString;
 };
 
 enum {
diff --git a/main.cpp b/main.cpp
index 99497af5..2815cde9 100644
--- a/main.cpp
+++ b/main.cpp
@@ -320,6 +320,11 @@ int main(int Argc, char *Argv[]) {
     LLVMInitializeARMTargetMC();
 #endif
 
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXAsmPrinter();
+    LLVMInitializeNVPTXTargetMC();
+
     char *file = NULL;
     const char *headerFileName = NULL;
     const char *outFileName = NULL;
diff --git a/module.cpp b/module.cpp
index 94682dc0..a8f521d8 100644
--- a/module.cpp
+++ b/module.cpp
@@ -444,6 +444,38 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
         return;
     }
 
+    if (g->target->getISA() == Target::NVPTX && 
+#if 0
+        !type->IsConstType()  &&
+#endif
+#if 1
+        at != NULL &&
+#endif
+        type->IsVaryingType())
+    {
+      Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target.");
+      return;
+#if 0
+        int nel = 32;  /* warp-size */
+        if (type->IsArrayType())
+        {
+          const ArrayType *at = CastType<ArrayType>(type);
+          /* we must scale # elements by 4, because a thread-block will run 4 warps
+           * or 128 threads.
+           * ***note-to-me***:please define these value (128threads/4warps)
+           * in nvptx-target definition
+           * instead of compile-time constants 
+           */
+          nel *= at->GetElementCount();
+          assert (!type->IsSOAType());
+          type = new ArrayType(at->GetElementType()->GetAsUniformType(), nel);
+        }
+        else
+          type = new ArrayType(type->GetAsUniformType(), nel);
+#endif
+    }
+
+
     llvm::Type *llvmType = type->LLVMType(g->ctx);
     if (llvmType == NULL)
         return;
@@ -643,6 +675,21 @@ lCheckExportedParameterTypes(const Type *type, const std::string &name,
     }
 }
 
+static void
+lCheckTaskParameterTypes(const Type *type, const std::string &name,
+                             SourcePos pos) {
+  if (g->target->getISA() != Target::NVPTX) 
+    return;
+  if (lRecursiveCheckValidParamType(type, false) == false) {
+    if (CastType<VectorType>(type))
+      Error(pos, "Vector-typed parameter \"%s\" is illegal in a task "
+          "function with \"nvptx\" target.", name.c_str());
+    else
+      Error(pos, "Varying parameter \"%s\" is illegal in a task function with \"nvptx\" target.",
+          name.c_str());
+    }
+}
+
 
 /** Given a function type, loop through the function parameters and see if
     any are StructTypes.  If so, issue an error; this is currently broken
@@ -801,7 +848,8 @@ Module::AddFunctionDeclaration(const std::string &name,
 #else // LLVM 3.1 and 3.3+
         function->addFnAttr(llvm::Attribute::AlwaysInline);
 #endif
-    if (functionType->isTask)
+    /* evghenii: fails function verification when "if" executed in nvptx target */
+    if (functionType->isTask && g->target->getISA() != Target::NVPTX)
         // This also applies transitively to members I think?
 #if defined(LLVM_3_1)
         function->setDoesNotAlias(1, true);
@@ -822,6 +870,13 @@ Module::AddFunctionDeclaration(const std::string &name,
         Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false)
         Error(pos, "Task-qualified functions must have void return type.");
 
+    if (g->target->getISA() == Target::NVPTX &&
+        Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false &&
+        functionType->isExported)
+    {
+        Error(pos, "Export-qualified functions must have void return type with \"nvptx\" target.");
+    }
+
     if (functionType->isExported || functionType->isExternC)
         lCheckForStructParameters(functionType, pos);
 
@@ -841,6 +896,9 @@ Module::AddFunctionDeclaration(const std::string &name,
         if (functionType->isExported) {
           lCheckExportedParameterTypes(argType, argName, argPos);
         }
+        if (functionType->isTask) {
+          lCheckTaskParameterTypes(argType, argName, argPos);
+        }
 
         // ISPC assumes that no pointers alias.  (It should be possible to
         // specify when this is not the case, but this should be the
@@ -959,7 +1017,13 @@ Module::writeOutput(OutputType outputType, const char *outFileName,
         const char *fileType = NULL;
         switch (outputType) {
         case Asm:
-            if (strcasecmp(suffix, "s"))
+            if (g->target->getISA() != Target::NVPTX)
+            {
+              if (strcasecmp(suffix, "s"))
+                fileType = "assembly";
+            }
+            else
+              if (strcasecmp(suffix, "ptx"))
                 fileType = "assembly";
             break;
         case Bitcode:
@@ -1057,6 +1121,11 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) {
     }
 
     llvm::raw_fd_ostream fos(fd, (fd != 1), false);
+    if (g->target->getISA() == Target::NVPTX)
+    {
+      const std::string dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
+      module->setDataLayout(dl_string);
+    }
     llvm::WriteBitcodeToFile(module, fos);
     return true;
 }
@@ -2095,6 +2164,24 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
             opts.addMacroDef(g->cppArgs[i].substr(2));
         }
     }
+    if (g->target->getISA() == Target::NVPTX)
+    {
+      opts.addMacroDef("__NVPTX__");
+      opts.addMacroDef("programIndex=__programIndex()");
+      opts.addMacroDef("cif=if");
+      opts.addMacroDef("cfor=for");
+      opts.addMacroDef("cwhile=while");
+      opts.addMacroDef("ccontinue=continue");
+      opts.addMacroDef("cdo=do");
+      opts.addMacroDef("taskIndex0=__taskIndex0()");
+      opts.addMacroDef("taskIndex1=__taskIndex1()");
+      opts.addMacroDef("taskIndex2=__taskIndex2()");
+      opts.addMacroDef("taskIndex=__taskIndex()");
+      opts.addMacroDef("taskCount0=__taskCount0()");
+      opts.addMacroDef("taskCount1=__taskCount1()");
+      opts.addMacroDef("taskCount2=__taskCount2()");
+      opts.addMacroDef("taskCount=__taskCount()");
+    }
 
 #if defined(LLVM_3_1)
     inst.getLangOpts().BCPLComment = 1;
@@ -2540,6 +2627,29 @@ lCreateDispatchModule(std::map<std::string, FunctionTargetVariants> &functions)
     return module;
 }
 
+static std::string lCBEMangle(const std::string &S) {
+  std::string Result;
+
+  for (unsigned i = 0, e = S.size(); i != e; ++i) {
+    if (i+1 != e && ((S[i] == '>' && S[i+1] == '>') ||
+                     (S[i] == '<' && S[i+1] == '<'))) {
+      Result += '_';
+      Result += 'A'+(S[i]&15);
+      Result += 'A'+((S[i]>>4)&15);
+      Result += '_';
+      i++;
+    } else if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') {
+      Result += S[i];
+    } else {
+      Result += '_';
+      Result += 'A'+(S[i]&15);
+      Result += 'A'+((S[i]>>4)&15);
+      Result += '_';
+    }
+  }
+  return Result;
+}
+
 
 int
 Module::CompileAndOutput(const char *srcFile,
@@ -2555,7 +2665,7 @@ Module::CompileAndOutput(const char *srcFile,
                          const char *hostStubFileName,
                          const char *devStubFileName)
 {
-    if (target == NULL || strchr(target, ',') == NULL) {
+  if (target == NULL || strchr(target, ',') == NULL) {
         // We're only compiling to a single target
         g->target = new Target(arch, cpu, target, generatePIC);
         if (!g->target->isValid())
@@ -2563,6 +2673,32 @@ Module::CompileAndOutput(const char *srcFile,
 
         m = new Module(srcFile);
         if (m->CompileFile() == 0) {
+
+            /* NVPTX:
+             * for PTX target replace '.' with '_' in all global variables 
+             * a PTX identifier name must match [a-zA-Z$_][a-zA-Z$_0-9]*
+             */
+            if (g->target->getISA() == Target::NVPTX)
+            {
+              /* mangle global variables names */
+              {
+                llvm::Module::global_iterator I = m->module->global_begin(), E = m->module->global_end();
+                for (; I != E; I++)
+                  I->setName(lCBEMangle(I->getName()));
+              }
+
+              /* mangle functions names */
+              {
+                llvm::Module::iterator I = m->module->begin(), E = m->module->end();
+                for (; I != E; I++)
+                {
+                  std::string str = I->getName();
+                  if (str.find("operator") != std::string::npos)
+                    I->setName(lCBEMangle(str));
+                }
+              }
+            }
+
             if (outputType == CXX) {
                 if (target == NULL || strncmp(target, "generic-", 8) != 0) {
                     Error(SourcePos(), "When generating C++ output, one of the \"generic-*\" "
@@ -2765,4 +2901,5 @@ Module::CompileAndOutput(const char *srcFile,
 
         return errorCount > 0;
     }
+    return true;
 }
diff --git a/nvptxcc b/nvptxcc
new file mode 100755
index 00000000..81d622e9
--- /dev/null
+++ b/nvptxcc
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+PATH=$ISPC_HOME/examples_ptx/ptxcc:$ISPC_HOME/examples_ptx/ptxgen:$PATH
+PTXCC=ptxcc
+ARGS=${@:2}
+if [ "$NVVM" == "1" ];
+then
+  LLVM32=$HOME/usr/local/llvm/bin-3.2
+  LLVMDIS=$LLVM32/bin/llvm-dis
+  PTXGEN=$ISPC_HOME/examples_ptx/ptxgen/ptxgen
+  $($LLVMDIS $1 -o $1.ll) && $($PTXGEN $1.ll > $1.ptx) && \
+  $($PTXCC $1.ptx -o $1.o -Xnvcc="-G") && \
+  $(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS);
+else
+  $($PTXCC $1 -o $1.o -Xnvcc="-G") && \
+  $(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS);
+fi
+
+
+
diff --git a/opt.cpp b/opt.cpp
index 9c66ade1..a54805db 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -133,6 +133,7 @@ static llvm::Pass *CreateDebugPass(char * output);
 static llvm::Pass *CreateReplaceStdlibShiftPass();
 
 static llvm::Pass *CreateFixBooleanSelectPass();
+static llvm::Pass *CreatePromoteLocalToPrivatePass();
 
 #define DEBUG_START_PASS(NAME)                                 \
     if (g->debugPrint &&                                       \
@@ -496,7 +497,11 @@ Optimize(llvm::Module *module, int optLevel) {
         // run absolutely no optimizations, since the front-end needs us to
         // take the various __pseudo_* functions it has emitted and turn
         // them into something that can actually execute.
-        optPM.add(CreateImproveMemoryOpsPass(), 100);
+        
+        if (g->opt.disableGatherScatterOptimizations == false &&
+            g->target->getVectorWidth() > 1) 
+          optPM.add(CreateImproveMemoryOpsPass(), 100);
+
         if (g->opt.disableHandlePseudoMemoryOps == false)
             optPM.add(CreateReplacePseudoMemoryOpsPass());
 
@@ -519,6 +524,8 @@ Optimize(llvm::Module *module, int optLevel) {
         llvm::initializeInstrumentation(*registry);
         llvm::initializeTarget(*registry);
 
+        if (g->target->getISA() == Target::NVPTX)
+          optPM.add(CreatePromoteLocalToPrivatePass());
         optPM.add(llvm::createGlobalDCEPass(), 185);
 
         // Setup to use LLVM default AliasAnalysis
@@ -577,7 +584,10 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createGlobalOptimizerPass());
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createIPConstantPropagationPass());
-        optPM.add(CreateReplaceStdlibShiftPass(),229);
+
+        if (g->target->getISA() != Target::NVPTX)
+          optPM.add(CreateReplaceStdlibShiftPass(),229);
+
         optPM.add(llvm::createDeadArgEliminationPass(),230);
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());
@@ -689,6 +699,111 @@ Optimize(llvm::Module *module, int optLevel) {
 
         // Should be the last
         optPM.add(CreateFixBooleanSelectPass(), 400);
+
+        if (g->target->getISA() == Target::NVPTX)
+        {
+          optPM.add(llvm::createGlobalDCEPass());
+
+          optPM.add(llvm::createTypeBasedAliasAnalysisPass());
+          optPM.add(llvm::createBasicAliasAnalysisPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          // Here clang has an experimental pass SROAPass instead of
+          // ScalarReplAggregatesPass. We should add it in the future.
+          optPM.add(llvm::createScalarReplAggregatesPass());
+          optPM.add(llvm::createEarlyCSEPass());
+          optPM.add(llvm::createLowerExpectIntrinsicPass());
+          optPM.add(llvm::createTypeBasedAliasAnalysisPass());
+          optPM.add(llvm::createBasicAliasAnalysisPass());
+
+          // Early optimizations to try to reduce the total amount of code to
+          // work with if we can
+          optPM.add(llvm::createReassociatePass());
+          optPM.add(llvm::createConstantPropagationPass());
+          optPM.add(llvm::createDeadInstEliminationPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+
+          optPM.add(llvm::createPromoteMemoryToRegisterPass());
+          optPM.add(llvm::createAggressiveDCEPass());
+
+
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createDeadInstEliminationPass());
+
+          // On to more serious optimizations
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          optPM.add(llvm::createPromoteMemoryToRegisterPass());
+          optPM.add(llvm::createGlobalOptimizerPass());
+          optPM.add(llvm::createReassociatePass());
+          optPM.add(llvm::createIPConstantPropagationPass());
+
+          optPM.add(llvm::createDeadArgEliminationPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          optPM.add(llvm::createPruneEHPass());
+          optPM.add(llvm::createFunctionAttrsPass());
+          optPM.add(llvm::createFunctionInliningPass());
+          optPM.add(llvm::createConstantPropagationPass());
+          optPM.add(llvm::createDeadInstEliminationPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+
+          optPM.add(llvm::createArgumentPromotionPass());
+#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
+          // Starting from 3.4 this functionality was moved to
+          // InstructionCombiningPass. See r184459 for details.
+          optPM.add(llvm::createSimplifyLibCallsPass());
+#endif
+          optPM.add(llvm::createAggressiveDCEPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createJumpThreadingPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createTailCallEliminationPass());
+
+          optPM.add(llvm::createInstructionCombiningPass());
+
+          optPM.add(llvm::createFunctionInliningPass());
+          optPM.add(llvm::createConstantPropagationPass());
+
+          optPM.add(llvm::createInstructionCombiningPass());
+
+          optPM.add(llvm::createIPSCCPPass());
+          optPM.add(llvm::createDeadArgEliminationPass());
+          optPM.add(llvm::createAggressiveDCEPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+
+          optPM.add(llvm::createFunctionInliningPass());
+          optPM.add(llvm::createArgumentPromotionPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          optPM.add(llvm::createReassociatePass());
+          optPM.add(llvm::createLoopRotatePass());
+          optPM.add(llvm::createLICMPass());
+//          optPM.add(llvm::createLoopUnswitchPass(false));
+#if 1
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createIndVarSimplifyPass());
+          optPM.add(llvm::createLoopIdiomPass());
+          optPM.add(llvm::createLoopDeletionPass());
+          optPM.add(llvm::createLoopUnrollPass());
+          optPM.add(llvm::createGVNPass());
+          optPM.add(llvm::createMemCpyOptPass());
+          optPM.add(llvm::createSCCPPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createJumpThreadingPass());
+          optPM.add(llvm::createCorrelatedValuePropagationPass());
+          optPM.add(llvm::createDeadStoreEliminationPass());
+          optPM.add(llvm::createAggressiveDCEPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createFunctionInliningPass());
+          optPM.add(llvm::createAggressiveDCEPass());
+          optPM.add(llvm::createStripDeadPrototypesPass());
+          optPM.add(llvm::createGlobalDCEPass());
+          optPM.add(llvm::createConstantMergePass());
+#endif
+        }
     }
 
     // Finish up by making sure we didn't mess anything up in the IR along
@@ -5267,4 +5382,63 @@ CreateFixBooleanSelectPass() {
     return new FixBooleanSelectPass();
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// Detect addrspace(3)
+///////////////////////////////////////////////////////////////////////////////
+
+class PromoteLocalToPrivatePass: public llvm::BasicBlockPass
+{
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    PromoteLocalToPrivatePass() : BasicBlockPass(ID) {}
+
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+};
+
+char PromoteLocalToPrivatePass::ID = 0;
+
+bool 
+PromoteLocalToPrivatePass::runOnBasicBlock(llvm::BasicBlock &BB)
+{
+  std::vector<llvm::AllocaInst*> Allocas;
+
+  bool modifiedAny  = false;
+
+  llvm::Function *cvtFunc = m->module->getFunction("__cvt_loc2gen_var");
+
+  // Find allocas that are safe to promote, by looking at all instructions in
+  // the entry node
+  for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+  {
+    llvm::Instruction *inst = &*I;
+    if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
+    {
+      llvm::Function *func = ci->getCalledFunction();
+      if (cvtFunc && (cvtFunc == func))
+      {
+#if 0
+        fprintf(stderr , "--found cvt-- name= %s \n",
+            I->getName().str().c_str());
+#endif
+        llvm::AllocaInst *alloca = new llvm::AllocaInst(LLVMTypes::Int64Type, "opt_loc2var", ci);
+        assert(alloca != NULL);
+#if 0
+        const int align = 8; // g->target->getNativeVectorAlignment();
+        alloca->setAlignment(align);
+#endif
+        ci->replaceAllUsesWith(alloca);
+        modifiedAny = true;
+      }
+    }
+  }
+  return modifiedAny;
+}
+
+static llvm::Pass *
+CreatePromoteLocalToPrivatePass() {
+    return new PromoteLocalToPrivatePass();
+}
+
+
+
 
diff --git a/ptxtestcc.sh b/ptxtestcc.sh
new file mode 100755
index 00000000..2ba5e252
--- /dev/null
+++ b/ptxtestcc.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+LLC=$HOME/usr/local/llvm/bin-trunk/bin/llc
+DIS=$HOME/usr/local/llvm/bin-3.2/bin/llvm-dis
+
+ISPC=ispc
+PTXCC=ptxcc
+PTXGEN=~/ptxgen
+$(cat $1 |grep -v 'width'|$ISPC --target=nvptx --emit-llvm -o -|$LLC -march=nvptx64 -mcpu=sm_35 -o $1.ptx) && \
+#$(cat $1 |grep -v 'width'|$ISPC --target=nvptx --emit-llvm -o -|$DIS -o $1_32_ptx.ll && $PTXGEN $1_32_ptx.ll > $1.ptx) && \
+$($PTXCC $1.ptx  -Xptxas=-v -o $1.ptx.o) && \
+nvcc -o test_nvptx test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.ptx.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt -DTEST_SIG=$2
+
+
+
diff --git a/run_tests.py b/run_tests.py
index 89e6cd87..671ad416 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -204,6 +204,8 @@ def run_test(testname):
             return (1, 0)
         else:
             global is_generic_target
+            global is_nvptx_target
+            global is_nvptx_nvvm
             if is_windows:
                 if is_generic_target:
                     obj_name = "%s.cpp" % os.path.basename(filename)
@@ -218,6 +220,13 @@ def run_test(testname):
             else:
                 if is_generic_target:
                     obj_name = "%s.cpp" % testname
+                elif is_nvptx_target:
+                  if os.environ.get("NVVM") == "1":
+                    is_nvptx_nvvm = True
+                    obj_name = "%s.bc" % testname
+                  else:
+                    obj_name = "%s.ptx" % testname
+                    is_nvptx_nvvm = False
                 else:
                     obj_name = "%s.o" % testname
                 exe_name = "%s.run" % testname
@@ -248,13 +257,32 @@ def run_test(testname):
                     cc_cmd += ' -Wl,-no_pie'
                 if should_fail:
                     cc_cmd += " -DEXPECT_FAILURE"
+                if is_nvptx_target:
+                  nvptxcc_exe = "nvptxcc"
+                  nvptxcc_exe_rel = add_prefix(nvptxcc_exe)
+                  cc_cmd = "%s %s -DTEST_SIG=%d -o %s" % \
+                      (nvptxcc_exe_rel, obj_name, match, exe_name)
 
-            ispc_cmd = ispc_exe_rel + " --woff %s -o %s --arch=%s --target=%s" % \
+            ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --arch=%s --target=%s" % \
                        (filename, obj_name, options.arch, options.target)
             if options.no_opt:
                 ispc_cmd += " -O0" 
             if is_generic_target:
                 ispc_cmd += " --emit-c++ --c++-include-file=%s" % add_prefix(options.include_file)
+            if is_nvptx_target:
+                filename4ptx = filename+".ptx.parsed_ispc"
+                grep_cmd = "grep -v 'export uniform int width' %s > %s " % \
+                    (filename, filename4ptx)
+                if options.verbose:
+                  print "Grepping: %s" % grep_cmd
+                sp = subprocess.Popen(grep_cmd, shell=True)
+                sp.communicate()
+                if is_nvptx_nvvm:
+                  ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-llvm --target=%s" % \
+                         (filename4ptx, obj_name, options.target)
+                else:
+                  ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-asm --target=%s" % \
+                         (filename4ptx, obj_name, options.target)
 
         # compile the ispc code, make the executable, and run it...
         (compile_error, run_error) = run_cmds([ispc_cmd, cc_cmd], 
@@ -269,7 +297,7 @@ def run_test(testname):
                     basename = os.path.basename(filename)
                     os.unlink("%s.pdb" % basename)
                     os.unlink("%s.ilk" % basename)
-            os.unlink(obj_name)
+#            os.unlink(obj_name)
         except:
             None
 
@@ -290,6 +318,7 @@ def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test
     ispc_exe = glob_var[3]
     global is_generic_target
     is_generic_target = glob_var[4]
+    global is_nvptx_target
     global run_tests_log
     run_tests_log = glob_var[5]    
 
@@ -505,6 +534,8 @@ def run_tests(options1, args, print_version):
  
     if options.target == 'neon':
         options.arch = 'arm'
+    if options.target == "nvptx":
+        options.arch = "nvptx64"
  
     # use relative path to not depend on host directory, which may possibly
     # have white spaces and unicode characters.
@@ -530,9 +561,11 @@ def run_tests(options1, args, print_version):
     print_debug("Testing ispc: " + ispc_exe + "\n", s, run_tests_log)
     ispc_exe += " " + options.ispc_flags
 
-    global is_generic_target 
+    global is_generic_target
+    global is_nvptx_target
     is_generic_target = (options.target.find("generic-") != -1 and
                      options.target != "generic-1" and options.target != "generic-x1")
+    is_nvptx_target = (options.target.find("nvptx") != -1)
     if is_generic_target and options.include_file == None:
         if options.target == "generic-4" or options.target == "generic-x4":
             error("No generics #include specified; using examples/intrinsics/sse4.h\n", 2)
diff --git a/stdlib.ispc b/stdlib.ispc
index 24217cd0..de0e32ed 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -57,6 +57,31 @@
   #error Unknown value of ISPC_MASK_BITS
 #endif
 
+
+
+///////////////////////////////////////////////////////////////////////////
+// CUDA Specific primitives
+//
+/***************/
+
+__declspec(safe,cost0) static inline varying int __programIndex() { return __program_index(); }
+__declspec(safe,cost0) static inline uniform int __programCount() { return __program_count(); }
+__declspec(safe,cost0) static inline uniform int __warpIndex()    { return __warp_index();    }
+
+/***************/
+
+__declspec(safe,cost0) static inline uniform int __taskIndex0() { return __task_index0(); }
+__declspec(safe,cost0) static inline uniform int __taskIndex1() { return __task_index1(); }
+__declspec(safe,cost0) static inline uniform int __taskIndex2() { return __task_index2(); }
+__declspec(safe,cost0) static inline uniform int __taskIndex () { return __task_index (); }
+
+/***************/
+
+__declspec(safe,cost0) static inline uniform int __taskCount0() { return __task_count0(); }
+__declspec(safe,cost0) static inline uniform int __taskCount1() { return __task_count1(); }
+__declspec(safe,cost0) static inline uniform int __taskCount2() { return __task_count2(); }
+__declspec(safe,cost0) static inline uniform int __taskCount () { return __task_count (); }
+
 ///////////////////////////////////////////////////////////////////////////
 // Low level primitives
 
@@ -464,7 +489,10 @@ __declspec(safe)
 static inline uniform int popcnt(bool v) {
     // As with any() and all(), only count across the active lanes
 #if (ISPC_MASK_BITS == 1)
-    return __popcnt_int64(__movmsk(v & __mask));
+    if (__is_nvptx_target)
+      return __popcnt_int64(__movmsk_ptx(v & __mask));
+    else
+      return __popcnt_int64(__movmsk(v & __mask));
 #else
     return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask));
 #endif
@@ -1226,6 +1254,11 @@ packed_store_active(uniform int a[], int vals) {
     return __packed_store_active(a, vals, (IntMaskType)__mask);
 }
 
+static inline uniform int 
+packed_store_active(bool active, uniform int a[], int vals) {
+    return __packed_store_active(a, vals, (IntMaskType)(-(int)active));
+}
+
 static inline uniform int
 packed_store_active2(uniform int a[], int vals) {
     return __packed_store_active2(a, vals, (IntMaskType)__mask);
@@ -1236,6 +1269,9 @@ packed_store_active2(uniform int a[], int vals) {
 // System information
 
 static inline uniform int num_cores() {
+  if (__is_nvptx_target)
+    return 15*32; // K20/K20X/K40 - 15SMX x 32 warps/smx (max is 64 warps/smx)
+  else
     return __num_cores();
 }
 
@@ -1783,7 +1819,7 @@ static inline void memory_barrier() {
     __memory_barrier();
 }
 
-#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
+#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE,TC)                        \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
     TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
     return ret;                                                         \
@@ -1794,6 +1830,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
     return ret;                                                         \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
     TA ret;                                                             \
@@ -1804,10 +1844,15 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
     return ret;                                                         \
+  } \
 }                                                                       \
 
-#define DEFINE_ATOMIC_SWAP(TA,TB)                                       \
+#define DEFINE_ATOMIC_SWAP(TA,TB,MASKTYPE,TC)                \
 static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
     uniform int i = 0;                                                  \
     TA ret[programCount];                                               \
     TA memVal;                                                          \
@@ -1838,6 +1883,7 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
        originally got back from memory... */                            \
     ret[lastSwap] = memVal;                                             \
     return ret[programIndex];                                           \
+  }\
 }                                                                       \
 static inline uniform TA atomic_swap_global(uniform TA * uniform ptr,   \
                                             uniform TA value) {         \
@@ -1845,6 +1891,10 @@ static inline uniform TA atomic_swap_global(uniform TA * uniform ptr,   \
     return ret;                                                         \
 }                                                                       \
 static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
     TA ret;                                                             \
@@ -1855,9 +1905,10 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
     return ret;                                                         \
+  }\
 }                                                                       \
 
-#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB)                          \
+#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC)                          \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
     uniform TA oneval = reduce_##OPA(value);                            \
     TA ret;                                                             \
@@ -1872,6 +1923,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
                                        TA value) {                      \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
     TA ret;                                                             \
@@ -1882,57 +1937,58 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
     return ret;                                                         \
+  } \
 }
 
-DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
-DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
-DEFINE_ATOMIC_SWAP(int32,int32)
+DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64)
+DEFINE_ATOMIC_SWAP(int32,int32,IntMaskType,int64)
 
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
-DEFINE_ATOMIC_SWAP(unsigned int32,int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_SWAP(unsigned int32,int32,UIntMaskType, unsigned int64)
 
-DEFINE_ATOMIC_SWAP(float,float)
+DEFINE_ATOMIC_SWAP(float,float,IntMaskType,int64)
 
-DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
-DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
-DEFINE_ATOMIC_SWAP(int64,int64)
+DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64)
+DEFINE_ATOMIC_SWAP(int64,int64,IntMaskType, int64)
 
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
-DEFINE_ATOMIC_SWAP(unsigned int64,int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_SWAP(unsigned int64,int64,UIntMaskType, unsigned int64)
 
-DEFINE_ATOMIC_SWAP(double,double)
+DEFINE_ATOMIC_SWAP(double,double,IntMaskType, int64)
 
 #undef DEFINE_ATOMIC_OP
 #undef DEFINE_ATOMIC_MINMAX_OP
 #undef DEFINE_ATOMIC_SWAP
 
-#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
+#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE, TC)                           \
 static inline uniform TA atomic_compare_exchange_global(               \
          uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
     uniform TA ret =                                                    \
@@ -1947,6 +2003,10 @@ static inline TA atomic_compare_exchange_global(                           \
 } \
 static inline TA atomic_compare_exchange_global(               \
          uniform TA * varying ptr, TA oldval, TA newval) { \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_compare_exchange_varying_##TB##_global((TC)ptr, oldval, newval, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
     TA ret;                                                             \
@@ -1958,14 +2018,15 @@ static inline TA atomic_compare_exchange_global(               \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
     return ret;                                                         \
+  } \
 }
 
-ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
-ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
-ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
-ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
-ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
-ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
+ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType,int64)
+ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType,unsigned int64)
+ATOMIC_DECL_CMPXCHG(float, float, IntMaskType,int64)
+ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType,int64)
+ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType,unsigned int64)
+ATOMIC_DECL_CMPXCHG(double, double, IntMaskType,int64)
 
 #undef ATOMIC_DECL_CMPXCHG
 
@@ -2032,12 +2093,20 @@ static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value)
 }                                                                      \
 static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) {    \
     TYPE ret;                                                          \
+  if (__is_nvptx_target) {                                            \
+    foreach_active (i) {                                             \
+        uniform TYPE * uniform ptr = (uniform TYPE * uniform)extract((int64)p, i); \
+        ret  = insert(ret, i, *ptr);                                \
+        *ptr = OPFUNC(*ptr, extract(value, i));                \
+    }                                                                  \
+  } else {    \
     uniform TYPE * uniform ptrs[programCount];                         \
     ptrs[programIndex] = p;                                            \
     foreach_active (i) {                                             \
         ret = insert(ret, i, *ptrs[i]);                                \
         *ptrs[i] = OPFUNC(*ptrs[i], extract(value, i));                \
     }                                                                  \
+  } \
     return ret;                                                        \
 }
 
diff --git a/stmt.cpp b/stmt.cpp
index 52d25fe9..ee9e819c 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -142,6 +142,62 @@ lHasUnsizedArrays(const Type *type) {
         return lHasUnsizedArrays(at->GetElementType());
 }
 
+static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos &currentPos, const bool variable = false)
+{
+  if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX) 
+    return value;
+  llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
+  const int addressSpace = pt->getAddressSpace();
+  if (addressSpace != 3 && addressSpace != 4) 
+    return value;
+
+  llvm::Type *elTy = pt->getElementType();
+
+  /* convert elTy addrspace(3)* to i64* addrspace(3)* */
+  llvm::PointerType *Int64Ptr3 = llvm::PointerType::get(LLVMTypes::Int64Type, addressSpace);
+  value = ctx->BitCastInst(value, Int64Ptr3, "gep2gen_cast1");
+
+  /* convert i64* addrspace(3) to i64* */
+  llvm::Function *__cvt2gen = m->module->getFunction(
+      addressSpace == 3 ? (variable ? "__cvt_loc2gen_var" : "__cvt_loc2gen") : "__cvt_const2gen");
+
+  std::vector<llvm::Value *> __cvt2gen_args;
+  __cvt2gen_args.push_back(value);
+  value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, variable ? "gep2gen_cvt_var" : "gep2gen_cvt", ctx->GetCurrentBasicBlock());
+
+  /* compute offset */
+  if (addressSpace == 3)
+  {
+    assert(elTy->isArrayTy());
+    const int numElTot = elTy->getArrayNumElements();
+    const int numEl    = numElTot/4;
+#if 0
+    fprintf(stderr, " --- detected addrspace(3) sz= %d --- \n", numEl);
+#endif
+    llvm::ArrayType *arrTy = llvm::dyn_cast<llvm::ArrayType>(pt->getArrayElementType());
+    assert(arrTy != NULL);
+    llvm::Type *arrElTy = arrTy->getElementType();
+#if 0
+    if (arrElTy->isArrayTy())
+      Error(currentPos, "Currently \"nvptx\" target doesn't support array-of-array");
+#endif
+
+    /* convert i64* to errElTy* */
+    llvm::PointerType *arrElTyPt0 = llvm::PointerType::get(arrElTy, 0);
+    value  = ctx->BitCastInst(value, arrElTyPt0, "gep2gen_cast2");
+
+    llvm::Function *func_warp_index    = m->module->getFunction("__warp_index");
+    llvm::Value *warpId = ctx->CallInst(func_warp_index, NULL, std::vector<llvm::Value*>(),  "gep2gen_warp_index");
+    llvm::Value *offset = ctx->BinaryOperator(llvm::Instruction::Mul, warpId, LLVMInt32(numEl), "gep2gen_offset");
+    value = llvm::GetElementPtrInst::Create(value, offset, "gep2gen_offset", ctx->GetCurrentBasicBlock());
+  }
+
+  /* convert arrElTy* to elTy* */
+  llvm::PointerType *elTyPt0 = llvm::PointerType::get(elTy, 0);
+  value  = ctx->BitCastInst(value, elTyPt0, "gep2gen_cast3");
+
+  return value;
+}
 
 void
 DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
@@ -205,7 +261,22 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
             return;
         }
 
+
         if (sym->storageClass == SC_STATIC) {
+
+            if (g->target->getISA() == Target::NVPTX && !sym->type->IsConstType())
+                PerformanceWarning(sym->pos, 
+                    "Non-constant static variable ""\"%s\" is stored in __global address sace with ""\"nvptx\" target.",
+                    sym->name.c_str());
+            if (g->target->getISA() == Target::NVPTX && sym->type->IsVaryingType())
+                PerformanceWarning(sym->pos, 
+                    "\"const static varying\" variable ""\"%s\" is stored in __global address space with ""\"nvptx\" target.",
+                    sym->name.c_str());
+            if (g->target->getISA() == Target::NVPTX && sym->type->IsUniformType())
+                PerformanceWarning(sym->pos, 
+                    "\"const static uniform\" variable ""\"%s\" is stored in __constant address space with ""\"nvptx\" target.",
+                    sym->name.c_str());
+
             // For static variables, we need a compile-time constant value
             // for its initializer; if there's no initializer, we use a
             // zero value.
@@ -233,19 +304,97 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
             if (cinit == NULL)
                 cinit = llvm::Constant::getNullValue(llvmType);
 
+            int addressSpace = 0;
+            if (g->target->getISA() == Target::NVPTX &&
+                sym->type->IsConstType() &&
+                sym->type->IsUniformType())
+              addressSpace = 4;
+
             // Allocate space for the static variable in global scope, so
             // that it persists across function calls
             sym->storagePtr =
                 new llvm::GlobalVariable(*m->module, llvmType,
                                          sym->type->IsConstType(),
                                          llvm::GlobalValue::InternalLinkage, cinit,
-                                         llvm::Twine("static.") +
+                                         llvm::Twine("static_") +
                                          llvm::Twine(sym->pos.first_line) +
-                                         llvm::Twine(".") + sym->name.c_str());
+                                         llvm::Twine("_") + sym->name.c_str(),
+                                         NULL,
+                                         llvm::GlobalVariable::NotThreadLocal,
+                                         addressSpace);
+            sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos);
             // Tell the FunctionEmitContext about the variable
             ctx->EmitVariableDebugInfo(sym);
         }
-        else {
+        else if ((sym->type->IsUniformType() || sym->type->IsSOAType()) &&
+          /* NVPTX:
+           * only non-constant uniform data types are stored in shared memory 
+           * constant uniform are automatically promoted to varying 
+           */
+           !sym->type->IsConstType() &&
+#if 1     
+           sym->type->IsArrayType() &&
+#endif
+           g->target->getISA() == Target::NVPTX)
+          {
+              PerformanceWarning(sym->pos,
+                  "Non-constant \"uniform\" data types might be slow with \"nvptx\" target. "
+                  "Unless data sharing between program instances is desired, try \"const [static] uniform\", \"varying\" or \"uniform new uniform \"+\"delete\" if possible.");
+
+              /* with __shared__ memory everything must be an array */
+              int nel = 4;
+              ArrayType *nat;
+              bool variable = true;
+              if (sym->type->IsArrayType())
+              {
+                const ArrayType *at = CastType<ArrayType>(sym->type);
+                /* we must scale # elements by 4, because a thread-block will run 4 warps
+                 * or 128 threads.
+                 * ***note-to-me***:please define these value (128threads/4warps)
+                 * in nvptx-target definition
+                 * instead of compile-time constants 
+                 */
+                nel *= at->GetElementCount();
+                if (sym->type->IsSOAType())
+                  nel *= sym->type->GetSOAWidth();
+                nat = new ArrayType(at->GetElementType(), nel);
+                variable = false;
+              }
+              else
+                nat = new ArrayType(sym->type, nel);
+
+              llvm::Type *llvmTypeUn = nat->LLVMType(g->ctx);
+              llvm::Constant *cinit = llvm::UndefValue::get(llvmTypeUn);
+
+              sym->storagePtr =
+                new llvm::GlobalVariable(*m->module, llvmTypeUn,
+                    sym->type->IsConstType(),
+                    llvm::GlobalValue::InternalLinkage, 
+                    cinit,
+                    llvm::Twine("local_") + 
+                    llvm::Twine(sym->pos.first_line) +
+                    llvm::Twine("_") + sym->name.c_str(),
+                    NULL,
+                    llvm::GlobalVariable::NotThreadLocal,
+                    /*AddressSpace=*/3);
+              sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos, variable);
+              llvm::PointerType *ptrTy = llvm::PointerType::get(sym->type->LLVMType(g->ctx),0);
+              sym->storagePtr = ctx->BitCastInst(sym->storagePtr, ptrTy, "uniform_decl");
+
+              // Tell the FunctionEmitContext about the variable; must do
+              // this before the initializer stuff.
+              ctx->EmitVariableDebugInfo(sym);
+
+              if (initExpr == 0 && sym->type->IsConstType())
+                Error(sym->pos, "Missing initializer for const variable "
+                    "\"%s\".", sym->name.c_str());
+
+              // And then get it initialized...
+              sym->parentFunction = ctx->GetFunction();
+              InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos);
+          }
+          else
+          {
             // For non-static variables, allocate storage on the stack
             sym->storagePtr = ctx->AllocaInst(llvmType, sym->name.c_str());
 
@@ -253,10 +402,14 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
             // this before the initializer stuff.
             ctx->EmitVariableDebugInfo(sym);
 
+            if (initExpr == 0 && sym->type->IsConstType())
+              Error(sym->pos, "Missing initializer for const variable "
+                  "\"%s\".", sym->name.c_str());
+
             // And then get it initialized...
             sym->parentFunction = ctx->GetFunction();
             InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos);
-        }
+          }
     }
 }
 
@@ -415,6 +568,19 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {
     if (testValue == NULL)
         return;
 
+
+#if 0
+    if (!isUniform && g->target->getISA() == Target::NVPTX)
+    {
+      /* With "nvptx" target, SIMT hardware takes care of non-uniform 
+       * control flow. We trick ISPC to generate uniform control flow.
+       */
+      testValue = ctx->ExtractInst(testValue, 0);
+      isUniform = true;
+    }
+#endif
+
+
     if (isUniform) {
         ctx->StartUniformIf();
         if (doAllCheck)
@@ -696,7 +862,11 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
 
     // Do any of the program instances want to run the 'true'
     // block?  If not, jump ahead to bNext.
+#if 1
     llvm::Value *maskAnyTrueQ = ctx->Any(ctx->GetFullMask());
+#else
+    llvm::Value *maskAnyTrueQ = ctx->ExtractInst(ctx->GetFullMask(),0);
+#endif
     ctx->BranchInst(bRunTrue, bNext, maskAnyTrueQ);
 
     // Emit statements for true
@@ -713,7 +883,11 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
 
     // Similarly, check to see if any of the instances want to
     // run the 'false' block...
+#if 1
     llvm::Value *maskAnyFalseQ = ctx->Any(ctx->GetFullMask());
+#else
+    llvm::Value *maskAnyFalseQ = ctx->ExtractInst(ctx->GetFullMask(),0);
+#endif
     ctx->BranchInst(bRunFalse, bDone, maskAnyFalseQ);
 
     // Emit code for false
@@ -1273,7 +1447,10 @@ static llvm::Value *
 lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
                       llvm::Value *uniformCounterPtr,
                       llvm::Value *varyingCounterPtr,
-                      const std::vector<int> &spans) {
+                      const std::vector<int> &spans) 
+{
+  if (g->target->getISA() != Target::NVPTX)
+  {
     // Smear the uniform counter value out to be varying
     llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
     llvm::Value *smearCounter = ctx->BroadcastValue(
@@ -1306,6 +1483,93 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
                             LLVMInt32Vector(delta), "iter_val");
     ctx->StoreInst(varyingCounter, varyingCounterPtr);
     return varyingCounter;
+  }
+  else /* NVPTX == true */
+  {
+    // Smear the uniform counter value out to be varying
+    llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
+    llvm::Value *smearCounter = ctx->BroadcastValue(
+        counter, LLVMTypes::Int32VectorType, "smear_counter");
+
+    // Figure out the offsets; this is a little bit tricky.  As an example,
+    // consider a 2D tiled foreach loop, where we're running 8-wide and
+    // where the inner dimension has a stride of 4 and the outer dimension
+    // has a stride of 2.  For the inner dimension, we want the offsets
+    // (0,1,2,3,0,1,2,3), and for the outer dimension we want
+    // (0,0,0,0,1,1,1,1).
+    int32_t delta[ISPC_MAX_NVEC];
+    const int vecWidth = 32; 
+    std::vector<llvm::Constant*> constDeltaList;
+    for (int i = 0; i < vecWidth; ++i) 
+    {
+      int d = i;
+      // First, account for the effect of any dimensions at deeper
+      // nesting levels than the current one.
+      int prevDimSpanCount = 1;
+      for (int j = dim; j < nDims-1; ++j)
+        prevDimSpanCount *= spans[j+1];
+      d /= prevDimSpanCount;
+
+      // And now with what's left, figure out our own offset
+      delta[i] = d % spans[dim];
+      constDeltaList.push_back(LLVMInt8(delta[i]));
+    }
+
+    llvm::ArrayType* ArrayDelta = llvm::ArrayType::get(LLVMTypes::Int8Type, 32);
+//    llvm::PointerType::get(ArrayDelta, 4); /* constant memory */
+
+
+    llvm::GlobalVariable* globalDelta = new llvm::GlobalVariable(
+        /*Module=*/*m->module,
+        /*Type=*/ArrayDelta,
+        /*isConstant=*/true,
+        /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
+        /*Initializer=*/0, // has initializer, specified below
+        /*Name=*/"constDeltaForeach");
+#if 0
+        /*ThreadLocalMode=*/llvm::GlobalVariable::NotThreadLocal,
+        /*unsigned AddressSpace=*/4 /*constant*/);
+#endif
+
+
+    llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList);
+
+    globalDelta->setInitializer(constDelta);
+    llvm::Function *func_program_index = m->module->getFunction("__program_index");
+    llvm::Value *laneIdx = ctx->CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__programIndex");
+
+    std::vector<llvm::Value*> ptr_arrayidx_indices;
+    ptr_arrayidx_indices.push_back(LLVMInt32(0));
+    ptr_arrayidx_indices.push_back(laneIdx);
+#if 1
+    llvm::Instruction* ptr_arrayidx = llvm::GetElementPtrInst::Create(globalDelta, ptr_arrayidx_indices, "arrayidx", ctx->GetCurrentBasicBlock());
+    llvm::LoadInst* int8_39 = new llvm::LoadInst(ptr_arrayidx, "", false, ctx->GetCurrentBasicBlock());
+    llvm::Value * int32_39 = ctx->ZExtInst(int8_39, LLVMTypes::Int32Type);
+
+    llvm::VectorType* VectorTy_2 = llvm::VectorType::get(llvm::IntegerType::get(*g->ctx, 32), 1);
+    llvm::UndefValue* const_packed_41 = llvm::UndefValue::get(VectorTy_2);
+
+    llvm::InsertElementInst* packed_43 = llvm::InsertElementInst::Create(
+//        llvm::UndefValue(LLVMInt32Vector),
+        const_packed_41,
+        int32_39, LLVMInt32(0), "", ctx->GetCurrentBasicBlock());
+#endif
+
+
+    // Add the deltas to compute the varying counter values; store the
+    // result to memory and then return it directly as well.
+#if 0
+    llvm::Value *varyingCounter =
+        ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
+                            LLVMInt32Vector(delta), "iter_val");
+#else
+    llvm::Value *varyingCounter =
+        ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
+                            packed_43, "iter_val");
+#endif
+    ctx->StoreInst(varyingCounter, varyingCounterPtr);
+    return varyingCounter;
+  }
 }
 
 
@@ -1383,7 +1647,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
 
     // This should be caught during typechecking
     AssertPos(pos, startExprs.size() == dimVariables.size() &&
-              endExprs.size() == dimVariables.size());
+        endExprs.size() == dimVariables.size());
     int nDims = (int)dimVariables.size();
 
     ///////////////////////////////////////////////////////////////////////
@@ -1394,64 +1658,66 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
     std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;
 
     std::vector<int> span(nDims, 0);
-    lGetSpans(nDims-1, nDims, g->target->getVectorWidth(), isTiled, &span[0]);
+    const int vectorWidth = 
+      g->target->getISA() == Target::NVPTX ? 32 : g->target->getVectorWidth();
+    lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]);
 
     for (int i = 0; i < nDims; ++i) {
-        // Basic blocks that we'll fill in later with the looping logic for
-        // this dimension.
-        bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
-        if (i < nDims-1)
-            // stepping for the innermost dimension is handled specially
-            bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
-        bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));
+      // Basic blocks that we'll fill in later with the looping logic for
+      // this dimension.
+      bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
+      if (i < nDims-1)
+        // stepping for the innermost dimension is handled specially
+        bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
+      bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));
 
-        // Start and end value for this loop dimension
-        llvm::Value *sv = startExprs[i]->GetValue(ctx);
-        llvm::Value *ev = endExprs[i]->GetValue(ctx);
-        if (sv == NULL || ev == NULL)
-            return;
-        startVals.push_back(sv);
-        endVals.push_back(ev);
+      // Start and end value for this loop dimension
+      llvm::Value *sv = startExprs[i]->GetValue(ctx);
+      llvm::Value *ev = endExprs[i]->GetValue(ctx);
+      if (sv == NULL || ev == NULL)
+        return;
+      startVals.push_back(sv);
+      endVals.push_back(ev);
 
-        // nItems = endVal - startVal
-        llvm::Value *nItems =
-            ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");
+      // nItems = endVal - startVal
+      llvm::Value *nItems =
+        ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");
 
-        // nExtras = nItems % (span for this dimension)
-        // This gives us the number of extra elements we need to deal with
-        // at the end of the loop for this dimension that don't fit cleanly
-        // into a vector width.
-        nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
-                                              LLVMInt32(span[i]), "nextras"));
+      // nExtras = nItems % (span for this dimension)
+      // This gives us the number of extra elements we need to deal with
+      // at the end of the loop for this dimension that don't fit cleanly
+      // into a vector width.
+      nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
+            LLVMInt32(span[i]), "nextras"));
 
-        // alignedEnd = endVal - nExtras
-        alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev,
-                                                 nExtras[i], "aligned_end"));
+      // alignedEnd = endVal - nExtras
+      alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev,
+            nExtras[i], "aligned_end"));
 
-        ///////////////////////////////////////////////////////////////////////
-        // Each dimension has a loop counter that is a uniform value that
-        // goes from startVal to endVal, in steps of the span for this
-        // dimension.  Its value is only used internally here for looping
-        // logic and isn't directly available in the user's program code.
-        uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type,
-                                                     "counter"));
-        ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
+      ///////////////////////////////////////////////////////////////////////
+      // Each dimension has a loop counter that is a uniform value that
+      // goes from startVal to endVal, in steps of the span for this
+      // dimension.  Its value is only used internally here for looping
+      // logic and isn't directly available in the user's program code.
+      uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type,
+            "counter"));
+      ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
 
-        // There is also a varying variable that holds the set of index
-        // values for each dimension in the current loop iteration; this is
-        // the value that is program-visible.
-        dimVariables[i]->storagePtr =
-            ctx->AllocaInst(LLVMTypes::Int32VectorType,
-                            dimVariables[i]->name.c_str());
-        dimVariables[i]->parentFunction = ctx->GetFunction();
-        ctx->EmitVariableDebugInfo(dimVariables[i]);
+      // There is also a varying variable that holds the set of index
+      // values for each dimension in the current loop iteration; this is
+      // the value that is program-visible.
+      dimVariables[i]->storagePtr =
+        ctx->AllocaInst(LLVMTypes::Int32VectorType,
+            dimVariables[i]->name.c_str());
+      dimVariables[i]->parentFunction = ctx->GetFunction();
+      ctx->EmitVariableDebugInfo(dimVariables[i]);
 
-        // Each dimension also maintains a mask that represents which of
-        // the varying elements in the current iteration should be
-        // processed.  (i.e. this is used to disable the lanes that have
-        // out-of-bounds offsets.)
-        extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask"));
-        ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
+      // Each dimension also maintains a mask that represents which of
+      // the varying elements in the current iteration should be
+      // processed.  (i.e. this is used to disable the lanes that have
+      // out-of-bounds offsets.)
+      extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask"));
+      ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
     }
 
     ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR);
@@ -1464,14 +1730,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
     // a given dimension in preparation for running through its loop again,
     // after the enclosing level advances its counter.
     for (int i = 0; i < nDims; ++i) {
-        ctx->SetCurrentBasicBlock(bbReset[i]);
-        if (i == 0)
-            ctx->BranchInst(bbExit);
-        else {
-            ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
-            ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
-            ctx->BranchInst(bbStep[i-1]);
-        }
+      ctx->SetCurrentBasicBlock(bbReset[i]);
+      if (i == 0)
+        ctx->BranchInst(bbExit);
+      else {
+        ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
+        ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
+        ctx->BranchInst(bbStep[i-1]);
+      }
     }
 
     ///////////////////////////////////////////////////////////////////////////
@@ -1481,67 +1747,67 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
     // this for the innermost dimension, which has a more complex stepping
     // structure..
     for (int i = 0; i < nDims-1; ++i) {
-        ctx->SetCurrentBasicBlock(bbStep[i]);
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
-        llvm::Value *newCounter =
-            ctx->BinaryOperator(llvm::Instruction::Add, counter,
-                                LLVMInt32(span[i]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
-        ctx->BranchInst(bbTest[i]);
+      ctx->SetCurrentBasicBlock(bbStep[i]);
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
+      llvm::Value *newCounter =
+        ctx->BinaryOperator(llvm::Instruction::Add, counter,
+            LLVMInt32(span[i]), "new_counter");
+      ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
+      ctx->BranchInst(bbTest[i]);
     }
 
     ///////////////////////////////////////////////////////////////////////////
     // foreach_test (for all dimensions other than the innermost...)
     std::vector<llvm::Value *> inExtras;
     for (int i = 0; i < nDims-1; ++i) {
-        ctx->SetCurrentBasicBlock(bbTest[i]);
+      ctx->SetCurrentBasicBlock(bbTest[i]);
 
-        llvm::Value *haveExtras =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT,
-                         endVals[i], alignedEnd[i], "have_extras");
+      llvm::Value *haveExtras =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT,
+            endVals[i], alignedEnd[i], "have_extras");
 
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter");
-        llvm::Value *atAlignedEnd =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
-                         counter, alignedEnd[i], "at_aligned_end");
-        llvm::Value *inEx =
-            ctx->BinaryOperator(llvm::Instruction::And, haveExtras,
-                                atAlignedEnd, "in_extras");
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter");
+      llvm::Value *atAlignedEnd =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
+            counter, alignedEnd[i], "at_aligned_end");
+      llvm::Value *inEx =
+        ctx->BinaryOperator(llvm::Instruction::And, haveExtras,
+            atAlignedEnd, "in_extras");
 
-        if (i == 0)
-            inExtras.push_back(inEx);
-        else
-            inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx,
-                                                   inExtras[i-1], "in_extras_all"));
+      if (i == 0)
+        inExtras.push_back(inEx);
+      else
+        inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx,
+              inExtras[i-1], "in_extras_all"));
 
-        llvm::Value *varyingCounter =
-            lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i],
-                                  dimVariables[i]->storagePtr, span);
+      llvm::Value *varyingCounter =
+        lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i],
+            dimVariables[i]->storagePtr, span);
 
-        llvm::Value *smearEnd = ctx->BroadcastValue(
-            endVals[i], LLVMTypes::Int32VectorType, "smear_end");
+      llvm::Value *smearEnd = ctx->BroadcastValue(
+          endVals[i], LLVMTypes::Int32VectorType, "smear_end");
 
-        // Do a vector compare of its value to the end value to generate a
-        // mask for this last bit of work.
-        llvm::Value *emask =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-                         varyingCounter, smearEnd);
-        emask = ctx->I1VecToBoolVec(emask);
+      // Do a vector compare of its value to the end value to generate a
+      // mask for this last bit of work.
+      llvm::Value *emask =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            varyingCounter, smearEnd);
+      emask = ctx->I1VecToBoolVec(emask);
 
-        if (i == 0)
-            ctx->StoreInst(emask, extrasMaskPtrs[i]);
-        else {
-            llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
-            llvm::Value *newMask =
-                ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
-                                    "extras_mask");
-            ctx->StoreInst(newMask, extrasMaskPtrs[i]);
-        }
+      if (i == 0)
+        ctx->StoreInst(emask, extrasMaskPtrs[i]);
+      else {
+        llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
+        llvm::Value *newMask =
+          ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
+              "extras_mask");
+        ctx->StoreInst(newMask, extrasMaskPtrs[i]);
+      }
 
-        llvm::Value *notAtEnd =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-                         counter, endVals[i]);
-        ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
+      llvm::Value *notAtEnd =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            counter, endVals[i]);
+      ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
     }
 
     ///////////////////////////////////////////////////////////////////////////
@@ -1578,18 +1844,18 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
     // (i.e. processing extra elements that don't exactly fit into a
     // vector).
     llvm::BasicBlock *bbOuterInExtras =
-        ctx->CreateBasicBlock("outer_in_extras");
+      ctx->CreateBasicBlock("outer_in_extras");
     llvm::BasicBlock *bbOuterNotInExtras =
-        ctx->CreateBasicBlock("outer_not_in_extras");
+      ctx->CreateBasicBlock("outer_not_in_extras");
 
     ctx->SetCurrentBasicBlock(bbTest[nDims-1]);
     if (inExtras.size())
-        ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
-                        inExtras.back());
+      ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
+          inExtras.back());
     else
-        // for a 1D iteration domain, we certainly don't have any enclosing
-        // dimensions that are processing extra elements.
-        ctx->BranchInst(bbOuterNotInExtras);
+      // for a 1D iteration domain, we certainly don't have any enclosing
+      // dimensions that are processing extra elements.
+      ctx->BranchInst(bbOuterNotInExtras);
 
     ///////////////////////////////////////////////////////////////////////////
     // One or more outer dimensions in extras, so we need to mask for the loop
@@ -1604,21 +1870,21 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
     //   // run loop body with mask
     // }
     llvm::BasicBlock *bbAllInnerPartialOuter =
-        ctx->CreateBasicBlock("all_inner_partial_outer");
+      ctx->CreateBasicBlock("all_inner_partial_outer");
     llvm::BasicBlock *bbPartial =
-        ctx->CreateBasicBlock("both_partial");
+      ctx->CreateBasicBlock("both_partial");
     ctx->SetCurrentBasicBlock(bbOuterInExtras); {
-        // Update the varying counter value here, since all subsequent
-        // blocks along this path need it.
-        lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
-                              dimVariables[nDims-1]->storagePtr, span);
+      // Update the varying counter value here, since all subsequent
+      // blocks along this path need it.
+      lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
+          dimVariables[nDims-1]->storagePtr, span);
 
-        // here we just check to see if counter < alignedEnd
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
-        llvm::Value *beforeAlignedEnd =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-                         counter, alignedEnd[nDims-1], "before_aligned_end");
-        ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
+      // here we just check to see if counter < alignedEnd
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
+      llvm::Value *beforeAlignedEnd =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            counter, alignedEnd[nDims-1], "before_aligned_end");
+      ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
     }
 
     // Below we have a basic block that runs the loop body code for the
@@ -1637,53 +1903,53 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
     // should step the loop counter for the next enclosing dimension
     // instead.
     llvm::Value *stepIndexAfterMaskedBodyPtr =
-        ctx->AllocaInst(LLVMTypes::BoolType, "step_index");
+      ctx->AllocaInst(LLVMTypes::BoolType, "step_index");
 
     ///////////////////////////////////////////////////////////////////////////
     // We're in the inner loop part where the only masking is due to outer
     // dimensions but the innermost dimension fits fully into a vector's
     // width.  Set the mask and jump to the masked loop body.
     ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); {
-        llvm::Value *mask;
-        if (nDims == 1)
-            // 1D loop; we shouldn't ever get here anyway
-            mask = LLVMMaskAllOff;
-        else
-            mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
+      llvm::Value *mask;
+      if (nDims == 1)
+        // 1D loop; we shouldn't ever get here anyway
+        mask = LLVMMaskAllOff;
+      else
+        mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
 
-        ctx->SetInternalMask(mask);
+      ctx->SetInternalMask(mask);
 
-        ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbMaskedBody);
+      ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
+      ctx->BranchInst(bbMaskedBody);
     }
 
     ///////////////////////////////////////////////////////////////////////////
     // We need to include the effect of the innermost dimension in the mask
     // for the final bits here
     ctx->SetCurrentBasicBlock(bbPartial); {
-        llvm::Value *varyingCounter =
-            ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
-        llvm::Value *smearEnd = ctx->BroadcastValue(
-            endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
+      llvm::Value *varyingCounter =
+        ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
+      llvm::Value *smearEnd = ctx->BroadcastValue(
+          endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
 
-        llvm::Value *emask =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-                         varyingCounter, smearEnd);
-        emask = ctx->I1VecToBoolVec(emask);
+      llvm::Value *emask =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            varyingCounter, smearEnd);
+      emask = ctx->I1VecToBoolVec(emask);
 
-        if (nDims == 1) {
-            ctx->SetInternalMask(emask);
-        }
-        else {
-            llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
-            llvm::Value *newMask =
-                ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
-                                    "extras_mask");
-            ctx->SetInternalMask(newMask);
-        }
+      if (nDims == 1) {
+        ctx->SetInternalMask(emask);
+      }
+      else {
+        llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
+        llvm::Value *newMask =
+          ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
+              "extras_mask");
+        ctx->SetInternalMask(newMask);
+      }
 
-        ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbMaskedBody);
+      ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
+      ctx->BranchInst(bbMaskedBody);
     }
 
     ///////////////////////////////////////////////////////////////////////////
@@ -1699,14 +1965,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
     //   // run loop body with mask
     // }
     llvm::BasicBlock *bbPartialInnerAllOuter =
-        ctx->CreateBasicBlock("partial_inner_all_outer");
+      ctx->CreateBasicBlock("partial_inner_all_outer");
     ctx->SetCurrentBasicBlock(bbOuterNotInExtras); {
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
-        llvm::Value *beforeAlignedEnd =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-                         counter, alignedEnd[nDims-1], "before_aligned_end");
-        ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
-                        beforeAlignedEnd);
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
+      llvm::Value *beforeAlignedEnd =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            counter, alignedEnd[nDims-1], "before_aligned_end");
+      ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
+          beforeAlignedEnd);
     }
 
     ///////////////////////////////////////////////////////////////////////////
@@ -1716,26 +1982,26 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
     // value of the varying loop counter and have the statements in the
     // loop body emit their code.
     llvm::BasicBlock *bbFullBodyContinue =
-        ctx->CreateBasicBlock("foreach_full_continue");
+      ctx->CreateBasicBlock("foreach_full_continue");
     ctx->SetCurrentBasicBlock(bbFullBody); {
-        ctx->SetInternalMask(LLVMMaskAllOn);
-        ctx->SetBlockEntryMask(LLVMMaskAllOn);
-        lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
-                              dimVariables[nDims-1]->storagePtr, span);
-        ctx->SetContinueTarget(bbFullBodyContinue);
-        ctx->AddInstrumentationPoint("foreach loop body (all on)");
-        stmts->EmitCode(ctx);
-        AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL);
-        ctx->BranchInst(bbFullBodyContinue);
+      ctx->SetInternalMask(LLVMMaskAllOn);
+      ctx->SetBlockEntryMask(LLVMMaskAllOn);
+      lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
+          dimVariables[nDims-1]->storagePtr, span);
+      ctx->SetContinueTarget(bbFullBodyContinue);
+      ctx->AddInstrumentationPoint("foreach loop body (all on)");
+      stmts->EmitCode(ctx);
+      AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL);
+      ctx->BranchInst(bbFullBodyContinue);
     }
     ctx->SetCurrentBasicBlock(bbFullBodyContinue); {
-        ctx->RestoreContinuedLanes();
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
-        llvm::Value *newCounter =
-            ctx->BinaryOperator(llvm::Instruction::Add, counter,
-                                LLVMInt32(span[nDims-1]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
-        ctx->BranchInst(bbOuterNotInExtras);
+      ctx->RestoreContinuedLanes();
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
+      llvm::Value *newCounter =
+        ctx->BinaryOperator(llvm::Instruction::Add, counter,
+            LLVMInt32(span[nDims-1]), "new_counter");
+      ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
+      ctx->BranchInst(bbOuterNotInExtras);
     }
 
     ///////////////////////////////////////////////////////////////////////////
@@ -1743,33 +2009,33 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
     // less than the end value, in which case we need to run the body one
     // more time to get the extra bits.
     llvm::BasicBlock *bbSetInnerMask =
-        ctx->CreateBasicBlock("partial_inner_only");
+      ctx->CreateBasicBlock("partial_inner_only");
     ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); {
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
-        llvm::Value *beforeFullEnd =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-                         counter, endVals[nDims-1], "before_full_end");
-        ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
+      llvm::Value *beforeFullEnd =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            counter, endVals[nDims-1], "before_full_end");
+      ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
     }
 
     ///////////////////////////////////////////////////////////////////////////
     // The outer dimensions are all on, so the mask is just given by the
     // mask for the innermost dimension
     ctx->SetCurrentBasicBlock(bbSetInnerMask); {
-        llvm::Value *varyingCounter =
-            lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
-                                  dimVariables[nDims-1]->storagePtr, span);
-        llvm::Value *smearEnd = ctx->BroadcastValue(
-            endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
-        llvm::Value *emask =
-            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
-                         varyingCounter, smearEnd);
-        emask = ctx->I1VecToBoolVec(emask);
-        ctx->SetInternalMask(emask);
-        ctx->SetBlockEntryMask(emask);
+      llvm::Value *varyingCounter =
+        lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
+            dimVariables[nDims-1]->storagePtr, span);
+      llvm::Value *smearEnd = ctx->BroadcastValue(
+          endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
+      llvm::Value *emask =
+        ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+            varyingCounter, smearEnd);
+      emask = ctx->I1VecToBoolVec(emask);
+      ctx->SetInternalMask(emask);
+      ctx->SetBlockEntryMask(emask);
 
-        ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbMaskedBody);
+      ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
+      ctx->BranchInst(bbMaskedBody);
     }
 
     ///////////////////////////////////////////////////////////////////////////
@@ -1779,34 +2045,34 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
     // mask known to be all-on, which in turn leads to more efficient code
     // for that case.
     llvm::BasicBlock *bbStepInnerIndex =
-        ctx->CreateBasicBlock("step_inner_index");
+      ctx->CreateBasicBlock("step_inner_index");
     llvm::BasicBlock *bbMaskedBodyContinue =
-        ctx->CreateBasicBlock("foreach_masked_continue");
+      ctx->CreateBasicBlock("foreach_masked_continue");
     ctx->SetCurrentBasicBlock(bbMaskedBody); {
-        ctx->AddInstrumentationPoint("foreach loop body (masked)");
-        ctx->SetContinueTarget(bbMaskedBodyContinue);
-        ctx->DisableGatherScatterWarnings();
-        ctx->SetBlockEntryMask(ctx->GetFullMask());
-        stmts->EmitCode(ctx);
-        ctx->EnableGatherScatterWarnings();
-        ctx->BranchInst(bbMaskedBodyContinue);
+      ctx->AddInstrumentationPoint("foreach loop body (masked)");
+      ctx->SetContinueTarget(bbMaskedBodyContinue);
+      ctx->DisableGatherScatterWarnings();
+      ctx->SetBlockEntryMask(ctx->GetFullMask());
+      stmts->EmitCode(ctx);
+      ctx->EnableGatherScatterWarnings();
+      ctx->BranchInst(bbMaskedBodyContinue);
     }
     ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); {
-        ctx->RestoreContinuedLanes();
-        llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
-        ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
+      ctx->RestoreContinuedLanes();
+      llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
+      ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
     }
 
     ///////////////////////////////////////////////////////////////////////////
     // step the innermost index, for the case where we're doing the
     // innermost for loop over full vectors.
     ctx->SetCurrentBasicBlock(bbStepInnerIndex); {
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
-        llvm::Value *newCounter =
-            ctx->BinaryOperator(llvm::Instruction::Add, counter,
-                                LLVMInt32(span[nDims-1]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
-        ctx->BranchInst(bbOuterInExtras);
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
+      llvm::Value *newCounter =
+        ctx->BinaryOperator(llvm::Instruction::Add, counter,
+            LLVMInt32(span[nDims-1]), "new_counter");
+      ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
+      ctx->BranchInst(bbOuterInExtras);
     }
 
     ///////////////////////////////////////////////////////////////////////////
@@ -1993,7 +2259,8 @@ ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const {
         // math...)
 
         // Get the "program index" vector value
-        llvm::Value *programIndex = ctx->ProgramIndexVector();
+        llvm::Value *programIndex = g->target->getISA() == Target::NVPTX ?
+          ctx->ProgramIndexVectorPTX() : ctx->ProgramIndexVector();
 
         // And smear the current lane out to a vector
         llvm::Value *firstSet32 =
@@ -2189,10 +2456,19 @@ ForeachUniqueStmt::EmitCode(FunctionEmitContext *ctx) const {
 
         // And load the corresponding element value from the temporary
         // memory storing the value of the varying expr.
-        llvm::Value *uniqueValuePtr =
+        llvm::Value *uniqueValue;
+        if (g->target->getISA() != Target::NVPTX)
+        {
+          llvm::Value *uniqueValuePtr =
             ctx->GetElementPtrInst(exprMem, LLVMInt64(0), firstSet, exprPtrType,
-                                   "unique_index_ptr");
-        llvm::Value *uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value");
+                "unique_index_ptr");
+          uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value");
+        }
+        else  /* in case of PTX target, use __shfl PTX intrinsics via __insert/__extract function */
+        {
+          llvm::Value *firstSet32 = ctx->TruncInst(firstSet, LLVMTypes::Int32Type);
+          uniqueValue = ctx->Extract(exprValue, firstSet32);
+        }
 
         // If it's a varying pointer type, need to convert from the int
         // type we store in the vector to the actual pointer type
@@ -3100,7 +3376,8 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
     }
 
     // Now we can emit code to call __do_print()
-    llvm::Function *printFunc = m->module->getFunction("__do_print");
+    llvm::Function *printFunc = g->target->getISA() != Target::NVPTX ?
+      m->module->getFunction("__do_print") : m->module->getFunction("__do_print_nvptx");
     AssertPos(pos, printFunc);
 
     llvm::Value *mask = ctx->GetFullMask();
diff --git a/test_static_cuda.cpp b/test_static_cuda.cpp
new file mode 100644
index 00000000..4e69e298
--- /dev/null
+++ b/test_static_cuda.cpp
@@ -0,0 +1,440 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_WINDOWS
+#include <windows.h>
+#endif // ISPC_IS_WINDOWS
+
+#include <cassert>
+#include <cstring>
+#include <cstdio>
+#include <cstdint>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif
+
+/******************************/
+
+#include <cassert>
+#include <iostream>
+#include <cuda.h>
+#include "drvapi_error_string.h"
+#include "ispc_malloc.h"
+
+#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)
+// These are the inline versions for all of the SDK helper functions
+void __checkCudaErrors(CUresult err, const char *file, const int line) {
+  if(CUDA_SUCCESS != err) {
+    std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
+           << getCudaDrvErrorString(err) << "\" from file <" << file
+           << ", line " << line << "\n";
+    exit(-1);
+  }
+}
+
+
+/******************************/
+/****  Basic CUDriver API  ****/
+/******************************/
+
+CUcontext context;
+
+static void createContext(const int deviceId = 0, const bool verbose = true)
+{
+  CUdevice device;
+  int devCount;
+  checkCudaErrors(cuInit(0));
+  checkCudaErrors(cuDeviceGetCount(&devCount));
+  assert(devCount > 0);
+  checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
+
+  char name[128];
+  checkCudaErrors(cuDeviceGetName(name, 128, device));
+  if (verbose)
+    std::cout << "Using CUDA Device [0]: " << name << "\n";
+
+  int devMajor, devMinor;
+  checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
+  if (verbose)
+    std::cout << "Device Compute Capability: " 
+      << devMajor << "." << devMinor << "\n";
+  if (devMajor < 2) {
+    if (verbose)
+      std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
+    exit(1); 
+  }
+
+  // Create driver context
+  checkCudaErrors(cuCtxCreate(&context, 0, device));
+}
+static void destroyContext()
+{
+  checkCudaErrors(cuCtxDestroy(context));
+}
+
+static CUmodule loadModule(
+    const char * module,
+    const int maxrregcount = 64,
+    const char cudadevrt_lib[] = "libcudadevrt.a",
+    const size_t log_size = 32768,
+    const bool print_log = true
+    )
+{
+  CUmodule cudaModule;
+  // in this branch we use compilation with parameters
+
+  CUlinkState  CUState;
+  CUlinkState *lState = &CUState;
+  const int nOptions = 8;
+  CUjit_option options[nOptions];
+  void* optionVals[nOptions];
+  float walltime;
+  size_t logSize = log_size;
+  char error_log[logSize],
+       info_log[logSize];
+  void *cuOut;
+  size_t outSize;
+  int myErr = 0;
+
+  // Setup linker options
+  // Return walltime from JIT compilation
+  options[0] = CU_JIT_WALL_TIME;
+  optionVals[0] = (void*) &walltime;
+  // Pass a buffer for info messages
+  options[1] = CU_JIT_INFO_LOG_BUFFER;
+  optionVals[1] = (void*) info_log;
+  // Pass the size of the info buffer
+  options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+  optionVals[2] = (void*) logSize;
+  // Pass a buffer for error message
+  options[3] = CU_JIT_ERROR_LOG_BUFFER;
+  optionVals[3] = (void*) error_log;
+  // Pass the size of the error buffer
+  options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+  optionVals[4] = (void*) logSize;
+  // Make the linker verbose
+  options[5] = CU_JIT_LOG_VERBOSE;
+  optionVals[5] = (void*) 1;
+  // Max # of registers/pthread
+  options[6] = CU_JIT_MAX_REGISTERS;
+  int jitRegCount = maxrregcount;
+  optionVals[6] = (void *)(size_t)jitRegCount;
+  // Caching
+  options[7] = CU_JIT_CACHE_MODE;
+  optionVals[7] = (void *)CU_JIT_CACHE_OPTION_CA;
+  // Create a pending linker invocation
+
+  // Create a pending linker invocation
+  checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState));
+
+#if 0
+  if (sizeof(void *)==4)
+  {
+    // Load the PTX from the string myPtx32
+    printf("Loading myPtx32[] program\n");
+    // PTX May also be loaded from file, as per below.
+    myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0);
+  }
+  else
+#endif
+  {
+    // Load the PTX from the string myPtx (64-bit)
+    if (print_log)
+      fprintf(stderr, "Loading ptx..\n");
+    myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0);
+    myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, cudadevrt_lib, 0,0,0); 
+    // PTX May also be loaded from file, as per below.
+    // myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0);
+  }
+
+  // Complete the linker step
+  myErr = cuLinkComplete(*lState, &cuOut, &outSize);
+
+  if ( myErr != CUDA_SUCCESS )
+  {
+    // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above. 
+    fprintf(stderr,"PTX Linker Error:\n%s\n",error_log);
+    assert(0);
+  }    
+
+  // Linker walltime and info_log were requested in options above.
+ if (print_log)
+   fprintf(stderr, "CUDA Link Completed in %fms. Linker Output:\n%s\n",walltime,info_log);
+
+ // Load resulting cuBin into module
+ checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut));
+
+ // Destroy the linker invocation
+ checkCudaErrors(cuLinkDestroy(*lState));
+ return cudaModule;
+}
+static void unloadModule(CUmodule &cudaModule)
+{
+  checkCudaErrors(cuModuleUnload(cudaModule));
+}
+
+static CUfunction getFunction(CUmodule &cudaModule, const char * function)
+{
+  CUfunction cudaFunction;
+  checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
+  return cudaFunction;
+}
+
+static CUdeviceptr deviceMalloc(const size_t size)
+{
+  CUdeviceptr d_buf;
+  checkCudaErrors(cuMemAlloc(&d_buf, size));
+  return d_buf;
+}
+static void deviceFree(CUdeviceptr d_buf)
+{
+  checkCudaErrors(cuMemFree(d_buf));
+}
+static void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
+{
+  checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
+}
+static void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
+{
+  checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
+}
+#define deviceLaunch(func,params) \
+  checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \
+checkCudaErrors( \
+    cuLaunchKernel( \
+      (func), \
+      1,1,1, \
+      32, 1, 1, \
+      0, NULL, (params), NULL \
+      ));
+
+typedef CUdeviceptr devicePtr;
+
+
+/**************/
+#include <vector>
+static std::vector<char> readBinary(const char * filename, const bool print_size = false)
+{
+  std::vector<char> buffer;
+  FILE *fp = fopen(filename, "rb");
+  if (!fp )
+  {
+    fprintf(stderr, "file %s not found\n", filename);
+    assert(0);
+  }
+  fseek(fp, 0, SEEK_END); 
+  const unsigned long long size = ftell(fp);         /*calc the size needed*/
+  fseek(fp, 0, SEEK_SET); 
+  buffer.resize(size);
+
+  if (fp == NULL){ /*ERROR detection if file == empty*/
+    fprintf(stderr, "Error: There was an Error reading the file %s \n",filename);           
+    exit(1);
+  }
+  else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/
+    fprintf(stderr, "Error: There was an Error reading the file %s \n", filename);
+    exit(1);
+  }
+  if (print_size)
+    fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size());
+  return buffer;
+}
+
+static double CUDALaunch(
+    void **handlePtr, 
+    const char * func_name,
+    void **func_args,
+    const bool print_log = true,
+    const int maxrregcount = 64,
+    const char kernel_file[] = "__kernels.ptx",
+    const char cudadevrt_lib[] = "libcudadevrt.a",
+    const int log_size = 32768)
+{
+  fprintf(stderr, " launching kernel: %s \n", func_name);
+  const std::vector<char> module_str = readBinary(kernel_file, print_log);
+  const char *  module = &module_str[0];
+  CUmodule   cudaModule   = loadModule(module, maxrregcount, cudadevrt_lib, log_size, print_log);
+  CUfunction cudaFunction = getFunction(cudaModule, func_name);
+  deviceLaunch(cudaFunction, func_args);
+  checkCudaErrors(cuStreamSynchronize(0));
+  unloadModule(cudaModule);
+  return 0.0;
+}
+/******************************/
+
+
+extern "C" {
+//    extern int width();
+    int width() { return 32; }
+    extern void f_v(float *result);
+    extern void f_f(float *result, float *a);
+    extern void f_fu(float *result, float *a, float b);
+    extern void f_fi(float *result, float *a, int *b);
+    extern void f_du(float *result, double *a, double b);
+    extern void f_duf(float *result, double *a, float b);
+    extern void f_di(float *result, double *a, int *b);
+    extern void result(float *val);
+}
+
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ALIGN
+#else
+#define ALIGN __attribute__((aligned(64)))
+#endif
+
+int main(int argc, char *argv[]) {
+    int w = width();
+    assert(w <= 64);
+
+    float returned_result[64] ALIGN;
+    float vfloat[64] ALIGN;
+    double vdouble[64] ALIGN;
+    int vint[64] ALIGN;
+    int vint2[64] ALIGN;
+
+    const int device = 0;
+#if 0
+    const bool verbose = true;
+#else
+    const bool verbose = false;
+#endif
+
+    /*******************/
+    createContext(device, verbose);
+    /*******************/
+
+    devicePtr d_returned_result = deviceMalloc(64*sizeof(float));
+    devicePtr d_vfloat          = deviceMalloc(64*sizeof(float));
+    devicePtr d_vdouble         = deviceMalloc(64*sizeof(double));
+    devicePtr d_vint            = deviceMalloc(64*sizeof(int));
+    devicePtr d_vint2           = deviceMalloc(64*sizeof(int));
+
+
+    for (int i = 0; i < 64; ++i) {
+        returned_result[i] = -1e20;
+        vfloat[i] = i+1;
+        vdouble[i] = i+1;
+        vint[i] = 2*(i+1);
+        vint2[i] = i+5;
+    }
+
+    memcpyH2D(d_returned_result, returned_result, 64*sizeof(float));
+    memcpyH2D(d_vfloat         , vfloat,          64*sizeof(float));
+    memcpyH2D(d_vdouble        , vdouble,         64*sizeof(double));
+    memcpyH2D(d_vint        , vint,         64*sizeof(int));
+    memcpyH2D(d_vint2       , vint2,         64*sizeof(int));
+
+
+    float b = 5.;
+
+    const bool print_log = false;
+    const int  nreg = 64;
+#if (TEST_SIG == 0)
+    void *args[] = {&d_returned_result};
+    CUDALaunch(NULL, "f_v", args, print_log, nreg);
+#elif (TEST_SIG == 1)
+    void *args[] = {&d_returned_result, &d_vfloat};
+    CUDALaunch(NULL, "f_f", args, print_log, nreg);
+#elif (TEST_SIG == 2)
+    void *args[] = {&d_returned_result, &d_vfloat, &b};
+    CUDALaunch(NULL, "f_fu", args, print_log, nreg);
+#elif (TEST_SIG == 3)
+    void *args[] = {&d_returned_result, &d_vfloat, &vint};
+    CUDALaunch(NULL, "f_fi", args, print_log, nreg);
+#elif (TEST_SIG == 4)
+    int num = 5;
+    void *args[] = {&d_returned_result, &d_vdouble, &num};
+    CUDALaunch(NULL, "f_du", args, print_log, nreg);
+#elif (TEST_SIG == 5)
+    float num = 5.0f;
+    void *args[] = {&d_returned_result, &d_vdouble, &num};
+    CUDALaunch(NULL, "f_duf", args, print_log, nreg);
+#elif (TEST_SIG == 6)
+    void *args[] = {&d_returned_result, &d_vdouble, &v_int2};
+    CUDALaunch(NULL, "f_di", args, print_log, nreg);
+#else
+#error "Unknown or unset TEST_SIG value"
+#endif
+
+    float expected_result[64];
+    
+    memset(expected_result, 0, 64*sizeof(float));
+    devicePtr d_expected_result = deviceMalloc(64*sizeof(float));
+    memcpyH2D(d_expected_result, expected_result, 64*sizeof(float));
+    void *res_args[] = {&d_expected_result};
+    CUDALaunch(NULL, "result", res_args, print_log, nreg);
+    memcpyD2H(expected_result, d_expected_result, 64*sizeof(float));
+    memcpyD2H(returned_result, d_returned_result, 64*sizeof(float));
+
+    deviceFree(d_returned_result);
+    deviceFree(d_vfloat);
+    deviceFree(d_vdouble);
+    deviceFree(d_vint);
+    deviceFree(d_vint2);
+    deviceFree(d_expected_result);
+
+    /*******************/
+    destroyContext();
+    /*******************/
+
+    int errors = 0;
+    for (int i = 0; i < w; ++i) {
+        if (returned_result[i] != expected_result[i]) {
+#ifdef EXPECT_FAILURE
+            // bingo, failed
+            return 1;
+#else
+            printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
+                   argv[0], i, returned_result[i], returned_result[i],
+                   expected_result[i], expected_result[i]);
+            ++errors;
+#endif // EXPECT_FAILURE
+        }
+    }
+
+#ifdef EXPECT_FAILURE
+    // Don't expect to get here
+    return 0;
+#else
+    return errors > 0;
+#endif
+}
diff --git a/test_static_nvptx.cpp b/test_static_nvptx.cpp
new file mode 100644
index 00000000..0d56d06c
--- /dev/null
+++ b/test_static_nvptx.cpp
@@ -0,0 +1,133 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_WINDOWS
+#include <windows.h>
+#endif // ISPC_IS_WINDOWS
+
+#include <cassert>
+#include <cstring>
+#include <cstdio>
+#include <stdint.h>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif
+
+#include "ispc_malloc.h"
+
+#define N 32
+extern "C" {
+    int width() { return N; }
+    extern void f_v(float *result);
+    extern void f_f(float *result, float *a);
+    extern void f_fu(float *result, float *a, float b);
+    extern void f_fi(float *result, float *a, int *b);
+    extern void f_du(float *result, double *a, double b);
+    extern void f_duf(float *result, double *a, float b);
+    extern void f_di(float *result, double *a, int *b);
+    extern void result(float *val);
+}
+
+int main(int argc, char *argv[]) {
+    int w = width();
+    assert(w <= N);
+
+    float *returned_result = new float[N*4];
+    float *vfloat = new float[N*4];
+    double *vdouble = new double[N*4];
+    int *vint = new int[N*4];
+    int *vint2 = new int[N*4];
+
+    for (int i = 0; i < N*4; ++i) {
+        returned_result[i] = -1e20;
+        vfloat[i] = i+1;
+        vdouble[i] = i+1;
+        vint[i] = 2*(i+1);
+        vint2[i] = i+5;
+    }
+
+    float b = 5.;
+
+#if (TEST_SIG == 0)
+    f_v(returned_result);
+#elif (TEST_SIG == 1)
+    f_f(returned_result, vfloat);
+#elif (TEST_SIG == 2)
+    f_fu(returned_result, vfloat, b);
+#elif (TEST_SIG == 3)
+    f_fi(returned_result, vfloat, vint);
+#elif (TEST_SIG == 4)
+    f_du(returned_result, vdouble, 5.);
+#elif (TEST_SIG == 5)
+    f_duf(returned_result, vdouble, 5.f);
+#elif (TEST_SIG == 6)
+    f_di(returned_result, vdouble, vint2);
+#else
+#error "Unknown or unset TEST_SIG value"
+#endif
+
+    float *expected_result = new float[N];
+    memset(expected_result, 0, N*sizeof(float));
+    result(expected_result);
+
+    int errors = 0;
+    for (int i = 0; i < w; ++i) {
+        if (returned_result[i] != expected_result[i]) 
+        {
+#ifdef EXPECT_FAILURE
+            // bingo, failed
+            return 1;
+#else
+            printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
+                   argv[0], i, returned_result[i], returned_result[i],
+                   expected_result[i], expected_result[i]);
+            ++errors;
+#endif // EXPECT_FAILURE
+        }
+    }
+
+#ifdef EXPECT_FAILURE
+    // Don't expect to get here
+    return 0;
+#else
+    return errors > 0;
+#endif
+}
diff --git a/tests/array-mixed-unif-vary-indexing-3.ispc b/tests/array-mixed-unif-vary-indexing-3.ispc
index ab3a7a7c..c6623cf6 100644
--- a/tests/array-mixed-unif-vary-indexing-3.ispc
+++ b/tests/array-mixed-unif-vary-indexing-3.ispc
@@ -5,7 +5,13 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     assert(programCount <= 64);
+#ifdef __NVPTX__
+    uniform float  * uniform xarr   =  uniform new uniform float[70*70];
+    uniform float (* uniform x)[70] = (uniform float (* uniform)[70])xarr;
+#define _SHMALLOC
+#else
     uniform float x[70][70];
+#endif
     for (uniform int i = 0; i < 70; ++i)
         for (uniform int j = 0; j < 70; ++j)
             x[i][j] = 2+b-5;
@@ -16,6 +22,10 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     else
         x[b-1][a-1] = 1;
     RET[programIndex] = x[4][a];
+
+#ifdef _SHMALLOC
+    delete xarr;
+#endif
 }
 
 export void result(uniform float RET[]) { 
diff --git a/tests/broadcast.ispc b/tests/broadcast.ispc
index 1df835ae..6dfa1a00 100644
--- a/tests/broadcast.ispc
+++ b/tests/broadcast.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    float b = (programCount == 1) ? 3 : broadcast(a, 2);
+    float b = (programCount == 1) ? 4 : broadcast(a, 2);
     RET[programIndex] = b;
 }
 
diff --git a/tests/c-test-64.ispc b/tests/c-test-64.ispc
index 3429bf91..d2602bc7 100644
--- a/tests/c-test-64.ispc
+++ b/tests/c-test-64.ispc
@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 2;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 5;
-    RET[3] = RET[7] = RET[11] = RET[15] = 6;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 2;
+    RET[i+1] = 3;
+    RET[i+2] = 5;
+    RET[i+3] = 6;
+  }
 }
diff --git a/tests/c-test-65.ispc b/tests/c-test-65.ispc
index 9a363864..15df6367 100644
--- a/tests/c-test-65.ispc
+++ b/tests/c-test-65.ispc
@@ -18,6 +18,9 @@ export void f_fu(uniform float RET[4], uniform float aFOO[4], uniform float b) {
 
 export void result(uniform float RET[]) {
     RET[programIndex] = 3;
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+    for (int i = 0; i < programCount; i += 4)
+    {
+      RET[i+0] = 1;
+      RET[i+3] = 29;
+    }
 }
diff --git a/tests/c-test-66.ispc b/tests/c-test-66.ispc
index a6c35dc7..22511604 100644
--- a/tests/c-test-66.ispc
+++ b/tests/c-test-66.ispc
@@ -19,6 +19,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 export void result(uniform float RET[]) {
     RET[programIndex] = 32;
-    RET[2] = RET[6] = RET[10] = RET[14] = 38;
-    RET[3] = RET[7] = RET[11] = RET[15] = 39;
+    for (int i = 0; i < programCount; i += 4)
+    {
+      RET[i+2] = 38;
+      RET[i+3] = 39;
+    }
 }
diff --git a/tests/cfor-array-struct-gather.ispc b/tests/cfor-array-struct-gather.ispc
index c320ad7c..d433b00d 100644
--- a/tests/cfor-array-struct-gather.ispc
+++ b/tests/cfor-array-struct-gather.ispc
@@ -4,14 +4,14 @@ export uniform int width() { return programCount; }
 
 
 struct Foo {
-    uniform float x[17];
+    uniform float x[programCount+1];
 };
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     uniform Foo foo;
     uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
         foo.x[i] = i;
 
     if ((int)a & 1)
diff --git a/tests/cfor-gs-double-improve-multidim-1.ispc b/tests/cfor-gs-double-improve-multidim-1.ispc
index ed672bd8..62124e2a 100644
--- a/tests/cfor-gs-double-improve-multidim-1.ispc
+++ b/tests/cfor-gs-double-improve-multidim-1.ispc
@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
-    uniform double udx[25][25];
-    cfor (uniform int i = 0; i < 25; ++i)
-        cfor (uniform int j = 0; j < 25; ++j)
+    uniform double udx[programCount+1][programCount+1];
+    cfor (uniform int i = 0; i < programCount+1; ++i)
+        cfor (uniform int j = 0; j < programCount+1; ++j)
             udx[i][j] = 10*i+j;
 
     int x = 1;
diff --git a/tests/cfor-gs-improve-multidim-1.ispc b/tests/cfor-gs-improve-multidim-1.ispc
index b0893617..32482ced 100644
--- a/tests/cfor-gs-improve-multidim-1.ispc
+++ b/tests/cfor-gs-improve-multidim-1.ispc
@@ -5,9 +5,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
-    uniform float udx[20][20];
-    cfor (uniform int i = 0; i < 20; ++i)
-        cfor (uniform int j = 0; j < 20; ++j)
+    uniform float udx[programCount+1][programCount+1];
+    cfor (uniform int i = 0; i < programCount+1; ++i)
+        cfor (uniform int j = 0; j < programCount+1; ++j)
             udx[i][j] = 100*i+j;
 
     int x = 1;
diff --git a/tests/cfor-gs-improve-multidim-struct-1.ispc b/tests/cfor-gs-improve-multidim-struct-1.ispc
index d599ceb9..0d682f9a 100644
--- a/tests/cfor-gs-improve-multidim-struct-1.ispc
+++ b/tests/cfor-gs-improve-multidim-struct-1.ispc
@@ -4,19 +4,27 @@ export uniform int width() { return programCount; }
 
 
 struct Foo {
-    uniform float udx[25][25];
+    uniform float udx[32][32];
 };
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
+#ifndef __NVPTX__ 
     uniform Foo f[5];
+#else     /* too much shared memory allocated, nvcc fails to link */
+    uniform Foo * uniform f = uniform new uniform Foo[5];
+#define _UNMALLOC
+#endif
     cfor (uniform int i = 0; i < 5; ++i)
-        cfor (uniform int j = 0; j < 25; ++j)
-            cfor (uniform int k = 0; k < 25; ++k)
+        cfor (uniform int j = 0; j < 32; ++j)
+            cfor (uniform int k = 0; k < 32; ++k)
                 f[i].udx[j][k] = 1000*i+100*j+k;
 
     int x = 1;
     RET[programIndex] = f[x+1].udx[b-4][programIndex];
+#ifdef _UNMALLOC
+    delete f;
+#endif
 }
 
 export void result(uniform float RET[]) { RET[programIndex] = 2100 +programIndex; }
diff --git a/tests/cfor-struct-gather-2.ispc b/tests/cfor-struct-gather-2.ispc
index 7c615139..75da4a3f 100644
--- a/tests/cfor-struct-gather-2.ispc
+++ b/tests/cfor-struct-gather-2.ispc
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
     uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
         foo[i].f = i*a;
     RET[programIndex] = func(foo, (int)a);
 }
diff --git a/tests/cfor-struct-gather-3.ispc b/tests/cfor-struct-gather-3.ispc
index 7c615139..75da4a3f 100644
--- a/tests/cfor-struct-gather-3.ispc
+++ b/tests/cfor-struct-gather-3.ispc
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
     uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
         foo[i].f = i*a;
     RET[programIndex] = func(foo, (int)a);
 }
diff --git a/tests/cfor-struct-gather.ispc b/tests/cfor-struct-gather.ispc
index 49928a6b..9265da32 100644
--- a/tests/cfor-struct-gather.ispc
+++ b/tests/cfor-struct-gather.ispc
@@ -9,9 +9,9 @@ struct Foo {
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
     uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
         foo[i].f = i*a;
     RET[programIndex] = foo[(int)a].f;
 }
diff --git a/tests/cfor-struct-test-114.ispc b/tests/cfor-struct-test-114.ispc
index 0ea2f65a..e7b83a79 100644
--- a/tests/cfor-struct-test-114.ispc
+++ b/tests/cfor-struct-test-114.ispc
@@ -10,9 +10,9 @@ struct Foo {
 export void f_fi(uniform float RET[], uniform float aFOO[], uniform int bFOO[]) {
     float a = aFOO[programIndex];
     int b = bFOO[programIndex];
-    varying Foo myFoo[17];
+    varying Foo myFoo[programCount+1];
     uniform int i;
-    cfor (i = 0; i < 17; ++i) {
+    cfor (i = 0; i < programCount+1; ++i) {
         myFoo[i].x = i;
         myFoo[i].f = 2*i;
     }
diff --git a/tests/cfor-test-134.ispc b/tests/cfor-test-134.ispc
index 96493dff..0e8af645 100644
--- a/tests/cfor-test-134.ispc
+++ b/tests/cfor-test-134.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/cfor-test-135.ispc b/tests/cfor-test-135.ispc
index 5926ba30..9f17350e 100644
--- a/tests/cfor-test-135.ispc
+++ b/tests/cfor-test-135.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/cfor-test-136.ispc b/tests/cfor-test-136.ispc
index 62834f67..e7ac9f75 100644
--- a/tests/cfor-test-136.ispc
+++ b/tests/cfor-test-136.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/cfor-test-64.ispc b/tests/cfor-test-64.ispc
index 9c51c9b0..eb2cbec0 100644
--- a/tests/cfor-test-64.ispc
+++ b/tests/cfor-test-64.ispc
@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 2;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 5;
-    RET[3] = RET[7] = RET[11] = RET[15] = 6;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 2;
+    RET[i+1] = 3;
+    RET[i+2] = 5;
+    RET[i+3] = 6;
+  }
 }
diff --git a/tests/cfor-test-65.ispc b/tests/cfor-test-65.ispc
index a3c11c6d..28f82225 100644
--- a/tests/cfor-test-65.ispc
+++ b/tests/cfor-test-65.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/cfor-test-66.ispc b/tests/cfor-test-66.ispc
index d3698ffe..e53d2b94 100644
--- a/tests/cfor-test-66.ispc
+++ b/tests/cfor-test-66.ispc
@@ -18,8 +18,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 32;
-    RET[1] = RET[5] = RET[9] = RET[13] = 32;
-    RET[2] = RET[6] = RET[10] = RET[14] = 38;
-    RET[3] = RET[7] = RET[11] = RET[15] = 39;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 32;
+    RET[i+1] = 32;
+    RET[i+2] = 38;
+    RET[i+3] = 39;
+  }
 }
diff --git a/tests/cfor-unif-struct-test-114.ispc b/tests/cfor-unif-struct-test-114.ispc
index 114e826d..59649fd0 100644
--- a/tests/cfor-unif-struct-test-114.ispc
+++ b/tests/cfor-unif-struct-test-114.ispc
@@ -8,9 +8,9 @@ struct Foo {
 };
 export void f_fi(uniform float RET[], uniform float a[], uniform int bFOO[]) {
     int b = bFOO[programIndex];
-    uniform struct Foo myFoo[17];
+    uniform struct Foo myFoo[programCount+1];
     uniform int i;
-    cfor (i = 0; i < 17; ++i) {
+    cfor (i = 0; i < programCount+1; ++i) {
         myFoo[i].x = i;
         myFoo[i].f = 2*i;
     }
diff --git a/tests/const-fold-1.ispc b/tests/const-fold-1.ispc
index fc4717ce..95b46cea 100644
--- a/tests/const-fold-1.ispc
+++ b/tests/const-fold-1.ispc
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex];
     uniform int x = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
-    static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
+    const static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
     RET[programIndex] = (x == y) ? 1. : 0.;
 }
 
diff --git a/tests/const-fold-2.ispc b/tests/const-fold-2.ispc
index 88743d2f..4e0ea5b6 100644
--- a/tests/const-fold-2.ispc
+++ b/tests/const-fold-2.ispc
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex];
     uniform int x = (170 >> 4) % 5;
-    static uniform int y = (170 >> 4) % 5;
+    const static uniform int y = (170 >> 4) % 5;
     RET[programIndex] = (x == y) ? 1. : 0.;
 }
 
diff --git a/tests/const-fold-3.ispc b/tests/const-fold-3.ispc
index cf5bc915..15c49e92 100644
--- a/tests/const-fold-3.ispc
+++ b/tests/const-fold-3.ispc
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex];
     uniform int x = (17 < 2) || (6 >= 5) && (20 >= 20);
-    static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20);
+    const static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20);
     RET[programIndex] = ((x!=0) == (y!=0)) ? 1. : 0.;
 }
 
diff --git a/tests/launch-8.ispc b/tests/launch-8.ispc
index eacba673..9855a963 100644
--- a/tests/launch-8.ispc
+++ b/tests/launch-8.ispc
@@ -2,22 +2,23 @@
 export uniform int width() { return programCount; }
 
 
-#define N0 10
+#define N0 12
 #define N1 20
 #define N2 50
 static uniform float array[N2][N1][N0];
 
-task void x(const float f) {
+task void x(const uniform float farray[]) {
+    const float f = farray[programIndex];
     uniform int j;
 
-    assert(taskCount  == (int32)N0*N1*N2);
-    assert(taskCount0 == (int32)N0);
-    assert(taskCount1 == (int32)N1);
-    assert(taskCount2 == (int32)N2);
-    assert(taskIndex  == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2));
-    assert(taskIndex0 < (int32)N0);
-    assert(taskIndex1 < (int32)N1);
-    assert(taskIndex2 < (int32)N2);
+    assert(taskCount  == (uniform int32)N0*N1*N2);
+    assert(taskCount0 == (uniform int32)N0);
+    assert(taskCount1 == (uniform int32)N1);
+    assert(taskCount2 == (uniform int32)N2);
+    assert(taskIndex  == (uniform int32)taskIndex0 + (uniform int32)N0*(taskIndex1 +(uniform int32) N1*taskIndex2));
+    assert(taskIndex0 < (uniform int32)N0);
+    assert(taskIndex1 < (uniform int32)N1);
+    assert(taskIndex2 < (uniform int32)N2);
 
     const uniform int i0 = taskIndex0;
     const uniform int i1 = taskIndex1;
@@ -30,7 +31,7 @@ task void x(const float f) {
         array[i2][i1][i0] = i;
 }
 export void f_f(uniform float RET[], uniform float fFOO[]) { 
-    float f = fFOO[programIndex];
+    uniform float * uniform f = fFOO;
     launch[N2][N1][N0] x(f);
     sync;
     RET[programIndex] = array[N2-1][N1-1][N0-1];
@@ -38,5 +39,5 @@ export void f_f(uniform float RET[], uniform float fFOO[]) {
 
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 9999.000000;
+    RET[programIndex] = 11999.000000;
 }
diff --git a/tests/launch-9.ispc b/tests/launch-9.ispc
index 1952e8e7..dbbb9f80 100644
--- a/tests/launch-9.ispc
+++ b/tests/launch-9.ispc
@@ -2,12 +2,13 @@
 export uniform int width() { return programCount; }
 
 
-#define N0 10
+#define N0 12
 #define N1 20
 #define N2 50
 static uniform float array[N2][N1][N0];
 
-task void x(const float f) {
+task void x(const uniform float farray[]) {
+    const float f = farray[programIndex];
     uniform int j;
 
     assert(taskCount  == (int32)N0*N1*N2);
@@ -30,13 +31,13 @@ task void x(const float f) {
         array[i2][i1][i0] = i;
 }
 export void f_f(uniform float RET[], uniform float fFOO[]) { 
-    float f = fFOO[programIndex];
-    launch[N0,N1,N2] x(f);
+    uniform float * uniform f = fFOO;
+    launch[N2][N1][N0] x(f);
     sync;
     RET[programIndex] = array[N2-1][N1-1][N0-1];
 }
 
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 9999.000000;
+    RET[programIndex] = 11999.000000;
 }
diff --git a/tests/operators2.ispc b/tests/operators2.ispc
index b732b24a..daef4ec6 100644
--- a/tests/operators2.ispc
+++ b/tests/operators2.ispc
@@ -1,4 +1,9 @@
+#ifdef __NVPTX__
+uniform int _off[programCount];
+#define off _off[programIndex]
+#else  /* global varying data types are not yet supported with "nvptx" target */
 int off;
+#endif
 
 export uniform int width() { return programCount; }
 
@@ -22,11 +27,11 @@ struct S operator/(struct S rr, struct S rv) {
     return c;
 }
 
-struct S a;
-struct S b;
-struct S d;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
+    struct S a;
+    struct S b;
+    struct S d;
     int T = programIndex;
     a.a = aFOO[programIndex];
     b.a = -aFOO[programIndex];
diff --git a/tests/soa-16.ispc b/tests/soa-16.ispc
index f23c39cb..3c6ff6c4 100644
--- a/tests/soa-16.ispc
+++ b/tests/soa-16.ispc
@@ -15,6 +15,16 @@ static void p(uniform float *uniform ptr) {
 }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+#ifdef __NVPTX__  /* soa is converted to shared memory story for now, use smaller amount to check the test */
+    soa<4> Point pts[10];
+    for (uniform int i = 0; i < 40; ++i) {
+        pts[i].x = b*i;
+        pts[i].y[0] = 2*b*i;
+        pts[i].y[1] = 2*b*i+1;
+        pts[i].y[2] = 2*b*i+2;
+        pts[i].z = 3*b*i;
+    }
+#else
     soa<4> Point pts[30];
     for (uniform int i = 0; i < 120; ++i) {
         pts[i].x = b*i;
@@ -23,6 +33,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         pts[i].y[2] = 2*b*i+2;
         pts[i].z = 3*b*i;
     }
+#endif
 
     float a = aFOO[programIndex]; 
     a *= -1;
diff --git a/tests/soa-17.ispc b/tests/soa-17.ispc
index f25b85bd..5dc9ea2f 100644
--- a/tests/soa-17.ispc
+++ b/tests/soa-17.ispc
@@ -16,6 +16,16 @@ static void p(uniform float *uniform ptr) {
 }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+#ifdef __NVPTX__  /* soa is converted to shared memory story for now, use smaller amount to check the test */
+    soa<4> Point pts[15];
+    for (uniform int i = 0; i < 60; ++i) {
+        pts[i].x = b*i;
+        pts[i].y[0] = 2*b*i;
+        pts[i].y[1] = 2*b*i+1;
+        pts[i].y[2] = 2*b*i+2;
+        pts[i].z = 3*b*i;
+    }
+#else
     soa<4> Point pts[40];
     for (uniform int i = 0; i < 160; ++i) {
         pts[i].x = b*i;
@@ -24,6 +34,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         pts[i].y[2] = 2*b*i+2;
         pts[i].z = 3*b*i;
     }
+#endif
 
     float a = aFOO[programIndex]; 
     a *= -1;
diff --git a/tests/soa-22.ispc b/tests/soa-22.ispc
index 60448694..ba3ffa0c 100644
--- a/tests/soa-22.ispc
+++ b/tests/soa-22.ispc
@@ -25,7 +25,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
             }
         }
     }
-    
+   
     assert(programIndex < 80);
     RET[programIndex] = pts[programIndex].pts[programIndex % 3][programIndex % 4].z;
 }
diff --git a/tests/soa-3.ispc b/tests/soa-3.ispc
index 2cec07a5..86c7c57c 100644
--- a/tests/soa-3.ispc
+++ b/tests/soa-3.ispc
@@ -6,6 +6,17 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
 
+#ifdef __NVPTX__  /* soa is converted to shared memory story for now, use smaller amount to check the test */
+    soa<8> Point pts[4];
+//CO    uniform Point pts[80];
+    foreach (i = 0 ... 40) {
+        pts[i].x = b*i;
+        pts[i].y[0] = 2*b*i;
+        pts[i].y[1] = 2*b*i+1;
+        pts[i].y[2] = 2*b*i+2;
+        pts[i].z = 3*b*i;
+    }
+#else
     soa<8> Point pts[10];
 //CO    uniform Point pts[80];
     foreach (i = 0 ... 80) {
@@ -15,6 +26,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         pts[i].y[2] = 2*b*i+2;
         pts[i].z = 3*b*i;
     }
+#endif
 
     assert(programCount < 80);
     RET[programIndex] = pts[programIndex].y[2];
diff --git a/tests/test-134.ispc b/tests/test-134.ispc
index baa8ec37..9d4d0e94 100644
--- a/tests/test-134.ispc
+++ b/tests/test-134.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/test-135.ispc b/tests/test-135.ispc
index c350a524..bb9881e6 100644
--- a/tests/test-135.ispc
+++ b/tests/test-135.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/test-136.ispc b/tests/test-136.ispc
index ab6c6b5b..098ac456 100644
--- a/tests/test-136.ispc
+++ b/tests/test-136.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/test-140.ispc b/tests/test-140.ispc
index a983d528..997d558e 100644
--- a/tests/test-140.ispc
+++ b/tests/test-140.ispc
@@ -8,8 +8,11 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 0x0.0p+0;
-    RET[1] = RET[5] = RET[9] = RET[13] = 0x1.62e43p-1;
-    RET[2] = RET[6] = RET[10] = RET[14] =  0x1.193ea8p+0;
-    RET[3] = RET[7] = RET[11] = RET[15] = 0x1.62e43p+0;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 0x0.0p+0;
+    RET[i+1] = 0x1.62e43p-1;
+    RET[i+2] = 0x1.193ea8p+0;
+    RET[i+3] = 0x1.62e43p+0;
+  }
 }
diff --git a/tests/test-141.ispc b/tests/test-141.ispc
index b69be1fa..9045c081 100644
--- a/tests/test-141.ispc
+++ b/tests/test-141.ispc
@@ -5,7 +5,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex];
     // calculation error 1e-6 is the same as in icc
-    RET[programIndex] = (exp(-log(1/a)) - a) < 1e-6 ? 1 : 0;
+    RET[programIndex] = (exp(-log(1/a)) - a)/a < 1e-6 ? 1 : 0;
 }
 
 export void result(uniform float RET[4]) {
diff --git a/tests/test-142.ispc b/tests/test-142.ispc
index 18053402..9ab8ff9f 100644
--- a/tests/test-142.ispc
+++ b/tests/test-142.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = round(a+.499999); 
+    RET[programIndex] = round(a+.49999); 
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/test-144.ispc b/tests/test-144.ispc
index 568bdc10..64e1817a 100644
--- a/tests/test-144.ispc
+++ b/tests/test-144.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = floor(a+.999999); 
+    RET[programIndex] = floor(a+.99999); 
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/uniform-1.ispc b/tests/uniform-1.ispc
new file mode 100644
index 00000000..dcf4eab0
--- /dev/null
+++ b/tests/uniform-1.ispc
@@ -0,0 +1,34 @@
+
+export uniform int width() { return programCount; }
+
+
+task void f_f_task(uniform float RET[], uniform float aFOO[]) {
+    uniform float val[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        val[i] = 0;
+
+    foreach (i = 0 ... programCount)
+        val[i] += aFOO[programCount*taskIndex + i] - 1;
+
+    uniform float sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        sum += val[i];
+
+    if (programIndex < 32/4)
+      RET[programCount/4*taskIndex + programIndex] = sum;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) 
+{
+  launch[4] f_f_task(RET,  aFOO);
+}
+task void result_task(uniform float RET[])
+{
+  const uniform float ret = reduce_add(programIndex + programCount*taskIndex);
+  if (programIndex < 32/4)
+    RET[programCount/4*taskIndex + programIndex] = ret;
+}
+
+export void result(uniform float RET[]) {
+  launch[4] result_task(RET);
+}
diff --git a/type.cpp b/type.cpp
index cf7ac85d..00795737 100644
--- a/type.cpp
+++ b/type.cpp
@@ -749,7 +749,7 @@ EnumType::Mangle() const {
     std::string ret;
     if (isConst) ret += "C";
     ret += variability.MangleString();
-    ret += std::string("enum[") + name + std::string("]");
+    ret += std::string("enum_5B_") + name + std::string("_5C_");
     return ret;
 }
 
@@ -1420,7 +1420,7 @@ ArrayType::Mangle() const {
         sprintf(buf, "%d", numElements);
     else
         buf[0] = '\0';
-    return s + "[" + buf + "]";
+    return s + "_5B_" + buf + "_5C_";
 }
 
 
@@ -2058,12 +2058,12 @@ lMangleStruct(Variability variability, bool isConst, const std::string &name) {
     Assert(variability != Variability::Unbound);
 
     std::string ret;
-    ret += "s[";
+    ret += "s_5B_";
     if (isConst)
         ret += "_c_";
     ret += variability.MangleString();
 
-    ret += name + std::string("]");
+    ret += name + std::string("_5C_");
     return ret;
 }
 
@@ -3009,7 +3009,7 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
         llvmArgTypes.push_back(LLVMTypes::MaskType);
 
     std::vector<llvm::Type *> callTypes;
-    if (isTask) {
+    if (isTask && g->target->getISA() != Target::NVPTX) {
         // Tasks take three arguments: a pointer to a struct that holds the
         // actual task arguments, the thread index, and the total number of
         // threads the tasks system has running.  (Task arguments are