diff --git a/LICENSE.txt b/LICENSE.txt
index 93c4d816..0b3959a5 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -141,3 +141,46 @@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
 POSSIBILITY OF SUCH DAMAGE.
+
+---------------------------------------------------------------------------
+
+The ptxtools use parts of the PTX parser code from GPU Ocelot project
+(https://code.google.com/p/gpuocelot/), which is covered by the following
+license:
+
+Copyright 2011
+GEORGIA TECH RESEARCH CORPORATION
+ALL RIGHTS RESERVED
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+    * Redistributions of source code must retain the above copyright
+notice,   this list of conditions and the following disclaimers.
+    * Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the
+distribution.
+    * Neither the name of GEORGIA TECH RESEARCH CORPORATION nor the
+names of  its contributors may be used to endorse or promote
+products derived  from this software without specific prior
+written permission.
+
+THIS SOFTWARE IS PROVIDED BY GEORGIA TECH RESEARCH CORPORATION ''AS IS''
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GEORGIA TECH RESEARCH
+CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You agree that the Software will not be shipped, transferred, exported,
+or re-exported directly into any country prohibited by the United States
+Export Administration Act and the regulations thereunder nor will be
+used for any purpose prohibited by the Act.
+
+ 
diff --git a/Makefile b/Makefile
index b30cc50e..f04bc3de 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 #
-#  Copyright (c) 2010-2013, Intel Corporation
+#  Copyright (c) 2010-2014, Intel Corporation
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -45,15 +45,15 @@ Location of LLVM files in your PATH is different than path in LLVM_HOME \n
 variable (or LLVM_HOME is not set). The most likely this means that you are \n
 using default LLVM installation on your system, which is very bad sign. \n
 Note, that ISPC uses LLVM optimizer and is highly dependent on it. We recommend \n
-using *patched* version of LLVM 3.3 or 3.4. Patches are availible in \n
+using *patched* version of LLVM 3.4 or 3.5. Patches are availible in \n
 llvm_patches folder. You can build LLVM manually, or run our scripts, which \n
 will do all the work for you. Do the following: \n
 1. Create a folder, where LLVM will reside and set LLVM_HOME variable to its \n
   path. \n
 2. Set ISPC_HOME variable to your ISPC location (probably current folder).
 3. Run alloy.py tool to checkout and build LLVM: \n
-  alloy.py -b --version=3.4 \n
-4. Add $$LLVM_HOME/bin-3.4/bin path to your PATH. \n
+  alloy.py -b --version=3.5 \n
+4. Add $$LLVM_HOME/bin-3.5/bin path to your PATH. \n
 ==============================================================================
 endef
 
@@ -73,6 +73,10 @@ endif
 # To enable: make ARM_ENABLED=1
 ARM_ENABLED=0
 
+# Disable NVPTX by request
+# To enable: make NVPTX_ENABLED=1
+NVPTX_ENABLED=0
+
 # Add llvm bin to the path so any scripts run will go to the right llvm-config
 LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
 export PATH:=$(LLVM_BIN):$(PATH)
@@ -89,7 +93,7 @@ LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
 LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e 's/svn//' -e 's/\./_/' -e 's/\..*//')
 LLVM_VERSION_DEF=-D$(LLVM_VERSION)
 
-LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker
+LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker 
 # Component "option" was introduced in 3.3 and starting with 3.4 it is required for the link step.
 # We check if it's available before adding it (to not break 3.2 and earlier).
 ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1)
@@ -98,6 +102,9 @@ endif
 ifneq ($(ARM_ENABLED), 0)
     LLVM_COMPONENTS+=arm
 endif
+ifneq ($(NVPTX_ENABLED), 0)
+    LLVM_COMPONENTS+=nvptx
+endif	
 LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS))
 
 CLANG=clang
@@ -160,12 +167,19 @@ endif
 ifneq ($(ARM_ENABLED), 0)
     CXXFLAGS+=-DISPC_ARM_ENABLED
 endif
+ifneq ($(NVPTX_ENABLED), 0)
+    CXXFLAGS+=-DISPC_NVPTX_ENABLED
+endif
 
 LDFLAGS=
 ifeq ($(ARCH_OS),Linux)
   # try to link everything statically under Linux (including libstdc++) so
   # that the binaries we generate will be portable across distributions...
 #    LDFLAGS=-static
+  # Linking everything statically isn't easy (too many things are required),
+  # but linking libstdc++ and libgcc is necessary when building with relatively
+  # new gcc, when going to distribute to old systems.
+#    LDFLAGS=-static-libgcc -static-libstdc++
 endif
 
 LEX=flex
@@ -184,6 +198,9 @@ TARGETS=avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-
 ifneq ($(ARM_ENABLED), 0)
     TARGETS+=neon-32 neon-16 neon-8
 endif
+ifneq ($(NVPTX_ENABLED), 0)
+    TARGETS+=nvptx
+endif
 # These files need to be compiled in two versions - 32 and 64 bits.
 BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
 # These are files to be compiled in single version.
@@ -289,15 +306,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
 
-objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@
 
-objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(32 bit version\)
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@
 
-objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(64 bit version\)
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@
 
diff --git a/alloy.py b/alloy.py
index 13d38660..f932b5f3 100755
--- a/alloy.py
+++ b/alloy.py
@@ -226,7 +226,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
         try_do_LLVM("configure release version for selfbuild ",
                     "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" +
                     LLVM_BIN_selfbuild + " --enable-optimized" +
-                    " --enable-targets=x86,x86_64" +
+                    " --enable-targets=x86,x86_64,nvptx" +
                     ((" --with-gcc-toolchain=" + gcc_toolchain_path) if gcc_toolchain_path != "" else "") +
                     mac_system_root,
                     from_validation)
@@ -244,7 +244,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
             try_do_LLVM("configure release version ",
                     "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" +
                     LLVM_BIN + " --enable-optimized" + selfbuild_compiler +
-                    " --enable-targets=x86,x86_64" +
+                    " --enable-targets=x86,x86_64,nvptx" +
                     ((" --with-gcc-toolchain=" + gcc_toolchain_path) if gcc_toolchain_path != "" else "") +
                     mac_system_root,
                     from_validation)
@@ -257,7 +257,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
         try_do_LLVM("configure debug version ",
                     "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" + LLVM_BIN +
                     " --enable-debug-runtime --enable-debug-symbols --enable-keep-symbols" + selfbuild_compiler +
-                    " --enable-targets=x86,x86_64" +
+                    " --enable-targets=x86,x86_64,nvptx" +
                     ((" --with-gcc-toolchain=" + gcc_toolchain_path) if gcc_toolchain_path != "" else "") +
                     mac_system_root,
                     from_validation)
diff --git a/builtins.cpp b/builtins.cpp
index 52fd000c..2a1df0eb 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -342,11 +342,17 @@ lSetInternalFunctions(llvm::Module *module) {
         "__all",
         "__any",
         "__aos_to_soa3_float",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__aos_to_soa3_float1",
+//#endif /* ISPC_NVPTX_ENABLED */
         "__aos_to_soa3_float16",
         "__aos_to_soa3_float4",
         "__aos_to_soa3_float8",
         "__aos_to_soa3_int32",
         "__aos_to_soa4_float",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__aos_to_soa4_float1",
+//#endif /* ISPC_NVPTX_ENABLED */
         "__aos_to_soa4_float16",
         "__aos_to_soa4_float4",
         "__aos_to_soa4_float8",
@@ -395,6 +401,38 @@ lSetInternalFunctions(llvm::Module *module) {
         "__atomic_xor_int64_global",
         "__atomic_xor_uniform_int32_global",
         "__atomic_xor_uniform_int64_global",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__atomic_add_varying_int32_global",
+        "__atomic_add_varying_int64_global",
+        "__atomic_and_varying_int32_global",
+        "__atomic_and_varying_int64_global",
+        "__atomic_compare_exchange_varying_double_global",
+        "__atomic_compare_exchange_varying_float_global",
+        "__atomic_compare_exchange_varying_int32_global",
+        "__atomic_compare_exchange_varying_int64_global",
+        "__atomic_max_varying_int32_global",
+        "__atomic_max_varying_int64_global",
+        "__atomic_min_varying_int32_global",
+        "__atomic_min_varying_int64_global",
+        "__atomic_or_varying_int32_global",
+        "__atomic_or_varying_int64_global",
+        "__atomic_sub_varying_int32_global",
+        "__atomic_sub_varying_int64_global",
+        "__atomic_swap_varying_double_global",
+        "__atomic_swap_varying_float_global",
+        "__atomic_swap_varying_int32_global",
+        "__atomic_swap_varying_int64_global",
+        "__atomic_umax_varying_uint32_global",
+        "__atomic_umax_varying_uint64_global",
+        "__atomic_umin_varying_uint32_global",
+        "__atomic_umin_varying_uint64_global",
+        "__atomic_xor_uniform_int32_global",
+        "__atomic_xor_uniform_int64_global",
+        "__atomic_xor_varying_int32_global",
+        "__atomic_xor_varying_int64_global",
+        "__atomic_xor_varying_int32_global",
+        "__atomic_xor_varying_int64_global",
+//#endif /* ISPC_NVPTX_ENABLED */
         "__broadcast_double",
         "__broadcast_float",
         "__broadcast_i16",
@@ -417,6 +455,9 @@ lSetInternalFunctions(llvm::Module *module) {
         "__do_assert_uniform",
         "__do_assert_varying",
         "__do_print",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__do_print_nvptx",
+//#endif /* ISPC_NVPTX_ENABLED */
         "__doublebits_uniform_int64",
         "__doublebits_varying_int64",
         "__exclusive_scan_add_double",
@@ -431,6 +472,10 @@ lSetInternalFunctions(llvm::Module *module) {
         "__extract_int32",
         "__extract_int64",
         "__extract_int8",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__extract_float",
+        "__extract_double",
+//#endif /* ISPC_NVPTX_ENABLED */
         "__fastmath",
         "__float_to_half_uniform",
         "__float_to_half_varying",
@@ -447,6 +492,10 @@ lSetInternalFunctions(llvm::Module *module) {
         "__insert_int32",
         "__insert_int64",
         "__insert_int8",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__insert_float",
+        "__insert_double",
+//#endif /* ISPC_NVPTX_ENABLED */
         "__intbits_uniform_double",
         "__intbits_uniform_float",
         "__intbits_varying_double",
@@ -483,6 +532,9 @@ lSetInternalFunctions(llvm::Module *module) {
         "__min_varying_uint32",
         "__min_varying_uint64",
         "__movmsk",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__movmsk_ptx",
+//#endif /* ISPC_NVPTX_ENABLED */
         "__new_uniform_32rt",
         "__new_uniform_64rt",
         "__new_varying32_32rt",
@@ -581,6 +633,10 @@ lSetInternalFunctions(llvm::Module *module) {
         "__soa_to_aos3_float8",
         "__soa_to_aos3_int32",
         "__soa_to_aos4_float",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__soa_to_aos3_float1",
+        "__soa_to_aos4_float1",
+//#endif /* ISPC_NVPTX_ENABLED */
         "__soa_to_aos4_float16",
         "__soa_to_aos4_float4",
         "__soa_to_aos4_float8",
@@ -681,6 +737,26 @@ lSetInternalFunctions(llvm::Module *module) {
         "__vec4_add_float",
         "__vec4_add_int32",
         "__vselect_float",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__program_index",
+        "__program_count",
+        "__warp_index",
+        "__task_index0",
+        "__task_index1",
+        "__task_index2",
+        "__task_index",
+        "__task_count0",
+        "__task_count1",
+        "__task_count2",
+        "__task_count",
+        "__cvt_loc2gen",
+        "__cvt_loc2gen_var",
+        "__cvt_const2gen",
+        "__puts_nvptx",
+        "ISPCAlloc",
+        "ISPCLaunch",
+        "ISPCSync",
+//#endif /* ISPC_NVPTX_ENABLED */
         "__vselect_i32"
     };
 
@@ -759,6 +835,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
             g->target->getISA() != Target::NEON16 &&
             g->target->getISA() != Target::NEON8)
 #endif // !__arm__
+#ifdef ISPC_NVPTX_ENABLED
+        if (g->target->getISA() != Target::NVPTX)
+#endif /* ISPC_NVPTX_ENABLED */
         {
             Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
                    mTriple.getArch() == bcTriple.getArch());
@@ -954,6 +1033,19 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     // Next, add the target's custom implementations of the various needed
     // builtin functions (e.g. __masked_store_32(), etc).
     switch (g->target->getISA()) {
+#ifdef ISPC_NVPTX_ENABLED
+    case Target::NVPTX: 
+      {
+        if (runtime32) {
+            fprintf(stderr, "Unfortunatly 32bit targets are not supported at the moment .. \n");
+            assert(0);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_nvptx_64bit);
+        }
+        break;
+      };
+#endif /* ISPC_NVPTX_ENABLED */
 
 #ifdef ISPC_ARM_ENABLED
     case Target::NEON8: {
@@ -1224,7 +1316,18 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     }
 
     // define the 'programCount' builtin variable
-    lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
+#ifdef ISPC_NVPTX_ENABLED
+    if (g->target->getISA() == Target::NVPTX)
+    {
+      lDefineConstantInt("programCount", 32, module, symbolTable);
+    }
+    else
+    {
+#endif /* ISPC_NVPTX_ENABLED */
+      lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
+#ifdef ISPC_NVPTX_ENABLED
+    }
+#endif /* ISPC_NVPTX_ENABLED */
 
     // define the 'programIndex' builtin
     lDefineProgramIndex(module, symbolTable);
@@ -1256,6 +1359,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(),
                        module, symbolTable);
 
+#ifdef ISPC_NVPTX_ENABLED
+    lDefineConstantInt("__is_nvptx_target", (int)(g->target->getISA() == Target::NVPTX),
+                       module, symbolTable);
+#else
+    lDefineConstantInt("__is_nvptx_target", (int)0, module, symbolTable);
+#endif /* ISPC_NVPTX_ENABLED */
+
     if (g->forceAlignment != -1) {
         llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
         alignment->setInitializer(LLVMInt32(g->forceAlignment));
diff --git a/builtins/__do_print_nvptx.cu b/builtins/__do_print_nvptx.cu
new file mode 100644
index 00000000..dc1bbcce
--- /dev/null
+++ b/builtins/__do_print_nvptx.cu
@@ -0,0 +1,130 @@
+#include <cstdio>
+
+#define PRINT_BUF_SIZE 4096
+#define uint64_t unsigned long long
+
+static __device__ size_t d_strlen(const char *str)
+{
+  const char *s;
+
+  for (s = str; *s; ++s)
+    ;
+  return (s - str);
+}
+
+static __device__  char* d_strncat(char *dest, const char *src, size_t n)
+{
+  size_t dest_len = d_strlen(dest);
+  size_t i;
+
+  for (i = 0 ; i < n && src[i] != '\0' ; i++)
+    dest[dest_len + i] = src[i];
+  dest[dest_len + i] = '\0';
+
+  return dest;
+}
+ 
+#define APPEND(str)                                        \
+    do {                                                   \
+        int offset = bufp - &printString[0];               \
+        *bufp = '\0';                                      \
+        d_strncat(bufp, str, PRINT_BUF_SIZE-offset);         \
+        bufp += d_strlen(str);                               \
+        if (bufp >= &printString[PRINT_BUF_SIZE])          \
+            goto done;                                     \
+    } while (0) /* eat semicolon */
+
+
+#define PRINT_SCALAR(fmt, type)                  \
+    sprintf(tmpBuf, fmt, *((type *)ptr));        \
+    APPEND(tmpBuf);                              \
+    break
+
+#define PRINT_VECTOR(fmt, type)                                         \
+    *bufp++ = '[';                                                      \
+    if (bufp == &printString[PRINT_BUF_SIZE]) break;                    \
+    for (int i = 0; i < width; ++i) {                                   \
+        /* only print the value if the current lane is executing */     \
+        type val0 = *((type*)ptr);                                      \
+        type val = val0;                                                \
+        if (mask & (1ull<<i))                                           \
+            sprintf(tmpBuf, fmt, val);                                  \
+        else                                                            \
+            sprintf(tmpBuf, "(( * )) ");                                \
+        APPEND(tmpBuf);                                                 \
+        *bufp++ = (i != width-1 ? ',' : ']');                           \
+    }                                                                   \
+    break
+
+extern "C"
+__device__ void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
+                void **args) {
+    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
+    char *bufp = &printString[0];
+    char tmpBuf[256];
+    const char  trueBuf[] = "true";
+    const char falseBuf[] = "false";
+
+    int argCount = 0;
+    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
+        // Format strings are just single percent signs.
+        if (*format != '%') {
+            *bufp++ = *format;
+        }
+        else {
+            if (*types) {
+                void *ptr = args[argCount++];
+                // Based on the encoding in the types string, cast the
+                // value appropriately and print it with a reasonable
+                // printf() formatting string.
+                switch (*types) {
+                case 'b': {
+                    const char *tmpBuf1 =  *((bool *)ptr) ? trueBuf : falseBuf;
+                    APPEND(tmpBuf1);
+                    break;
+                }
+                case 'B': {
+                    *bufp++ = '[';
+                    if (bufp == &printString[PRINT_BUF_SIZE])
+                        break;
+                    for (int i = 0; i < width; ++i) {
+                        bool val0 = *((bool*)ptr);                                     
+                        bool val = val0;                                                \
+                        if (mask & (1ull << i)) {
+                            const char *tmpBuf1 =  val ? trueBuf : falseBuf;
+                            APPEND(tmpBuf1);
+                        }
+                        else
+                            APPEND("_________");
+                        *bufp++ = (i != width-1) ? ',' : ']';
+                    }
+                    break;
+                }
+                case 'i': PRINT_SCALAR("%d", int);
+                case 'I': PRINT_VECTOR("%d", int);
+                case 'u': PRINT_SCALAR("%u", unsigned int);
+                case 'U': PRINT_VECTOR("%u", unsigned int);
+                case 'f': PRINT_SCALAR("%f", float);
+                case 'F': PRINT_VECTOR("%f", float);
+                case 'l': PRINT_SCALAR("%lld", long long);
+                case 'L': PRINT_VECTOR("%lld", long long);
+                case 'v': PRINT_SCALAR("%llu", unsigned long long);
+                case 'V': PRINT_VECTOR("%llu", unsigned long long);
+                case 'd': PRINT_SCALAR("%f", double);
+                case 'D': PRINT_VECTOR("%f", double);
+                case 'p': PRINT_SCALAR("%p", void *);
+                case 'P': PRINT_VECTOR("%p", void *);
+                default:
+                    APPEND("UNKNOWN TYPE ");
+                    *bufp++ = *types;
+                }
+                ++types;
+            }
+        }
+        ++format;
+    }
+
+ done:
+    *bufp = '\n'; bufp++;
+    *bufp = '\0';
+}
diff --git a/builtins/builtins.c b/builtins/builtins.c
index d02ab6a4..b65feb9a 100644
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -185,6 +185,81 @@ void __do_print(const char *format, const char *types, int width, uint64_t mask,
     fflush(stdout);
 }
 
+/* this is print for PTX target only */
+int __puts_nvptx(const char *);
+void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
+                void **args) {
+#if 0
+    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
+    char *bufp = &printString[0];
+    char tmpBuf[256];
+
+    int argCount = 0;
+    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
+        // Format strings are just single percent signs.
+        if (*format != '%') {
+            *bufp++ = *format;
+        }
+        else {
+            if (*types) {
+                void *ptr = args[argCount++];
+                // Based on the encoding in the types string, cast the
+                // value appropriately and print it with a reasonable
+                // printf() formatting string.
+                switch (*types) {
+                case 'b': {
+                    sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
+                    APPEND(tmpBuf);
+                    break;
+                }
+                case 'B': {
+                    *bufp++ = '[';
+                    if (bufp == &printString[PRINT_BUF_SIZE])
+                        break;
+                    for (int i = 0; i < width; ++i) {
+                        if (mask & (1ull << i)) {
+                            sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
+                            APPEND(tmpBuf);
+                        }
+                        else
+                            APPEND("_________");
+                        *bufp++ = (i != width-1) ? ',' : ']';
+                    }
+                    break;
+                }
+                case 'i': PRINT_SCALAR("%d", int);
+                case 'I': PRINT_VECTOR("%d", int);
+                case 'u': PRINT_SCALAR("%u", unsigned int);
+                case 'U': PRINT_VECTOR("%u", unsigned int);
+                case 'f': PRINT_SCALAR("%f", float);
+                case 'F': PRINT_VECTOR("%f", float);
+                case 'l': PRINT_SCALAR("%lld", long long);
+                case 'L': PRINT_VECTOR("%lld", long long);
+                case 'v': PRINT_SCALAR("%llu", unsigned long long);
+                case 'V': PRINT_VECTOR("%llu", unsigned long long);
+                case 'd': PRINT_SCALAR("%f", double);
+                case 'D': PRINT_VECTOR("%f", double);
+                case 'p': PRINT_SCALAR("%p", void *);
+                case 'P': PRINT_VECTOR("%p", void *);
+                default:
+                    APPEND("UNKNOWN TYPE ");
+                    *bufp++ = *types;
+                }
+                ++types;
+            }
+        }
+        ++format;
+    }
+
+ done:
+    *bufp = '\n'; bufp++;
+    *bufp = '\0';
+    __puts_nvptx(printString);
+#else
+    __puts_nvptx("---nvptx printing is not support---\n");
+#endif
+}
+
 
 int __num_cores() {
 #if defined(_MSC_VER) || defined(__MINGW32__)
diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index cf79278b..48a56bd9 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -289,4 +289,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
 ;; int8/int16 builtins
 
 define_avgs()
+declare_nvptx()
 
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index f3e4ddba..06121a6c 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -42,6 +42,7 @@ packed_load_and_store()
 scans()
 int64minmax()
 aossoa()
+declare_nvptx()
 saturation_arithmetic_novec()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 19373633..dd9bd428 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -382,6 +382,7 @@ declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x
 ;; int8/int16 builtins
 
 define_avgs()
+declare_nvptx()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reciprocals in double precision, if supported
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
index 1c0b421f..cb9d291f 100644
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -344,3 +344,4 @@ packed_load_and_store(4)
 ;; prefetch
 
 define_prefetches()
+declare_nvptx()
diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll
new file mode 100644
index 00000000..e6385d10
--- /dev/null
+++ b/builtins/target-nvptx.ll
@@ -0,0 +1,2340 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Define the standard library builtins for the NOVEC target
+define(`MASK',`i1')
+define(`WIDTH',`1')
+
+;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() nounwind readnone
+
+;;;;;;;;;;
+
+define i32 @__program_index()  nounwind readnone alwaysinline
+{
+ %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+ %program_index = and i32 %tid, 31
+ ret i32 %program_index
+}
+define i32 @__program_count()  nounwind readnone alwaysinline
+{
+;; %tid = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+;; ret i32 %tid
+  ret i32 32
+}
+define i32 @__warp_index() nounwind readnone alwaysinline
+{
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %warp_index = lshr i32 %tid, 5
+  ret i32 %warp_index
+}
+
+;;;;;;;;;;;;
+
+define i32 @__task_index0()  nounwind readnone alwaysinline
+{
+ %bid  = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+ %bid4 = shl i32 %bid, 2
+ %warp_index = call i32 @__warp_index()
+ %task_index0 = add i32 %bid4, %warp_index
+ ret i32 %task_index0
+}
+define i32 @__task_index1()  nounwind readnone alwaysinline
+{
+ %task_index1 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+ ret i32 %task_index1
+}
+define i32 @__task_index2()  nounwind readnone alwaysinline
+{
+ %task_index2 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+ ret i32 %task_index2
+}
+define i32 @__task_index()  nounwind readnone alwaysinline
+{
+  %ti0 = call i32 @__task_index0()
+  %ti1 = call i32 @__task_index1()
+  %ti2 = call i32 @__task_index2()
+  %tc0 = call i32 @__task_count0()
+  %tc1 = call i32 @__task_count1()
+  %mul1 = mul i32 %tc1, %ti2
+  %add1 = add i32 %mul1, %ti1
+  %mul2 = mul i32 %add1, %tc0
+  %task_index = add i32 %mul2, %ti0
+  ret i32 %task_index
+}
+
+;;;;;
+
+define i32 @__task_count0()  nounwind readnone alwaysinline
+{
+ %nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+ %task_count0 = shl i32 %nb, 2
+ ret i32 %task_count0
+}
+define i32 @__task_count1()  nounwind readnone alwaysinline
+{
+ %task_count1 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+ ret i32 %task_count1
+}
+define i32 @__task_count2()  nounwind readnone alwaysinline
+{
+ %task_count2 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+ ret i32 %task_count2
+}
+define i32 @__task_count()  nounwind readnone alwaysinline
+{
+  %tc0 = call i32 @__task_count0()
+  %tc1 = call i32 @__task_count1()
+  %tc2 = call i32 @__task_count2()
+  %mul1 = mul i32 %tc1, %tc2
+  %task_count = mul i32 %mul1, %tc0
+  ret i32 %task_count
+}
+
+;;;;;;;;
+
+declare i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p3i64(i64 addrspace(3)*)
+declare i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p4i64(i64 addrspace(4)*)
+define i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
+{
+  %ptr =  tail call i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p3i64(i64 addrspace(3)* %0)
+  ret i64* %ptr
+}
+define i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline
+{
+  %ptr =  tail call i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p3i64(i64 addrspace(3)* %0)
+  ret i64* %ptr
+}
+define i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
+{
+  %ptr =  tail call i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p4i64(i64 addrspace(4)* %0)
+  ret i64* %ptr
+}
+
+;;;;;;;;
+;; i32
+define internal i32 @__shfl_i32_nvptx(i32, i32) nounwind readnone alwaysinline
+{
+  %shfl = tail call i32 asm sideeffect "shfl.idx.b32  $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) 
+  ret i32 %shfl
+}
+define internal i32 @__shfl_xor_i32_nvptx(i32, i32) nounwind readnone alwaysinline
+{
+  %shfl = tail call i32 asm sideeffect "shfl.bfly.b32  $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) 
+  ret i32 %shfl
+}
+;; float
+define internal float @__shfl_float_nvptx(float, i32) nounwind readnone alwaysinline
+{
+  %shfl = tail call float asm sideeffect "shfl.idx.b32  $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1)
+  ret float %shfl
+}
+define internal float @__shfl_xor_float_nvptx(float, i32) nounwind readnone alwaysinline
+{
+  %shfl = tail call float asm sideeffect "shfl.bfly.b32  $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) 
+  ret float %shfl
+}
+
+;;;;;;;;;;; min/max
+;; float/double
+define internal float @__fminf_nvptx(float,float) nounwind readnone alwaysinline
+{
+  %min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1)
+  ret float %min
+}
+define internal float @__fmaxf_nvptx(float,float) nounwind readnone alwaysinline
+{
+  %max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1)
+  ret float %max
+}
+
+;; int
+define(`int_minmax',`
+define internal $1 @__min_$1_signed($1,$1) nounwind readnone alwaysinline {
+  %c = icmp slt $1 %0, %1
+  %r = select i1 %c, $1 %0, $1 %1
+  ret $1 %r
+}
+define internal $1 @__max_$1_signed($1,$1) nounwind readnone alwaysinline {
+  %c = icmp sgt $1 %0, %1
+  %r = select i1 %c, $1 %0, $1 %1
+  ret $1 %r
+}
+define internal $1 @__min_$1_unsigned($1,$1) nounwind readnone alwaysinline  {
+  %c = icmp ult $1 %0, %1
+  %r = select i1 %c, $1 %0, $1 %1
+  ret $1 %r
+}
+define internal $1 @__max_$1_unsigned($1,$1) nounwind readnone alwaysinline {
+  %c = icmp ugt $1 %0, %1
+  %r = select i1 %c, $1 %0, $1 %1
+  ret $1 %r
+}
+')
+int_minmax(i8);
+int_minmax(i16);
+int_minmax(i32);
+int_minmax(i64);
+
+;; float/double
+define(`fp_minmax',`
+define internal $1 @__min_$1($1,$1) nounwind readnone alwaysinline {
+  %c = fcmp olt $1 %0, %1
+  %r = select i1 %c, $1 %0, $1 %1
+  ret $1 %r
+}
+define internal $1 @__max_$1($1,$1) nounwind readnone alwaysinline {
+  %c = fcmp ogt $1 %0, %1
+  %r = select i1 %c, $1 %0, $1 %1
+  ret $1 %r
+}
+')
+fp_minmax(float)
+fp_minmax(double)
+
+;;;;;;;;; __shfl/__shfl_xor intrinsics
+;;  i8/i16/i64 
+define(`shfl32',`
+define internal $2 @$1_$2_nvptx($2, i32) nounwind readnone alwaysinline
+{
+  %ext = zext $2 %0 to i32
+  %res = tail call i32 @$1_i32_nvptx(i32 %ext, i32 %1)
+  %ret = trunc i32 %res to $2
+  ret $2 %ret
+}
+')
+shfl32(__shfl,     i8);
+shfl32(__shfl_xor, i8);
+shfl32(__shfl,     i16);
+shfl32(__shfl_xor, i16);
+
+
+define(`shfl64',`
+define internal $2 @$1_$2_nvptx($2, i32) nounwind readnone alwaysinline
+{
+  %in   = bitcast $2 %0 to <2 x i32>
+  %in0  = extractelement <2 x i32> %in, i32 0
+  %in1  = extractelement <2 x i32> %in, i32 1
+  %out0 = tail call i32 @$1_i32_nvptx(i32 %in0, i32 %1)
+  %out1 = tail call i32 @$1_i32_nvptx(i32 %in1, i32 %1)
+  %out2 = insertelement <2 x i32> undef, i32 %out0, i32 0
+  %out  = insertelement <2 x i32> %out2, i32 %out1, i32 1
+  %ret  = bitcast <2 x i32> %out to $2
+  ret $2 %ret
+}
+')
+shfl64(__shfl,     i64)
+shfl64(__shfl_xor, i64)
+shfl64(__shfl,     double)
+shfl64(__shfl_xor, double)
+
+;;;;;;;;;;;;;
+define internal i32 @__ballot_nvptx(i1) nounwind readnone alwaysinline
+{
+  %conv = zext i1 %0 to i32
+  %res = tail call i32 asm sideeffect 
+      "{ .reg .pred %p1; 
+         setp.ne.u32 %p1, $1, 0; 
+         vote.ballot.b32  $0, %p1; 
+      }", "=r,r"(i32 %conv) 
+  ret i32 %res
+}
+define internal i32 @__lanemask_lt_nvptx() nounwind readnone alwaysinline
+{
+  %mask = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() 
+  ret i32 %mask
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; tasking
+
+;; this call allocate parameter buffer for kernel launch
+declare i64 @cudaGetParameterBuffer(i64, i64) nounwind
+define i8* @ISPCAlloc(i8**, i64 %size, i32 %align32) nounwind alwaysinline
+{
+entry:
+  %and = call i32 @__program_index()
+  %cmp = icmp eq i32 %and, 0
+  %align = zext i32 %align32 to i64
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %ptri64tmp = call i64 @cudaGetParameterBuffer(i64 %align, i64 %size);
+  br label %if.end
+
+if.end:
+  %ptri64 = phi i64 [ %ptri64tmp, %if.then ], [ 0, %entry ]
+  %ptr = inttoptr i64 %ptri64 to i8*
+  ret i8* %ptr
+}
+
+;; this actually launches kernel a kernel
+module asm "
+.extern .func  (.param .b32 func_retval0) cudaLaunchDevice
+(
+  .param .b64 cudaLaunchDevice_param_0,
+  .param .b64 cudaLaunchDevice_param_1,
+  .param .align 4 .b8 cudaLaunchDevice_param_2[12],
+  .param .align 4 .b8 cudaLaunchDevice_param_3[12],
+  .param .b32 cudaLaunchDevice_param_4,
+  .param .b64 cudaLaunchDevice_param_5
+);
+"
+define void @ISPCLaunch(i8**, i8* %func_ptr, i8* %func_args, i32 %ntx, i32 %nty, i32 %ntz) nounwind alwaysinline
+{
+entry:
+;;  only 1 lane must launch the kernel  !!!
+ %func_i64 = ptrtoint i8*  %func_ptr  to i64
+ %args_i64 = ptrtoint i8*  %func_args to i64
+
+;; nbx = (%ntx-1)/(blocksize/warpsize) + 1  for blocksize=128 & warpsize=32
+  %ntxm1   = add nsw i32 %ntx, -1
+;;  %ntxm1d4 = sdiv i32 %ntxm1, 4
+  %ntxm1d4 = ashr i32 %ntxm1, 2
+  %nbx     = add nsw i32 %ntxm1d4, 1
+  %and = call i32 @__program_index()
+;; if (laneIdx == 0)
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+
+ %res_tmp = call i32 asm sideeffect "{
+     .param .b64 param0;
+     st.param.b64	[param0+0], $1;
+     .param .b64 param1;
+     st.param.b64	[param1+0], $2;
+     .param .align 4 .b8 param2[12];
+     st.param.b32	[param2+0], $3; 
+     st.param.b32	[param2+4], $4; 
+     st.param.b32	[param2+8], $5; 
+     .param .align 4 .b8 param3[12];
+     st.param.b32	[param3+0], $6; 
+     st.param.b32	[param3+4], $7; 
+     st.param.b32	[param3+8], $8; 
+     .param .b32 param4;
+     st.param.b32	[param4+0], $9; 
+     .param .b64 param5;
+     st.param.b64	[param5+0], $10; 
+
+     .param .b32 retval0;
+     call.uni (retval0), 
+       cudaLaunchDevice,
+       (
+        param0, 
+        param1, 
+        param2, 
+        param3, 
+        param4, 
+        param5
+       );
+     ld.param.b32	$0, [retval0+0];
+  }
+  ", 
+"=r, l,l, r,r,r, r,r,r, r,l"(
+          i64 %func_i64,i64 %args_i64, 
+          i32 %nbx,i32 %nty,i32 %ntz, 
+          i32 128,i32 1,i32 1, i32 0,i64 0);
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+;;  %res = phi i32 [ %res_tmp, %if.then ], [ undef, %entry ]
+
+  ret void
+}
+
+;; this synchronizes kernel
+declare i32 @cudaDeviceSynchronize() nounwind
+define void @ISPCSync(i8*) nounwind alwaysinline
+{
+  call i32 @cudaDeviceSynchronize()
+  ret void;
+}
+
+
+;;;;;;;;;;;;;;
+
+
+
+include(`util-nvptx.m4')
+
+stdlib_core()
+packed_load_and_store()
+int64minmax()
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; broadcast/rotate/shuffle
+
+define_shuffles()
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aos/soa
+
+aossoa()
+
+;; dummy 1 wide vector ops
+declare  void
+@__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
+        <1 x float> %v3, <1 x float> * noalias %out0, 
+        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
+        <1 x float> * noalias %out3) nounwind alwaysinline ;
+
+declare  void
+@__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
+        <1 x float> %v3, <1 x float> * noalias %out0, 
+        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
+        <1 x float> * noalias %out3) nounwind alwaysinline ;
+
+declare  void
+@__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2);
+
+declare  void
+@__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2);
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @llvm.convert.from.fp16(i16) nounwind readnone
+declare i16   @llvm.convert.to.fp16(float) nounwind readnone
+define float @__half_to_float_uniform(i16 %v) nounwind readnone alwaysinline
+{
+  ;; %res = call float @llvm.convert.from.fp16(i16 %v)
+  %res = tail call float asm sideeffect 
+      "{ .reg .f16 tmp; 
+        mov.b16 tmp, $1;
+        cvt.f32.f16 $0, tmp;
+     }", "=f,h"(i16 %v) 
+  ret float %res
+}
+define i16 @__float_to_half_uniform(float %v) nounwind readnone alwaysinline
+{
+ ;; this will break the compiler, use inline asm similarly to above case
+ ;; %half = call i16 @llvm.convert.to.fp16(float %v)
+  %half = tail call i16 asm sideeffect 
+      "{ .reg .f16 tmp; 
+        cvt.rn.f16.f32 tmp, $1;
+        mov.b16 $0, tmp;
+     }", "=h,f"(float %v) 
+  ret i16 %half
+}
+define <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone alwaysinline
+{
+  %el = extractelement <1 x i16> %v, i32 0
+  %sf = call float @__half_to_float_uniform(i16 %el)
+  %vf = insertelement <1 x float> undef, float %sf, i32 0
+  ret <1 x float> %vf;
+}
+define <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone alwaysinline
+{
+  %el = extractelement <1 x float> %v, i32 0
+  %sh = call i16 @__float_to_half_uniform(float %el)
+  %vh = insertelement <1 x i16> undef, i16 %sh, i32 0
+  ret <1 x i16> %vh;
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+declare void @__fastmath() nounwind 
+
+;; round/floor/ceil
+
+define internal float @__round_uniform_float_ptx(float) nounwind readnone alwaysinline
+{
+  %2 = tail call float asm sideeffect
+        "{ .reg .pred p<3>; .reg .s32 r<4>; .reg .f32 f<10>;
+           mov.f32 f4, $1;
+           abs.f32 f5, f4;
+           mov.b32 r1, f4;
+           and.b32 r2, r1, -2147483648;
+           or.b32  r3, r2, 1056964608;
+           mov.b32 f6, r3;
+           add.f32 f7, f6, f4;
+           cvt.rzi.f32.f32	f8, f7;
+           setp.gt.f32	p1, f5, 0f4B000000;
+           selp.f32	f9, f4, f8, p1;
+           setp.geu.f32	p2, f5, 0f3F000000;
+           @p2 bra BB2_2;
+           cvt.rzi.f32.f32	f9, f4;
+BB2_2:
+           mov.f32 $0, f9;
+        }", "=f,f"(float %0) 
+  ret float %2
+}
+define  float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to <1 x i32>
+  %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648>
+  %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06>
+  %binop21.i = fadd <1 x float> %binop.i, <float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32>
+  %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to float
+  ret float %int_to_float_bitcast.i.i.i
+}
+define float @__floor_uniform_float(float) nounwind readnone alwaysinline
+{
+  %2 = tail call float asm sideeffect "cvt.rmi.f32.f32 $0, $1;", "=f,f"(float %0) 
+  ret float %2
+}
+define float @__ceil_uniform_float(float) nounwind readnone alwaysinline
+{
+  %2 = tail call float asm sideeffect "cvt.rpi.f32.f32 $0, $1;", "=f,f"(float %0)
+  ret float %2
+}
+
+define double @__round_uniform_double(double) nounwind readnone alwaysinline
+{
+  %2 = tail call double asm sideeffect
+        "{ 
+          .reg .pred 	p<3>;
+          .reg .s32 	r<6>;
+          .reg .f64 	fd<9>;
+
+          mov.f64 	fd8, $1
+          abs.f64 	fd1, fd8;
+          setp.ge.f64	p1, fd1, 0d4330000000000000;
+          @p1 bra 	BB5_2;
+
+          add.f64 	fd5, fd1, 0d3FE0000000000000;
+          cvt.rzi.f64.f64	fd6, fd5;
+          setp.lt.f64	p2, fd1, 0d3FE0000000000000;
+          selp.f64	fd7, 0d0000000000000000, fd6, p2;
+          {
+            .reg .b32 temp; 
+            mov.b64 	{r1, temp}, fd7;
+          }
+          {
+            .reg .b32 temp; 
+            mov.b64 	{temp, r2}, fd7;
+          }
+          {
+            .reg .b32 temp; 
+            mov.b64 	{temp, r3}, fd8;
+          }
+          and.b32  	r4, r3, -2147483648;
+          or.b32  	r5, r2, r4;
+          mov.b64 	fd8, {r1, r5};
+
+BB5_2:
+          mov.f64	$0, fd8;
+        }", "=d,d"(double %0)
+  ret double %2
+}
+define double @__floor_uniform_double(double) nounwind readnone alwaysinline
+{
+  %2 = tail call double asm sideeffect "cvt.rmi.f64.f64 $0, $1;", "=f,f"(double %0)
+  ret double %2
+}
+define double @__ceil_uniform_double(double) nounwind readnone alwaysinline
+{
+  %2 = tail call double asm sideeffect "cvt.rpi.f64.f64 $0, $1;", "=f,f"(double %0)
+  ret double %2
+}
+
+define  internal <1 x float> @__floor_varying_floatX(<1 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
+  %bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <1 x float> %binop.i
+}
+
+define(`rfc_varying',`
+define <1 x $2> @__$1_varying_$2(<1 x $2>) nounwind readonly alwaysinline
+{
+   %val = extractelement <1 x $2> %0, i32 0
+   %res = call $2 @__$1_uniform_$2($2 %val)
+   %ret = insertelement <1 x $2> undef, $2 %res, i32 0
+   ret <1 x $2> %ret
+}
+')
+rfc_varying(round, float)
+rfc_varying(floor, float)
+rfc_varying(ceil,  float)
+rfc_varying(round, double)
+rfc_varying(floor, double)
+rfc_varying(ceil,  double)
+
+;; min/max uniform
+
+;; declare float @__max_uniform_float(float, float) nounwind readnone 
+;; declare float @__min_uniform_float(float, float) nounwind readnone 
+define  float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  %d = fcmp ogt float %0, %1 
+  %r = select i1 %d, float %0, float %1
+  ret float %r
+
+}
+define  float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  %d = fcmp olt float %0, %1 
+  %r = select i1 %d, float %0, float %1
+  ret float %r
+
+}
+
+;; declare i32 @__min_uniform_int32(i32, i32) nounwind readnone 
+;; declare i32 @__max_uniform_int32(i32, i32) nounwind readnone 
+define  i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp slt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+define  i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp sgt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+;; declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone 
+;; declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone 
+define  i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ult i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+define  i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ugt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+;; declare i64 @__min_uniform_int64(i64, i64) nounwind readnone 
+;; declare i64 @__max_uniform_int64(i64, i64) nounwind readnone 
+define  internal i64 @__min_uniform_int64X(i64, i64) nounwind readonly alwaysinline {
+  %c = icmp slt i64 %0, %1
+  %r = select i1 %c, i64 %0, i64 %1
+  ret i64 %r
+}
+define  internal i64 @__max_uniform_int64X(i64, i64) nounwind readonly alwaysinline {
+  %c = icmp sgt i64 %0, %1
+  %r = select i1 %c, i64 %0, i64 %1
+  ret i64 %r
+}
+
+;; declare i64 @__min_uniform_uint64(i64, i64) nounwind readnone 
+;; declare i64 @__max_uniform_uint64(i64, i64) nounwind readnone 
+define  internal i64 @__min_uniform_uint64X(i64, i64) nounwind readonly alwaysinline {
+  %c = icmp ult i64 %0, %1
+  %r = select i1 %c, i64 %0, i64 %1
+  ret i64 %r
+}
+define  internal i64 @__max_uniform_uint64X(i64, i64) nounwind readonly alwaysinline {
+  %c = icmp ugt i64 %0, %1
+  %r = select i1 %c, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define  double @__max_uniform_double(double, double) nounwind readonly alwaysinline {
+  %d = fcmp ogt double %0, %1 
+  %r = select i1 %d, double %0, double %1
+  ret double %r
+}
+define  double @__min_uniform_double(double, double) nounwind readonly alwaysinline {
+  %d = fcmp olt double %0, %1 
+  %r = select i1 %d, double %0, double %1
+  ret double %r
+}
+
+;; min/max uniform
+
+
+define(`minmax_vy',`
+define <1 x $2> @__$1_varying_$3(<1 x $2>, <1 x $2>) nounwind readnone alwaysinline
+{
+  %v0 = extractelement <1 x $2> %0, i32 0
+  %v1 = extractelement <1 x $2> %1, i32 0
+  %r = call $2 @__$1_uniform_$3($2 %v0, $2 %v1)
+  %ret = insertelement <1 x $2> undef, $2 %r, i32 0
+  ret <1 x $2> %ret;
+}
+')
+minmax_vy(min, i32,  int32)
+minmax_vy(max, i32,  int32)
+minmax_vy(min, i32, uint32)
+minmax_vy(max, i32, uint32)
+minmax_vy(min, float, float)
+minmax_vy(max, float, float)
+minmax_vy(min, double, double)
+minmax_vy(max, double, double)
+
+;; sqrt/rsqrt/rcp
+
+define  float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+  %ret = fdiv float 1.,%0
+  ret float %ret
+}
+declare double @__nv_drcp_rn(double)
+define  double @__rcp_uniform_double(double) nounwind readonly alwaysinline 
+{
+  %ret  = call double @__nv_drcp_rn(double %0)
+  ret double %ret
+}
+declare float @__nv_sqrtf(float)
+define  float @__sqrt_uniform_float(float) nounwind readonly alwaysinline 
+{
+  %ret = call float @__nv_sqrtf(float %0)
+  ret float %ret
+}
+declare double @__nv_sqrt(double)
+define  double @__sqrt_uniform_double(double) nounwind readonly alwaysinline {
+  %ret = call double @__nv_sqrt(double %0)
+  ret double %ret
+}
+declare float @__nv_rsqrtf(float)
+define  float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline 
+{
+  %ret = call float @__nv_rsqrtf(float %0)
+  ret float %ret
+}
+declare double @__nv_rsqrt(double)
+define  double @__rsqrt_uniform_double(double) nounwind readonly alwaysinline 
+{
+  %ret = call double @__nv_rsqrt(double %0)
+  ret double %ret
+}
+
+;;;;;; varying
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone  alwaysinline
+{
+  %v = extractelement <1 x float> %0, i32 0
+  %r = call float @__rcp_uniform_float(float %v)
+  %rv = insertelement <1 x float> undef, float %r, i32 0 
+  ret <WIDTH x float> %rv
+}
+define <WIDTH x double> @__rcp_varying_double(<WIDTH x double>) nounwind readnone  alwaysinline
+{
+  %v = extractelement <1 x double> %0, i32 0
+  %r = call double @__rcp_uniform_double(double %v)
+  %rv = insertelement <1 x double> undef, double %r, i32 0 
+  ret <WIDTH x double> %rv
+}
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline
+{
+  %v = extractelement <1 x float> %0, i32 0
+  %r = call float @__rsqrt_uniform_float(float %v)
+  %rv = insertelement <1 x float> undef, float %r, i32 0 
+  ret <WIDTH x float> %rv
+}
+define <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>) nounwind readnone alwaysinline
+{
+  %v = extractelement <1 x double> %0, i32 0
+  %r = call double @__rsqrt_uniform_double(double %v)
+  %rv = insertelement <1 x double> undef, double %r, i32 0 
+  ret <WIDTH x double> %rv
+}
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline
+{
+  %v = extractelement <1 x float> %0, i32 0
+  %r = call float @__sqrt_uniform_float(float %v)
+  %rv = insertelement <1 x float> undef, float %r, i32 0 
+  ret <WIDTH x float> %rv
+}
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone alwaysinline
+{
+  %v = extractelement <1 x double> %0, i32 0
+  %r = call double @__sqrt_uniform_double(double %v)
+  %rv = insertelement <1 x double> undef, double %r, i32 0 
+  ret <WIDTH x double> %rv
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; population count
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+define  i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+ %call = call i32 @llvm.ctpop.i32(i32 %0)
+ ret i32 %call
+;;  %res = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %0)
+ ;; ret i32 %res
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+define  i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; binary prefix sum
+
+define internal i64 @__warpBinExclusiveScan(i1 %p) nounwind readonly alwaysinline 
+{
+entry:
+  %call  = call i32 @__ballot_nvptx(i1 zeroext %p)
+  %call1 = call i32 @__popcnt_int32(i32 %call)
+  %call2 = call i32 @__lanemask_lt_nvptx()
+  %and = and i32 %call2, %call
+  %call3 = call i32 @__popcnt_int32(i32 %and)
+  %retval.sroa.1.4.insert.ext.i = zext i32 %call3 to i64
+  %retval.sroa.1.4.insert.shift.i = shl nuw i64 %retval.sroa.1.4.insert.ext.i, 32
+  %retval.sroa.0.0.insert.ext.i = zext i32 %call1 to i64
+  %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.1.4.insert.shift.i, %retval.sroa.0.0.insert.ext.i
+  ret i64 %retval.sroa.0.0.insert.insert.i
+}
+
+ctlztz()
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+;; svml is not support in PTX, will generate linking error
+
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define  i64 @__movmsk(<1 x i1>) nounwind readnone alwaysinline {
+  %v = extractelement <1 x i1> %0, i32 0
+  %v64 = zext i1 %v to i64
+  ret i64 %v64
+}
+define  i64 @__movmsk_ptx(<1 x i1>) nounwind readnone alwaysinline {
+  %v = extractelement <1 x i1> %0, i32 0
+   %v0  = call i32 @__ballot_nvptx(i1 %v)
+   %v64 = zext i32 %v0 to i64
+   ret i64 %v64
+}
+
+define  i1 @__any(<1 x i1>) nounwind readnone alwaysinline {
+  %v = extractelement <1 x i1> %0, i32 0
+  %res = call i32 @__ballot_nvptx(i1 %v)
+  %cmp = icmp ne i32 %res, 0
+  ret i1 %cmp
+}
+
+define  i1 @__all(<1 x i1>) nounwind readnone alwaysinline {
+  %v = extractelement <1 x i1> %0, i32 0
+  %res0 = call i32 @__ballot_nvptx(i1 %v)
+  %cmp = icmp eq i32 %res0, -1
+  ret i1 %cmp
+}
+
+define  i1 @__none(<1 x i1>) nounwind readnone alwaysinline {
+  %v = extractelement <1 x i1> %0, i32 0
+  %res = call i32 @__ballot_nvptx(i1 %v)
+  %cmp = icmp eq i32 %res, 0
+  ret i1 %cmp
+}
+
+;;;;;;;;; reductions i8
+define i16 @__reduce_add_int8(<1 x i8> %v) nounwind readnone alwaysinline {
+  %value8 = extractelement <1 x i8> %v, i32 0
+  %value  = zext i8 %value8 to i16
+  %call = tail call i16 @__shfl_xor_i16_nvptx(i16 %value, i32 16)
+  %call1 = add i16 %call, %value 
+  %call.1 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1, i32 8)
+  %call1.1 = add i16 %call1, %call.1 
+  %call.2 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1.1, i32 4)
+  %call1.2 = add i16 %call1.1, %call.2
+  %call.3 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1.2, i32 2)
+  %call1.3 = add i16 %call1.2, %call.3 
+  %call.4 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1.3, i32 1)
+  %call1.4 = add i16 %call1.3, %call.4 
+  ret i16 %call1.4
+}
+;;;;;;;;; reductions i16
+define i32 @__reduce_add_int16(<1 x i16> %v) nounwind readnone alwaysinline {
+  %value16 = extractelement <1 x i16> %v, i32 0
+  %value  = zext i16 %value16 to i32
+  %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
+  %call1 = add i32 %call, %value 
+  %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
+  %call1.1 = add i32 %call1, %call.1 
+  %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
+  %call1.2 = add i32 %call1.1, %call.2
+  %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
+  %call1.3 = add i32 %call1.2, %call.3 
+  %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
+  %call1.4 = add i32 %call1.3, %call.4 
+  ret i32 %call1.4
+}
+
+;;;;;;;;; reductions float
+define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
+  %value = extractelement <1 x float> %v, i32 0
+  %call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
+  %call1 = fadd float %call, %value 
+  %call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8)
+  %call1.1 = fadd float %call1, %call.1 
+  %call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4)
+  %call1.2 = fadd float %call1.1, %call.2
+  %call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2)
+  %call1.3 = fadd float %call1.2, %call.3 
+  %call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1)
+  %call1.4 = fadd float %call1.3, %call.4 
+  ret float %call1.4
+}
+define  float @__reduce_min_float(<1 x float>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x float> %0, i32 0
+  %call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
+  %call1 = tail call float @__fminf_nvptx(float %value, float %call) 
+  %call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8)
+  %call1.1 = tail call float @__fminf_nvptx(float %call1, float %call.1) 
+  %call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4)
+  %call1.2 = tail call float @__fminf_nvptx(float %call1.1, float %call.2) 
+  %call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2)
+  %call1.3 = tail call float @__fminf_nvptx(float %call1.2, float %call.3) 
+  %call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1)
+  %call1.4 = tail call float @__fminf_nvptx(float %call1.3, float %call.4) 
+  ret float %call1.4
+}
+define  float @__reduce_max_float(<1 x float>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x float> %0, i32 0
+  %call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
+  %call1 = tail call float @__fmaxf_nvptx(float %value, float %call) 
+  %call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8)
+  %call1.1 = tail call float @__fmaxf_nvptx(float %call1, float %call.1) 
+  %call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4)
+  %call1.2 = tail call float @__fmaxf_nvptx(float %call1.1, float %call.2) 
+  %call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2)
+  %call1.3 = tail call float @__fmaxf_nvptx(float %call1.2, float %call.3) 
+  %call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1)
+  %call1.4 = tail call float @__fmaxf_nvptx(float %call1.3, float %call.4) 
+  ret float %call1.4
+}
+
+;;;;;;;;; reductions int32
+define  i32 @__reduce_add_int32(<1 x i32>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i32> %0, i32 0
+  %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
+  %call1 = add i32 %call, %value 
+  %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
+  %call1.1 =add i32 %call1, %call.1 
+  %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
+  %call1.2 = add i32 %call1.1, %call.2
+  %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
+  %call1.3 = add i32 %call1.2, %call.3 
+  %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
+  %call1.4 = add i32 %call1.3, %call.4 
+  ret i32 %call1.4
+}
+define  i32 @__reduce_min_int32(<1 x i32>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i32> %0, i32 0
+  %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
+  %call1 = tail call i32 @__min_i32_signed(i32 %value, i32 %call) 
+  %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
+  %call1.1 = tail call i32 @__min_i32_signed(i32 %call1, i32 %call.1) 
+  %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
+  %call1.2 = tail call i32 @__min_i32_signed(i32 %call1.1, i32 %call.2) 
+  %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
+  %call1.3 = tail call i32 @__min_i32_signed(i32 %call1.2, i32 %call.3) 
+  %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
+  %call1.4 = tail call i32 @__min_i32_signed(i32 %call1.3, i32 %call.4) 
+  ret i32 %call1.4
+}
+define  i32 @__reduce_max_int32(<1 x i32>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i32> %0, i32 0
+  %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
+  %call1 = tail call i32 @__max_i32_signed(i32 %value, i32 %call) 
+  %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
+  %call1.1 = tail call i32 @__max_i32_signed(i32 %call1, i32 %call.1) 
+  %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
+  %call1.2 = tail call i32 @__max_i32_signed(i32 %call1.1, i32 %call.2) 
+  %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
+  %call1.3 = tail call i32 @__max_i32_signed(i32 %call1.2, i32 %call.3) 
+  %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
+  %call1.4 = tail call i32 @__max_i32_signed(i32 %call1.3, i32 %call.4) 
+  ret i32 %call1.4
+}
+
+;;;;;;;;; reductions uint32
+define  i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i32> %0, i32 0
+  %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
+  %call1 = tail call i32 @__min_i32_unsigned(i32 %value, i32 %call) 
+  %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
+  %call1.1 = tail call i32 @__min_i32_unsigned(i32 %call1, i32 %call.1) 
+  %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
+  %call1.2 = tail call i32 @__min_i32_unsigned(i32 %call1.1, i32 %call.2) 
+  %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
+  %call1.3 = tail call i32 @__min_i32_unsigned(i32 %call1.2, i32 %call.3) 
+  %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
+  %call1.4 = tail call i32 @__min_i32_unsigned(i32 %call1.3, i32 %call.4) 
+  ret i32 %call1.4
+}
+define  i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i32> %0, i32 0
+  %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
+  %call1 = tail call i32 @__max_i32_unsigned(i32 %value, i32 %call) 
+  %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
+  %call1.1 = tail call i32 @__max_i32_unsigned(i32 %call1, i32 %call.1) 
+  %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
+  %call1.2 = tail call i32 @__max_i32_unsigned(i32 %call1.1, i32 %call.2) 
+  %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
+  %call1.3 = tail call i32 @__max_i32_unsigned(i32 %call1.2, i32 %call.3) 
+  %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
+  %call1.4 = tail call i32 @__max_i32_unsigned(i32 %call1.3, i32 %call.4) 
+  ret i32 %call1.4
+ }
+
+;;;;;;;;; reductions double
+define  double @__reduce_add_double(<1 x double>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x double> %0, i32 0
+  %call = tail call double @__shfl_xor_double_nvptx(double %value, i32 16)
+  %call1 = fadd double %call, %value 
+  %call.1 = tail call double @__shfl_xor_double_nvptx(double %call1, i32 8)
+  %call1.1 = fadd double %call1, %call.1 
+  %call.2 = tail call double @__shfl_xor_double_nvptx(double %call1.1, i32 4)
+  %call1.2 = fadd double %call1.1, %call.2
+  %call.3 = tail call double @__shfl_xor_double_nvptx(double %call1.2, i32 2)
+  %call1.3 = fadd double %call1.2, %call.3 
+  %call.4 = tail call double @__shfl_xor_double_nvptx(double %call1.3, i32 1)
+  %call1.4 = fadd double %call1.3, %call.4 
+  ret double %call1.4
+}
+define  double @__reduce_min_double(<1 x double>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x double> %0, i32 0
+  %call = tail call double @__shfl_xor_double_nvptx(double %value, i32 16)
+  %call1 = tail call double @__min_double(double %value, double %call) 
+  %call.1 = tail call double @__shfl_xor_double_nvptx(double %call1, i32 8)
+  %call1.1 = tail call double @__min_double(double %call1, double %call.1) 
+  %call.2 = tail call double @__shfl_xor_double_nvptx(double %call1.1, i32 4)
+  %call1.2 = tail call double @__min_double(double %call1.1, double %call.2) 
+  %call.3 = tail call double @__shfl_xor_double_nvptx(double %call1.2, i32 2)
+  %call1.3 = tail call double @__min_double(double %call1.2, double %call.3) 
+  %call.4 = tail call double @__shfl_xor_double_nvptx(double %call1.3, i32 1)
+  %call1.4 = tail call double @__min_double(double %call1.3, double %call.4) 
+  ret double %call1.4
+}
+define  double @__reduce_max_double(<1 x double>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x double> %0, i32 0
+  %call = tail call double @__shfl_xor_double_nvptx(double %value, i32 16)
+  %call1 = tail call double @__max_double(double %value, double %call) 
+  %call.1 = tail call double @__shfl_xor_double_nvptx(double %call1, i32 8)
+  %call1.1 = tail call double @__max_double(double %call1, double %call.1) 
+  %call.2 = tail call double @__shfl_xor_double_nvptx(double %call1.1, i32 4)
+  %call1.2 = tail call double @__max_double(double %call1.1, double %call.2) 
+  %call.3 = tail call double @__shfl_xor_double_nvptx(double %call1.2, i32 2)
+  %call1.3 = tail call double @__max_double(double %call1.2, double %call.3) 
+  %call.4 = tail call double @__shfl_xor_double_nvptx(double %call1.3, i32 1)
+  %call1.4 = tail call double @__max_double(double %call1.3, double %call.4) 
+  ret double %call1.4
+}
+
+
+;;;;;;;;; reductions int64
+define  i64 @__reduce_add_int64(<1 x i64>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i64> %0, i32 0
+  %call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16)
+  %call1 = add i64 %call, %value 
+  %call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8)
+  %call1.1 =add i64 %call1, %call.1 
+  %call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4)
+  %call1.2 = add i64 %call1.1, %call.2
+  %call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2)
+  %call1.3 = add i64 %call1.2, %call.3 
+  %call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1)
+  %call1.4 = add i64 %call1.3, %call.4 
+  ret i64 %call1.4
+}
+define  i64 @__reduce_min_int64(<1 x i64>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i64> %0, i32 0
+  %call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16)
+  %call1 = tail call i64 @__min_i64_signed(i64 %value, i64 %call) 
+  %call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8)
+  %call1.1 = tail call i64 @__min_i64_signed(i64 %call1, i64 %call.1) 
+  %call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4)
+  %call1.2 = tail call i64 @__min_i64_signed(i64 %call1.1, i64 %call.2) 
+  %call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2)
+  %call1.3 = tail call i64 @__min_i64_signed(i64 %call1.2, i64 %call.3) 
+  %call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1)
+  %call1.4 = tail call i64 @__min_i64_signed(i64 %call1.3, i64 %call.4) 
+  ret i64 %call1.4
+}
+define  i64 @__reduce_max_int64(<1 x i64>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i64> %0, i32 0
+  %call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16)
+  %call1 = tail call i64 @__max_i64_signed(i64 %value, i64 %call) 
+  %call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8)
+  %call1.1 = tail call i64 @__max_i64_signed(i64 %call1, i64 %call.1) 
+  %call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4)
+  %call1.2 = tail call i64 @__max_i64_signed(i64 %call1.1, i64 %call.2) 
+  %call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2)
+  %call1.3 = tail call i64 @__max_i64_signed(i64 %call1.2, i64 %call.3) 
+  %call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1)
+  %call1.4 = tail call i64 @__max_i64_signed(i64 %call1.3, i64 %call.4) 
+  ret i64 %call1.4
+}
+define  i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i64> %0, i32 0
+  %call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16)
+  %call1 = tail call i64 @__min_i64_unsigned(i64 %value, i64 %call) 
+  %call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8)
+  %call1.1 = tail call i64 @__min_i64_unsigned(i64 %call1, i64 %call.1) 
+  %call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4)
+  %call1.2 = tail call i64 @__min_i64_unsigned(i64 %call1.1, i64 %call.2) 
+  %call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2)
+  %call1.3 = tail call i64 @__min_i64_unsigned(i64 %call1.2, i64 %call.3) 
+  %call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1)
+  %call1.4 = tail call i64 @__min_i64_unsigned(i64 %call1.3, i64 %call.4) 
+  ret i64 %call1.4
+}
+define  i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone alwaysinline {
+  %value = extractelement <1 x i64> %0, i32 0
+  %call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16)
+  %call1 = tail call i64 @__max_i64_unsigned(i64 %value, i64 %call) 
+  %call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8)
+  %call1.1 = tail call i64 @__max_i64_unsigned(i64 %call1, i64 %call.1) 
+  %call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4)
+  %call1.2 = tail call i64 @__max_i64_unsigned(i64 %call1.1, i64 %call.2) 
+  %call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2)
+  %call1.3 = tail call i64 @__max_i64_unsigned(i64 %call1.2, i64 %call.3) 
+  %call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1)
+  %call1.4 = tail call i64 @__max_i64_unsigned(i64 %call1.3, i64 %call.4) 
+  ret i64 %call1.4
+}
+
+;;;; reduce equal, must be tested and may fail if data has -1
+define internal i32 @__shfl_reduce_and_step_i32_nvptx(i32, i32) nounwind readnone alwaysinline
+{
+  %shfl = tail call i32 asm sideeffect
+      "{.reg .u32 r0; 
+        .reg .pred p;
+        shfl.bfly.b32  r0|p, $1, $2, 0;
+        @p and.b32 r0, r0, $3;
+        mov.u32 $0, r0;
+      }", "=r,r,r,r"(i32 %0, i32 %1, i32 %0)
+  ret i32 %shfl
+}
+shfl64(__shfl_reduce_and_step, i64)
+
+define internal i32 @__reduce_and_i32(i32 %v0, i1 %mask) nounwind readnone alwaysinline
+{
+  %v  = select i1 %mask, i32 %v0, i32 -1
+  %s1 = tail call i32 @__shfl_reduce_and_step_i32_nvptx(i32 %v,  i32 16);
+  %s2 = tail call i32 @__shfl_reduce_and_step_i32_nvptx(i32 %s1, i32  8);
+  %s3 = tail call i32 @__shfl_reduce_and_step_i32_nvptx(i32 %s2, i32  4);
+  %s4 = tail call i32 @__shfl_reduce_and_step_i32_nvptx(i32 %s3, i32  2);
+  %s5 = tail call i32 @__shfl_reduce_and_step_i32_nvptx(i32 %s4, i32  1);
+  ret i32 %s5
+}
+define internal i64 @__reduce_and_i64(i64, i1) nounwind readnone alwaysinline
+{
+  %v   = bitcast i64 %0 to <2 x i32>
+  %v0  = extractelement <2 x i32> %v, i32 0
+  %v1  = extractelement <2 x i32> %v, i32 1
+  %s0  = call i32 @__reduce_and_i32(i32 %v0, i1 %1)
+  %s1  = call i32 @__reduce_and_i32(i32 %v1, i1 %1)
+  %tmp = insertelement <2 x i32> undef, i32 %s0, i32 0
+  %res = insertelement <2 x i32> %tmp,  i32 %s1, i32 1
+  %ret = bitcast <2 x i32> %res to i64
+  ret i64 %ret;
+}
+
+define(`reduce_equal',`
+define i1 @__reduce_equal_$2(<1 x $1> %v0, $1 * %samevalue, <1 x i1> %maskv) nounwind alwaysinline
+{
+entry:
+  %vv = bitcast <1 x $1> %v0 to <1 x $3>
+  %sv = extractelement <1 x $3> %vv, i32 0
+  %mask = extractelement <1 x i1> %maskv, i32 0
+
+  %s = call $3 @__reduce_and_$3($3 %sv, i1 %mask);
+
+  ;; find last active lane 
+  %nact  = call i32 @__ballot_nvptx(i1 %mask)
+  %lane1 = call i32 @__count_leading_zeros_i32(i32 %nact)
+  %lane  = sub i32 31, %lane1
+
+  ;; broadcast result from this lane
+  %r = tail call $3 @__shfl_$3_nvptx($3 %s, i32 %lane)
+
+  ;; compare result to the original value
+  %c0  = icmp eq $3 %r, %sv
+  %c1  = and i1 %c0, %mask
+  %neq = call i32 @__ballot_nvptx(i1 %c1)
+  %cmp = icmp eq i32 %neq, %nact
+
+  br i1 %cmp, label %all_equal, label %all_not_equal
+  
+all_equal:
+  %vstore = bitcast $3 %r to $1 
+  store $1 %vstore, $1* %samevalue;
+  ret i1 true
+
+all_not_equal:
+  ret i1 false
+
+}
+')
+reduce_equal(i32,    int32, i32);
+reduce_equal(i64,    int64, i64);
+reduce_equal(float,  float, i32);
+reduce_equal(double, double, i64);
+
+;;;;;;;;;;; shuffle
+define(`shuffle1', `
+define <1 x $1> @__shuffle_$1(<1 x $1>, <1 x i32>) nounwind readnone alwaysinline 
+{
+  %val  = extractelement <1 x $1> %0, i32 0
+  %lane = extractelement <1 x i32> %1, i32 0
+  %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane)
+  %retv = insertelement <1 x $1> undef, $1 %rets, i32 0
+  ret <1 x $1> %retv
+}
+')
+shuffle1(i8)
+shuffle1(i16)
+shuffle1(i32)
+shuffle1(i64)
+shuffle1(float)
+shuffle1(double)
+
+define(`shuffle2',`
+define <1 x $1> @__shuffle2_$1(<1 x $1>, <1 x $1>, <1 x i32>) nounwind readnone alwaysinline
+{
+  %val1 = extractelement <1 x  $1> %0, i32 0
+  %val2 = extractelement <1 x  $1> %1, i32 0
+
+  ;; fetch both values
+  %lane = extractelement <1 x i32> %2, i32 0
+  %lane_mask = and i32 %lane, 31
+  %ret1 = tail call $1 @__shfl_$1_nvptx($1 %val1, i32 %lane_mask);
+  %ret2 = tail call $1 @__shfl_$1_nvptx($1 %val2, i32 %lane_mask);
+
+  ;; select the correct one
+  %c    = icmp slt i32 %lane, 32              
+  %rets = select i1 %c, $1 %ret1, $1 %ret2
+  %retv = insertelement <1 x $1> undef, $1 %rets, i32 0
+  ret <1 x $1> %retv
+}
+')
+shuffle2(i8)
+shuffle2(i16)
+shuffle2(i32)
+shuffle2(i64)
+shuffle2(float)
+shuffle2(double)
+
+define(`shift',`
+define <1 x $1> @__shift_$1(<1 x $1>, i32) nounwind readnone alwaysinline
+{
+  %val  = extractelement <1 x $1> %0, i32 0
+  %lane = call i32 @__program_index()
+  %src  = add i32 %lane, %1
+  %ret  = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %src)
+  %c1   = icmp sge i32 %src, 0
+  %c2   = icmp slt i32 %src, 32
+  %c    = and i1 %c1, %c2
+  %rets = select i1 %c, $1 %ret, $1 zeroinitializer
+  %retv = insertelement <1 x $1> undef, $1 %rets, i32 0
+  ret <1 x $1> %retv
+}
+')
+shift(i8)
+shift(i16)
+shift(i32)
+shift(i64)
+shift(float)
+shift(double)
+
+define(`rotate', `
+define <1 x $1> @__rotate_$1(<1 x $1>, i32) nounwind readnone alwaysinline 
+{
+  %val  = extractelement <1 x $1> %0, i32 0
+  %tid  = call i32 @__program_index()
+  %src  = add i32 %tid, %1
+  %lane = and i32 %src, 31
+  %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane)
+  %retv = insertelement <1 x $1> undef, $1 %rets, i32 0
+  ret <1 x $1> %retv
+}
+')
+rotate(i8)
+rotate(i16)
+rotate(i32)
+rotate(i64)
+rotate(float)
+rotate(double)
+
+define(`broadcast', `
+define <1 x $1> @__broadcast_$1(<1 x $1>, i32) nounwind readnone alwaysinline 
+{
+  %val  = extractelement <1 x $1> %0, i32 0
+  %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %1)
+  %retv = insertelement <1 x $1> undef, $1 %rets, i32 0
+  ret <1 x $1> %retv
+}
+')
+broadcast(i8)
+broadcast(i16)
+broadcast(i32)
+broadcast(i64)
+broadcast(float)
+broadcast(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefix sum stuff
+
+define internal i32 @__shfl_scan_add_step_i32(i32 %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call i32 asm sideeffect  
+      "{.reg .u32 r0;
+       .reg .pred p;
+       shfl.up.b32 r0|p, $1, $2, 0;
+       @p add.u32 r0, r0, $3;
+       mov.u32 $0, r0;
+       }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial)
+  ret i32 %result;
+}
+define <1 x i32> @__exclusive_scan_add_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x i32> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v    = select i1 %mask, i32 %v0, i32 0
+  
+  %s1 = tail call i32 @__shfl_scan_add_step_i32(i32 %v,  i32  1);
+  %s2 = tail call i32 @__shfl_scan_add_step_i32(i32 %s1, i32  2);
+  %s3 = tail call i32 @__shfl_scan_add_step_i32(i32 %s2, i32  4);
+  %s4 = tail call i32 @__shfl_scan_add_step_i32(i32 %s3, i32  8);
+  %s5 = tail call i32 @__shfl_scan_add_step_i32(i32 %s4, i32 16);
+  %rets = sub i32 %s5, %v
+  %retv = insertelement <1 x i32> undef, i32 %rets, i32 0
+  ret <1 x i32> %retv
+}
+;;
+define internal i32 @__shfl_scan_or_step_i32(i32 %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call i32 asm sideeffect  
+      "{.reg .u32 r0;
+       .reg .pred p;
+       shfl.up.b32 r0|p, $1, $2, 0;
+       @p or.b32 r0, r0, $3;
+       mov.u32 $0, r0;
+       }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial)
+  ret i32 %result;
+}
+define <1 x i32> @__exclusive_scan_or_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x i32> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v1   = select i1 %mask, i32 %v0, i32 0
+
+  ;; shfl-up by one for exclusive scan
+  %v = tail call i32 asm sideeffect
+      "{.reg .u32 r0;
+        .reg .pred p;
+        shfl.up.b32 r0|p, $1, 1, 0;
+        @!p mov.u32 r0, 0;
+        mov.u32 $0, r0;
+      }","=r,r"(i32 %v1)
+  
+  %s1 = tail call i32 @__shfl_scan_or_step_i32(i32 %v,  i32  1);
+  %s2 = tail call i32 @__shfl_scan_or_step_i32(i32 %s1, i32  2);
+  %s3 = tail call i32 @__shfl_scan_or_step_i32(i32 %s2, i32  4);
+  %s4 = tail call i32 @__shfl_scan_or_step_i32(i32 %s3, i32  8);
+  %s5 = tail call i32 @__shfl_scan_or_step_i32(i32 %s4, i32 16);
+  %retv = insertelement <1 x i32> undef, i32 %s5, i32 0
+  ret <1 x i32> %retv
+}
+;;
+define internal i32 @__shfl_scan_and_step_i32(i32 %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = call i32 asm 
+      "{.reg .u32 r0;
+       .reg .pred p;
+       shfl.up.b32 r0|p, $1, $2, 0;
+       @p and.b32 r0, r0, $3;
+       mov.u32 $0, r0;
+       }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial)
+  ret i32 %result;
+}
+define <1 x i32> @__exclusive_scan_and_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x i32> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v1   = select i1 %mask, i32 %v0, i32 -1
+
+  ;; shfl-up by one for exclusive scan
+  %v = call i32 asm
+      "{.reg .u32 r0;
+        .reg .pred p;
+        shfl.up.b32 r0|p, $1, 1, 0;
+        @!p mov.u32 r0, -1;
+        mov.u32 $0, r0;
+      }","=r,r"(i32 %v1)
+
+  %s1 = call i32 @__shfl_scan_and_step_i32(i32 %v,  i32  1);
+  %s2 = call i32 @__shfl_scan_and_step_i32(i32 %s1, i32  2);
+  %s3 = call i32 @__shfl_scan_and_step_i32(i32 %s2, i32  4);
+  %s4 = call i32 @__shfl_scan_and_step_i32(i32 %s3, i32  8);
+  %s5 = call i32 @__shfl_scan_and_step_i32(i32 %s4, i32 16);
+  %retv = insertelement <1 x i32> undef, i32 %s5, i32 0
+  ret <1 x i32> %retv
+}
+
+define internal float @__shfl_scan_add_step_float(float %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call float asm sideeffect  
+      "{.reg .f32 f0;
+       .reg .pred p;
+       shfl.up.b32 f0|p, $1, $2, 0;
+       @p add.f32 f0, f0, $3;
+       mov.f32 $0, f0;
+       }", "=f,f,r,f"(float %partial, i32 %up_offset, float %partial)
+  ret float %result;
+}
+define <1 x float> @__exclusive_scan_add_float(<1 x float>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x float> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v    = select i1 %mask, float %v0, float zeroinitializer
+
+  %s1 = tail call float @__shfl_scan_add_step_float(float %v,  i32  1);
+  %s2 = tail call float @__shfl_scan_add_step_float(float %s1, i32  2);
+  %s3 = tail call float @__shfl_scan_add_step_float(float %s2, i32  4);
+  %s4 = tail call float @__shfl_scan_add_step_float(float %s3, i32  8);
+  %s5 = tail call float @__shfl_scan_add_step_float(float %s4, i32 16);
+  %rets = fsub float %s5, %v
+  %retv = insertelement <1 x float> undef, float %rets, i32 0
+  ret <1 x float> %retv
+}
+define internal double @__shfl_scan_add_step_double(double %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call double asm sideeffect  
+      "{.reg .s32 r<10>;
+        .reg .f64 fd0;
+       .reg .pred p;
+       .reg .b32 temp;
+       mov.b64 {r1,temp}, $1;
+       mov.b64 {temp,r2}, $1;
+       shfl.up.b32 r3,   r1, $2, 0;
+       shfl.up.b32 r4|p, r2, $2, 0;
+       mov.b64 fd0, {r3,r4};
+       @p add.f64 fd0, fd0, $3;
+       mov.f64 $0, fd0;
+       }", "=d,d,r,d"(double %partial, i32 %up_offset, double %partial)
+  ret double %result;
+}
+define <1 x double> @__exclusive_scan_add_double(<1 x double>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x double> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v    = select i1 %mask, double %v0, double zeroinitializer
+
+  %s1 = tail call double @__shfl_scan_add_step_double(double %v,  i32  1);
+  %s2 = tail call double @__shfl_scan_add_step_double(double %s1, i32  2);
+  %s3 = tail call double @__shfl_scan_add_step_double(double %s2, i32  4);
+  %s4 = tail call double @__shfl_scan_add_step_double(double %s3, i32  8);
+  %s5 = tail call double @__shfl_scan_add_step_double(double %s4, i32 16);
+  %rets = fsub double %s5, %v
+  %retv = bitcast double %rets to <1 x double>
+  ret <1 x double> %retv
+}
+
+define internal i64 @__shfl_scan_add_step_i64(i64 %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call i64 asm sideeffect  
+      "{.reg .s32 r<10>;
+        .reg .s64 rl0;
+       .reg .pred p;
+       .reg .b32 temp;
+       mov.b64 {r1,temp}, $1;
+       mov.b64 {temp,r2}, $1;
+       shfl.up.b32 r3,   r1, $2, 0;
+       shfl.up.b32 r4|p, r2, $2, 0;
+       mov.b64 rl0, {r3,r4};
+       @p add.s64 rl0, rl0, $3;
+       mov.s64 $0, rl0;
+       }", "=l,l,r,l"(i64 %partial, i32 %up_offset, i64 %partial) 
+  ret i64 %result;
+}
+define <1 x i64> @__exclusive_scan_add_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x i64> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v    = select i1 %mask, i64 %v0, i64 zeroinitializer
+
+  %s1 = tail call i64 @__shfl_scan_add_step_i64(i64 %v,  i32  1);
+  %s2 = tail call i64 @__shfl_scan_add_step_i64(i64 %s1, i32  2);
+  %s3 = tail call i64 @__shfl_scan_add_step_i64(i64 %s2, i32  4);
+  %s4 = tail call i64 @__shfl_scan_add_step_i64(i64 %s3, i32  8);
+  %s5 = tail call i64 @__shfl_scan_add_step_i64(i64 %s4, i32 16);
+  %rets = sub i64 %s5, %v
+  %retv = bitcast i64 %rets to <1 x i64>
+  ret <1 x i64> %retv
+}
+
+define(`exclusive_scan_i64',`
+define <1 x i64> @__exclusive_scan_$1_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v = bitcast <1 x i64> %0 to <2 x i32>
+  %v0 = extractelement <2 x i32> %v, i32 0
+  %v1 = extractelement <2 x i32> %v, i32 1
+  %inp0 = bitcast i32 %v0 to <1 x i32>
+  %inp1 = bitcast i32 %v1 to <1 x i32>
+  %res0 = call <1 x i32> @__exclusive_scan_$1_i32(<1 x i32> %inp0, <1 x i1> %1);
+  %res1 = call <1 x i32> @__exclusive_scan_$1_i32(<1 x i32> %inp1, <1 x i1> %1);
+  %r0   = bitcast <1 x i32> %res0 to i32
+  %r1   = bitcast <1 x i32> %res1 to i32
+  %ret0 = insertelement <2 x i32> undef, i32 %r0, i32 0
+  %ret1 = insertelement <2 x i32> %ret0, i32 %r1, i32 1
+  %ret  = bitcast <2 x i32> %ret1 to <1 x i64>
+  ret <1 x i64> %ret
+}
+')
+exclusive_scan_i64(or)
+exclusive_scan_i64(and)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(float)
+gen_masked_store(i64)
+gen_masked_store(double)
+
+define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i8> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
+  store <WIDTH x i8> %v1, <WIDTH x i8> * %0
+  ret void
+}
+
+define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                      <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i16> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
+  store <WIDTH x i16> %v1, <WIDTH x i16> * %0
+  ret void
+}
+
+define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                      <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i32> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
+  store <WIDTH x i32> %v1, <WIDTH x i32> * %0
+  ret void
+}
+
+define void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                        <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x float> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x float> %1, <WIDTH x float> %v
+  store <WIDTH x float> %v1, <WIDTH x float> * %0
+  ret void
+}
+
+define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
+                            <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i64> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
+  store <WIDTH x i64> %v1, <WIDTH x i64> * %0
+  ret void
+}
+
+define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
+                            <WIDTH x double>, <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x double> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x double> %1, <WIDTH x double> %v
+  store <WIDTH x double> %v1, <WIDTH x double> * %0
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch
+define_prefetches()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; vector ops
+
+define(`extract_insert',`
+define $1 @__extract_$2(<1 x $1>, i32) nounwind readnone alwaysinline {
+  %val = extractelement <1 x $1> %0, i32 0
+  %extract = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %1)
+  ret $1 %extract
+}
+
+define <1 x $1> @__insert_$2(<1 x $1>, i32, 
+                                   $1) nounwind readnone alwaysinline {
+  %orig = extractelement <1 x $1> %0, i32 0
+  %lane = call i32 @__program_index() 
+  %c    = icmp eq i32 %lane, %1
+  %val  = select i1 %c, $1 %2, $1 %orig
+  %insert = insertelement <1 x $1> %0, $1 %val, i32 0
+  ret <1 x $1> %insert
+}
+')
+
+extract_insert(i8, int8)
+extract_insert(i16, int16)
+extract_insert(i32, int32)
+extract_insert(i64, int64)
+extract_insert(float, float)
+extract_insert(double, double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; assert
+
+declare void @__assertfail(i64,i64,i32,i64,i64) noreturn;
+declare i32 @vprintf(i64,i64)
+define i32 @__puts_nvptx(i8*) alwaysinline
+{
+  %str   = ptrtoint i8* %0 to i64
+  %parm  = or i64 0, 0
+  %call  = call i32 @vprintf(i64 %str, i64 %parm)
+;;  %cr    = alloca <3 x i8>
+;;  store <3 x i8> <i8 13, i8 10, i8 0>, <3 x i8>* %cr
+;;  %cr1   = ptrtoint <3 x i8>* %cr to i64
+;;  %call1 = call i32 @vprintf(i64 %cr1, i64 %parm)
+  ret i32 %call;
+}
+define internal void @__abort_nvptx(i8* %str) noreturn
+{
+  %tmp1 = alloca <3 x i8>
+  store <3 x i8> <i8 58, i8 58, i8 0>, <3 x i8>* %tmp1
+  %tmp2 = alloca <2 x i8>
+  store <2 x i8> <i8 0, i8 0>, <2 x i8>* %tmp2
+
+  %param1 = ptrtoint <2 x i8>* %tmp2 to i64
+  %param3 = or i32 0, 0
+  %string = ptrtoint i8* %str to i64
+  %param4 = ptrtoint <3 x i8>* %tmp1 to i64
+  %param5 = or i64 1, 1
+  call void @__assertfail(i64 %param1, i64 %string, i32 %param3, i64 %param4, i64 %param5);
+  ret void
+}
+
+define void @__do_assert_uniform(i8 *%str, i1 %test, <WIDTH x MASK> %mask) {
+  br i1 %test, label %ok, label %fail
+
+fail:
+  %lane = call i32 @__program_index()
+  %cmp  = icmp eq i32 %lane, 0
+  br i1 %cmp, label %fail_print, label %fail_void;
+  
+
+
+fail_print:
+  call void @__abort_nvptx(i8* %str) noreturn
+  unreachable
+
+fail_void:
+  unreachable
+
+ok:
+  ret void
+}
+
+
+define void @__do_assert_varying(i8 *%str, <WIDTH x MASK> %test,
+                                 <WIDTH x MASK> %mask) {
+  %nottest = xor <WIDTH x MASK> %test,
+                 < forloop(i, 1, eval(WIDTH-1), `MASK -1, ') MASK -1 >
+  %nottest_and_mask = and <WIDTH x MASK> %nottest, %mask
+  %mm = call i64 @__movmsk(<WIDTH x MASK> %nottest_and_mask)
+  %all_ok = icmp eq i64 %mm, 0
+  br i1 %all_ok, label %ok, label %fail
+
+fail:
+  call void @__abort_nvptx(i8* %str) noreturn
+  unreachable
+
+ok:
+  ret void
+}
+
+define i64 @__clock() nounwind alwaysinline {
+  %r = call i64 asm sideeffect "mov.b64 $0, %clock64;", "=l"();
+  ret i64 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; atomics and memory barriers
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; global_atomic_associative
+;; More efficient implementation for atomics that are associative (e.g.,
+;; add, and, ...).  If a basic implementation would do sometihng like:
+;; result0 = atomic_op(ptr, val0)
+;; result1 = atomic_op(ptr, val1)
+;; ..
+;; Then instead we can do:
+;; tmp = (val0 op val1 op ...)
+;; result0 = atomic_op(ptr, tmp)
+;; result1 = (result0 op val0)
+;; ..
+;; And more efficiently compute the same result
+;;
+;; Takes five parameters:
+;; $1: vector width of the target
+;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
+;;     (add, sub...)
+;; $3: return type of the LLVM atomic (e.g. i32)
+;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
+;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
+;; add
+define <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i32> %valv  to i32
+  br i1 %mask, label %exec, label %pass
+exec:
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  %oldv = bitcast i32 %old to <1 x i32>
+  ret <1 x i32> %oldv
+pass:
+  ret <1 x i32> %valv
+}
+;; sub
+define <1 x i32> @__atomic_sub_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %nvalv = sub <1 x i32> <i32 0>, %valv
+  %ret = call <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %nvalv, <1 x i1> %maskv);
+  ret <1 x i32> %ret;
+}
+;; and
+define <1 x i32> @__atomic_and_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i32> %valv  to i32
+  br i1 %mask, label %exec, label %pass
+exec:
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  %oldv = bitcast i32 %old to <1 x i32>
+  ret <1 x i32> %oldv
+pass:
+  ret <1 x i32> %valv
+}
+;; or
+define <1 x i32> @__atomic_or_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i32> %valv  to i32
+  br i1 %mask, label %exec, label %pass
+exec:
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  %oldv = bitcast i32 %old to <1 x i32>
+  ret <1 x i32> %oldv
+pass:
+  ret <1 x i32> %valv
+}
+;; xor
+define <1 x i32> @__atomic_xor_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i32> %valv  to i32
+  br i1 %mask, label %exec, label %pass
+exec:
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  %oldv = bitcast i32 %old to <1 x i32>
+  ret <1 x i32> %oldv
+pass:
+  ret <1 x i32> %valv
+}
+
+;;;;;;;;; int64
+define <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i64> %valv  to i64
+  br i1 %mask, label %exec, label %pass
+exec:
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  %oldv = bitcast i64 %old to <1 x i64>
+  ret <1 x i64> %oldv
+pass:
+  ret <1 x i64> %valv
+}
+define <1 x i64> @__atomic_sub_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %nvalv = sub <1 x i64> <i64 0>, %valv
+  %ret = call <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %nvalv, <1 x i1> %maskv);
+  ret <1 x i64> %ret;
+}
+
+;; and
+define <1 x i64> @__atomic_and_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i64> %valv  to i64
+  br i1 %mask, label %exec, label %pass
+exec:
+  %andr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %andr, i64 %val);
+  %oldv = bitcast i64 %old to <1 x i64>
+  ret <1 x i64> %oldv
+pass:
+  ret <1 x i64> %valv
+}
+
+;; or 
+define <1 x i64> @__atomic_or_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i64> %valv  to i64
+  br i1 %mask, label %exec, label %pass
+exec:
+  %orr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %orr, i64 %val);
+  %oldv = bitcast i64 %old to <1 x i64>
+  ret <1 x i64> %oldv
+pass:
+  ret <1 x i64> %valv
+}
+
+;; xor
+define <1 x i64> @__atomic_xor_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i64> %valv  to i64
+  br i1 %mask, label %exec, label %pass
+exec:
+  %xorr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %xorr, i64 %val);
+  %oldv = bitcast i64 %old to <1 x i64>
+  ret <1 x i64> %oldv
+pass:
+  ret <1 x i64> %valv
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; global_atomic_uniform
+;; Defines the implementation of a function that handles the mapping from
+;; an ispc atomic function to the underlying LLVM intrinsics.  This variant
+;; just calls the atomic once, for the given uniform value
+;;
+;; Takes four parameters:
+;; $1: vector width of the target
+;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
+;;     (add, sub...)
+;; $3: return type of the LLVM atomic (e.g. i32)
+;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
+
+define internal i32 @__get_first_active_lane()
+{
+  %nact  = call i32 @__ballot_nvptx(i1 true);
+  %lane1 = call i32 @__count_leading_zeros_i32(i32 %nact)
+  %lane  = sub i32 31, %lane1
+  ret i32 %lane
+}
+
+define internal i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i32 @__atomic_sub_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %nval = sub i32 0, %val;
+  %old = tail call i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %nval);
+  ret i32 %old;
+}
+define internal i32 @__atomic_and_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i32 @__atomic_or_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i32 @__atomic_xor_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i32 @__atomic_min_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.min.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i32 @__atomic_max_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.max.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i32 @__atomic_umin_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.min.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i32 @__atomic_umax_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.max.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+
+
+define internal i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal i64 @__atomic_sub_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %nval = sub i64 0, %val;
+  %old = tail call i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %nval);
+  ret i64 %old;
+}
+define internal i64 @__atomic_and_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal i64 @__atomic_or_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal i64 @__atomic_xor_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal i64 @__atomic_min_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.min.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal i64 @__atomic_max_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.max.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal i64 @__atomic_umin_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.min.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal i64 @__atomic_umax_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.max.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+
+define(`global_atomic',`
+define <1 x $3> @__atomic_$2_$4_global($3* %ptr,  <1 x $3> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x i1> %maskv to i1
+  %val  = bitcast <1 x $3> %valv  to $3
+  br i1 %mask, label %exec, label %pass
+exec:
+  %old = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %val);
+  %oldv = bitcast $3 %old to <1 x $3>
+  ret <1 x $3> %oldv
+pass:
+  ret <1 x $3> %valv
+}
+')
+define(`global_atomic_uniform',`
+define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline
+{
+entry:
+  %addr   = ptrtoint $3 * %ptr to i64
+  %active = call i32 @__get_first_active_lane();
+  %lane   = call i32 @__program_index();
+  %c      = icmp eq i32 %lane, %active
+  br i1 %c, label %p1, label %p2
+
+p1:
+  %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %val);
+  br label %p2;
+
+p2: 
+  %t1 = phi $3 [%t0, %p1], [zeroinitializer, %entry]
+  %old = call $3 @__shfl_$3_nvptx($3 %t1, i32 %active)
+  ret $3 %old;
+}
+')
+define(`global_atomic_varying',`
+define <1 x $3> @__atomic_$2_varying_$4_global(<1 x i64> %ptr, <1 x $3> %val, <1 x i1> %maskv) nounwind alwaysinline
+{
+entry:
+  %addr  = bitcast <1 x i64> %ptr   to i64
+  %c     = bitcast <1 x  i1> %maskv to  i1
+  br i1 %c, label %p1, label %p2
+
+p1:
+  %sv = bitcast <1 x $3> %val to $3
+  %sptr = inttoptr i64 %addr to $3*
+  %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %sptr, $3 %sv);
+  %t0v = bitcast $3 %t0 to <1 x $3>
+  ret < 1x $3> %t0v
+
+p2: 
+  ret <1 x $3> %val
+}
+')
+
+
+global_atomic_uniform(1, add, i32, int32)
+global_atomic_uniform(1, sub, i32, int32)
+global_atomic_uniform(1, and, i32, int32)
+global_atomic_uniform(1, or, i32, int32)
+global_atomic_uniform(1, xor, i32, int32)
+global_atomic_uniform(1, min, i32, int32)
+global_atomic_uniform(1, max, i32, int32)
+global_atomic_uniform(1, umin, i32, uint32)
+global_atomic_uniform(1, umax, i32, uint32)
+
+global_atomic_uniform(1, add, i64, int64)
+global_atomic_uniform(1, sub, i64, int64)
+global_atomic_uniform(1, and, i64, int64)
+global_atomic_uniform(1, or, i64, int64)
+global_atomic_uniform(1, xor, i64, int64)
+global_atomic_uniform(1, min, i64, int64)
+global_atomic_uniform(1, max, i64, int64)
+global_atomic_uniform(1, umin, i64, uint64)
+global_atomic_uniform(1, umax, i64, uint64)
+
+global_atomic_varying(1, add, i32, int32)
+global_atomic_varying(1, sub, i32, int32)
+global_atomic_varying(1, and, i32, int32)
+global_atomic_varying(1, or, i32, int32)
+global_atomic_varying(1, xor, i32, int32)
+global_atomic_varying(1, min, i32, int32)
+global_atomic_varying(1, max, i32, int32)
+global_atomic_varying(1, umin, i32, uint32)
+global_atomic_varying(1, umax, i32, uint32)
+
+global_atomic_varying(1, add, i64, int64)
+global_atomic_varying(1, sub, i64, int64)
+global_atomic_varying(1, and, i64, int64)
+global_atomic_varying(1, or, i64, int64)
+global_atomic_varying(1, xor, i64, int64)
+global_atomic_varying(1, min, i64, int64)
+global_atomic_varying(1, max, i64, int64)
+global_atomic_varying(1, umin, i64, uint64)
+global_atomic_varying(1, umax, i64, uint64)
+
+;; Macro to declare the function that implements the swap atomic.  
+;; Takes three parameters:
+;; $1: vector width of the target
+;; $2: llvm type of the vector elements (e.g. i32)
+;; $3: ispc type of the elements (e.g. int32)
+
+define internal i32 @__atomic_swap_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.exch.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define internal i64 @__atomic_swap_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.exch.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define internal float @__atomic_swap_uniform_float_global_nvptx(float* %ptr, float %val) nounwind alwaysinline
+{
+   %ptrI = bitcast float* %ptr to i32*
+   %valI = bitcast float  %val to i32
+   %retI = call i32 @__atomic_swap_uniform_int32_global_nvptx(i32* %ptrI, i32 %valI)
+   %ret  = bitcast i32 %retI to float
+   ret float %ret
+}
+define internal double @__atomic_swap_uniform_double_global_nvptx(double* %ptr, double %val) nounwind alwaysinline
+{
+   %ptrI = bitcast double* %ptr to i64*
+   %valI = bitcast double  %val to i64
+   %retI = call i64 @__atomic_swap_uniform_int64_global_nvptx(i64* %ptrI, i64 %valI)
+   %ret  = bitcast i64 %retI to double
+   ret double %ret
+}
+global_atomic_uniform(1, swap, i32, int32)
+global_atomic_uniform(1, swap, i64, int64)
+global_atomic_uniform(1, swap, float, float)
+global_atomic_uniform(1, swap, double, double)
+global_atomic_varying(1, swap, i32, int32)
+global_atomic_varying(1, swap, i64, int64)
+global_atomic_varying(1, swap, float, float)
+global_atomic_varying(1, swap, double, double)
+
+
+;; Similarly, macro to declare the function that implements the compare/exchange
+;; atomic.  Takes three parameters:
+;; $1: vector width of the target
+;; $2: llvm type of the vector elements (e.g. i32)
+;; $3: ispc type of the elements (e.g. int32)
+
+define internal i32 @__atomic_compare_exchange_uniform_int32_global_nvptx(i32* %ptr, i32 %cmp, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.cas.b32 $0, [$1], $2, $3;", "=r,l,r,r"(i64 %addr, i32 %cmp, i32 %val);
+  ret i32 %old;
+}
+define internal i64 @__atomic_compare_exchange_uniform_int64_global_nvptx(i64* %ptr, i64 %cmp, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.cas.b64 $0, [$1], $2, $3;", "=l,l,l,l"(i64 %addr, i64 %cmp, i64 %val);
+  ret i64 %old;
+}
+define internal float @__atomic_compare_exchange_uniform_float_global_nvptx(float* %ptr, float %cmp, float %val) nounwind alwaysinline
+{
+   %ptrI = bitcast float* %ptr to i32*
+   %cmpI = bitcast float  %cmp to i32
+   %valI = bitcast float  %val to i32
+   %retI = call i32 @__atomic_compare_exchange_uniform_int32_global_nvptx(i32* %ptrI, i32 %cmpI, i32 %valI)
+   %ret  = bitcast i32 %retI to float
+   ret float %ret
+}
+define internal double @__atomic_compare_exchange_uniform_double_global_nvptx(double* %ptr, double %cmp, double %val) nounwind alwaysinline
+{
+   %ptrI = bitcast double* %ptr to i64*
+   %cmpI = bitcast double  %cmp to i64
+   %valI = bitcast double  %val to i64
+   %retI = call i64 @__atomic_compare_exchange_uniform_int64_global_nvptx(i64* %ptrI, i64 %cmpI, i64 %valI)
+   %ret  = bitcast i64 %retI to double
+   ret double %ret
+}
+
+;;;;;;;;;;;;
+define(`global_atomic_cas',`
+define <1 x $3> @__atomic_$2_$4_global($3* %ptr, <1 x $3> %cmpv, <1 x $3> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x i1> %maskv to i1
+  %cmp  = bitcast <1 x $3> %cmpv  to $3
+  %val  = bitcast <1 x $3> %valv  to $3
+  br i1 %mask, label %exec, label %pass
+exec:
+  %old = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %cmp, $3 %val);
+  %oldv = bitcast $3 %old to <1 x $3>
+  ret <1 x $3> %oldv
+pass:
+  ret <1 x $3> %valv
+}
+')
+define(`global_atomic_cas_uniform',`
+define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %cmp, $3 %val) nounwind alwaysinline
+{
+entry:
+  %addr   = ptrtoint $3 * %ptr to i64
+  %active = call i32 @__get_first_active_lane();
+  %lane   = call i32 @__program_index();
+  %c      = icmp eq i32 %lane, %active
+  br i1 %c, label %p1, label %p2
+
+p1:
+  %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %cmp, $3 %val);
+  br label %p2;
+
+p2: 
+  %t1 = phi $3 [%t0, %p1], [zeroinitializer, %entry]
+  %old = call $3 @__shfl_$3_nvptx($3 %t1, i32 %active)
+  ret $3 %old;
+}
+')
+define(`global_atomic_cas_varying',`
+define <1 x $3> @__atomic_$2_varying_$4_global(<1 x i64> %ptr, <1 x $3> %cmp, <1 x $3> %val, <1 x i1> %maskv) nounwind alwaysinline
+{
+entry:
+  %addr  = bitcast <1 x i64> %ptr   to i64
+  %c     = bitcast <1 x  i1> %maskv to  i1
+  br i1 %c, label %p1, label %p2
+
+p1:
+  %sv = bitcast <1 x $3> %val to $3
+  %sc = bitcast <1 x $3> %cmp to $3
+  %sptr = inttoptr i64 %addr to $3*
+  %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %sptr, $3 %sc, $3 %sv);
+  %t0v = bitcast $3 %t0 to <1 x $3>
+  ret < 1x $3> %t0v
+
+p2: 
+  ret <1 x $3> %val
+}
+')
+
+global_atomic_cas_uniform(1, compare_exchange, i32, int32)
+global_atomic_cas_uniform(1, compare_exchange, i64, int64)
+global_atomic_cas_uniform(1, compare_exchange, float, float)
+global_atomic_cas_uniform(1, compare_exchange, double, double)
+global_atomic_cas_varying(1, compare_exchange, i32, int32)
+global_atomic_cas_varying(1, compare_exchange, i64, int64)
+global_atomic_cas_varying(1, compare_exchange, float, float)
+global_atomic_cas_varying(1, compare_exchange, double, double)
+global_atomic_cas(1, compare_exchange, i32, int32)
+global_atomic_cas(1, compare_exchange, i64, int64)
+global_atomic_cas(1, compare_exchange, float, float)
+global_atomic_cas(1, compare_exchange, double, double)
+
+
+
+
+declare void @llvm.nvvm.membar.gl()
+declare void @llvm.nvvm.membar.sys()
+declare void @llvm.nvvm.membar.cta()
+
+define void @__memory_barrier() nounwind readnone alwaysinline {
+  ;; see http://llvm.org/bugs/show_bug.cgi?id=2829.  It seems like we
+  ;; only get an MFENCE on x86 if "device" is true, but IMHO we should
+  ;; in the case where the first 4 args are true but it is false.
+  ;;  So we just always set that to true...
+  call void @llvm.nvvm.membar.gl()
+  ret void
+}
+
+saturation_arithmetic_novec();
+
+;;;;;;;;;;;;;;;;;;;;
+;; trigonometry
+
+
+define(`transcendetals_decl',`
+    declare float @__log_uniform_float(float) nounwind readnone
+    declare <WIDTH x float> @__log_varying_float(<WIDTH x float>) nounwind readnone
+    declare float @__exp_uniform_float(float) nounwind readnone
+    declare <WIDTH x float> @__exp_varying_float(<WIDTH x float>) nounwind readnone
+    declare float @__pow_uniform_float(float, float) nounwind readnone
+    declare <WIDTH x float> @__pow_varying_float(<WIDTH x float>, <WIDTH x float>) nounwind readnone
+
+    declare double @__log_uniform_double(double) nounwind readnone
+    declare <WIDTH x double> @__log_varying_double(<WIDTH x double>) nounwind readnone
+    declare double @__exp_uniform_double(double) nounwind readnone
+    declare <WIDTH x double> @__exp_varying_double(<WIDTH x double>) nounwind readnone
+    declare double @__pow_uniform_double(double, double) nounwind readnone
+    declare <WIDTH x double> @__pow_varying_double(<WIDTH x double>, <WIDTH x double>) nounwind readnone
+')
+
+;; 1 - function call, e.g. __nv_fast_logf
+;; 2 - data-type, float/double
+;; 3 - local function name, e.g. __log, __exp, ..
+define(`transcendentals1',`
+declare $2 @$1($2)
+define $2 @$3_uniform_$2($2) nounwind readnone alwaysinline
+{
+  %ret = call $2 @$1($2 %0)
+  ret $2 %ret
+}
+define <1 x $2> @$3_varying_$2(<1 x $2>) nounwind readnone alwaysinline
+{
+  %v = bitcast <1 x $2> %0 to $2
+  %r = call $2 @$3_uniform_$2($2 %v);
+  %ret = bitcast $2 %r to <1 x $2>
+  ret <1 x $2> %ret
+}
+')
+
+
+define(`transcendentals2',`
+declare $2 @$1($2, $2)
+define $2 @$3_uniform_$2($2, $2) nounwind readnone alwaysinline
+{
+  %ret = call $2 @$1($2 %0, $2 %1)
+  ret $2 %ret
+}
+define <1 x $2> @$3_varying_$2(<1 x $2>, <1x $2>) nounwind readnone alwaysinline
+{
+  %v0 = bitcast <1 x $2> %0 to $2
+  %v1 = bitcast <1 x $2> %1 to $2
+  %r = call $2 @$3_uniform_$2($2 %v0, $2 %v1);
+  %ret = bitcast $2 %r to <1 x $2>
+  ret <1 x $2> %ret
+}
+')
+transcendentals1(__nv_fast_logf, float, __log)
+transcendentals1(__nv_fast_expf, float, __exp)
+transcendentals2(__nv_fast_powf, float, __pow)
+
+transcendentals1(__nv_log, double, __log)
+transcendentals1(__nv_exp, double, __exp)
+transcendentals2(__nv_pow, double, __pow)
+
+
+transcendentals1(__nv_fast_sinf, float, __sin)
+transcendentals1(__nv_fast_cosf, float, __cos)
+transcendentals1(__nv_fast_tanf, float, __tan)
+transcendentals1(__nv_asinf,     float, __asin)
+transcendentals1(__nv_acosf,     float, __acos)
+transcendentals1(__nv_atanf,     float, __atan)
+transcendentals2(__nv_atan2f,    float, __atan2)
+
+transcendentals1(__nv_sin,   double, __sin)
+transcendentals1(__nv_cos,   double, __cos)
+transcendentals1(__nv_tan,   double, __tan)
+transcendentals1(__nv_asin,  double, __asin)
+transcendentals1(__nv_acos,  double, __acos)
+transcendentals1(__nv_atan,  double, __atan)
+transcendentals2(__nv_atan2, double, __atan2)
+
+declare void @__sincos_uniform_float(float, float*, float*) nounwind readnone
+declare void @__sincos_varying_float(<WIDTH x float>, <WIDTH x float>*, <WIDTH x float>*) nounwind readnone
+declare void @__sincos_uniform_double(double, double*, double*) nounwind readnone
+declare void @__sincos_varying_double(<WIDTH x double>, <WIDTH x double>*, <WIDTH x double>*) nounwind readnone
+
diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll
index 77a5c551..bf59b230 100644
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -274,3 +274,4 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
 
 define_avgs()
 
+declare_nvptx()
diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll
index 50dd0582..e1f9b2c8 100644
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -278,3 +278,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
   %call = call i64 @llvm.ctpop.i64(i64 %0)
   ret i64 %call
 }
+
+declare_nvptx()
diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4
new file mode 100644
index 00000000..764872a2
--- /dev/null
+++ b/builtins/util-nvptx.m4
@@ -0,0 +1,3492 @@
+;;  Copyright (c) 2010-2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;; This file provides a variety of macros used to generate LLVM bitcode
+;; parametrized in various ways.  Implementations of the standard library
+;; builtins for various targets can use macros from this file to simplify
+;; generating code for their implementations of those builtins.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; It is a bit of a pain to compute this in m4 for 32 and 64-wide targets...
+define(`ALL_ON_MASK',
+`ifelse(WIDTH, `64', `-1', 
+        WIDTH, `32', `4294967295',
+                     `eval((1<<WIDTH)-1)')')
+
+define(`MASK_HIGH_BIT_ON',
+`ifelse(WIDTH, `64', `-9223372036854775808',
+        WIDTH, `32', `2147483648',
+                     `eval(1<<(WIDTH-1))')')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; vector deconstruction utilities
+;; split 8-wide vector into 2 4-wide vectors
+;;
+;; $1: vector element type
+;; $2: 8-wide vector
+;; $3: first 4-wide vector
+;; $4: second 4-wide vector
+
+define(`v8tov4', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  $4 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+')
+
+define(`v16tov8', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  $4 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+define(`v4tov2', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 0, i32 1>
+  $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 2, i32 3>
+')
+
+define(`v8tov2', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 0, i32 1>
+  $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 2, i32 3>
+  $5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 4, i32 5>
+  $6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 6, i32 7>
+')
+
+define(`v16tov4', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  $5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  $6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; vector assembly: wider vector from two narrower vectors
+;;
+;; $1: vector element type
+;; $2: first n-wide vector
+;; $3: second n-wide vector
+;; $4: result 2*n-wide vector
+define(`v8tov16', `
+  $4 = shufflevector <8 x $1> $2, <8 x $1> $3,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Helper macro for calling various SSE instructions for scalar values
+;; but where the instruction takes a vector parameter.
+;; $1 : name of variable to put the final value in
+;; $2 : vector width of the target
+;; $3 : scalar type of the operand
+;; $4 : SSE intrinsic name
+;; $5 : variable name that has the scalar value
+;; For example, the following call causes the variable %ret to have
+;; the result of a call to sqrtss with the scalar value in %0
+;;  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+
+define(`sse_unary_scalar', `
+  %$1_vec = insertelement <$2 x $3> undef, $3 $5, i32 0
+  %$1_val = call <$2 x $3> $4(<$2 x $3> %$1_vec)
+  %$1 = extractelement <$2 x $3> %$1_val, i32 0
+')
+
+;; Similar to `sse_unary_scalar', this helper macro is for calling binary
+;; SSE instructions with scalar values, 
+;; $1: name of variable to put the result in
+;; $2: vector width of the target
+;; $3: scalar type of the operand
+;; $4 : SSE intrinsic name
+;; $5 : variable name that has the first scalar operand
+;; $6 : variable name that has the second scalar operand
+
+define(`sse_binary_scalar', `
+  %$1_veca = insertelement <$2 x $3> undef, $3 $5, i32 0
+  %$1_vecb = insertelement <$2 x $3> undef, $3 $6, i32 0
+  %$1_val = call <$2 x $3> $4(<$2 x $3> %$1_veca, <$2 x $3> %$1_vecb)
+  %$1 = extractelement <$2 x $3> %$1_val, i32 0
+')
+
+;; Do a reduction over a 4-wide vector
+;; $1: type of final scalar result
+;; $2: 4-wide function that takes 2 4-wide operands and returns the 
+;;     element-wise reduction
+;; $3: scalar function that takes two scalar operands and returns
+;;     the final reduction
+
+define(`reduce4', `
+  %v1 = shufflevector <4 x $1> %0, <4 x $1> undef,
+                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %0)
+  %m1a = extractelement <4 x $1> %m1, i32 0
+  %m1b = extractelement <4 x $1> %m1, i32 1
+  %m = call $1 $3($1 %m1a, $1 %m1b)
+  ret $1 %m
+'
+)
+
+;; Similar to `reduce4', do a reduction over an 8-wide vector
+;; $1: type of final scalar result
+;; $2: 8-wide function that takes 2 8-wide operands and returns the 
+;;     element-wise reduction
+;; $3: scalar function that takes two scalar operands and returns
+;;     the final reduction
+
+define(`reduce8', `
+  %v1 = shufflevector <8 x $1> %0, <8 x $1> undef,
+        <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %m1 = call <8 x $1> $2(<8 x $1> %v1, <8 x $1> %0)
+  %v2 = shufflevector <8 x $1> %m1, <8 x $1> undef,
+        <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %m2 = call <8 x $1> $2(<8 x $1> %v2, <8 x $1> %m1)
+  %m2a = extractelement <8 x $1> %m2, i32 0
+  %m2b = extractelement <8 x $1> %m2, i32 1
+  %m = call $1 $3($1 %m2a, $1 %m2b)
+  ret $1 %m
+'
+)
+
+define(`reduce16', `
+  %v1 = shufflevector <16 x $1> %0, <16 x $1> undef,
+        <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef>
+  %m1 = call <16 x $1> $2(<16 x $1> %v1, <16 x $1> %0)
+  %v2 = shufflevector <16 x $1> %m1, <16 x $1> undef,
+        <16 x i32> <i32 4, i32 5, i32 6, i32 7,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef>
+  %m2 = call <16 x $1> $2(<16 x $1> %v2, <16 x $1> %m1)
+  %v3 = shufflevector <16 x $1> %m2, <16 x $1> undef,
+        <16 x i32> <i32 2, i32 3, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef>
+  %m3 = call <16 x $1> $2(<16 x $1> %v3, <16 x $1> %m2)
+
+  %m3a = extractelement <16 x $1> %m3, i32 0
+  %m3b = extractelement <16 x $1> %m3, i32 1
+  %m = call $1 $3($1 %m3a, $1 %m3b)
+  ret $1 %m
+'
+)
+
+;; Do an reduction over an 8-wide vector, using a vector reduction function
+;; that only takes 4-wide vectors
+;; $1: type of final scalar result
+;; $2: 4-wide function that takes 2 4-wide operands and returns the 
+;;     element-wise reduction
+;; $3: scalar function that takes two scalar operands and returns
+;;     the final reduction
+
+define(`reduce8by4', `
+  v8tov4($1, %0, %v1, %v2)
+  %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2)
+  %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef,
+        <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m2 = call <4 x $1> $2(<4 x $1> %v3, <4 x $1> %m1)
+  %m2a = extractelement <4 x $1> %m2, i32 0
+  %m2b = extractelement <4 x $1> %m2, i32 1
+  %m = call $1 $3($1 %m2a, $1 %m2b)
+  ret $1 %m
+'
+)
+
+
+;; Apply a unary function to the 4-vector in %0, return the vector result.
+;; $1: scalar type of result
+;; $2: name of scalar function to call
+
+define(`unary1to4', `
+  %v_0 = extractelement <4 x $1> %0, i32 0
+  %r_0 = call $1 $2($1 %v_0)
+  %ret_0 = insertelement <4 x $1> undef, $1 %r_0, i32 0
+  %v_1 = extractelement <4 x $1> %0, i32 1
+  %r_1 = call $1 $2($1 %v_1)
+  %ret_1 = insertelement <4 x $1> %ret_0, $1 %r_1, i32 1
+  %v_2 = extractelement <4 x $1> %0, i32 2
+  %r_2 = call $1 $2($1 %v_2)
+  %ret_2 = insertelement <4 x $1> %ret_1, $1 %r_2, i32 2
+  %v_3 = extractelement <4 x $1> %0, i32 3
+  %r_3 = call $1 $2($1 %v_3)
+  %ret_3 = insertelement <4 x $1> %ret_2, $1 %r_3, i32 3
+  ret <4 x $1> %ret_3
+')
+
+define(`unary1to8', `
+  %v_0 = extractelement <8 x $1> %0, i32 0
+  %r_0 = call $1 $2($1 %v_0)
+  %ret_0 = insertelement <8 x $1> undef, $1 %r_0, i32 0
+  %v_1 = extractelement <8 x $1> %0, i32 1
+  %r_1 = call $1 $2($1 %v_1)
+  %ret_1 = insertelement <8 x $1> %ret_0, $1 %r_1, i32 1
+  %v_2 = extractelement <8 x $1> %0, i32 2
+  %r_2 = call $1 $2($1 %v_2)
+  %ret_2 = insertelement <8 x $1> %ret_1, $1 %r_2, i32 2
+  %v_3 = extractelement <8 x $1> %0, i32 3
+  %r_3 = call $1 $2($1 %v_3)
+  %ret_3 = insertelement <8 x $1> %ret_2, $1 %r_3, i32 3
+  %v_4 = extractelement <8 x $1> %0, i32 4
+  %r_4 = call $1 $2($1 %v_4)
+  %ret_4 = insertelement <8 x $1> %ret_3, $1 %r_4, i32 4
+  %v_5 = extractelement <8 x $1> %0, i32 5
+  %r_5 = call $1 $2($1 %v_5)
+  %ret_5 = insertelement <8 x $1> %ret_4, $1 %r_5, i32 5
+  %v_6 = extractelement <8 x $1> %0, i32 6
+  %r_6 = call $1 $2($1 %v_6)
+  %ret_6 = insertelement <8 x $1> %ret_5, $1 %r_6, i32 6
+  %v_7 = extractelement <8 x $1> %0, i32 7
+  %r_7 = call $1 $2($1 %v_7)
+  %ret_7 = insertelement <8 x $1> %ret_6, $1 %r_7, i32 7
+  ret <8 x $1> %ret_7
+')
+
+;; Given a unary function that takes a 2-wide vector and a 4-wide vector
+;; that we'd like to apply it to, extract 2 2-wide vectors from the 4-wide
+;; vector, apply it, and return the corresponding 4-wide vector result
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide unary vector function to apply
+;; $4: 4-wide operand value
+
+define(`unary2to4', `
+  %$1_0 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
+  %$1_1 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
+  %$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+'
+)
+
+;; Similar to `unary2to4', this applies a 2-wide binary function to two 4-wide
+;; vector operands
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide binary vector function to apply
+;; $4: First 4-wide operand value
+;; $5: Second 4-wide operand value
+
+define(`binary2to4', `
+%$1_0a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 0, i32 1>
+%$1_0b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> <i32 0, i32 1>
+%v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
+%$1_1a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 2, i32 3>
+%$1_1b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> <i32 2, i32 3>
+%v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
+%$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+'
+)
+
+;; Similar to `unary2to4', this maps a 4-wide unary function to an 8-wide 
+;; vector operand
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 4-wide unary vector function to apply
+;; $4: 8-wide operand value
+
+define(`unary4to8', `
+  %__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
+  %__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
+  %$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the input vector elements
+;; $3: scalar type of the result vector elements
+;; $4: 4-wide unary vector function to apply
+;; $5: 8-wide operand value
+
+define(`unary4to8conv', `
+  %$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
+  %$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+define(`unary4to16', `
+  %__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
+  %__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
+  %__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2)
+  %__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3)
+
+  %__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+define(`unary4to16conv', `
+  %$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
+  %$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2)
+  %$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3)
+
+  %$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+;; And so forth...
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 8-wide unary vector function to apply
+;; $4: 16-wide operand value
+
+define(`unary8to16', `
+  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1)
+  %$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+;; And along the lines of `binary2to4', this maps a 4-wide binary function to
+;; two 8-wide vector operands
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 4-wide unary vector function to apply
+;; $4: First 8-wide operand value
+;; $5: Second 8-wide operand value
+
+define(`binary4to8', `
+%$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b)
+%$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b)
+%$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+define(`binary8to16', `
+%$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0a, <8 x $2> %$1_0b)
+%$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1a, <8 x $2> %$1_1b)
+%$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, 
+         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+define(`binary4to16', `
+%$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%r$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b) 
+
+%$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%r$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b) 
+
+%$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%r$1_2 = call <4 x $2> $3(<4 x $2> %$1_2a, <4 x $2> %$1_2b) 
+
+%$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%r$1_3 = call <4 x $2> $3(<4 x $2> %$1_3a, <4 x $2> %$1_3b)
+
+%r$1_01 = shufflevector <4 x $2> %r$1_0, <4 x $2> %r$1_1, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%r$1_23 = shufflevector <4 x $2> %r$1_2, <4 x $2> %r$1_3, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+%$1 = shufflevector <8 x $2> %r$1_01, <8 x $2> %r$1_23, 
+          <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                      i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+;; Maps a 2-wide unary function to an 8-wide vector operand, returning an 
+;; 8-wide vector result
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide unary vector function to apply
+;; $4: 8-wide operand value
+
+define(`unary2to8', `
+  %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
+  %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
+  %$1_2 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2)
+  %$1_3 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3)
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>           
+'
+)
+
+define(`unary2to16', `
+  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
+  %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2)
+  %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3)
+  %$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4)
+  %$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5)
+  %$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6)
+  %$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7)
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+;; Maps an 2-wide binary function to two 8-wide vector operands
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide unary vector function to apply
+;; $4: First 8-wide operand value
+;; $5: Second 8-wide operand value
+
+define(`binary2to8', `
+  %$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
+  %$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
+  %$1_2a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %$1_2b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b)
+  %$1_3a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
+
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+define(`binary2to16', `
+  %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
+  %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
+  %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b)
+  %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
+  %$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b)
+  %$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b)
+  %$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b)
+  %$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b)
+
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>           
+
+  %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+;; The unary SSE round intrinsic takes a second argument that encodes the
+;; rounding mode.  This macro makes it easier to apply the 4-wide roundps
+;; to 8-wide vector operands
+;; $1: value to be rounded
+;; $2: integer encoding of rounding mode
+;; FIXME: this just has a ret statement at the end to return the result,
+;; which is inconsistent with the macros above 
+
+define(`round4to8', `
+%v0 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2)
+%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2)
+%ret = shufflevector <4 x float> %r0, <4 x float> %r1, 
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ret <8 x float> %ret
+'
+)
+
+define(`round4to16', `
+%v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2)
+%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2)
+%r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2)
+%r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2)
+%ret01 = shufflevector <4 x float> %r0, <4 x float> %r1,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret23 = shufflevector <4 x float> %r2, <4 x float> %r3,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret = shufflevector <8 x float> %ret01, <8 x float> %ret23,
+         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ret <16 x float> %ret
+'
+)
+
+define(`round8to16', `
+%v0 = shufflevector <16 x float> $1, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%v1 = shufflevector <16 x float> $1, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%r0 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v0, i32 $2)
+%r1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v1, i32 $2)
+%ret = shufflevector <8 x float> %r0, <8 x float> %r1, 
+         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ret <16 x float> %ret
+'
+)
+
+define(`round4to8double', `
+%v0 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2)
+%r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2)
+%ret = shufflevector <4 x double> %r0, <4 x double> %r1, 
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ret <8 x double> %ret
+'
+)
+
+; and similarly for doubles...
+
+define(`round2to4double', `
+%v0 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+%v1 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+%r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2)
+%r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2)
+%ret = shufflevector <2 x double> %r0, <2 x double> %r1, 
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ret <4 x double> %ret
+'
+)
+
+define(`round2to8double', `
+%v0 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+%v1 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+%v2 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+%v3 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+%r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2)
+%r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2)
+%r2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v2, i32 $2)
+%r3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v3, i32 $2)
+%ret0 = shufflevector <2 x double> %r0, <2 x double> %r1, 
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%ret1 = shufflevector <2 x double> %r2, <2 x double> %r3, 
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%ret = shufflevector <4 x double> %ret0, <4 x double> %ret1,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ret <8 x double> %ret
+'
+)
+
+define(`round4to16double', `
+%v0 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%v2 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%v3 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2)
+%r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2)
+%r2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v2, i32 $2)
+%r3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v3, i32 $2)
+%ret0 = shufflevector <4 x double> %r0, <4 x double> %r1, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret1 = shufflevector <4 x double> %r2, <4 x double> %r3, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret = shufflevector <8 x double> %ret0, <8 x double> %ret1,
+          <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                      i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ret <16 x double> %ret
+'
+)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; forloop macro
+
+divert(`-1')
+# forloop(var, from, to, stmt) - improved version:
+#   works even if VAR is not a strict macro name
+#   performs sanity check that FROM is larger than TO
+#   allows complex numerical expressions in TO and FROM
+define(`forloop', `ifelse(eval(`($3) >= ($2)'), `1',
+  `pushdef(`$1', eval(`$2'))_$0(`$1',
+    eval(`$3'), `$4')popdef(`$1')')')
+define(`_forloop',
+  `$3`'ifelse(indir(`$1'), `$2', `',
+    `define(`$1', incr(indir(`$1')))$0($@)')')
+divert`'dnl
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; stdlib_core
+;;
+;; This macro defines a bunch of helper routines that depend on the
+;; target's vector width
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define(`shuffles', `
+')
+
+define(`define_shuffles',`
+shuffles(i8, 1)
+shuffles(i16, 2)
+shuffles(float, 4)
+shuffles(i32, 4)
+shuffles(double, 8)
+shuffles(i64, 8)
+')
+
+
+define(`mask_converts', `
+define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) {
+  ret <$1 x i8> %0
+}
+define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) {
+  %r = trunc <$1 x i16> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) {
+  ret <$1 x i16> %0
+}
+define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) {
+  %r = sext <$1 x i16> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) {
+  %r = sext <$1 x i16> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) {
+  %r = trunc <$1 x i32> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) {
+  %r = trunc <$1 x i32> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) {
+  ret <$1 x i32> %0
+}
+define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
+  %r = sext <$1 x i32> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
+  ret <$1 x i64> %0
+}
+')
+
+mask_converts(WIDTH)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; count trailing zeros
+
+define(`ctlztz', `
+declare_count_zeros()
+
+define i32 @__count_trailing_zeros_i32(i32) nounwind readnone alwaysinline {
+  %c = call i32 @llvm.cttz.i32(i32 %0)
+  ret i32 %c
+}
+
+define i64 @__count_trailing_zeros_i64(i64) nounwind readnone alwaysinline {
+  %c = call i64 @llvm.cttz.i64(i64 %0)
+  ret i64 %c
+}
+
+define i32 @__count_leading_zeros_i32(i32) nounwind readnone alwaysinline {
+  %c = call i32 @llvm.ctlz.i32(i32 %0)
+  ret i32 %c
+}
+
+define i64 @__count_leading_zeros_i64(i64) nounwind readnone alwaysinline {
+  %c = call i64 @llvm.ctlz.i64(i64 %0)
+  ret i64 %c
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetching
+
+define(`define_prefetches', `
+declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality,
+                            i32 %cachetype) ; cachetype == 1 is dcache
+
+define void @__prefetch_read_uniform_1(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @__prefetch_read_uniform_2(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 2, i32 1)
+  ret void
+}
+
+define void @__prefetch_read_uniform_3(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 1, i32 1)
+  ret void
+}
+
+define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1)
+  ret void
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; AOS/SOA conversion primitives
+
+;; take 4 4-wide vectors laid out like <r0 g0 b0 a0> <r1 g1 b1 a1> ...
+;; and reorder them to <r0 r1 r2 r3> <g0 g1 g2 g3> ...
+
+define(`aossoa', `
+declare void
+@__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
+        <4 x float> %v3, <4 x float> * noalias %out0, 
+        <4 x float> * noalias %out1, <4 x float> * noalias %out2, 
+        <4 x float> * noalias %out3) nounwind alwaysinline ;
+
+;; Do the reverse of __aos_to_soa4_float4--reorder <r0 r1 r2 r3> <g0 g1 g2 g3> ..
+;; to <r0 g0 b0 a0> <r1 g1 b1 a1> ...
+;; This is the exact same set of operations that __soa_to_soa4_float4 does
+;; (a 4x4 transpose), so just call that...
+
+declare void
+@__soa_to_aos4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
+        <4 x float> %v3, <4 x float> * noalias %out0, 
+        <4 x float> * noalias %out1, <4 x float> * noalias %out2, 
+        <4 x float> * noalias %out3) nounwind alwaysinline;
+
+;; Convert 3-wide AOS values to SOA--specifically, given 3 4-vectors
+;; <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 x3 y3 z3>, transpose to
+;; <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3>.
+
+declare void
+@__aos_to_soa3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
+        <4 x float> * noalias %out0, <4 x float> * noalias %out1,
+        <4 x float> * noalias %out2) nounwind alwaysinline 
+;; The inverse of __aos_to_soa3_float4: convert 3 4-vectors
+;; <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3> to
+;; <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 x3 y3 z3>.
+
+declare void
+@__soa_to_aos3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
+        <4 x float> * noalias %out0, <4 x float> * noalias %out1,
+        <4 x float> * noalias %out2) nounwind alwaysinline 
+;; 8-wide
+;; These functions implement the 8-wide variants of the AOS/SOA conversion
+;; routines above.  These implementations are all built on top of the 4-wide
+;; vector versions.
+ 
+declare void
+@__aos_to_soa4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
+        <8 x float> %v3, <8 x float> * noalias %out0, 
+        <8 x float> * noalias %out1, <8 x float> * noalias %out2, 
+        <8 x float> * noalias %out3) nounwind alwaysinline 
+
+declare void
+@__soa_to_aos4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
+        <8 x float> %v3, <8 x float> * noalias %out0, 
+        <8 x float> * noalias %out1, <8 x float> * noalias %out2, 
+        <8 x float> * noalias %out3) nounwind alwaysinline
+
+declare void
+@__aos_to_soa3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
+        <8 x float> * noalias %out0, <8 x float> * noalias %out1,
+        <8 x float> * noalias %out2) nounwind alwaysinline ;
+
+
+declare void
+@__soa_to_aos3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
+        <8 x float> * noalias %out0, <8 x float> * noalias %out1,
+        <8 x float> * noalias %out2) nounwind alwaysinline ;
+
+;; 16-wide
+
+declare void
+@__aos_to_soa4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
+        <16 x float> %v3, <16 x float> * noalias %out0, 
+        <16 x float> * noalias %out1, <16 x float> * noalias %out2, 
+        <16 x float> * noalias %out3) nounwind alwaysinline ;
+
+
+declare void
+@__soa_to_aos4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
+        <16 x float> %v3, <16 x float> * noalias %out0, 
+        <16 x float> * noalias %out1, <16 x float> * noalias %out2, 
+        <16 x float> * noalias %out3) nounwind alwaysinline ;
+
+declare void
+@__aos_to_soa3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
+        <16 x float> * noalias %out0, <16 x float> * noalias %out1,
+        <16 x float> * noalias %out2) nounwind alwaysinline ;
+
+declare void
+@__soa_to_aos3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
+        <16 x float> * noalias %out0, <16 x float> * noalias %out1,
+        <16 x float> * noalias %out2) nounwind alwaysinline ;
+
+;; versions to be called from stdlib
+
+declare void
+@__aos_to_soa4_float(float * noalias %p,
+        <WIDTH x float> * noalias %out0, <WIDTH x float> * noalias %out1,
+        <WIDTH x float> * noalias %out2, <WIDTH x float> * noalias %out3)
+        nounwind alwaysinline ;
+
+
+declare void
+@__soa_to_aos4_float(<WIDTH x float> %v0, <WIDTH x float> %v1, <WIDTH x float> %v2,
+             <WIDTH x float> %v3, float * noalias %p) nounwind alwaysinline ;
+
+
+declare void
+@__aos_to_soa3_float(float * noalias %p,
+        <WIDTH x float> * %out0, <WIDTH x float> * %out1,
+        <WIDTH x float> * %out2) nounwind alwaysinline ;
+
+
+declare void
+@__soa_to_aos3_float(<WIDTH x float> %v0, <WIDTH x float> %v1, <WIDTH x float> %v2,
+                     float * noalias %p) nounwind alwaysinline ;
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define(`masked_load_float_double', `
+define <WIDTH x float> @__masked_load_float(i8 * %ptr,
+                                             <WIDTH x MASK> %mask) readonly alwaysinline {
+  %v32 = call <WIDTH x i32> @__masked_load_i32(i8 * %ptr, <WIDTH x MASK> %mask)
+  %vf = bitcast <WIDTH x i32> %v32 to <WIDTH x float>
+  ret <WIDTH x float> %vf
+}
+
+define <WIDTH x double> @__masked_load_double(i8 * %ptr,
+                                             <WIDTH x MASK> %mask) readonly alwaysinline {
+  %v64 = call <WIDTH x i64> @__masked_load_i64(i8 * %ptr, <WIDTH x MASK> %mask)
+  %vd = bitcast <WIDTH x i64> %v64 to <WIDTH x double>
+  ret <WIDTH x double> %vd
+}
+
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define(`masked_store_float_double', `
+define void @__masked_store_float(<WIDTH x float> * nocapture, <WIDTH x float>,
+                                  <WIDTH x MASK>) nounwind alwaysinline {
+  %ptr = bitcast <WIDTH x float> * %0 to <WIDTH x i32> *
+  %val = bitcast <WIDTH x float> %1 to <WIDTH x i32>
+  call void @__masked_store_i32(<WIDTH x i32> * %ptr, <WIDTH x i32> %val, <WIDTH x MASK> %2)
+  ret void
+}
+
+
+define void @__masked_store_double(<WIDTH x double> * nocapture, <WIDTH x double>,
+                                   <WIDTH x MASK>) nounwind alwaysinline {
+  %ptr = bitcast <WIDTH x double> * %0 to <WIDTH x i64> *
+  %val = bitcast <WIDTH x double> %1 to <WIDTH x i64>
+  call void @__masked_store_i64(<WIDTH x i64> * %ptr, <WIDTH x i64> %val, <WIDTH x MASK> %2)
+  ret void
+}
+
+define void @__masked_store_blend_float(<WIDTH x float> * nocapture, <WIDTH x float>,
+                                        <WIDTH x MASK>) nounwind alwaysinline {
+  %ptr = bitcast <WIDTH x float> * %0 to <WIDTH x i32> *
+  %val = bitcast <WIDTH x float> %1 to <WIDTH x i32>
+  call void @__masked_store_blend_i32(<WIDTH x i32> * %ptr, <WIDTH x i32> %val, <WIDTH x MASK> %2)
+  ret void
+}
+
+
+define void @__masked_store_blend_double(<WIDTH x double> * nocapture, <WIDTH x double>,
+                                         <WIDTH x MASK>) nounwind alwaysinline {
+  %ptr = bitcast <WIDTH x double> * %0 to <WIDTH x i64> *
+  %val = bitcast <WIDTH x double> %1 to <WIDTH x i64>
+  call void @__masked_store_blend_i64(<WIDTH x i64> * %ptr, <WIDTH x i64> %val, <WIDTH x MASK> %2)
+  ret void
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+define(`stdlib_core', `
+
+declare i32 @__fast_masked_vload()
+
+declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind
+
+declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
+declare i1 @__is_compile_time_constant_uniform_int32(i32)
+declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
+
+; This function declares placeholder masked store functions for the
+;  front-end to use.
+;
+;  void __pseudo_masked_store_i8 (uniform int8 *ptr, varying int8 values, mask)
+;  void __pseudo_masked_store_i16(uniform int16 *ptr, varying int16 values, mask)
+;  void __pseudo_masked_store_i32(uniform int32 *ptr, varying int32 values, mask)
+;  void __pseudo_masked_store_float(uniform float *ptr, varying float values, mask)
+;  void __pseudo_masked_store_i64(uniform int64 *ptr, varying int64 values, mask)
+;  void __pseudo_masked_store_double(uniform double *ptr, varying double values, mask)
+;
+;  These in turn are converted to native masked stores or to regular
+;  stores (if the mask is all on) by the MaskedStoreOptPass optimization
+;  pass.
+
+declare void @__pseudo_masked_store_i8(<WIDTH x i8> * nocapture, <WIDTH x i8>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_i16(<WIDTH x i16> * nocapture, <WIDTH x i16>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_i32(<WIDTH x i32> * nocapture, <WIDTH x i32>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_float(<WIDTH x float> * nocapture, <WIDTH x float>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_i64(<WIDTH x i64> * nocapture, <WIDTH x i64>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_double(<WIDTH x double> * nocapture, <WIDTH x double>, <WIDTH x MASK>)
+
+; Declare the pseudo-gather functions.  When the ispc front-end needs
+; to perform a gather, it generates a call to one of these functions,
+; which ideally have these signatures:
+;    
+; varying int8  __pseudo_gather_i8(varying int8 *, mask)
+; varying int16 __pseudo_gather_i16(varying int16 *, mask)
+; varying int32 __pseudo_gather_i32(varying int32 *, mask)
+; varying float __pseudo_gather_float(varying float *, mask)
+; varying int64 __pseudo_gather_i64(varying int64 *, mask)
+; varying double __pseudo_gather_double(varying double *, mask)
+;
+; However, vectors of pointers weren not legal in LLVM until recently, so
+; instead, it emits calls to functions that either take vectors of int32s
+; or int64s, depending on the compilation target.
+
+declare <WIDTH x i8>  @__pseudo_gather32_i8(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16> @__pseudo_gather32_i16(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32> @__pseudo_gather32_i32(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float> @__pseudo_gather32_float(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64> @__pseudo_gather32_i64(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double> @__pseudo_gather32_double(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>  @__pseudo_gather64_i8(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16> @__pseudo_gather64_i16(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32> @__pseudo_gather64_i32(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float> @__pseudo_gather64_float(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64> @__pseudo_gather64_i64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double> @__pseudo_gather64_double(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+
+; The ImproveMemoryOps optimization pass finds these calls and then 
+; tries to convert them to be calls to gather functions that take a uniform
+; base pointer and then a varying integer offset, when possible.
+;
+; For targets without a native gather instruction, it is best to factor the
+; integer offsets like "{1/2/4/8} * varying_offset + constant_offset",
+; where varying_offset includes non-compile time constant values, and
+; constant_offset includes compile-time constant values.  (The scalar loads
+; generated in turn can then take advantage of the free offsetting and scale by
+; 1/2/4/8 that is offered by the x86 addresisng modes.)
+;
+; varying int{8,16,32,float,64,double}
+; __pseudo_gather_factored_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
+;                                    int{32,64} offsets, uniform int32 offset_scale, 
+;                                    int{32,64} offset_delta, mask)
+;
+; For targets with a gather instruction, it is better to just factor them into
+; a gather from a uniform base pointer and then "{1/2/4/8} * offsets", where the
+; offsets are int32/64 vectors.
+;
+; varying int{8,16,32,float,64,double}
+; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
+;                                    uniform int32 offset_scale, int{32,64} offsets, mask)
+
+
+declare <WIDTH x i8>
+@__pseudo_gather_factored_base_offsets32_i8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                            <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_factored_base_offsets32_i16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_factored_base_offsets32_i32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_factored_base_offsets32_float(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                               <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_factored_base_offsets32_i64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_factored_base_offsets32_double(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>
+@__pseudo_gather_factored_base_offsets64_i8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                            <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_factored_base_offsets64_i16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_factored_base_offsets64_i32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_factored_base_offsets64_float(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                               <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_factored_base_offsets64_i64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_factored_base_offsets64_double(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>
+@__pseudo_gather_base_offsets32_i8(i8 *, i32, <WIDTH x i32>,
+                                   <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_base_offsets32_i16(i8 *, i32, <WIDTH x i32>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_base_offsets32_i32(i8 *, i32, <WIDTH x i32>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_base_offsets32_float(i8 *, i32, <WIDTH x i32>,
+                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_base_offsets32_i64(i8 *, i32, <WIDTH x i32>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_base_offsets32_double(i8 *, i32, <WIDTH x i32>,
+                                       <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>
+@__pseudo_gather_base_offsets64_i8(i8 *, i32, <WIDTH x i64>,
+                                   <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_base_offsets64_i16(i8 *, i32, <WIDTH x i64>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_base_offsets64_i32(i8 *, i32, <WIDTH x i64>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_base_offsets64_float(i8 *, i32, <WIDTH x i64>,
+                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_base_offsets64_i64(i8 *, i32, <WIDTH x i64>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_base_offsets64_double(i8 *, i32, <WIDTH x i64>,
+                                       <WIDTH x MASK>) nounwind readonly
+
+; Similarly to the pseudo-gathers defined above, we also declare undefined
+; pseudo-scatter instructions with signatures:
+;
+; void __pseudo_scatter_i8 (varying int8 *, varying int8 values, mask)
+; void __pseudo_scatter_i16(varying int16 *, varying int16 values, mask)
+; void __pseudo_scatter_i32(varying int32 *, varying int32 values, mask)
+; void __pseudo_scatter_float(varying float *, varying float values, mask)
+; void __pseudo_scatter_i64(varying int64 *, varying int64 values, mask)
+; void __pseudo_scatter_double(varying double *, varying double values, mask)
+;
+
+declare void @__pseudo_scatter32_i8(<WIDTH x i32>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_i16(<WIDTH x i32>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_i32(<WIDTH x i32>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_float(<WIDTH x i32>, <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_i64(<WIDTH x i32>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_double(<WIDTH x i32>, <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+declare void @__pseudo_scatter64_i8(<WIDTH x i64>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_i16(<WIDTH x i64>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_i32(<WIDTH x i64>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_float(<WIDTH x i64>, <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_i64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_double(<WIDTH x i64>, <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+; And the ImproveMemoryOps optimization pass also finds these and
+; either transforms them to scatters like:
+;
+; void __pseudo_scatter_factored_base_offsets{32,64}_i8(uniform int8 *base, 
+;             varying int32 offsets, uniform int32 offset_scale, 
+;             varying int{32,64} offset_delta, varying int8 values, mask)
+; (and similarly for 16/32/64 bit values)
+;
+; Or, if the target has a native scatter instruction:
+;
+; void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base, 
+;             uniform int32 offset_scale, varying int{32,64} offsets,
+;             varying int8 values, mask)
+; (and similarly for 16/32/64 bit values)
+
+declare void
+@__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                              <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                              <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                              <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                 <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                              <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                              <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                              <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                 <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_scatter_base_offsets32_i8(i8 * nocapture, i32, <WIDTH x i32>,
+                                    <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_i16(i8 * nocapture, i32, <WIDTH x i32>,
+                                     <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_i32(i8 * nocapture, i32, <WIDTH x i32>,
+                                     <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_float(i8 * nocapture, i32, <WIDTH x i32>,
+                                       <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_i64(i8 * nocapture, i32, <WIDTH x i32>,
+                                     <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_double(i8 * nocapture, i32, <WIDTH x i32>,
+                                        <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_scatter_base_offsets64_i8(i8 * nocapture, i32, <WIDTH x i64>,
+                                    <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_i16(i8 * nocapture, i32, <WIDTH x i64>,
+                                     <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_i32(i8 * nocapture, i32, <WIDTH x i64>,
+                                     <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_float(i8 * nocapture, i32, <WIDTH x i64>,
+                                       <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_i64(i8 * nocapture, i32, <WIDTH x i64>,
+                                     <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, <WIDTH x i64>,
+                                        <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+declare void @__use8(<WIDTH x i8>)
+declare void @__use16(<WIDTH x i16>)
+declare void @__use32(<WIDTH x i32>)
+declare void @__usefloat(<WIDTH x float>)
+declare void @__use64(<WIDTH x i64>)
+declare void @__usedouble(<WIDTH x double>)
+
+;; This is a temporary function that will be removed at the end of
+;; compilation--the idea is that it calls out to all of the various
+;; functions / pseudo-function declarations that we need to keep around
+;; so that they are available to the various optimization passes.  This
+;; then prevents those functions from being removed as dead code when
+;; we do early DCE...
+
+define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
+                               <WIDTH x i32> %v32, <WIDTH x i64> %v64,
+                               <WIDTH x MASK> %mask) {
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; loads
+  %ml8  = call <WIDTH x i8>  @__masked_load_i8(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %ml8)
+  %ml16 = call <WIDTH x i16> @__masked_load_i16(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %ml16)
+  %ml32 = call <WIDTH x i32> @__masked_load_i32(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %ml32)
+  %mlf = call <WIDTH x float> @__masked_load_float(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %mlf)
+  %ml64 = call <WIDTH x i64> @__masked_load_i64(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %ml64)
+  %mld = call <WIDTH x double> @__masked_load_double(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %mld)
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; stores
+  %pv8 = bitcast i8 * %ptr to <WIDTH x i8> *
+  call void @__pseudo_masked_store_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
+                                      <WIDTH x MASK> %mask)
+  %pv16 = bitcast i8 * %ptr to <WIDTH x i16> *
+  call void @__pseudo_masked_store_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
+                                       <WIDTH x MASK> %mask)
+  %pv32 = bitcast i8 * %ptr to <WIDTH x i32> *
+  call void @__pseudo_masked_store_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
+                                       <WIDTH x MASK> %mask)
+  %vf = bitcast <WIDTH x i32> %v32 to <WIDTH x float>
+  %pvf = bitcast i8 * %ptr to <WIDTH x float> *
+  call void @__pseudo_masked_store_float(<WIDTH x float> * %pvf, <WIDTH x float> %vf,
+                                         <WIDTH x MASK> %mask)
+  %pv64 = bitcast i8 * %ptr to <WIDTH x i64> *
+  call void @__pseudo_masked_store_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
+                                       <WIDTH x MASK> %mask)
+  %vd = bitcast <WIDTH x i64> %v64 to <WIDTH x double>
+  %pvd = bitcast i8 * %ptr to <WIDTH x double> *
+  call void @__pseudo_masked_store_double(<WIDTH x double> * %pvd, <WIDTH x double> %vd,
+                                         <WIDTH x MASK> %mask)
+
+  call void @__masked_store_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__masked_store_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__masked_store_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__masked_store_float(<WIDTH x float> * %pvf, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__masked_store_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__masked_store_double(<WIDTH x double> * %pvd, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__masked_store_blend_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
+                                     <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
+                                      <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
+                                      <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_float(<WIDTH x float> * %pvf, <WIDTH x float> %vf,
+                                        <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
+                                      <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_double(<WIDTH x double> * %pvd, <WIDTH x double> %vd,
+                                         <WIDTH x MASK> %mask)
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; gathers
+
+  %pg32_8 = call <WIDTH x i8>  @__pseudo_gather32_i8(<WIDTH x i32> %v32,
+                                                     <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pg32_8)
+  %pg32_16 = call <WIDTH x i16>  @__pseudo_gather32_i16(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pg32_16)
+  %pg32_32 = call <WIDTH x i32>  @__pseudo_gather32_i32(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pg32_32)
+  %pg32_f = call <WIDTH x float>  @__pseudo_gather32_float(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %pg32_f)
+  %pg32_64 = call <WIDTH x i64>  @__pseudo_gather32_i64(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pg32_64)
+  %pg32_d = call <WIDTH x double>  @__pseudo_gather32_double(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %pg32_d)
+
+  %pg64_8 = call <WIDTH x i8>  @__pseudo_gather64_i8(<WIDTH x i64> %v64,
+                                                     <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pg64_8)
+  %pg64_16 = call <WIDTH x i16>  @__pseudo_gather64_i16(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pg64_16)
+  %pg64_32 = call <WIDTH x i32>  @__pseudo_gather64_i32(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pg64_32)
+  %pg64_f = call <WIDTH x float>  @__pseudo_gather64_float(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %pg64_f)
+  %pg64_64 = call <WIDTH x i64>  @__pseudo_gather64_i64(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pg64_64)
+  %pg64_d = call <WIDTH x double>  @__pseudo_gather64_double(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %pg64_d)
+
+  %g32_8 = call <WIDTH x i8>  @__gather32_i8(<WIDTH x i32> %v32,
+                                                     <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %g32_8)
+  %g32_16 = call <WIDTH x i16>  @__gather32_i16(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %g32_16)
+  %g32_32 = call <WIDTH x i32>  @__gather32_i32(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %g32_32)
+  %g32_f = call <WIDTH x float>  @__gather32_float(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %g32_f)
+  %g32_64 = call <WIDTH x i64>  @__gather32_i64(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %g32_64)
+  %g32_d = call <WIDTH x double>  @__gather32_double(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %g32_d)
+
+  %g64_8 = call <WIDTH x i8>  @__gather64_i8(<WIDTH x i64> %v64,
+                                                     <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %g64_8)
+  %g64_16 = call <WIDTH x i16>  @__gather64_i16(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %g64_16)
+  %g64_32 = call <WIDTH x i32>  @__gather64_i32(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %g64_32)
+  %g64_f = call <WIDTH x float>  @__gather64_float(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %g64_f)
+  %g64_64 = call <WIDTH x i64>  @__gather64_i64(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %g64_64)
+  %g64_d = call <WIDTH x double>  @__gather64_double(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %g64_d)
+
+ifelse(HAVE_GATHER, `1', 
+`
+  %nfpgbo32_8 = call <WIDTH x i8>
+       @__pseudo_gather_base_offsets32_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfpgbo32_8)
+  %nfpgbo32_16 = call <WIDTH x i16>
+       @__pseudo_gather_base_offsets32_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfpgbo32_16)
+  %nfpgbo32_32 = call <WIDTH x i32>
+       @__pseudo_gather_base_offsets32_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfpgbo32_32)
+  %nfpgbo32_f = call <WIDTH x float>
+       @__pseudo_gather_base_offsets32_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfpgbo32_f)
+  %nfpgbo32_64 = call <WIDTH x i64>
+       @__pseudo_gather_base_offsets32_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfpgbo32_64)
+  %nfpgbo32_d = call <WIDTH x double>
+       @__pseudo_gather_base_offsets32_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfpgbo32_d)
+
+  %nfpgbo64_8 = call <WIDTH x i8>
+       @__pseudo_gather_base_offsets64_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfpgbo64_8)
+  %nfpgbo64_16 = call <WIDTH x i16>
+       @__pseudo_gather_base_offsets64_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfpgbo64_16)
+  %nfpgbo64_32 = call <WIDTH x i32>
+       @__pseudo_gather_base_offsets64_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfpgbo64_32)
+  %nfpgbo64_f = call <WIDTH x float>
+       @__pseudo_gather_base_offsets64_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfpgbo64_f)
+  %nfpgbo64_64 = call <WIDTH x i64>
+       @__pseudo_gather_base_offsets64_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfpgbo64_64)
+  %nfpgbo64_d = call <WIDTH x double>
+       @__pseudo_gather_base_offsets64_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfpgbo64_d)
+
+  %nfgbo32_8 = call <WIDTH x i8>
+       @__gather_base_offsets32_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfgbo32_8)
+  %nfgbo32_16 = call <WIDTH x i16>
+       @__gather_base_offsets32_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfgbo32_16)
+  %nfgbo32_32 = call <WIDTH x i32>
+       @__gather_base_offsets32_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfgbo32_32)
+  %nfgbo32_f = call <WIDTH x float>
+       @__gather_base_offsets32_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfgbo32_f)
+  %nfgbo32_64 = call <WIDTH x i64>
+       @__gather_base_offsets32_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfgbo32_64)
+  %nfgbo32_d = call <WIDTH x double>
+       @__gather_base_offsets32_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfgbo32_d)
+
+  %nfgbo64_8 = call <WIDTH x i8>
+       @__gather_base_offsets64_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfgbo64_8)
+  %nfgbo64_16 = call <WIDTH x i16>
+       @__gather_base_offsets64_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfgbo64_16)
+  %nfgbo64_32 = call <WIDTH x i32>
+       @__gather_base_offsets64_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfgbo64_32)
+  %nfgbo64_f = call <WIDTH x float>
+       @__gather_base_offsets64_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfgbo64_f)
+  %nfgbo64_64 = call <WIDTH x i64>
+       @__gather_base_offsets64_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfgbo64_64)
+  %nfgbo64_d = call <WIDTH x double>
+       @__gather_base_offsets64_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfgbo64_d)
+',
+`
+  %pgbo32_8 = call <WIDTH x i8>
+       @__pseudo_gather_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pgbo32_8)
+  %pgbo32_16 = call <WIDTH x i16>
+       @__pseudo_gather_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pgbo32_16)
+  %pgbo32_32 = call <WIDTH x i32>
+       @__pseudo_gather_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pgbo32_32)
+  %pgbo32_f = call <WIDTH x float>
+       @__pseudo_gather_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %pgbo32_f)
+  %pgbo32_64 = call <WIDTH x i64>
+       @__pseudo_gather_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pgbo32_64)
+  %pgbo32_d = call <WIDTH x double>
+       @__pseudo_gather_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %pgbo32_d)
+
+  %pgbo64_8 = call <WIDTH x i8>
+       @__pseudo_gather_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pgbo64_8)
+  %pgbo64_16 = call <WIDTH x i16>
+       @__pseudo_gather_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pgbo64_16)
+  %pgbo64_32 = call <WIDTH x i32>
+       @__pseudo_gather_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pgbo64_32)
+  %pgbo64_f = call <WIDTH x float>
+       @__pseudo_gather_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %pgbo64_f)
+  %pgbo64_64 = call <WIDTH x i64>
+       @__pseudo_gather_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pgbo64_64)
+  %pgbo64_d = call <WIDTH x double>
+       @__pseudo_gather_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %pgbo64_d)
+
+  %gbo32_8 = call <WIDTH x i8>
+       @__gather_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %gbo32_8)
+  %gbo32_16 = call <WIDTH x i16>
+       @__gather_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %gbo32_16)
+  %gbo32_32 = call <WIDTH x i32>
+       @__gather_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %gbo32_32)
+  %gbo32_f = call <WIDTH x float>
+       @__gather_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %gbo32_f)
+  %gbo32_64 = call <WIDTH x i64>
+       @__gather_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %gbo32_64)
+  %gbo32_d = call <WIDTH x double>
+       @__gather_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %gbo32_d)
+
+  %gbo64_8 = call <WIDTH x i8>
+       @__gather_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %gbo64_8)
+  %gbo64_16 = call <WIDTH x i16>
+       @__gather_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %gbo64_16)
+  %gbo64_32 = call <WIDTH x i32>
+       @__gather_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %gbo64_32)
+  %gbo64_f = call <WIDTH x float>
+       @__gather_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %gbo64_f)
+  %gbo64_64 = call <WIDTH x i64>
+       @__gather_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %gbo64_64)
+  %gbo64_d = call <WIDTH x double>
+       @__gather_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %pgbo64_d)
+')
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; scatters
+
+  call void @__pseudo_scatter32_i8(<WIDTH x i32> %v32, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_i16(<WIDTH x i32> %v32, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_i32(<WIDTH x i32> %v32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_float(<WIDTH x i32> %v32, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_i64(<WIDTH x i32> %v32, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_double(<WIDTH x i32> %v32, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_scatter64_i8(<WIDTH x i64> %v64, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_i16(<WIDTH x i64> %v64, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_i32(<WIDTH x i64> %v64, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_float(<WIDTH x i64> %v64, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_double(<WIDTH x i64> %v64, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter32_i8(<WIDTH x i32> %v32, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter32_i16(<WIDTH x i32> %v32, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter32_i32(<WIDTH x i32> %v32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter32_float(<WIDTH x i32> %v32, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter32_i64(<WIDTH x i32> %v32, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter32_double(<WIDTH x i32> %v32, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter64_i8(<WIDTH x i64> %v64, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter64_i16(<WIDTH x i64> %v64, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter64_i32(<WIDTH x i64> %v64, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter64_float(<WIDTH x i64> %v64, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter64_double(<WIDTH x i64> %v64, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+ifelse(HAVE_SCATTER, `1',
+`
+  call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter_base_offsets32_i8(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i16(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i32(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_float(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i64(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_double(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter_base_offsets64_i8(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i16(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i32(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_float(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i64(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_double(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+',
+`
+  call void @__pseudo_scatter_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_scatter_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+')
+
+  ret void
+}
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; various bitcasts from one type to another
+
+define <WIDTH x i32> @__intbits_varying_float(<WIDTH x float>) nounwind readnone alwaysinline {
+  %float_to_int_bitcast = bitcast <WIDTH x float> %0 to <WIDTH x i32>
+  ret <WIDTH x i32> %float_to_int_bitcast
+}
+
+define i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline {
+  %float_to_int_bitcast = bitcast float %0 to i32
+  ret i32 %float_to_int_bitcast
+}
+
+define <WIDTH x i64> @__intbits_varying_double(<WIDTH x double>) nounwind readnone alwaysinline {
+  %double_to_int_bitcast = bitcast <WIDTH x double> %0 to <WIDTH x i64>
+  ret <WIDTH x i64> %double_to_int_bitcast
+}
+
+define i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline {
+  %double_to_int_bitcast = bitcast double %0 to i64
+  ret i64 %double_to_int_bitcast
+}
+
+define <WIDTH x float> @__floatbits_varying_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
+  %int_to_float_bitcast = bitcast <WIDTH x i32> %0 to <WIDTH x float>
+  ret <WIDTH x float> %int_to_float_bitcast
+}
+
+define float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline {
+  %int_to_float_bitcast = bitcast i32 %0 to float
+  ret float %int_to_float_bitcast
+}
+
+define <WIDTH x double> @__doublebits_varying_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
+  %int_to_double_bitcast = bitcast <WIDTH x i64> %0 to <WIDTH x double>
+  ret <WIDTH x double> %int_to_double_bitcast
+}
+
+define double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline {
+  %int_to_double_bitcast = bitcast i64 %0 to double
+  ret double %int_to_double_bitcast
+}
+
+define <WIDTH x float> @__undef_varying() nounwind readnone alwaysinline {
+  ret <WIDTH x float> undef
+}
+
+define float @__undef_uniform() nounwind readnone alwaysinline {
+  ret float undef
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sign extension
+
+define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
+  %r = sext i1 %0 to i32
+  ret i32 %r
+}
+
+define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
+;;  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
+;; `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
+;; ret <WIDTH x i32> %se')
+  ifelse(MASK,i32, `%se = bitcast <WIDTH x i32> %0 to <WIDTH x i32>',
+         MASK,i64, `%se = trunc <WIDTH x MASK> %0 to <WIDTH x i32>',
+                   `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>')
+  ret <WIDTH x i32> %se
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; memcpy/memmove/memset
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src,
+                                        i32 %len, i32 %align, i1 %isvolatile)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* %dest, i8* %src,
+                                        i64 %len, i32 %align, i1 %isvolatile)
+
+declare void @__memcpy32(i8 * %dst, i8 * %src, i32 %len) alwaysinline;
+declare void @__memcpy64(i8 * %dst, i8 * %src, i64 %len) alwaysinline;
+
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src,
+                                         i32 %len, i32 %align, i1 %isvolatile)
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* %dest, i8* %src,
+                                         i64 %len, i32 %align, i1 %isvolatile)
+
+declare void @__memmove32(i8 * %dst, i8 * %src, i32 %len) alwaysinline;
+declare void @__memmove64(i8 * %dst, i8 * %src, i64 %len) alwaysinline
+
+declare void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 %len, i32 %align,
+                                   i1 %isvolatile)
+declare void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 %len, i32 %align,
+                                   i1 %isvolatile)
+
+declare void @__memset32(i8 * %dst, i8 %val, i32 %len) alwaysinline ;
+declare void @__memset64(i8 * %dst, i8 %val, i64 %len) alwaysinline;
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; new/delete
+
+;; Set of functions for 32 bit runtime.
+;; They are different for Windows and Unix (Linux/MacOS),
+;; on Windows we have to use _aligned_malloc/_aligned_free,
+;; while on Unix we use posix_memalign/free
+;;
+;; Note that this should be really two different libraries for 32 and 64
+;; environment and it should happen sooner or later
+
+ifelse(WIDTH, 1, `define(`ALIGNMENT', `16')', `define(`ALIGNMENT', `eval(WIDTH*4)')')
+
+@memory_alignment = internal constant i32 ALIGNMENT
+
+ifelse(BUILD_OS, `UNIX', 
+`
+
+ifelse(RUNTIME, `32',
+`
+
+;; Unix 32 bit environment.
+;; Use: posix_memalign and free
+;; Define:
+;; - __new_uniform_32rt
+;; - __new_varying32_32rt
+;; - __delete_uniform_32rt
+;; - __delete_varying_32rt
+
+declare i8* @malloc(i32)
+declare i32 @posix_memalign(i8**, i32, i32)
+declare void @free(i8 *)
+
+declare noalias i8 * @__new_uniform_32rt(i64 %size);
+declare <WIDTH x i64> @__new_varying32_32rt(<WIDTH x i32> %size, <WIDTH x MASK> %mask);
+declare void @__delete_uniform_32rt(i8 * %ptr);
+declare void @__delete_varying_32rt(<WIDTH x i64> %ptr, <WIDTH x MASK> %mask);
+
+',
+RUNTIME, `64',
+`
+
+;; Unix 64 bit environment.
+;; Use: posix_memalign and free
+;; Define:
+;; - __new_uniform_64rt
+;; - __new_varying32_64rt
+;; - __new_varying64_64rt
+;; - __delete_uniform_64rt
+;; - __delete_varying_64rt
+
+declare i8* @malloc(i64)
+declare void @free(i8 *)
+
+define noalias i8 * @__new_uniform_64rt(i64 %size) 
+{
+entry:
+;;  compute laneIdx = __tid_x() & (__warpsize() - 1)
+  %and = call i32 @__program_index()
+;; if (laneIdx == 0)
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %call2 = tail call noalias i8* @malloc(i64 %size) 
+  %phitmp = ptrtoint i8* %call2 to i64
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %ptr.0 = phi i64 [ %phitmp, %if.then ], [ undef, %entry ]
+  %val.sroa.0.0.extract.trunc = trunc i64 %ptr.0 to i32
+  %call3 = tail call i32 @__shfl_i32_nvptx(i32 %val.sroa.0.0.extract.trunc, i32 0)
+  %val.sroa.0.0.insert.ext = zext i32 %call3 to i64
+  %val.sroa.0.4.extract.shift = lshr i64 %ptr.0, 32
+  %val.sroa.0.4.extract.trunc = trunc i64 %val.sroa.0.4.extract.shift to i32
+  %call8 = tail call i32 @__shfl_i32_nvptx(i32 %val.sroa.0.4.extract.trunc, i32 0)
+  %val.sroa.0.4.insert.ext = zext i32 %call8 to i64
+  %val.sroa.0.4.insert.shift = shl nuw i64 %val.sroa.0.4.insert.ext, 32
+  %val.sroa.0.4.insert.insert = or i64 %val.sroa.0.4.insert.shift, %val.sroa.0.0.insert.ext
+  %0 = inttoptr i64 %val.sroa.0.4.insert.insert to i8*
+  ret i8* %0
+}
+define void @__delete_uniform_64rt(i8 * %ptr) 
+{
+entry:
+  %and = call i32 @__program_index()
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void @free(i8* %ptr) 
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define <1 x i64> @__new_varying32_64rt(<1 x i32> %sizev, <1 x i1> %maskv)
+{
+entry:
+  %size32 = extractelement <1 x i32> %sizev, i32 0
+  %mask   = extractelement <1 x  i1> %maskv, i32 0
+  %size64 = zext i32 %size32 to i64
+  br i1 %mask, label %alloc, label %skip
+
+alloc:
+  %ptr   = tail call noalias i8* @malloc(i64 %size64) 
+  %addr1 = ptrtoint i8* %ptr to i64
+  br label %skip
+
+skip:
+  %addr64 = phi i64 [ %addr1, %alloc], [ 0, %entry ]
+  %addr   = insertelement <1 x i64> undef, i64 %addr64, i32 0
+  ret <1 x i64> %addr
+}
+
+define <1 x i64> @__new_varying64_64rt(<1 x i64> %sizev, <1 x i1> %maskv)
+{
+entry:
+  %size64 = extractelement <1 x i64> %sizev, i32 0
+  %mask   = extractelement <1 x  i1> %maskv, i32 0
+  br i1 %mask, label %alloc, label %skip
+
+alloc:
+  %ptr   = tail call noalias i8* @malloc(i64 %size64) 
+  %addr1 = ptrtoint i8* %ptr to i64
+  br label %skip
+
+skip:
+  %addr64 = phi i64 [ %addr1, %alloc], [ 0, %entry ]
+  %addr   = insertelement <1 x i64> undef, i64 %addr64, i32 0
+  ret <1 x i64> %addr
+}
+
+define void @__delete_varying_64rt(<1 x i64> %ptrv, <1 x i1> %maskv)
+{
+entry:
+  %addr64 = extractelement <1 x i64> %ptrv,  i32 0
+  %mask   = extractelement <1 x  i1> %maskv, i32 0
+  br i1 %mask, label %free, label %skip
+
+free:
+  %ptr = inttoptr i64 %addr64 to i8*
+  tail call void @free(i8* %ptr) 
+  br label %skip
+
+skip:
+  ret void
+}
+', `
+errprint(`RUNTIME should be defined to either 32 or 64
+')
+m4exit(`1')
+')
+
+',
+BUILD_OS, `WINDOWS',
+`
+
+ifelse(RUNTIME, `32',
+`
+
+;; Windows 32 bit environment.
+;; Use: _aligned_malloc and _aligned_free
+;; Define:
+;; - __new_uniform_32rt
+;; - __new_varying32_32rt
+;; - __delete_uniform_32rt
+;; - __delete_varying_32rt
+
+declare i8* @_aligned_malloc(i32, i32)
+declare void @_aligned_free(i8 *)
+
+define noalias i8 * @__new_uniform_32rt(i64 %size) {
+  %conv = trunc i64 %size to i32
+  %alignment = load i32* @memory_alignment
+  %ptr = tail call i8* @_aligned_malloc(i32 %conv, i32 %alignment)
+  ret i8* %ptr
+}
+
+define <WIDTH x i64> @__new_varying32_32rt(<WIDTH x i32> %size, <WIDTH x MASK> %mask) {
+  %ret = alloca <WIDTH x i64>
+  store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %ret
+  %ret64 = bitcast <WIDTH x i64> * %ret to i64 *
+  %alignment = load i32* @memory_alignment
+
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+    %sz_LANE_ID = extractelement <WIDTH x i32> %size, i32 LANE
+    %ptr_LANE_ID = call noalias i8 * @_aligned_malloc(i32 %sz_LANE_ID, i32 %alignment)
+    %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64
+    %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE
+    store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID')
+
+  %r = load <WIDTH x i64> * %ret
+  ret <WIDTH x i64> %r
+}
+
+define void @__delete_uniform_32rt(i8 * %ptr) {
+  call void @_aligned_free(i8 * %ptr)
+  ret void
+}
+
+define void @__delete_varying_32rt(<WIDTH x i64> %ptr, <WIDTH x MASK> %mask) {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+      %iptr_LANE_ID = extractelement <WIDTH x i64> %ptr, i32 LANE
+      %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8 *
+      call void @_aligned_free(i8 * %ptr_LANE_ID)
+  ')
+  ret void
+}
+
+',
+RUNTIME, `64',
+`
+
+;; Windows 64 bit environment.
+;; Use: _aligned_malloc and _aligned_free
+;; Define:
+;; - __new_uniform_64rt
+;; - __new_varying32_64rt
+;; - __new_varying64_64rt
+;; - __delete_uniform_64rt
+;; - __delete_varying_64rt
+
+declare i8* @_aligned_malloc(i64, i64)
+declare void @_aligned_free(i8 *)
+
+define noalias i8 * @__new_uniform_64rt(i64 %size) {
+  %alignment = load i32* @memory_alignment
+  %alignment64 = sext i32 %alignment to i64
+  %ptr = tail call i8* @_aligned_malloc(i64 %size, i64 %alignment64)
+  ret i8* %ptr
+}
+
+define <WIDTH x i64> @__new_varying32_64rt(<WIDTH x i32> %size, <WIDTH x MASK> %mask) {
+  %ret = alloca <WIDTH x i64>
+  store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %ret
+  %ret64 = bitcast <WIDTH x i64> * %ret to i64 *
+  %alignment = load i32* @memory_alignment
+  %alignment64 = sext i32 %alignment to i64
+
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+    %sz_LANE_ID = extractelement <WIDTH x i32> %size, i32 LANE
+    %sz64_LANE_ID = zext i32 %sz_LANE_ID to i64
+    %ptr_LANE_ID = call noalias i8 * @_aligned_malloc(i64 %sz64_LANE_ID, i64 %alignment64)
+    %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64
+    %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE
+    store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID')
+
+  %r = load <WIDTH x i64> * %ret
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__new_varying64_64rt(<WIDTH x i64> %size, <WIDTH x MASK> %mask) {
+  %ret = alloca <WIDTH x i64>
+  store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %ret
+  %ret64 = bitcast <WIDTH x i64> * %ret to i64 *
+  %alignment = load i32* @memory_alignment
+  %alignment64 = sext i32 %alignment to i64
+
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+    %sz64_LANE_ID = extractelement <WIDTH x i64> %size, i32 LANE
+    %ptr_LANE_ID = call noalias i8 * @_aligned_malloc(i64 %sz64_LANE_ID, i64 %alignment64)
+    %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64
+    %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE
+    store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID')
+
+  %r = load <WIDTH x i64> * %ret
+  ret <WIDTH x i64> %r
+}
+
+define void @__delete_uniform_64rt(i8 * %ptr) {
+  call void @_aligned_free(i8 * %ptr)
+  ret void
+}
+
+define void @__delete_varying_64rt(<WIDTH x i64> %ptr, <WIDTH x MASK> %mask) {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+      %iptr_LANE_ID = extractelement <WIDTH x i64> %ptr, i32 LANE
+      %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8 *
+      call void @_aligned_free(i8 * %ptr_LANE_ID)
+  ')
+  ret void
+}
+
+', `
+errprint(`RUNTIME should be defined to either 32 or 64
+')
+m4exit(`1')
+')
+
+',
+`
+errprint(`BUILD_OS should be defined to either UNIX or WINDOWS
+')
+m4exit(`1')
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; read hw clock
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; stdlib transcendentals
+;;
+;; These functions provide entrypoints that call out to the libm 
+;; implementations of the transcendental functions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+declare float @sinf(float) nounwind readnone
+declare float @cosf(float) nounwind readnone
+declare void @sincosf(float, float *, float *) nounwind readnone
+declare float @asinf(float) nounwind readnone
+declare float @acosf(float) nounwind readnone
+declare float @tanf(float) nounwind readnone
+declare float @atanf(float) nounwind readnone
+declare float @atan2f(float, float) nounwind readnone
+declare float @expf(float) nounwind readnone
+declare float @logf(float) nounwind readnone
+declare float @powf(float, float) nounwind readnone
+
+define float @__stdlib_sinf(float) nounwind readnone alwaysinline {
+  %r = call float @sinf(float %0)
+  ret float %r
+}
+
+define float @__stdlib_cosf(float) nounwind readnone alwaysinline {
+  %r = call float @cosf(float %0)
+  ret float %r
+}
+
+define void @__stdlib_sincosf(float, float *, float *) nounwind readnone alwaysinline {
+  call void @sincosf(float %0, float *%1, float *%2)
+  ret void
+}
+
+define float @__stdlib_asinf(float) nounwind readnone alwaysinline {
+  %r = call float @asinf(float %0)
+  ret float %r
+}
+
+define float @__stdlib_acosf(float) nounwind readnone alwaysinline {
+  %r = call float @acosf(float %0)
+  ret float %r
+}
+
+define float @__stdlib_tanf(float) nounwind readnone alwaysinline {
+  %r = call float @tanf(float %0)
+  ret float %r
+}
+
+define float @__stdlib_atanf(float) nounwind readnone alwaysinline {
+  %r = call float @atanf(float %0)
+  ret float %r
+}
+
+define float @__stdlib_atan2f(float, float) nounwind readnone alwaysinline {
+  %r = call float @atan2f(float %0, float %1)
+  ret float %r
+}
+
+define float @__stdlib_logf(float) nounwind readnone alwaysinline {
+  %r = call float @logf(float %0)
+  ret float %r
+}
+
+define float @__stdlib_expf(float) nounwind readnone alwaysinline {
+  %r = call float @expf(float %0)
+  ret float %r
+}
+
+define float @__stdlib_powf(float, float) nounwind readnone alwaysinline {
+  %r = call float @powf(float %0, float %1)
+  ret float %r
+}
+
+declare double @sin(double) nounwind readnone
+declare double @asin(double) nounwind readnone
+declare double @cos(double) nounwind readnone
+declare void @sincos(double, double *, double *) nounwind readnone
+declare double @tan(double) nounwind readnone
+declare double @atan(double) nounwind readnone
+declare double @atan2(double, double) nounwind readnone
+declare double @exp(double) nounwind readnone
+declare double @log(double) nounwind readnone
+declare double @pow(double, double) nounwind readnone
+
+define double @__stdlib_sin(double) nounwind readnone alwaysinline {
+  %r = call double @sin(double %0)
+  ret double %r
+}
+
+define double @__stdlib_asin(double) nounwind readnone alwaysinline {
+  %r = call double @asin(double %0)
+  ret double %r
+}
+
+define double @__stdlib_cos(double) nounwind readnone alwaysinline {
+  %r = call double @cos(double %0)
+  ret double %r
+}
+
+define void @__stdlib_sincos(double, double *, double *) nounwind readnone alwaysinline {
+  call void @sincos(double %0, double *%1, double *%2)
+  ret void
+}
+
+define double @__stdlib_tan(double) nounwind readnone alwaysinline {
+  %r = call double @tan(double %0)
+  ret double %r
+}
+
+define double @__stdlib_atan(double) nounwind readnone alwaysinline {
+  %r = call double @atan(double %0)
+  ret double %r
+}
+
+define double @__stdlib_atan2(double, double) nounwind readnone alwaysinline {
+  %r = call double @atan2(double %0, double %1)
+  ret double %r
+}
+
+define double @__stdlib_log(double) nounwind readnone alwaysinline {
+  %r = call double @log(double %0)
+  ret double %r
+}
+
+define double @__stdlib_exp(double) nounwind readnone alwaysinline {
+  %r = call double @exp(double %0)
+  ret double %r
+}
+
+define double @__stdlib_pow(double, double) nounwind readnone alwaysinline {
+  %r = call double @pow(double %0, double %1)
+  ret double %r
+}
+
+
+')
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 64-bit integer min and max functions
+
+;; utility function used by int64minmax below.  This shouldn't be called by
+;; target .ll files directly.
+;; $1: target vector width
+;; $2: {min,max} (used in constructing function names)
+;; $3: {int64,uint64} (used in constructing function names)
+;; $4: {slt,sgt} comparison operator to used
+
+define(`i64minmax', `
+define i64 @__$2_uniform_$3(i64, i64) nounwind alwaysinline readnone {
+  %c = icmp $4 i64 %0, %1
+  %r = select i1 %c, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define <$1 x i64> @__$2_varying_$3(<$1 x i64>, <$1 x i64>) nounwind alwaysinline readnone {
+  %rptr = alloca <$1 x i64>
+  %r64ptr = bitcast <$1 x i64> * %rptr to i64 *
+
+  forloop(i, 0, eval($1-1), `
+  %v0_`'i = extractelement <$1 x i64> %0, i32 i
+  %v1_`'i = extractelement <$1 x i64> %1, i32 i
+  %c_`'i = icmp $4 i64 %v0_`'i, %v1_`'i
+  %v_`'i = select i1 %c_`'i, i64 %v0_`'i, i64 %v1_`'i
+  %ptr_`'i = getelementptr i64 * %r64ptr, i32 i
+  store i64 %v_`'i, i64 * %ptr_`'i
+')                  
+
+  %ret = load <$1 x i64> * %rptr
+  ret <$1 x i64> %ret
+}
+')
+
+;; this is the function that target .ll files should call; it just takes the target
+;; vector width as a parameter
+
+define(`int64minmax', `
+i64minmax(WIDTH,min,int64,slt)
+i64minmax(WIDTH,max,int64,sgt)
+i64minmax(WIDTH,min,uint64,ult)
+i64minmax(WIDTH,max,uint64,ugt)
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Emit general-purpose code to do a masked load for targets that dont have
+;; an instruction to do that.  Parameters:
+;; $1: element type for which to emit the function (i32, i64, ...) (and suffix for function name)
+;; $2: alignment for elements of type $1 (4, 8, ...)
+
+define(`masked_load', `
+define <WIDTH x $1> @__masked_load_$1(i8 *, <WIDTH x MASK> %mask) nounwind alwaysinline {
+entry:
+  %mm = call i64 @__movmsk(<WIDTH x MASK> %mask)
+  
+  ; if the first lane and the last lane are on, then it is safe to do a vector load
+  ; of the whole thing--what the lanes in the middle want turns out to not matter...
+  %mm_and_low = and i64 %mm, 1
+  %mm_and_high = and i64 %mm, MASK_HIGH_BIT_ON
+  %mm_and_high_shift = lshr i64 %mm_and_high, eval(WIDTH-1)
+  %mm_and_low_i1 = trunc i64 %mm_and_low to i1
+  %mm_and_high_shift_i1 = trunc i64 %mm_and_high_shift to i1
+  %can_vload = and i1 %mm_and_low_i1, %mm_and_high_shift_i1
+
+  %fast32 = call i32 @__fast_masked_vload()
+  %fast_i1 = trunc i32 %fast32 to i1
+  %can_vload_maybe_fast = or i1 %fast_i1, %can_vload
+
+  ; if we are not able to do a singe vload, we will accumulate lanes in this memory..
+  %retptr = alloca <WIDTH x $1>
+  %retptr32 = bitcast <WIDTH x $1> * %retptr to $1 *
+  br i1 %can_vload_maybe_fast, label %load, label %loop
+
+load: 
+  %ptr = bitcast i8 * %0 to <WIDTH x $1> *
+  %valall = load <WIDTH x $1> * %ptr, align $2
+  ret <WIDTH x $1> %valall
+
+loop:
+  ; loop over the lanes and see if each one is on...
+  %lane = phi i32 [ 0, %entry ], [ %next_lane, %lane_done ]
+  %lane64 = zext i32 %lane to i64
+  %lanemask = shl i64 1, %lane64
+  %mask_and = and i64 %mm, %lanemask
+  %do_lane = icmp ne i64 %mask_and, 0
+  br i1 %do_lane, label %load_lane, label %lane_done
+
+load_lane:
+  ; yes!  do the load and store the result into the appropriate place in the
+  ; allocaed memory above
+  %ptr32 = bitcast i8 * %0 to $1 *
+  %lane_ptr = getelementptr $1 * %ptr32, i32 %lane
+  %val = load $1 * %lane_ptr
+  %store_ptr = getelementptr $1 * %retptr32, i32 %lane
+  store $1 %val, $1 * %store_ptr
+  br label %lane_done
+
+lane_done:
+  %next_lane = add i32 %lane, 1
+  %done = icmp eq i32 %lane, eval(WIDTH-1)
+  br i1 %done, label %return, label %loop
+
+return:
+  %r = load <WIDTH x $1> * %retptr
+  ret <WIDTH x $1> %r
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+;; emit code to do masked store as a set of per-lane scalar stores
+;; parameters:
+;; $1: llvm type of elements (and suffix for function name)
+
+define(`gen_masked_store', `
+define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x MASK>) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %2, `
+      %ptr_LANE_ID = getelementptr <WIDTH x $1> * %0, i32 0, i32 LANE
+      %storeval_LANE_ID = extractelement <WIDTH x $1> %1, i32 LANE
+      store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID')
+  ret void
+}
+')
+
+define(`masked_store_blend_8_16_by_4', `
+define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
+                                     <4 x i32>) nounwind alwaysinline {
+  %old = load <4 x i8> * %0, align 1
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old32 = bitcast <4 x i8> %old to i32
+    %new32 = bitcast <4 x i8> %1 to i32
+
+    %mask8 = trunc <4 x i32> %2 to <4 x i8>
+    %mask32 = bitcast <4 x i8> %mask8 to i32
+    %notmask32 = xor i32 %mask32, -1
+
+    %newmasked = and i32 %new32, %mask32
+    %oldmasked = and i32 %old32, %notmask32
+    %result = or i32 %newmasked, %oldmasked
+
+    %resultvec = bitcast i32 %result to <4 x i8>
+  ',`
+    %m = trunc <4 x i32> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
+  ')
+  store <4 x i8> %resultvec, <4 x i8> * %0, align 1
+  ret void
+}
+
+define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
+                                      <4 x i32>) nounwind alwaysinline {
+  %old = load <4 x i16> * %0, align 2
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old64 = bitcast <4 x i16> %old to i64
+    %new64 = bitcast <4 x i16> %1 to i64
+
+    %mask16 = trunc <4 x i32> %2 to <4 x i16>
+    %mask64 = bitcast <4 x i16> %mask16 to i64
+    %notmask64 = xor i64 %mask64, -1
+
+    %newmasked = and i64 %new64, %mask64
+    %oldmasked = and i64 %old64, %notmask64
+    %result = or i64 %newmasked, %oldmasked
+
+    %resultvec = bitcast i64 %result to <4 x i16>
+  ',`
+    %m = trunc <4 x i32> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
+  ')
+  store <4 x i16> %resultvec, <4 x i16> * %0, align 2
+  ret void
+}
+')
+
+define(`masked_store_blend_8_16_by_4_mask64', `
+define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
+                                     <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i8> * %0, align 1
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old32 = bitcast <4 x i8> %old to i32
+    %new32 = bitcast <4 x i8> %1 to i32
+
+    %mask8 = trunc <4 x i64> %2 to <4 x i8>
+    %mask32 = bitcast <4 x i8> %mask8 to i32
+    %notmask32 = xor i32 %mask32, -1
+
+    %newmasked = and i32 %new32, %mask32
+    %oldmasked = and i32 %old32, %notmask32
+    %result = or i32 %newmasked, %oldmasked
+
+    %resultvec = bitcast i32 %result to <4 x i8>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
+  ')
+  store <4 x i8> %resultvec, <4 x i8> * %0, align 1
+  ret void
+}
+
+define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i16> * %0, align 2
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old64 = bitcast <4 x i16> %old to i64
+    %new64 = bitcast <4 x i16> %1 to i64
+
+    %mask16 = trunc <4 x i64> %2 to <4 x i16>
+    %mask64 = bitcast <4 x i16> %mask16 to i64
+    %notmask64 = xor i64 %mask64, -1
+
+    %newmasked = and i64 %new64, %mask64
+    %oldmasked = and i64 %old64, %notmask64
+    %result = or i64 %newmasked, %oldmasked
+
+    %resultvec = bitcast i64 %result to <4 x i16>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
+  ')
+  store <4 x i16> %resultvec, <4 x i16> * %0, align 2
+  ret void
+}
+')
+
+define(`masked_store_blend_8_16_by_8', `
+define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
+                                     <8 x i32>) nounwind alwaysinline {
+  %old = load <8 x i8> * %0, align 1
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old64 = bitcast <8 x i8> %old to i64
+    %new64 = bitcast <8 x i8> %1 to i64
+
+    %mask8 = trunc <8 x i32> %2 to <8 x i8>
+    %mask64 = bitcast <8 x i8> %mask8 to i64
+    %notmask64 = xor i64 %mask64, -1
+
+    %newmasked = and i64 %new64, %mask64
+    %oldmasked = and i64 %old64, %notmask64
+    %result = or i64 %newmasked, %oldmasked
+
+    %resultvec = bitcast i64 %result to <8 x i8>
+  ',`
+    %m = trunc <8 x i32> %2 to <8 x i1>
+    %resultvec = select <8 x i1> %m, <8 x i8> %1, <8 x i8> %old
+  ')
+  store <8 x i8> %resultvec, <8 x i8> * %0, align 1
+  ret void
+}
+
+define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>,
+                                      <8 x i32>) nounwind alwaysinline {
+  %old = load <8 x i16> * %0, align 2
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old128 = bitcast <8 x i16> %old to i128
+    %new128 = bitcast <8 x i16> %1 to i128
+
+    %mask16 = trunc <8 x i32> %2 to <8 x i16>
+    %mask128 = bitcast <8 x i16> %mask16 to i128
+    %notmask128 = xor i128 %mask128, -1
+
+    %newmasked = and i128 %new128, %mask128
+    %oldmasked = and i128 %old128, %notmask128
+    %result = or i128 %newmasked, %oldmasked
+
+    %resultvec = bitcast i128 %result to <8 x i16>
+  ',`
+    %m = trunc <8 x i32> %2 to <8 x i1>
+    %resultvec = select <8 x i1> %m, <8 x i16> %1, <8 x i16> %old
+  ')
+  store <8 x i16> %resultvec, <8 x i16> * %0, align 2
+  ret void
+}
+')
+
+
+define(`masked_store_blend_8_16_by_16', `
+define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
+                                     <16 x i32>) nounwind alwaysinline {
+  %old = load <16 x i8> * %0, align 1
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old128 = bitcast <16 x i8> %old to i128
+    %new128 = bitcast <16 x i8> %1 to i128
+
+    %mask8 = trunc <16 x i32> %2 to <16 x i8>
+    %mask128 = bitcast <16 x i8> %mask8 to i128
+    %notmask128 = xor i128 %mask128, -1
+
+    %newmasked = and i128 %new128, %mask128
+    %oldmasked = and i128 %old128, %notmask128
+    %result = or i128 %newmasked, %oldmasked
+
+    %resultvec = bitcast i128 %result to <16 x i8>
+  ',`
+    %m = trunc <16 x i32> %2 to <16 x i1>
+    %resultvec = select <16 x i1> %m, <16 x i8> %1, <16 x i8> %old
+  ')
+  store <16 x i8> %resultvec, <16 x i8> * %0, align 1
+  ret void
+}
+
+define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
+                                      <16 x i32>) nounwind alwaysinline {
+  %old = load <16 x i16> * %0, align 2
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old256 = bitcast <16 x i16> %old to i256
+    %new256 = bitcast <16 x i16> %1 to i256
+
+    %mask16 = trunc <16 x i32> %2 to <16 x i16>
+    %mask256 = bitcast <16 x i16> %mask16 to i256
+    %notmask256 = xor i256 %mask256, -1
+
+    %newmasked = and i256 %new256, %mask256
+    %oldmasked = and i256 %old256, %notmask256
+    %result = or i256 %newmasked, %oldmasked
+
+    %resultvec = bitcast i256 %result to <16 x i16>
+  ',`
+    %m = trunc <16 x i32> %2 to <16 x i1>
+    %resultvec = select <16 x i1> %m, <16 x i16> %1, <16 x i16> %old
+  ')
+  store <16 x i16> %resultvec, <16 x i16> * %0, align 2
+  ret void
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; packed load and store functions
+;;
+;; These define functions to emulate those nice packed load and packed store
+;; instructions.  For packed store, given a pointer to destination array and 
+;; an offset into the array, for each lane where the mask is on, the
+;; corresponding value for that lane is stored into packed locations in the
+;; destination array.  For packed load, each lane that has an active mask
+;; loads a sequential value from the array.
+;;
+;; $1: vector width of the target
+;;
+;; FIXME: use the per_lane macro, defined below, to implement these!
+
+define(`packed_load_and_store', `
+
+define i32 @__packed_load_active(i32 * %startptr, <1 x i32> * %val_ptr,
+                                 <1 x i1> %full_mask) nounwind alwaysinline {
+entry:
+  %active = extractelement <1 x i1> %full_mask, i32 0
+  %call = tail call i64 @__warpBinExclusiveScan(i1 zeroext %active)
+  %res.sroa.0.0.extract.trunc = trunc i64 %call to i32
+  br i1 %active, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %idxprom = ashr i64 %call, 32
+  %arrayidx = getelementptr inbounds i32* %startptr, i64 %idxprom
+  %val = load i32* %arrayidx, align 4
+  %valvec = insertelement <1 x i32> undef, i32 %val, i32 0
+  store <1 x i32> %valvec, <1 x i32>* %val_ptr, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i32 %res.sroa.0.0.extract.trunc
+}
+
+define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
+                                   <WIDTH x MASK> %full_mask) nounwind alwaysinline 
+{
+entry:
+  %active = extractelement <1 x i1> %full_mask, i32 0
+  %call = tail call i64 @__warpBinExclusiveScan(i1 zeroext %active)
+  %res.sroa.0.0.extract.trunc = trunc i64 %call to i32
+  br i1 %active, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %idxprom = ashr i64 %call, 32
+  %arrayidx = getelementptr inbounds i32* %startptr, i64 %idxprom
+  %val = extractelement <1 x i32> %vals, i32 0
+  store i32 %val, i32* %arrayidx, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i32 %res.sroa.0.0.extract.trunc
+}
+
+define i32 @__packed_store_active2(i32 * %startptr, <1 x i32> %vals,
+                                   <1 x i1> %full_mask) nounwind alwaysinline 
+{
+  %ret = call i32 @__packed_store_active(i32* %startptr, 
+           <1 x i32> %vals, <1 x i1> %full_mask);
+  ret i32 %ret
+}
+')
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reduce_equal
+
+;; count leading/trailing zeros
+;; Macros declares set of count-trailing and count-leading zeros.
+;; Macros behaves as a static functon - it works only at first invokation
+;; to avoid redifinition.
+define(`declare_count_zeros', `
+ifelse(count_zeros_are_defined, true, `',
+`
+declare i32 @llvm.ctlz.i32(i32)
+declare i64 @llvm.ctlz.i64(i64)
+declare i32 @llvm.cttz.i32(i32)
+declare i64 @llvm.cttz.i64(i64)
+
+define(`count_zeros_are_defined', true)
+')
+
+')
+
+define(`reduce_equal_aux', `
+declare_count_zeros()
+
+define i1 @__reduce_equal_$3(<$1 x $2> %v, $2 * %samevalue,
+                             <$1 x MASK> %mask) nounwind alwaysinline {
+entry:
+   %mm = call i64 @__movmsk(<$1 x MASK> %mask)
+   %allon = icmp eq i64 %mm, ALL_ON_MASK
+   br i1 %allon, label %check_neighbors, label %domixed
+
+domixed:
+  ; First, figure out which lane is the first active one
+  %first = call i64 @llvm.cttz.i64(i64 %mm)
+  %first32 = trunc i64 %first to i32
+  %baseval = extractelement <$1 x $2> %v, i32 %first32
+  %basev1 = insertelement <$1 x $2> undef, $2 %baseval, i32 0
+  ; get a vector that is that value smeared across all elements
+  %basesmear = shufflevector <$1 x $2> %basev1, <$1 x $2> undef,
+        <$1 x i32> < forloop(i, 0, eval($1-2), `i32 0, ') i32 0 >
+
+  ; now to a blend of that vector with the original vector, such that the
+  ; result will be the original value for the active lanes, and the value
+  ; from the first active lane for the inactive lanes.  Given that, we can
+  ; just unconditionally check if the lanes are all equal in check_neighbors
+  ; below without worrying about inactive lanes...
+  %ptr = alloca <$1 x $2>
+  store <$1 x $2> %basesmear, <$1 x $2> * %ptr
+  %castptr = bitcast <$1 x $2> * %ptr to <$1 x $4> *
+  %castv = bitcast <$1 x $2> %v to <$1 x $4>
+  call void @__masked_store_blend_i$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x MASK> %mask)
+  %blendvec = load <$1 x $2> * %ptr
+  br label %check_neighbors
+
+check_neighbors:
+  %vec = phi <$1 x $2> [ %blendvec, %domixed ], [ %v, %entry ]
+  ifelse($6, `32', `
+  ; For 32-bit elements, we rotate once and compare with the vector, which ends 
+  ; up comparing each element to its neighbor on the right.  Then see if
+  ; all of those values are true; if so, then all of the elements are equal..
+  %castvec = bitcast <$1 x $2> %vec to <$1 x $4>
+  %castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1)
+  %vr = bitcast <$1 x $4> %castvr to <$1 x $2>
+  %eq = $5 $7 <$1 x $2> %vec, %vr
+  ifelse(MASK,i1, `
+    %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)',
+    `%eqm = sext <$1 x i1> %eq to <$1 x MASK>
+    %eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)')
+  %alleq = icmp eq i64 %eqmm, ALL_ON_MASK
+  br i1 %alleq, label %all_equal, label %not_all_equal
+  ', `
+  ; But for 64-bit elements, it turns out to be more efficient to just
+  ; scalarize and do a individual pairwise comparisons and AND those
+  ; all together..
+  forloop(i, 0, eval($1-1), `
+  %v`'i = extractelement <$1 x $2> %vec, i32 i')
+
+  forloop(i, 0, eval($1-2), `
+  %eq`'i = $5 $7 $2 %v`'i, %v`'eval(i+1)')
+
+  %and0 = and i1 %eq0, %eq1
+  forloop(i, 1, eval($1-3), `
+  %and`'i = and i1 %and`'eval(i-1), %eq`'eval(i+1)')
+
+  br i1 %and`'eval($1-3), label %all_equal, label %not_all_equal
+  ')
+
+all_equal:
+  %the_value = extractelement <$1 x $2> %vec, i32 0
+  store $2 %the_value, $2 * %samevalue
+  ret i1 true
+
+not_all_equal:
+  ret i1 false
+}
+')
+
+define(`reduce_equal', `
+reduce_equal_aux($1, i32, int32, i32, icmp, 32, eq)
+reduce_equal_aux($1, float, float, i32, fcmp, 32, oeq)
+reduce_equal_aux($1, i64, int64, i64, icmp, 64, eq)
+reduce_equal_aux($1, double, double, i64, fcmp, 64, oeq)
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; per_lane
+;;
+;; The scary macro below encapsulates the 'scalarization' idiom--i.e. we have
+;; some operation that we'd like to perform only for the lanes where the
+;; mask is on
+;; $1: vector width of the target
+;; $2: variable that holds the mask
+;; $3: block of code to run for each lane that is on
+;;       Inside this code, any instances of the text "LANE" are replaced
+;;       with an i32 value that represents the current lane number
+
+; num lanes, mask, code block to do per lane
+define(`per_lane', `
+  br label %pl_entry
+
+pl_entry:
+  %pl_mask = call i64 @__movmsk($2)
+  %pl_mask_known = call i1 @__is_compile_time_constant_mask($2)
+  br i1 %pl_mask_known, label %pl_known_mask, label %pl_unknown_mask
+
+pl_known_mask:
+  ;; the mask is known at compile time; see if it is something we can
+  ;; handle more efficiently
+  %pl_is_allon = icmp eq i64 %pl_mask, ALL_ON_MASK
+  br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask
+
+pl_all_on:
+  ;; the mask is all on--just expand the code for each lane sequentially
+  forloop(i, 0, eval($1-1), 
+          `patsubst(`$3', `LANE', i)')
+  br label %pl_done
+
+pl_unknown_mask:
+  ;; we just run the general case, though we could
+  ;; try to be smart and just emit the code based on what it actually is,
+  ;; for example by emitting the code straight-line without a loop and doing 
+  ;; the lane tests explicitly, leaving later optimization passes to eliminate
+  ;; the stuff that is definitely not needed.  Not clear if we will frequently 
+  ;; encounter a mask that is known at compile-time but is not either all on or
+  ;; all off...
+  br label %pl_loop
+
+pl_loop:
+  ;; Loop over each lane and see if we want to do the work for this lane
+  %pl_lane = phi i32 [ 0, %pl_unknown_mask ], [ %pl_nextlane, %pl_loopend ]
+  %pl_lanemask = phi i64 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ]
+
+  ; is the current lane on?  if so, goto do work, otherwise to end of loop
+  %pl_and = and i64 %pl_mask, %pl_lanemask
+  %pl_doit = icmp eq i64 %pl_and, %pl_lanemask
+  br i1 %pl_doit, label %pl_dolane, label %pl_loopend 
+
+pl_dolane:
+  ;; If so, substitute in the code from the caller and replace the LANE
+  ;; stuff with the current lane number
+  patsubst(`patsubst(`$3', `LANE_ID', `_id')', `LANE', `%pl_lane')
+  br label %pl_loopend
+
+pl_loopend:
+  %pl_nextlane = add i32 %pl_lane, 1
+  %pl_nextlanemask = mul i64 %pl_lanemask, 2
+
+  ; are we done yet?
+  %pl_test = icmp ne i32 %pl_nextlane, $1
+  br i1 %pl_test, label %pl_loop, label %pl_done
+
+pl_done:
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+;;
+;; $1: scalar type for which to generate functions to do gathers
+
+define(`gen_gather_general', `
+; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
+define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs, 
+                                   <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
+  %ret_ptr = alloca <WIDTH x $1>
+  per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
+  %val_LANE_ID = load $1 * %ptr_LANE_ID
+  %store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
+  store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
+ ')
+
+  %ret = load <WIDTH x $1> * %ret_ptr
+  ret <WIDTH x $1> %ret
+}
+
+; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
+define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs, 
+                                   <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
+  %ret_ptr = alloca <WIDTH x $1>
+  per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
+  %val_LANE_ID = load $1 * %ptr_LANE_ID
+  %store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
+  store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
+ ')
+
+  %ret = load <WIDTH x $1> * %ret_ptr
+  ret <WIDTH x $1> %ret
+}
+')
+
+; vec width, type
+define(`gen_gather_factored', `
+;; Define the utility function to do the gather operation for a single element
+;; of the type
+define <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
+                                    <WIDTH x i32> %offset_delta, <WIDTH x $1> %ret,
+                                    i32 %lane) nounwind readonly alwaysinline {
+  ; compute address for this one from the base
+  %offset32 = extractelement <WIDTH x i32> %offsets, i32 %lane
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %offset64 = sext i32 %offset32 to i64
+  %scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset
+
+  %delta = extractelement <WIDTH x i32> %offset_delta, i32 %lane
+  %delta64 = sext i32 %delta to i64
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  ; load value and insert into returned value
+  %ptrcast = bitcast i8 * %finalptr to $1 *
+  %val = load $1 *%ptrcast
+  %updatedret = insertelement <WIDTH x $1> %ret, $1 %val, i32 %lane
+  ret <WIDTH x $1> %updatedret
+}
+
+define <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
+                                    <WIDTH x i64> %offset_delta, <WIDTH x $1> %ret,
+                                    i32 %lane) nounwind readonly alwaysinline {
+  ; compute address for this one from the base
+  %offset64 = extractelement <WIDTH x i64> %offsets, i32 %lane
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %offset_scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %offset_scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset
+
+  %delta64 = extractelement <WIDTH x i64> %offset_delta, i32 %lane
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  ; load value and insert into returned value
+  %ptrcast = bitcast i8 * %finalptr to $1 *
+  %val = load $1 *%ptrcast
+  %updatedret = insertelement <WIDTH x $1> %ret, $1 %val, i32 %lane
+  ret <WIDTH x $1> %updatedret
+}
+
+
+define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
+                                             <WIDTH x i32> %offset_delta,
+                                             <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
+  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
+  ; to require that the 0th element of the array being gathered from is always
+  ; legal to read from (and we do indeed require that, given the benefits!) 
+  ;
+  ; Set the offset to zero for lanes that are off
+  %offsetsPtr = alloca <WIDTH x i32>
+  store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %offsetsPtr
+  call void @__masked_store_blend_i32(<WIDTH x i32> * %offsetsPtr, <WIDTH x i32> %offsets, 
+                                      <WIDTH x MASK> %vecmask)
+  %newOffsets = load <WIDTH x i32> * %offsetsPtr
+
+  %deltaPtr = alloca <WIDTH x i32>
+  store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %deltaPtr
+  call void @__masked_store_blend_i32(<WIDTH x i32> * %deltaPtr, <WIDTH x i32> %offset_delta, 
+                                      <WIDTH x MASK> %vecmask)
+  %newDelta = load <WIDTH x i32> * %deltaPtr
+
+  %ret0 = call <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, <WIDTH x i32> %newOffsets,
+                                            i32 %offset_scale, <WIDTH x i32> %newDelta,
+                                            <WIDTH x $1> undef, i32 0)
+  forloop(lane, 1, eval(WIDTH-1), 
+          `patsubst(patsubst(`%retLANE = call <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, 
+                                <WIDTH x i32> %newOffsets, i32 %offset_scale, <WIDTH x i32> %newDelta,
+                                <WIDTH x $1> %retPREV, i32 LANE)
+                    ', `LANE', lane), `PREV', eval(lane-1))')
+  ret <WIDTH x $1> %ret`'eval(WIDTH-1)
+}
+
+define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
+                                             <WIDTH x i64> %offset_delta,
+                                             <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
+  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
+  ; to require that the 0th element of the array being gathered from is always
+  ; legal to read from (and we do indeed require that, given the benefits!) 
+  ;
+  ; Set the offset to zero for lanes that are off
+  %offsetsPtr = alloca <WIDTH x i64>
+  store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %offsetsPtr
+  call void @__masked_store_blend_i64(<WIDTH x i64> * %offsetsPtr, <WIDTH x i64> %offsets, 
+                                      <WIDTH x MASK> %vecmask)
+  %newOffsets = load <WIDTH x i64> * %offsetsPtr
+
+  %deltaPtr = alloca <WIDTH x i64>
+  store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %deltaPtr
+  call void @__masked_store_blend_i64(<WIDTH x i64> * %deltaPtr, <WIDTH x i64> %offset_delta, 
+                                      <WIDTH x MASK> %vecmask)
+  %newDelta = load <WIDTH x i64> * %deltaPtr
+
+  %ret0 = call <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %newOffsets,
+                                            i32 %offset_scale, <WIDTH x i64> %newDelta,
+                                            <WIDTH x $1> undef, i32 0)
+  forloop(lane, 1, eval(WIDTH-1), 
+          `patsubst(patsubst(`%retLANE = call <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, 
+                                <WIDTH x i64> %newOffsets, i32 %offset_scale, <WIDTH x i64> %newDelta,
+                                <WIDTH x $1> %retPREV, i32 LANE)
+                    ', `LANE', lane), `PREV', eval(lane-1))')
+  ret <WIDTH x $1> %ret`'eval(WIDTH-1)
+}
+
+gen_gather_general($1)
+'
+)
+
+; vec width, type
+define(`gen_gather', `
+
+gen_gather_factored($1)
+
+define <WIDTH x $1>
+@__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale,
+                           <WIDTH x i32> %offsets,
+                           <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
+  %scale_vec = bitcast i32 %offset_scale to <1 x i32>
+  %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef,
+     <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
+  %scaled_offsets = mul <WIDTH x i32> %smear_scale, %offsets
+  %v = call <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %scaled_offsets, i32 1, 
+                                                     <WIDTH x i32> zeroinitializer, <WIDTH x MASK> %vecmask)
+  ret <WIDTH x $1> %v
+}
+
+define <WIDTH x $1>
+@__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale,
+                            <WIDTH x i64> %offsets,
+                            <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
+  %scale64 = zext i32 %offset_scale to i64
+  %scale_vec = bitcast i64 %scale64 to <1 x i64>
+  %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef,
+     <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
+  %scaled_offsets = mul <WIDTH x i64> %smear_scale, %offsets
+  %v = call <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %scaled_offsets,
+                                                     i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x MASK> %vecmask)
+  ret <WIDTH x $1> %v
+}
+
+'
+)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gen_scatter
+;; Emit a function declaration for a scalarized scatter.
+;;
+;; $1: scalar type for which we want to generate code to scatter
+
+define(`gen_scatter', `
+;; Define the function that descripes the work to do to scatter a single
+;; value
+define void @__scatter_elt32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
+                                <WIDTH x i32> %offset_delta, <WIDTH x $1> %values,
+                                i32 %lane) nounwind alwaysinline {
+  %offset32 = extractelement <WIDTH x i32> %offsets, i32 %lane
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %offset64 = sext i32 %offset32 to i64
+  %scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset
+
+  %delta = extractelement <WIDTH x i32> %offset_delta, i32 %lane
+  %delta64 = sext i32 %delta to i64
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  %ptrcast = bitcast i8 * %finalptr to $1 *
+  %storeval = extractelement <WIDTH x $1> %values, i32 %lane
+  store $1 %storeval, $1 * %ptrcast
+  ret void
+}
+
+define void @__scatter_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
+                                <WIDTH x i64> %offset_delta, <WIDTH x $1> %values,
+                                i32 %lane) nounwind alwaysinline {
+  %offset64 = extractelement <WIDTH x i64> %offsets, i32 %lane
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset
+
+  %delta64 = extractelement <WIDTH x i64> %offset_delta, i32 %lane
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  %ptrcast = bitcast i8 * %finalptr to $1 *
+  %storeval = extractelement <WIDTH x $1> %values, i32 %lane
+  store $1 %storeval, $1 * %ptrcast
+  ret void
+}
+
+define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
+                                         <WIDTH x i32> %offset_delta, <WIDTH x $1> %values,
+                                         <WIDTH x MASK> %mask) nounwind alwaysinline {
+  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+      call void @__scatter_elt32_$1(i8 * %base, <WIDTH x i32> %offsets, i32 %offset_scale,
+                                    <WIDTH x i32> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
+  ret void
+}
+
+define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
+                                         <WIDTH x i64> %offset_delta, <WIDTH x $1> %values,
+                                         <WIDTH x MASK> %mask) nounwind alwaysinline {
+  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+      call void @__scatter_elt64_$1(i8 * %base, <WIDTH x i64> %offsets, i32 %offset_scale,
+                                    <WIDTH x i64> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
+  ret void
+}
+
+; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s
+define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
+                            <WIDTH x MASK> %mask) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
+  %val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
+  store $1 %val_LANE_ID, $1 * %ptr_LANE_ID
+ ')
+  ret void
+}
+
+; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s
+define void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,
+                            <WIDTH x MASK> %mask) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
+  %val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
+  store $1 %val_LANE_ID, $1 * %ptr_LANE_ID
+ ')
+  ret void
+}
+
+'
+)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rdrand 
+
+define(`rdrand_decls', `
+declare i1 @__rdrand_i16(i16 * nocapture)
+declare i1 @__rdrand_i32(i32 * nocapture)
+declare i1 @__rdrand_i64(i64 * nocapture)
+')
+
+define(`rdrand_definition', `
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rdrand
+
+declare {i16, i32} @llvm.x86.rdrand.16()
+declare {i32, i32} @llvm.x86.rdrand.32()
+declare {i64, i32} @llvm.x86.rdrand.64()
+
+define i1 @__rdrand_i16(i16 * %ptr) {
+  %v = call {i16, i32} @llvm.x86.rdrand.16()
+  %v0 = extractvalue {i16, i32} %v, 0
+  %v1 = extractvalue {i16, i32} %v, 1
+  store i16 %v0, i16 * %ptr
+  %good = icmp ne i32 %v1, 0
+  ret i1 %good
+}
+
+define i1 @__rdrand_i32(i32 * %ptr) {
+  %v = call {i32, i32} @llvm.x86.rdrand.32()
+  %v0 = extractvalue {i32, i32} %v, 0
+  %v1 = extractvalue {i32, i32} %v, 1
+  store i32 %v0, i32 * %ptr
+  %good = icmp ne i32 %v1, 0
+  ret i1 %good
+}
+
+define i1 @__rdrand_i64(i64 * %ptr) {
+  %v = call {i64, i32} @llvm.x86.rdrand.64()
+  %v0 = extractvalue {i64, i32} %v, 0
+  %v1 = extractvalue {i64, i32} %v, 1
+  store i64 %v0, i64 * %ptr
+  %good = icmp ne i32 %v1, 0
+  ret i1 %good
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define(`define_avg_up_uint8', `
+define <WIDTH x i8> @__avg_up_uint8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum1 = add <WIDTH x i16> %a16, %b16
+  %sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_up_int8', `
+define <WIDTH x i8> @__avg_up_int8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum1 = add <WIDTH x i16> %a16, %b16
+  %sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_up_uint16', `
+define <WIDTH x i16> @__avg_up_uint16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum1 = add <WIDTH x i32> %a32, %b32
+  %sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_up_int16', `
+define <WIDTH x i16> @__avg_up_int16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum1 = add <WIDTH x i32> %a32, %b32
+  %sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_down_uint8', `
+define <WIDTH x i8> @__avg_down_uint8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum = add <WIDTH x i16> %a16, %b16
+  %avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_down_int8', `
+define <WIDTH x i8> @__avg_down_int8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum = add <WIDTH x i16> %a16, %b16
+  %avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_down_uint16', `
+define <WIDTH x i16> @__avg_down_uint16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum = add <WIDTH x i32> %a32, %b32
+  %avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_down_int16', `
+define <WIDTH x i16> @__avg_down_int16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum = add <WIDTH x i32> %a32, %b32
+  %avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_up_avgs', `
+define_avg_up_uint8()
+define_avg_up_int8()
+define_avg_up_uint16()
+define_avg_up_int16()
+')
+
+define(`define_down_avgs', `
+define_avg_down_uint8()
+define_avg_down_int8()
+define_avg_down_uint16()
+define_avg_down_int16()
+')
+
+define(`define_avgs', `
+define_up_avgs()
+define_down_avgs()
+')
+
+;;;;;;;;;;;;;;;;;;;;
+
+define(`const_vector', `<$1 $2>')
+define(`saturation_arithmetic_novec_universal', `
+define <WIDTH x i8> @__p$1s_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0_i16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
+  %v1_i16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
+  %res = $1 <WIDTH x i16> %v0_i16, %v1_i16
+  %over_mask = icmp sgt <WIDTH x i16> %res, const_vector(i16, 127)
+  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i16> const_vector(i16, 127), <WIDTH x i16> %res
+  %under_mask = icmp slt <WIDTH x i16> %res, const_vector(i16, -128)
+  %ret_i16 = select <WIDTH x i1> %under_mask, <WIDTH x i16> const_vector(i16, -128), <WIDTH x i16> %over_res
+  %ret = trunc <WIDTH x i16> %ret_i16 to <WIDTH x i8>
+  ret <WIDTH x i8> %ret
+}
+
+define <WIDTH x i16> @__p$1s_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0_i32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
+  %v1_i32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
+  %res = $1 <WIDTH x i32> %v0_i32, %v1_i32
+  %over_mask = icmp sgt <WIDTH x i32> %res, const_vector(i32, 32767)
+  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i32> const_vector(i32, 32767), <WIDTH x i32> %res
+  %under_mask = icmp slt <WIDTH x i32> %res, const_vector(i32, -32768)
+  %ret_i32 = select <WIDTH x i1> %under_mask, <WIDTH x i32> const_vector(i32, -32768), <WIDTH x i32> %over_res
+  %ret = trunc <WIDTH x i32> %ret_i32 to <WIDTH x i16>
+  ret <WIDTH x i16> %ret
+}
+
+define <WIDTH x i8> @__p$1us_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0_i16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
+  %v1_i16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
+  %res = $1 <WIDTH x i16> %v0_i16, %v1_i16
+  %over_mask = icmp ugt <WIDTH x i16> %res, const_vector(i16, 255)
+  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i16> const_vector(i16, 255), <WIDTH x i16> %res
+  %under_mask = icmp slt <WIDTH x i16> %res, const_vector(i16, 0)
+  %ret_i16 = select <WIDTH x i1> %under_mask, <WIDTH x i16> const_vector(i16, 0), <WIDTH x i16> %over_res
+  %ret = trunc <WIDTH x i16> %ret_i16 to <WIDTH x i8>
+  ret <WIDTH x i8> %ret
+}
+
+define <WIDTH x i16> @__p$1us_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0_i32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
+  %v1_i32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
+  %res = $1 <WIDTH x i32> %v0_i32, %v1_i32
+  %over_mask = icmp ugt <WIDTH x i32> %res, const_vector(i32, 65535)
+  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i32> const_vector(i32, 65535), <WIDTH x i32> %res
+  %under_mask = icmp slt <WIDTH x i32> %res, const_vector(i32, 0)
+  %ret_i32 = select <WIDTH x i1> %under_mask, <WIDTH x i32> const_vector(i32, 0), <WIDTH x i32> %over_res
+  %ret = trunc <WIDTH x i32> %ret_i32 to <WIDTH x i16>
+  ret <WIDTH x i16> %ret
+}
+')
+
+define(`saturation_arithmetic_novec', `
+saturation_arithmetic_novec_universal(sub)
+saturation_arithmetic_novec_universal(add)
+')
+
+declare void @__pseudo_prefetch_read_varying_1(<WIDTH x i64>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_prefetch_read_varying_1_native(i8 *, i32, <WIDTH x i32>,
+                                         <WIDTH x MASK>) nounwind
+
+declare void @__pseudo_prefetch_read_varying_2(<WIDTH x i64>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_prefetch_read_varying_2_native(i8 *, i32, <WIDTH x i32>,
+                                         <WIDTH x MASK>) nounwind
+
+declare void @__pseudo_prefetch_read_varying_3(<WIDTH x i64>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_prefetch_read_varying_3_native(i8 *, i32, <WIDTH x i32>,
+                                         <WIDTH x MASK>) nounwind
+
+declare void @__pseudo_prefetch_read_varying_nt(<WIDTH x i64>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_prefetch_read_varying_nt_native(i8 *, i32, <WIDTH x i32>,
+                                         <WIDTH x MASK>) nounwind
diff --git a/builtins/util.m4 b/builtins/util.m4
index fda60891..b265add8 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -4964,6 +4964,62 @@ declare  double @__rcp_uniform_double(double)
 declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
 ')
 
+define(`declare_nvptx',
+`
+declare i32 @__program_index()  nounwind readnone alwaysinline
+declare i32 @__program_count()  nounwind readnone alwaysinline
+declare i32 @__warp_index()  nounwind readnone alwaysinline
+declare i32 @__task_index0()  nounwind readnone alwaysinline
+declare i32 @__task_index1()  nounwind readnone alwaysinline
+declare i32 @__task_index2()  nounwind readnone alwaysinline
+declare i32 @__task_index()  nounwind readnone alwaysinline
+declare i32 @__task_count0()  nounwind readnone alwaysinline
+declare i32 @__task_count1()  nounwind readnone alwaysinline
+declare i32 @__task_count2()  nounwind readnone alwaysinline
+declare i32 @__task_count()  nounwind readnone alwaysinline
+declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
+declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
+declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline
+declare i64 @__movmsk_ptx(<WIDTH x i1>) nounwind readnone alwaysinline;
+')
+
+define(`global_atomic_varying',`
+declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
+')
+
+define(`global_atomic_cas_varying',`
+declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %cmp, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
+')
+
+global_atomic_cas_varying(WIDTH, compare_exchange, i32, int32)
+global_atomic_cas_varying(WIDTH, compare_exchange, i64, int64)
+global_atomic_cas_varying(WIDTH, compare_exchange, float, float)
+global_atomic_cas_varying(WIDTH, compare_exchange, double, double)
+
+global_atomic_varying(WIDTH, swap, i32, int32)
+global_atomic_varying(WIDTH, swap, i64, int64)
+global_atomic_varying(WIDTH, swap, float, float)
+global_atomic_varying(WIDTH, swap, double, double)
+
+global_atomic_varying(WIDTH, add, i32, int32)
+global_atomic_varying(WIDTH, sub, i32, int32)
+global_atomic_varying(WIDTH, and, i32, int32)
+global_atomic_varying(WIDTH, or, i32, int32)
+global_atomic_varying(WIDTH, xor, i32, int32)
+global_atomic_varying(WIDTH, min, i32, int32)
+global_atomic_varying(WIDTH, max, i32, int32)
+global_atomic_varying(WIDTH, umin, i32, uint32)
+global_atomic_varying(WIDTH, umax, i32, uint32)
+
+global_atomic_varying(WIDTH, add, i64, int64)
+global_atomic_varying(WIDTH, sub, i64, int64)
+global_atomic_varying(WIDTH, and, i64, int64)
+global_atomic_varying(WIDTH, or, i64, int64)
+global_atomic_varying(WIDTH, xor, i64, int64)
+global_atomic_varying(WIDTH, min, i64, int64)
+global_atomic_varying(WIDTH, max, i64, int64)
+global_atomic_varying(WIDTH, umin, i64, uint64)
+global_atomic_varying(WIDTH, umax, i64, uint64)
 
 define(`transcendetals_decl',`
     declare float @__log_uniform_float(float) nounwind readnone
diff --git a/ctx.cpp b/ctx.cpp
index 7abf1e68..f09002c1 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -57,6 +57,10 @@
   #include <llvm/IR/Instructions.h>
   #include <llvm/IR/DerivedTypes.h>
 #endif
+#ifdef ISPC_NVPTX_ENABLED
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Support/FormattedStream.h>
+#endif /* ISPC_NVPTX_ENABLED */
 
 /** This is a small utility structure that records information related to one
     level of nested control flow.  It's mostly used in correctly restoring
@@ -1383,10 +1387,17 @@ FunctionEmitContext::None(llvm::Value *mask) {
 
 llvm::Value *
 FunctionEmitContext::LaneMask(llvm::Value *v) {
+#ifdef ISPC_NVPTX_ENABLED
+    /* this makes mandelbrot example slower with "nvptx" target. 
+     * Needs further investigation. */
+    const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk";
+#else
+    const char *__movmsk = "__movmsk";
+#endif
     // Call the target-dependent movmsk function to turn the vector mask
     // into an i64 value
     std::vector<Symbol *> mm;
-    m->symbolTable->LookupFunction("__movmsk", &mm);
+    m->symbolTable->LookupFunction(__movmsk, &mm);
     if (g->target->getMaskBitCount() == 1)
         AssertPos(currentPos, mm.size() == 1);
     else
@@ -1398,13 +1409,78 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
     return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
 }
 
+#ifdef ISPC_NVPTX_ENABLED
+bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName)
+{
+  llvm::Type *type = vector->getType();
+  if (type == LLVMTypes::Int8VectorType)
+    funcName += "_int8";
+  else if (type == LLVMTypes::Int16VectorType)
+    funcName += "_int16";
+  else if (type == LLVMTypes::Int32VectorType)
+    funcName += "_int32";
+  else if (type == LLVMTypes::Int64VectorType)
+    funcName += "_int64";
+  else if (type == LLVMTypes::FloatVectorType)
+    funcName += "_float";
+  else if (type == LLVMTypes::DoubleVectorType)
+    funcName += "_double";
+  else
+    return false;
+  return true;
+}
+
+llvm::Value*
+FunctionEmitContext::Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar)
+{
+  std::string funcName = "__insert";
+  assert(lAppendInsertExtractName(vector, funcName));
+  assert(lane->getType() == LLVMTypes::Int32Type);
+  
+  llvm::Function *func = m->module->getFunction(funcName.c_str());
+  assert(func != NULL);
+  std::vector<llvm::Value *> args;
+  args.push_back(vector);
+  args.push_back(lane);
+  args.push_back(scalar);
+  llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
+  return ret;
+}
+
+llvm::Value*
+FunctionEmitContext::Extract(llvm::Value *vector, llvm::Value *lane)
+{
+  std::string funcName = "__extract";
+  assert(lAppendInsertExtractName(vector, funcName));
+  assert(lane->getType() == LLVMTypes::Int32Type);
+  
+  llvm::Function *func = m->module->getFunction(funcName.c_str());
+  assert(func != NULL);
+  std::vector<llvm::Value *> args;
+  args.push_back(vector);
+  args.push_back(lane);
+  llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
+  return ret;
+}
+#endif /* ISPC_NVPTX_ENABLED */
+
 
 llvm::Value *
 FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
+#ifdef ISPC_NVPTX_ENABLED
+    if (g->target->getISA() == Target::NVPTX)
+    {
+      // Compare the two masks to get a vector of i1s
+      llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
+          v1, v2, "v1==v2");
+      return ExtractInst(cmp, 0);  /* this works without calling All(..) in PTX. Why ?!? */
+    }
+#endif /* ISPC_NVPTX_ENABLED */
+
 #if 0
     // Compare the two masks to get a vector of i1s
     llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
-                               v1, v2, "v1==v2");
+        v1, v2, "v1==v2");
     // Turn that into a bool vector type (often i32s)
     cmp = I1VecToBoolVec(cmp);
     // And see if it's all on
@@ -1413,7 +1489,7 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
     llvm::Value *mm1 = LaneMask(v1);
     llvm::Value *mm2 = LaneMask(v2);
     return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
-                   LLVMGetName("equal", v1, v2));
+        LLVMGetName("equal", v1, v2));
 #endif
 }
 
@@ -1421,8 +1497,8 @@ llvm::Value *
 FunctionEmitContext::ProgramIndexVector(bool is32bits) {
     llvm::SmallVector<llvm::Constant*, 16> array;
     for (int i = 0; i < g->target->getVectorWidth() ; ++i) {
-        llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
-        array.push_back(C);
+      llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
+      array.push_back(C);
     }
 
     llvm::Constant* index = llvm::ConstantVector::get(array);
@@ -1430,6 +1506,20 @@ FunctionEmitContext::ProgramIndexVector(bool is32bits) {
     return index;
 }
 
+#ifdef ISPC_NVPTX_ENABLED
+llvm::Value *
+FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) {
+    llvm::Function *func_program_index  = m->module->getFunction("__program_index");
+    llvm::Value *__program_index    = CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__program_indexS");
+    llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), __program_index, 0, "foreach__program_indexV");
+#if 0
+    if (!is32bits)
+      index = ZExtInst(index, LLVMTypes::Int64VectandType);
+#endif
+    return index;
+}
+#endif /* ISPC_NVPTX_ENABLED */
+
 
 llvm::Value *
 FunctionEmitContext::GetStringPtr(const std::string &str) {
@@ -3555,31 +3645,117 @@ llvm::Value *
 FunctionEmitContext::LaunchInst(llvm::Value *callee,
                                 std::vector<llvm::Value *> &argVals,
                                 llvm::Value *launchCount[3]){
-    if (callee == NULL) {
+#ifdef ISPC_NVPTX_ENABLED
+    if (g->target->getISA() == Target::NVPTX)
+    {
+      if (callee == NULL) {
         AssertPos(currentPos, m->errorCount > 0);
         return NULL;
+      }
+      launchedTasks = true;
+
+      AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
+      std::vector<llvm::Type*> argTypes;
+
+      llvm::Function *F = llvm::dyn_cast<llvm::Function>(callee);
+      const unsigned int nArgs = F->arg_size();
+      llvm::Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+      for (; I != E; ++I) 
+        argTypes.push_back(I->getType());
+      llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes);
+      llvm::StructType *argStructType = static_cast<llvm::StructType *>(st);
+      llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
+      if (structSize->getType() != LLVMTypes::Int64Type)
+        structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
+            "struct_size_to_64");
+
+      const int align = 8;
+      llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
+      AssertPos(currentPos, falloc != NULL);
+      std::vector<llvm::Value *> allocArgs;
+      allocArgs.push_back(launchGroupHandlePtr);
+      allocArgs.push_back(structSize);
+      allocArgs.push_back(LLVMInt32(align));
+      llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
+      llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64");
+      llvm::BasicBlock* if_true  = CreateBasicBlock("if_true");
+      llvm::BasicBlock* if_false = CreateBasicBlock("if_false");
+
+      /* check if the pointer returned by ISPCAlloc is not NULL 
+       * --------------
+       * this is a workaround for not checking the value of programIndex 
+       * because ISPCAlloc will return NULL pointer for all programIndex > 0
+       * of course, if ISPAlloc fails to get parameter buffer, the pointer for programIndex = 0
+       * will also be NULL
+       * This check must be added, and also rewrite the code to make it less opaque 
+       */
+      llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1");
+      BranchInst(if_true, if_false, cmp1);
+
+      /**********************/
+      bblock = if_true;    
+
+      // label_if_then block:
+      llvm::Type *pt = llvm::PointerType::getUnqual(st);
+      llvm::Value *argmem = BitCastInst(voidmem, pt);
+      for (unsigned int i = 0; i < argVals.size(); ++i) 
+      {
+        llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
+        // don't need to do masked store here, I think
+        StoreInst(argVals[i], ptr);
+      }
+      if (nArgs == argVals.size() + 1) {
+        // copy in the mask
+        llvm::Value *mask = GetFullMask();
+        llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
+            "funarg_mask");
+        StoreInst(mask, ptr);
+      }
+      BranchInst(if_false);
+
+      /**********************/
+      bblock = if_false;
+
+      llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
+      llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
+      AssertPos(currentPos, flaunch != NULL);
+      std::vector<llvm::Value *> args;
+      args.push_back(launchGroupHandlePtr);
+      args.push_back(fptr);
+      args.push_back(voidmem);
+      args.push_back(launchCount[0]);
+      args.push_back(launchCount[1]);
+      args.push_back(launchCount[2]);
+      llvm::Value *ret =  CallInst(flaunch, NULL, args, "");
+      return ret;
+    }
+#endif /* ISPC_NVPTX_ENABLED */
+
+    if (callee == NULL) {
+      AssertPos(currentPos, m->errorCount > 0);
+      return NULL;
     }
 
     launchedTasks = true;
 
     AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
     llvm::Type *argType =
-        (llvm::dyn_cast<llvm::Function>(callee))->arg_begin()->getType();
+      (llvm::dyn_cast<llvm::Function>(callee))->arg_begin()->getType();
     AssertPos(currentPos, llvm::PointerType::classof(argType));
     llvm::PointerType *pt =
-        llvm::dyn_cast<llvm::PointerType>(argType);
+      llvm::dyn_cast<llvm::PointerType>(argType);
     AssertPos(currentPos, llvm::StructType::classof(pt->getElementType()));
     llvm::StructType *argStructType =
-        static_cast<llvm::StructType *>(pt->getElementType());
+      static_cast<llvm::StructType *>(pt->getElementType());
 
     llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
     AssertPos(currentPos, falloc != NULL);
     llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
     if (structSize->getType() != LLVMTypes::Int64Type)
-        // ISPCAlloc expects the size as an uint64_t, but on 32-bit
-        // targets, SizeOf returns a 32-bit value
-        structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
-                              "struct_size_to_64");
+      // ISPCAlloc expects the size as an uint64_t, but on 32-bit
+      // targets, SizeOf returns a 32-bit value
+      structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
+          "struct_size_to_64");
     int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth());
 
     std::vector<llvm::Value *> allocArgs;
@@ -3592,17 +3768,17 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
     // Copy the values of the parameters into the appropriate place in
     // the argument block
     for (unsigned int i = 0; i < argVals.size(); ++i) {
-        llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
-        // don't need to do masked store here, I think
-        StoreInst(argVals[i], ptr);
+      llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
+      // don't need to do masked store here, I think
+      StoreInst(argVals[i], ptr);
     }
 
     if (argStructType->getNumElements() == argVals.size() + 1) {
-        // copy in the mask
-        llvm::Value *mask = GetFullMask();
-        llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
-                                            "funarg_mask");
-        StoreInst(mask, ptr);
+      // copy in the mask
+      llvm::Value *mask = GetFullMask();
+      llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
+          "funarg_mask");
+      StoreInst(mask, ptr);
     }
 
     // And emit the call to the user-supplied task launch function, passing
@@ -3624,6 +3800,21 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
 
 void
 FunctionEmitContext::SyncInst() {
+#ifdef ISPC_NVPTX_ENABLED 
+    if (g->target->getISA() == Target::NVPTX)
+    {
+      llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
+      llvm::Value *nullPtrValue =
+        llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
+      llvm::Function *fsync = m->module->getFunction("ISPCSync");
+      if (fsync == NULL)
+        FATAL("Couldn't find ISPCSync declaration?!");
+      CallInst(fsync, NULL, launchGroupHandle, "");
+      StoreInst(nullPtrValue, launchGroupHandlePtr);
+      return;
+    }
+#endif /* ISPC_NVPTX_ENABLED */
+
     llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
     llvm::Value *nullPtrValue =
         llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
diff --git a/ctx.h b/ctx.h
index 0cfe4549..cd4db7e8 100644
--- a/ctx.h
+++ b/ctx.h
@@ -302,9 +302,17 @@ public:
         that indicates whether the two masks are equal. */
     llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
 
-    /** Generate ConstantVector, which contains ProgramIndex, i.e.
+    /** generate constantvector, which contains programindex, i.e.
         < i32 0, i32 1, i32 2, i32 3> */
     llvm::Value *ProgramIndexVector(bool is32bits = true);
+#ifdef ISPC_NVPTX_ENABLED
+    llvm::Value *ProgramIndexVectorPTX(bool is32bits = true);
+
+    /** Issues a call to __insert_int8/int16/int32/int64/float/double */
+    llvm::Value* Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar);
+    /** Issues a call to __extract_int8/int16/int32/int64/float/double */
+    llvm::Value* Extract(llvm::Value *vector, llvm::Value *lane);
+#endif 
 
     /** Given a string, create an anonymous global variable to hold its
         value and return the pointer to the string. */
diff --git a/decl.cpp b/decl.cpp
index 2bdb6c10..c915d6b8 100644
--- a/decl.cpp
+++ b/decl.cpp
@@ -168,6 +168,15 @@ DeclSpecs::GetBaseType(SourcePos pos) const {
     retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
 
     if (soaWidth > 0) {
+#ifdef ISPC_NVPTX_ENABLED
+#if 0  /* see stmt.cpp in DeclStmt::EmitCode for work-around of SOAType Declaration */
+        if (g->target->getISA() == Target::NVPTX)
+        {
+            Error(pos, "\"soa\" data types are currently not supported with \"nvptx\" target.");
+            return NULL;
+        }
+#endif
+#endif /* ISPC_NVPTX_ENABLED */
         const StructType *st = CastType<StructType>(retType);
 
         if (st == NULL) {
@@ -402,6 +411,15 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
             return;
         }
 
+#ifdef ISPC_NVPTX_ENABLED
+#if 0 /* NVPTX */
+        if (baseType->IsUniformType())
+        {
+          fprintf(stderr, " detected uniform array of size= %d  array= %s\n" ,arraySize,
+              baseType->IsArrayType() ? " true " : " false ");
+        }
+#endif
+#endif /* ISPC_NVPTX_ENABLED */
         const Type *arrayType = new ArrayType(baseType, arraySize);
         if (child != NULL) {
             child->InitFromType(arrayType, ds);
diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt
index ef8cf6f8..722175ec 100644
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,38 @@
+=== v1.8.0 === (16 October 2014)
+
+A major new version of ISPC, which introduces experimental support for NVPTX
+target, brings numerous improvements to our KNC (Xeon Phi) support, introduces
+debugging support on Windows and fixes several bugs. We also ship experimental
+build for Sony PlayStation4 target in this release. Binaries for all platforms
+are based on LLVM 3.5.
+
+Note that MacOS binaries are build for MacOS 10.9 Mavericks. Linux binaries are
+compatible with kernel 2.6.32 (ok for RHEL6) and later.
+
+More details:
+
+* Experimental NVPTX support is available for users of our binary distribution
+  on Linux only at the moment. MacOS and Windows users willing to experiment
+  with this target are welcome to build it from source. Note that GPU imposes
+  some limitation on ISPC language, which are discussed in corresponding section
+  of ISPC User's Guide. Implementation of NVPTX support was done by our
+  contributor Evghenii Gaburov.
+
+* KNC support was greatly extended in knc.h header file. Beyond new features
+  there are stability fixes and changes for icc 15.0 compatibility. Stdlib
+  prefetch functions were improved to map to KNC vector prefetches.
+
+* PS4 experimental build is Windows to PS4 cross compiler, which disables arch
+  and cpu selection (which are preset to PS4 hardware).
+
+* Debug info support on Windows (compatible with VS2010, VS2012 and VS2013).
+
+* Critical bug fix, which caused code generation for incorrect target, despite
+  explicit target switches, under some conditions.
+
+* Stability fix of the bug, which caused print() function to execute under
+  all-off mask under some conditions.
+
 === v1.7.0 === (18 April 2014)
 
 A major new version of ISPC with several language and library extensions and
diff --git a/docs/ispc.rst b/docs/ispc.rst
index a2cf2a95..d854b309 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -178,6 +178,13 @@ Contents:
   + `Data Alignment and Aliasing`_
   + `Restructuring Existing Programs to Use ISPC`_
 
+* `Experimental support for PTX`_
+
+  + `Overview`_
+  + `Compiling For The NVIDIA Kepler GPU`_
+  + `Hints`_
+  + `Limitations & known issues`_
+
 * `Disclaimer and Legal Information`_
 
 * `Optimization Notice`_
@@ -4936,6 +4943,129 @@ program instances improves performance.
 .. _ispc Performance Tuning Guide: http://ispc.github.com/perfguide.html
 
 
+Experimental support for PTX
+============================
+``ispc`` provides experimental support for PTX code generation which currently
+targets NVIDIA GPUs with compute capability >3.5 [Kepler GPUs with support for
+dynamic parallelism]. Due to its nature, the PTX backend currently impose
+several restrictions on the ``ispc`` program, which will be described below.
+
+Overview
+--------
+SPMD programming in ``ispc`` is similar to a warp-synchronous CUDA programming.
+Namely, program instances in a gang are equivalent of CUDA threads in a single
+warp. Hence, to run efficiently on a GPU ``ispc`` program must use tasking
+functionality via ``launch`` keyword to ensure multiple number of warps are
+executed concurrently on the GPU.
+
+``export`` functions are equipped with a CUDA C wrapper which schedules a
+single warp--a thread-block with a total of 32 threads. In contract to CPU
+programming, this exported function, either directly or otherwise, should
+utilize ``launch`` keyword to schedule work on a GPU.
+
+At the PTX level, ``launch`` keyword is mapped to CUDA Dynamic Parallelism and
+it schedules a grid of thread-blocks each 4 warps-wide (128 threads).  As a
+result, ``ispc`` has a tasking-granularity of 4 tasks with PTX target; this
+restriction will be eliminated in future.
+
+When passing pointers to an ``export`` function, it is important that they
+remain legal when are accessed from GPU. Prior to CUDA 6.0, such a pointer were
+holding an address that is only accessible from the GPU.  With the release of
+CUDA 6.0, it is possible to pass a pointer to a unified memory allocated with
+``cudaMallocManaged``. Examples provides rudimentary wrapper functions that
+call CUDA API for managed memory allocations, allowing the programmers to avoid
+explicit memory copies.
+
+
+
+Compiling For The NVIDIA Kepler GPU
+-----------------------------------
+Compilation for NVIDIA Kepler GPU is a several step procedure.
+
+First, we need to generate a LLVM assembly from ``ispc`` source file (``ispc``
+generates LLVM assembly instead of bitcode when ``nvptx`` target is chosen):
+
+::
+
+  $ISPC_HOME/ispc foo.ispc --emit-llvm --target=nvptx -o foo.ll
+
+
+This LLVM assembly can immediately be compiled into PTX with the help of
+``ptxgen`` tool; this tool uses ``libNVVM`` which is a part of a CUDA Toolkit.
+
+::
+
+  $ISPC_HOME/ptxtools/ptxgen --use_fast_math foo.ll -o foo.ptx
+
+.. If ``ispc`` is compiled with  LLVM >3.2, the resulting bitcode must first be
+.. decompiled with the ``llvm-dis`` from LLVM 3.2 distribution; this "trick" is
+.. required to generate an IR compatible with libNVVM:
+
+.. ::
+.. 
+..   $LLVM32/bin/llvm-dis foo.bc -o foo.ll
+..   $ISPC_HOME/ptxtools/ptxgen --use_fast_math foo.ll -o foo.ptx
+
+This PTX is ready for execution on a GPU, for example via CUDA
+Driver API. Alternatively, we also provide a simple ``ptxcc`` tool, which
+compiles the resulting PTX code into an object file:
+
+::
+
+   $ISPC_HOME/ptxtools/ptxcc foo.ptx -o foo_cu.o -Xnvcc="--maxrregcount=64
+   -Xptxas=-v"
+
+This object file can be linked with the main program via ``nvcc``:
+
+::
+
+    nvcc foo_cu.o foo_main.o -o foo
+
+
+Hints
+-----
+- ``uniform`` arrays in a function scope are statically allocated in
+  ``__shared__`` memory, with all ensuing consequences. For example, if more 
+  than avaiable shared memory per SMX is allocated, a link- or runtime-error will occur
+- If ``uniform`` arrays of large size are desired, we recommend to use
+  ``uniform new uniform T[size]`` for their allocation, ideally outside the
+  tasking function (see ``deferred/kernels.ispc`` in the deferred shading example)
+
+Examples that produces executables for CPU, XeonPhi and Kepler GPU display
+several tuning approaches that can benefit GPU performance. 
+``ispc`` may also generate performance warning, that if followed, may improve
+GPU application performance.
+
+Limitations & known issues
+--------------------------
+Due to its experimental form, PTX code generation is known to impose several
+limitation on the ``ispc`` program which are documented in the following list:
+
+- Must use ``ispc`` tasking functionality to run efficiently on GPU
+- Must use ``new/delete`` and/or ``ispc_malloc``/``ispc_free``/``ispc_memset``/``ispc_memcpy`` to allocate/free/set/copy memory that is visible to GPU
+- ``export`` functions must have ``void`` return type.
+- ``task``/``export`` functions do not accept varying data-types
+- ``new``/``delete`` currently only works with ``uniform`` data-types
+- ``aossoa``/``soaaos`` is not yet supported
+- ``sizeof(varying)`` is not yet unsupported
+- Function pointers do not work yet (may or may not generate compilation fail)
+- ``memset``/``memcpy``/``memmove`` is not yet supported
+- ``uniform`` arrays in global scope are mapped to global memory
+- ``varying`` arrays in global scope are not yet supported
+- ``uniform`` arrays in local  scope are mapped to shared memory
+- ``varying`` arrays in local  scope are mapped to local  memory
+- ``const uniform/varying`` arrays are mapped to local memory
+- ``const static uniform`` arrays are mapped to constant memory
+- ``const static varying``  arrays are mapped to global   memory
+- ``static`` data types in local scope are not allowed; compilation will fail
+- Best performance is obtained with libNVVM (LLVM PTX backend can also be used but it requires libdevice.compute_35.10.bc that comes with libNVVM)
+
+
+Likely there are more... which, together with some of the above-mentioned
+issues, will be fixed in due time.
+
+
+
 Disclaimer and Legal Information
 ================================
 
diff --git a/docs/news.rst b/docs/news.rst
index 80c3e8b3..cc29b39b 100644
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -2,10 +2,19 @@
 ispc News
 =========
 
+ispc 1.8.0 is Released
+----------------------
+
+A major new version of ``ispc``, which introduces experimental support for NVPTX
+target, brings numerous improvements to our KNC (Xeon Phi) support, introduces
+debugging support on Windows and fixes several bugs. We also ship experimental
+build for Sony PlayStation4 target in this release. Binaries for all platforms
+are based on LLVM 3.5.
+
 ispc 1.7.0 is Released
 ----------------------
 
-A major new version of ISPC with several language and library extensions and
+A major new version of ``ispc`` with several language and library extensions and
 fixes in debug info support. Binaries for all platforms are based on patched
 version on LLVM 3.4. There also performance improvements beyond switchover to
 LLVM 3.4.
diff --git a/doxygen.cfg b/doxygen.cfg
index 39e41fb3..c09a11f1 100644
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
 
-PROJECT_NUMBER         = 1.7.1dev
+PROJECT_NUMBER         = 1.8.1dev
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
diff --git a/examples/portable/aobench/.gitignore b/examples/portable/aobench/.gitignore
new file mode 100644
index 00000000..ad080f43
--- /dev/null
+++ b/examples/portable/aobench/.gitignore
@@ -0,0 +1,2 @@
+ao
+*.ppm
diff --git a/examples/portable/aobench/Makefile_cpu b/examples/portable/aobench/Makefile_cpu
new file mode 100644
index 00000000..6dd16131
--- /dev/null
+++ b/examples/portable/aobench/Makefile_cpu
@@ -0,0 +1,8 @@
+
+EXAMPLE=ao
+CPP_SRC=ao.cpp 
+ISPC_SRC=ao.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+
+include ../common_cpu.mk
diff --git a/examples/portable/aobench/Makefile_knc b/examples/portable/aobench/Makefile_knc
new file mode 100644
index 00000000..fe889a26
--- /dev/null
+++ b/examples/portable/aobench/Makefile_knc
@@ -0,0 +1,7 @@
+EXAMPLE=ao
+CXX_SRC=ao.cpp 
+ISPC_SRC=ao.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+
+include ../common_knc.mk
diff --git a/examples/portable/aobench/Makefile_ptx b/examples/portable/aobench/Makefile_ptx
new file mode 100644
index 00000000..fb390eb1
--- /dev/null
+++ b/examples/portable/aobench/Makefile_ptx
@@ -0,0 +1,14 @@
+PROG=ao
+ISPC_SRC=ao.ispc
+CU_SRC=ao.cu
+CXX_SRC=ao.cpp 
+PTXCC_REGMAX=64
+#ISPC_FLAGS= --opt=disable-uniform-control-flow
+
+#LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
diff --git a/examples/portable/aobench/ao.cpp b/examples/portable/aobench/ao.cpp
new file mode 100644
index 00000000..b8dfcac7
--- /dev/null
+++ b/examples/portable/aobench/ao.cpp
@@ -0,0 +1,152 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#ifdef __linux__
+#include <malloc.h>
+#endif
+#include <math.h>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <sys/types.h>
+
+#include "ao_ispc.h"
+
+#include "timing.h"
+#include "ispc_malloc.h"
+
+#define NSUBSAMPLES        2
+
+static unsigned int test_iterations[] = {3, 7, 1};
+static unsigned int width, height;
+static unsigned char *img;
+static float *fimg;
+
+
+static unsigned char
+clamp(float f)
+{
+    int i = (int)(f * 255.5);
+
+    if (i < 0) i = 0;
+    if (i > 255) i = 255;
+
+    return (unsigned char)i;
+}
+
+
+static void
+savePPM(const char *fname, int w, int h)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)  {
+            img[3 * (y * w + x) + 0] = clamp(fimg[3 *(y * w + x) + 0]);
+            img[3 * (y * w + x) + 1] = clamp(fimg[3 *(y * w + x) + 1]);
+            img[3 * (y * w + x) + 2] = clamp(fimg[3 *(y * w + x) + 2]);
+        }
+    }
+
+    FILE *fp = fopen(fname, "wb");
+    if (!fp) {
+        perror(fname);
+        exit(1);
+    }
+
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", w, h);
+    fprintf(fp, "255\n");
+    fwrite(img, w * h * 3, 1, fp);
+    fclose(fp);
+    printf("Wrote image file %s\n", fname);
+}
+
+
+int main(int argc, char **argv)
+{
+    if (argc < 3) {
+        printf ("%s\n", argv[0]);
+        printf ("Usage: ao [width] [height] [ispc iterations] [tasks iterations] [serial iterations]\n");
+        getchar();
+        exit(-1);
+    }
+    else {
+        if (argc == 6) {
+            for (int i = 0; i < 3; i++) {
+                test_iterations[i] = atoi(argv[3 + i]);
+            }
+        }
+        width = atoi (argv[1]);
+        height = atoi (argv[2]);
+    }
+
+    // Allocate space for output images
+    img = new unsigned char[width * height * 3];
+    fimg = new float[width * height * 3];
+
+    //
+    // Run the ispc + tasks path, test_iterations times, and report the
+    // minimum time for any of them.
+    //
+    double minTimeISPCTasks = 1e30;
+    for (unsigned int i = 0; i < test_iterations[1]; i++) {
+        ispc_memset(fimg, 0, sizeof(float) * width * height * 3);
+        assert(NSUBSAMPLES == 2);
+
+        reset_and_start_timer();
+        ispc::ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_msec();
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", t);
+        minTimeISPCTasks = std::min(minTimeISPCTasks, t);
+    }
+
+    // Report results and save image
+    printf("[aobench ispc + tasks]:\t\t[%.3f] msec (%d x %d image)\n",
+           minTimeISPCTasks, width, height);
+    savePPM("ao-ispc-tasks.ppm", width, height);
+
+    delete img;
+    delete fimg;
+
+    return 0;
+}
diff --git a/examples/portable/aobench/ao.cu b/examples/portable/aobench/ao.cu
new file mode 100644
index 00000000..aaca3e88
--- /dev/null
+++ b/examples/portable/aobench/ao.cu
@@ -0,0 +1,447 @@
+// -*- mode: c++ -*-
+/*
+   Copyright (c) 2010-2014, Intel Corporation
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+   Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
+   */
+
+#include "cuda_helpers.cuh"
+
+#define NAO_SAMPLES        8
+//#define M_PI 3.1415926535f
+
+#define vec Float3
+struct Float3
+{
+  float x,y,z;
+
+  __device__ friend Float3 operator+(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x+b.x;
+    c.y = a.y+b.y;
+    c.z = a.z+b.z;
+    return c;
+  }
+  __device__ friend Float3 operator-(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x-b.x;
+    c.y = a.y-b.y;
+    c.z = a.z-b.z;
+    return c;
+  }
+  __device__ friend Float3 operator/(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x/b.x;
+    c.y = a.y/b.y;
+    c.z = a.z/b.z;
+    return c;
+  }
+  __device__ friend Float3 operator/(const float a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a/b.x;
+    c.y = a/b.y;
+    c.z = a/b.z;
+    return c;
+  }
+  __device__ friend Float3 operator*(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x*b.x;
+    c.y = a.y*b.y;
+    c.z = a.z*b.z;
+    return c;
+  }
+  __device__ friend Float3 operator*(const Float3 a, const float b)
+  {
+    Float3 c;
+    c.x = a.x*b;
+    c.y = a.y*b;
+    c.z = a.z*b;
+    return c;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////
+// RNG stuff
+
+struct RNGState {
+    unsigned int z1, z2, z3, z4;
+};
+
+__device__
+static inline unsigned int random(RNGState * state)
+{
+    unsigned int b;
+
+    b  = ((state->z1 << 6) ^ state->z1) >> 13;
+    state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
+    b  = ((state->z2 << 2) ^ state->z2) >> 27;
+    state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
+    b  = ((state->z3 << 13) ^ state->z3) >> 21;
+    state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
+    b  = ((state->z4 << 3) ^ state->z4) >> 12;
+    state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
+    return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
+}
+
+
+__device__
+static inline float frandom(RNGState * state)
+{
+    unsigned int irand = random(state);
+    irand &= (1ul<<23)-1;
+    return __int_as_float(0x3F800000 | irand)-1.0f;
+}
+
+__device__
+static inline void seed_rng(RNGState * state,
+                            unsigned int seed) {
+    state->z1 = seed;
+    state->z2 = seed ^ 0xbeeff00d;
+    state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
+    state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul)  << 8) |
+                 ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
+}
+
+
+
+struct Isect {
+  float      t;
+  vec        p;
+  vec        n;
+  int        hit;
+};
+
+struct Sphere {
+  vec        center;
+  float      radius;
+};
+
+struct Plane {
+  vec    p;
+  vec    n;
+};
+
+struct Ray {
+  vec org;
+  vec dir;
+};
+
+__device__
+static inline float dot(vec a, vec b) {
+  return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+__device__
+static inline vec vcross(vec v0, vec v1) {
+  vec ret;
+  ret.x = v0.y * v1.z - v0.z * v1.y;
+  ret.y = v0.z * v1.x - v0.x * v1.z;
+  ret.z = v0.x * v1.y - v0.y * v1.x;
+  return ret;
+}
+
+__device__
+static inline void vnormalize(vec &v) {
+  float len2 = dot(v, v);
+  float invlen = rsqrt(len2);
+  v = v*invlen;
+}
+
+
+__device__
+static inline void
+ray_plane_intersect(Isect &isect,const  Ray &ray, const  Plane &plane) {
+  float d = -dot(plane.p, plane.n);
+  float v = dot(ray.dir, plane.n);
+
+#if 0
+  if (abs(v) < 1.0f-17)
+    return;
+  else {
+    float t = -(dot(ray.org, plane.n) + d) / v;
+
+    if ((t > 0.0) && (t < isect.t)) {
+      isect.t = t;
+      isect.hit = 1;
+      isect.p = ray.org + ray.dir * t;
+      isect.n = plane.n;
+    }
+  }
+#else
+    if (abs(v) <= 1.0e-17)
+      return;
+    float t = -(dot(ray.org, plane.n) + d) / v;
+    if ((t > 0.0) && (t < isect.t)) {
+      isect.t = t;
+      isect.hit = 1;
+      isect.p = ray.org + ray.dir * t;
+      isect.n = plane.n;
+    }
+#endif
+}
+
+
+__device__
+static inline void
+ray_sphere_intersect(Isect &isect,const  Ray &ray, const Sphere &sphere) {
+  vec rs = ray.org - sphere.center;
+
+  float B = dot(rs, ray.dir);
+  float C = dot(rs, rs) - sphere.radius * sphere.radius;
+  float D = B * B - C;
+
+#if 0
+  if (D > 0.) {
+    float t = -B - sqrt(D);
+
+    if ((t > 0.0) && (t < isect.t)) {
+      isect.t = t;
+      isect.hit = 1;
+      isect.p = ray.org +  ray.dir * t;
+      isect.n = isect.p - sphere.center;
+      vnormalize(isect.n);
+    }
+  }
+#else
+    if (D <= 0.0f)
+      return;
+
+    float t = -B - sqrt(D);
+
+    if ((t > 0.0) && (t < isect.t)) {
+      isect.t = t;
+      isect.hit = 1;
+      isect.p = ray.org +  ray.dir * t;
+      isect.n = isect.p - sphere.center;
+      vnormalize(isect.n);
+    }
+#endif
+
+}
+
+
+__device__
+static inline void
+orthoBasis(vec basis[3], vec n) {
+  basis[2] = n;
+  basis[1].x = 0.0f; basis[1].y = 0.0f; basis[1].z = 0.0f;
+
+  if ((n.x < 0.6f) && (n.x > -0.6f)) {
+    basis[1].x = 1.0f;
+  } else if ((n.y < 0.6f) && (n.y > -0.6f)) {
+    basis[1].y = 1.0f;
+  } else if ((n.z < 0.6f) && (n.z > -0.6f)) {
+    basis[1].z = 1.0f;
+  } else {
+    basis[1].x = 1.0f;
+  }
+
+  basis[0] = vcross(basis[1], basis[2]);
+  vnormalize(basis[0]);
+
+  basis[1] = vcross(basis[2], basis[0]);
+  vnormalize(basis[1]);
+}
+
+
+__device__
+static inline float
+ambient_occlusion(Isect &isect,  const Plane &plane, const  Sphere spheres[3],
+    RNGState &rngstate) {
+  float eps = 0.0001f;
+  vec p; //, n;
+  vec basis[3];
+  float occlusion = 0.0f;
+
+  p = isect.p + isect.n * eps;
+
+  orthoBasis(basis, isect.n);
+
+  const  int ntheta = NAO_SAMPLES;
+  const  int nphi   = NAO_SAMPLES;
+  for ( int j = 0; j < ntheta; j++) {
+    for ( int i = 0; i < nphi; i++) {
+      Ray ray;
+      Isect occIsect;
+
+      float theta = sqrt(frandom(&rngstate));
+      float phi   = 2.0f * M_PI * frandom(&rngstate);
+      float x = cos(phi) * theta;
+      float y = sin(phi) * theta;
+      float z = sqrtf(1.0f - theta * theta);
+
+      // local . global
+      float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
+      float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
+      float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
+
+      ray.org = p;
+      ray.dir.x = rx;
+      ray.dir.y = ry;
+      ray.dir.z = rz;
+
+      occIsect.t   = 1.0f+17;
+      occIsect.hit = 0;
+
+      for ( int snum = 0; snum < 3; ++snum)
+        ray_sphere_intersect(occIsect, ray, spheres[snum]);
+      ray_plane_intersect (occIsect, ray, plane);
+
+      if (occIsect.hit) occlusion += 1.0f;
+    }
+  }
+
+  occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
+  return occlusion;
+}
+
+
+/* Compute the image for the scanlines from [y0,y1), for an overall image
+   of width w and height h.
+   */
+__device__
+static inline void ao_tiles(
+     int x0,  int x1,
+     int y0,  int y1,
+     int w,  int h,
+     int nsubsamples,
+     float image[])
+{
+  const  Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+  const  Sphere spheres[3] = {
+    { { -2.0f, 0.0f, -3.5f }, 0.5f },
+    { { -0.5f, 0.0f, -3.0f }, 0.5f },
+    { { 1.0f, 0.0f, -2.2f }, 0.5f } };
+  RNGState rngstate;
+
+  seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
+  float invSamples = 1.f / nsubsamples;
+  for ( int y = y0; y < y1; y++)
+    for ( int x = programIndex+x0; x < x1; x += programCount)
+    {
+      const int offset = 3 * (y * w + x);
+      float res = 0.0f;
+
+      for ( int u = 0; u < nsubsamples; u++)
+        for ( int v = 0; v < nsubsamples; v++)
+        {
+          float du = (float)u * invSamples, dv = (float)v * invSamples;
+
+          // Figure out x,y pixel in NDC
+          float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
+          float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
+          float ret = 0.f;
+          Ray ray;
+          Isect isect;
+
+          ray.org.x = 0.0f;
+          ray.org.y = 0.0f;
+          ray.org.z = 0.0f;
+
+          // Poor man's perspective projection
+          ray.dir.x = px;
+          ray.dir.y = py;
+          ray.dir.z = -1.0;
+          vnormalize(ray.dir);
+
+          isect.t   = 1.0e+17;
+          isect.hit = 0;
+
+          for ( int snum = 0; snum < 3; ++snum)
+            ray_sphere_intersect(isect, ray, spheres[snum]);
+          ray_plane_intersect(isect, ray, plane);
+
+          // Note use of 'coherent' if statement; the set of rays we
+          // trace will often all hit or all miss the scene
+          if (any(isect.hit)) {
+            ret = isect.hit*ambient_occlusion(isect, plane, spheres, rngstate);
+            ret *= invSamples * invSamples;
+            res += ret;
+          }
+        }
+
+      if (x < x1)
+      {
+        image[offset  ] = res;
+        image[offset+1] = res;
+        image[offset+2] = res;
+      }
+    }
+}
+
+
+
+#define TILEX 64
+#define TILEY 4
+
+extern "C"
+__global__
+void ao_task( int width,  int height,
+     int nsubsamples,  float image[])
+{
+  if (taskIndex0 >= taskCount0) return;
+  if (taskIndex1 >= taskCount1) return;
+
+  const  int x0 = taskIndex0 * TILEX;
+  const  int x1 = min(x0 + TILEX, width);
+
+  const  int y0 = taskIndex1 * TILEY;
+  const  int y1 = min(y0 + TILEY, height);
+  ao_tiles(x0,x1,y0,y1, width, height, nsubsamples, image);
+}
+
+extern "C"
+__global__
+void ao_ispc_tasks___export(
+    int w, int h, int nsubsamples,
+    float image[])
+{
+  const int ntilex = (w+TILEX-1)/TILEX;
+  const int ntiley = (h+TILEY-1)/TILEY;
+  launch(ntilex,ntiley,1,ao_task)(w,h,nsubsamples,image);
+  cudaDeviceSynchronize();
+}
+
+extern "C"
+__host__ void ao_ispc_tasks(
+    int w, int h, int nsubsamples,
+    float image[])
+{
+  ao_ispc_tasks___export<<<1,32>>>(w,h,nsubsamples,image);
+  cudaDeviceSynchronize();
+}
diff --git a/examples/portable/aobench/ao.ispc b/examples/portable/aobench/ao.ispc
new file mode 100644
index 00000000..6efa9d55
--- /dev/null
+++ b/examples/portable/aobench/ao.ispc
@@ -0,0 +1,340 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/*
+  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
+*/
+
+#define NAO_SAMPLES        8
+#define M_PI 3.1415926535f
+
+typedef float<3> vec;
+
+#if 1
+#define __inline inline
+#else
+#define __inline
+#endif
+
+struct Isect {
+    float      t;
+    vec        p;
+    vec        n;
+    int        hit;
+};
+
+struct Sphere {
+    vec        center;
+    float      radius;
+};
+
+struct Plane {
+    vec    p;
+    vec    n;
+};
+
+struct Ray {
+    vec org;
+    vec dir;
+};
+
+static inline float dot(vec a, vec b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static inline vec vcross(vec v0, vec v1) {
+    vec ret;
+    ret.x = v0.y * v1.z - v0.z * v1.y;
+    ret.y = v0.z * v1.x - v0.x * v1.z;
+    ret.z = v0.x * v1.y - v0.y * v1.x;
+    return ret;
+}
+
+static inline void vnormalize(vec &v) {
+    float len2 = dot(v, v);
+    float invlen = rsqrt(len2);
+    v *= invlen;
+}
+
+
+__inline
+static void
+ray_plane_intersect(Isect &isect, Ray &ray, const Plane &plane) {
+    float d = -dot(plane.p, plane.n);
+    float v = dot(ray.dir, plane.n);
+
+#if 0
+    cif (abs(v) < 1.0e-17)
+        return;
+    else {
+        float t = -(dot(ray.org, plane.n) + d) / v;
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + ray.dir * t;
+            isect.n = plane.n;
+        }
+    }
+#else
+    cif (abs(v) <= 1.0e-17)
+      return;
+    float t = -(dot(ray.org, plane.n) + d) / v;
+    cif ((t > 0.0) && (t < isect.t)) {
+      isect.t = t;
+      isect.hit = 1;
+      isect.p = ray.org + ray.dir * t;
+      isect.n = plane.n;
+    }
+#endif
+}
+
+
+static inline void
+ray_sphere_intersect(Isect &isect, Ray &ray, const Sphere &sphere) {
+    vec rs = ray.org - sphere.center;
+
+    float B = dot(rs, ray.dir);
+    float C = dot(rs, rs) - sphere.radius * sphere.radius;
+    float D = B * B - C;
+
+#if 0
+    cif (D > 0.) {
+        float t = -B - sqrt(D);
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + t * ray.dir;
+            isect.n = isect.p - sphere.center;
+            vnormalize(isect.n);
+        }
+    }
+#else
+    cif (D <=0.0f)
+      return;
+
+    float t = -B - sqrt(D);
+    cif ((t > 0.0) && (t < isect.t)) {
+      isect.t = t;
+      isect.hit = 1;
+      isect.p = ray.org + t * ray.dir;
+      isect.n = isect.p - sphere.center;
+      vnormalize(isect.n);
+    }
+#endif
+}
+
+
+__inline
+static void
+orthoBasis(vec basis[3], vec n) {
+    basis[2] = n;
+    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
+
+    if ((n.x < 0.6) && (n.x > -0.6)) {
+        basis[1].x = 1.0;
+    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+        basis[1].y = 1.0;
+    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+        basis[1].z = 1.0;
+    } else {
+        basis[1].x = 1.0;
+    }
+
+    basis[0] = vcross(basis[1], basis[2]);
+    vnormalize(basis[0]);
+
+    basis[1] = vcross(basis[2], basis[0]);
+    vnormalize(basis[1]);
+}
+
+
+__inline
+static float
+ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
+                  RNGState &rngstate) {
+    float eps = 0.0001f;
+    vec p, n;
+    vec basis[3];
+    float occlusion = 0.0;
+
+    p = isect.p + eps * isect.n;
+
+    orthoBasis(basis, isect.n);
+
+    static const uniform int ntheta = NAO_SAMPLES;
+    static const uniform int nphi   = NAO_SAMPLES;
+    for (uniform int j = 0; j < ntheta; j++) {
+        for (uniform int i = 0; i < nphi; i++) {
+            Ray ray;
+            Isect occIsect;
+
+            float theta = sqrt(frandom(&rngstate));
+            float phi   = 2.0f * M_PI * frandom(&rngstate);
+            float x = cos(phi) * theta;
+            float y = sin(phi) * theta;
+            float z = sqrt(1.0 - theta * theta);
+
+            // local . global
+            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
+            float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
+            float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
+
+            ray.org = p;
+            ray.dir.x = rx;
+            ray.dir.y = ry;
+            ray.dir.z = rz;
+
+            occIsect.t   = 1.0e+17;
+            occIsect.hit = 0;
+
+            for (uniform int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(occIsect, ray, spheres[snum]);
+            ray_plane_intersect (occIsect, ray, plane);
+
+            if (occIsect.hit) occlusion += 1.0;
+        }
+    }
+
+    occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
+    return occlusion;
+}
+
+static  inline void ao_tiles(
+    uniform int x0, uniform int x1,
+    uniform int y0, uniform int y1,
+    uniform int w, uniform int h,
+    uniform int nsubsamples,
+    uniform float image[])
+{
+  const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+  const Sphere spheres[3] = {
+    { { -2.0f, 0.0f, -3.5f }, 0.5f },
+    { { -0.5f, 0.0f, -3.0f }, 0.5f },
+    { { 1.0f, 0.0f, -2.2f }, 0.5f } };
+  RNGState rngstate;
+
+  seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
+  float invSamples = 1.f / nsubsamples;
+  foreach_tiled (y = y0 ... y1, x = x0 ... x1)
+  {
+    const int offset = 3 * (y * w + x);
+    float res = 0.0f;
+
+    for (uniform int u = 0; u < nsubsamples; u++)
+      for (uniform int v = 0; v < nsubsamples; v++)
+      {
+        float du = (float)u * invSamples, dv = (float)v * invSamples;
+
+        // Figure out x,y pixel in NDC
+        float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
+        float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
+        float ret = 0.f;
+        Ray ray;
+        Isect isect;
+
+        ray.org = 0.f;
+
+        // Poor man's perspective projection
+        ray.dir.x = px;
+        ray.dir.y = py;
+        ray.dir.z = -1.0;
+        vnormalize(ray.dir);
+
+        isect.t   = 1.0e+17;
+        isect.hit = 0;
+
+        for (uniform int snum = 0; snum < 3; ++snum)
+          ray_sphere_intersect(isect, ray, spheres[snum]);
+        ray_plane_intersect(isect, ray, plane);
+
+        // Note use of 'coherent' if statement; the set of rays we
+        // trace will often all hit or all miss the scene
+#if 0
+        cif (isect.hit) {
+          ret = ambient_occlusion(isect, plane, spheres, rngstate);
+          ret *= invSamples * invSamples;
+          res += ret;
+        }
+#else
+         if(any(isect.hit))
+         {
+          ret = isect.hit*ambient_occlusion(isect, plane, spheres, rngstate);
+          ret *= invSamples * invSamples;
+          res += ret;
+         }
+#endif
+      }
+
+      image[offset  ] = res;
+      image[offset+1] = res;
+      image[offset+2] = res;
+  }
+}
+
+#define TILEX max(64,programCount*2)
+#define TILEY 4
+
+export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
+                    uniform float image[]) {
+  const uniform int x0 = 0;
+  const uniform int x1 = w;
+  const uniform int y0 = 0;
+  const uniform int y1 = h;
+  ao_tiles(x0,x1,y0,y1, w, h, nsubsamples, image);
+}
+
+void task ao_task(uniform int width, uniform int height,
+    uniform int nsubsamples, uniform float image[])
+{
+  if (taskIndex0 >= taskCount0) return;
+  if (taskIndex1 >= taskCount1) return;
+
+  const uniform int x0 = taskIndex0 * TILEX;
+  const uniform int x1 = min(x0 + TILEX, width);
+
+  const uniform int y0 = taskIndex1 * TILEY;
+  const uniform int y1 = min(y0 + TILEY, height);
+  ao_tiles(x0,x1,y0,y1, width, height, nsubsamples, image);
+}
+
+
+export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
+    uniform float image[])
+{
+  const uniform int ntilex = (w+TILEX-1)/TILEX;
+  const uniform int ntiley = (h+TILEY-1)/TILEY;
+  launch[ntilex,ntiley] ao_task(w, h, nsubsamples, image);
+  sync;
+}
diff --git a/examples/portable/common_cpu.mk b/examples/portable/common_cpu.mk
new file mode 100644
index 00000000..76927848
--- /dev/null
+++ b/examples/portable/common_cpu.mk
@@ -0,0 +1,122 @@
+
+TASK_CXX=../omp_tasksys.cpp ../../util/ispc_malloc.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=objs/omp_tasksys.o objs/ispc_malloc.o
+
+CXX=clang++
+CXX=icc -openmp
+CXXFLAGS+=-Iobjs/ -O2 -I../../ -I../../util 
+CXXFLAGS+=-DISPC_USE_OMP
+CC=clang
+CC=icc -openmp
+CCFLAGS+=-Iobjs/ -O2 -I../../ -I../../util
+CCFLAGS+=-DISPC_USE_OMP
+
+LIBS=-lm $(TASK_LIB) -lstdc++
+ISPC=ispc
+ISPC_FLAGS+=-O2
+ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
+
+ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
+
+ifeq ($(ARCH),x86)
+  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o)
+  COMMA=,
+  ifneq (,$(findstring $(COMMA),$(ISPC_IA_TARGETS)))
+    #$(info multi-target detected: $(ISPC_IA_TARGETS))
+    ifneq (,$(findstring sse2,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse2.o)
+    endif
+    ifneq (,$(findstring sse4,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse4.o)
+    endif
+    ifneq (,$(findstring avx1-,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx.o)
+    endif
+    ifneq (,$(findstring avx1.1,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx11.o)
+    endif
+    ifneq (,$(findstring avx2,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx2.o)
+    endif
+  endif
+  ISPC_TARGETS=$(ISPC_IA_TARGETS)
+  ARCH_BIT:=$(shell getconf LONG_BIT)
+  ifeq ($(ARCH_BIT),32)
+    ISPC_FLAGS += --arch=x86
+    CXXFLAGS += -m32
+    CCFLAGS += -m32
+  else
+    ISPC_FLAGS += --arch=x86-64
+    CXXFLAGS += -m64
+    CCFLAGS += -m64
+  endif
+else ifeq ($(ARCH),arm)
+  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o))
+  ISPC_TARGETS=$(ISPC_ARM_TARGETS)
+else
+  $(error Unknown architecture $(ARCH) from uname -m)
+endif
+
+CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o))
+CC_OBJS=$(addprefix objs/, $(CC_SRC:.c=.o))
+OBJS=$(CPP_OBJS) $(CC_OBJS) $(TASK_OBJ) $(ISPC_OBJS)
+
+default: $(EXAMPLE)
+
+all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+objs/%.cpp objs/%.o objs/%.h: dirs
+
+clean:
+	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 ref test
+
+$(EXAMPLE): $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/%.o: %.cpp dirs $(ISPC_HEADER)
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: %.c dirs $(ISPC_HEADER)
+	$(CC) $< $(CCFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp dirs
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+objs/%.o: ../../%.cpp dirs
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+objs/%.o: ../../util/%.cpp dirs
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h dirs
+
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o objs/%_ispc_avx11.o objs/%_ispc_avx2.o: %.ispc dirs
+	$(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+
+objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
+
+objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
+	$(CXX) -I../../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@
+
+$(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
+
+objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
+	$(CXX) -I../../intrinsics $< $(CXXFLAGS) -c -o $@
+
+$(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-1
+
+$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
diff --git a/examples/portable/common_knc.mk b/examples/portable/common_knc.mk
new file mode 100644
index 00000000..5335dd7b
--- /dev/null
+++ b/examples/portable/common_knc.mk
@@ -0,0 +1,52 @@
+TASK_CXX=../omp_tasksys.cpp ../../util/ispc_malloc.cpp
+TASK_OBJ=objs_knc/omp_tasksys.o objs_knc/ispc_malloc.o
+TASK_LIB=-openmp
+
+CXX=icc -openmp -mmic
+CXXFLAGS+=-Iobjs_knc/ -O2 -I../../ -I../../util  -I./
+CXXFLAGS+=  -DISPC_USE_OMP
+CC=icc -openmp -mmic
+CCFLAGS+= -Iobjs_knc/ -O2 -I../../ -I../../util -I./
+CCFLAGS+=-DISPC_USE_OMP
+
+LD=icc -mmic -openmp
+
+LIBS=-lm $(TASK_LIB) -lstdc++
+ISPC=ispc
+ISPC_FLAGS+=-O2
+ISPC_FLAGS+= --target=$(ISPC_TARGET) --c++-include-file=$(ISPC_INTRINSICS)
+
+ISPC_HEADERS=$(ISPC_SRC:%.ispc=objs_knc/%_ispc.h)
+ISPC_OBJ=$(ISPC_SRC:%.ispc=objs_knc/%_ispc.o)
+CXX_OBJ=$(CXX_SRC:%.cpp=objs_knc/%.o)
+CXX_OBJ+=$(TASK_OBJ)
+
+PROG=$(EXAMPLE)_knc
+
+all: dirs $(PROG)
+
+dirs:
+	/bin/mkdir -p objs_knc/
+
+objs_knc/%.cpp objs_knc/%.o objs_knc/%.h: dirs
+
+clean: 
+	/bin/rm -rf $(PROG) objs_knc
+
+$(PROG): $(ISPC_OBJ) $(CXX_OBJ) 
+	$(LD) -o $@ $^ $(LDFLAGS)
+
+objs_knc/%.o: %.cpp
+	$(CXX) $(CXXFLAGS)  -o $@ -c $<
+
+objs_knc/%.o: ../%.cpp
+	$(CXX) $(CXXFLAGS)  -o $@ -c $<
+objs_knc/%.o: ../../%.cpp
+	$(CXX) $(CXXFLAGS)  -o $@ -c $<
+objs_knc/%.o: ../../util/%.cpp
+	$(CXX) $(CXXFLAGS)  -o $@ -c $<
+
+objs_knc/%_ispc.o: %.ispc
+	$(ISPC) $(ISPC_FLAGS) --emit-c++ -o objs_knc/$*_ispc_zmm.cpp -h objs_knc/$*_ispc.h $< 
+	$(CXX) $(CXXFLAGS) -o $@ objs_knc/$*_ispc_zmm.cpp  -c
+
diff --git a/examples/portable/common_ptx.mk b/examples/portable/common_ptx.mk
new file mode 100644
index 00000000..cfaa0b02
--- /dev/null
+++ b/examples/portable/common_ptx.mk
@@ -0,0 +1,136 @@
+NVCC_SRC=../../util/nvcc_helpers.cu
+NVCC_OBJS=objs_ptx/nvcc_helpers_nvcc.o
+#
+CXX=g++ -ffast-math
+CXXFLAGS=-O3 -I$(CUDATK)/include -Iobjs_ptx/ -D_CUDA_ -I../../util -I../../
+#
+NVCC=nvcc
+NVCC_FLAGS+=-O3 -arch=sm_35 -D_CUDA_ -I../../util -Xptxas=-v -Iobjs_ptx/
+ifdef PTXCC_REGMAX
+  NVCC_FLAGS += --maxrregcount=$(PTXCC_REGMAX)
+endif
+NVCC_FLAGS+=--use_fast_math
+#
+LD=nvcc
+LDFLAGS=-lcudart -lcudadevrt -arch=sm_35
+#
+PTXCC=$(ISPC_HOME)/ptxtools/ptxcc
+PTXCC_FLAGS+= -Xptxas=-v
+ifdef PTXCC_REGMAX
+  PTXCC_FLAGS += -maxrregcount=$(PTXCC_REGMAX)
+endif
+
+#
+ISPC=$(ISPC_HOME)/ispc
+ISPC_FLAGS+=-O3 --math-lib=fast --target=nvptx --opt=fast-math
+#
+#
+#
+ISPC_LLVM_OBJS=$(ISPC_SRC:%.ispc=objs_ptx/%_llvm_ispc.o)
+ISPC_NVVM_OBJS=$(ISPC_SRC:%.ispc=objs_ptx/%_nvvm_ispc.o)
+#ISPC_BCS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.bc)
+ISPC_LLS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.ll)
+ISPC_LLVM_PTX=$(ISPC_SRC:%.ispc=objs_ptx/%_llvm_ispc.ptx)
+ISPC_NVVM_PTX=$(ISPC_SRC:%.ispc=objs_ptx/%_nvvm_ispc.ptx)
+ISPC_HEADERS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.h)
+CXX_OBJS=$(CXX_SRC:%.cpp=objs_ptx/%_gcc.o)
+CU_OBJS=$(CU_SRC:%.cu=objs_ptx/%_cu.o)
+#NVCC_OBJS=$(NVCC_SRC:%.cu=objs_ptx/%_nvcc.o)
+
+CXX_SRC+=ispc_malloc.cpp
+CXX_OBJS+=objs_ptx/ispc_malloc_gcc.o
+
+PTXGEN = $(ISPC_HOME)/ptxtools/ptxgen
+PTXGEN += --use_fast_math
+
+#LLVM32=$(HOME)/usr/local/llvm/bin-3.2
+#LLVM32DIS=$(LLVM32)/bin/llvm-dis
+
+LLC=$(LLVM_ROOT)/bin/llc
+LLC_FLAGS=-march=nvptx64 -mcpu=sm_35
+
+# .SUFFIXES: .bc .o .cu  .ll
+
+ifdef LLVM_GPU
+  OBJSptx_llvm=$(ISPC_LLVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS) 
+  PROGptx_llvm=$(PROG)_llvm_ptx
+else
+  ISPC_LLVM_PTX=
+endif
+
+
+ifdef NVVM_GPU
+  OBJSptx_nvvm=$(ISPC_NVVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS) $(ISPC_LVVM_PTX)
+  PROGptx_nvvm=$(PROG)_nvvm_ptx
+else
+  ISPC_NVVM_PTX=
+endif
+
+ifdef CU_SRC
+  OBJScu=$(CU_OBJS) $(CXX_OBJS) $(NVCC_OBJS)
+  PROGcu=$(PROG)_cu
+endif
+
+
+all: dirs  \
+	$(PROGptx_nvvm)  \
+	$(PROGptx_llvm)  \
+	$(PROGcu) $(ISPC_BCS) $(ISPC_LLS)  $(ISPC_HEADERS) $(ISPC_NVVM_PTX) $(ISPC_LLVM_PTX)
+
+dirs:
+	/bin/mkdir -p objs_ptx/
+
+objs_ptx/%.cpp objs_ptx/%.o objs_ptx/%.h: dirs
+
+clean: 
+	/bin/rm -rf $(PROGptx_nvvm) $(PROGptx_llvm) $(PROGcu) objs_ptx
+
+# generate binaries
+$(PROGptx_llvm): $(OBJSptx_llvm)
+	$(LD) -o $@ $^ $(LDFLAGS)
+$(PROGptx_nvvm): $(OBJSptx_nvvm)
+	$(LD) -o $@ $^ $(LDFLAGS)
+$(PROGcu): $(OBJScu)
+	$(LD) -o $@ $^ $(LDFLAGS)
+
+# compile C++ code
+objs_ptx/%_gcc.o: %.cpp $(ISPC_HEADERS)
+	$(CXX) $(CXXFLAGS)  -o $@ -c $<
+objs_ptx/%_gcc.o: ../../util/%.cpp 
+	$(CXX) $(CXXFLAGS)  -o $@ -c $<
+
+# CUDA helpers
+objs_ptx/%_cu.o: %.cu $(ISPC_HEADERS)
+	$(NVCC) $(NVCC_FLAGS)  -o $@ -dc $<
+
+# compile CUDA code 
+objs_ptx/%_nvcc.o: ../../util/%.cu
+	$(NVCC) $(NVCC_FLAGS) -o $@ -c $<
+objs_ptx/%_nvcc.o: %.cu 
+	$(NVCC) $(NVCC_FLAGS) -o $@ -c $<
+
+# compile ISPC to LLVM BC
+#objs_ptx/%_ispc.h objs_ptx/%_ispc.bc: %.ispc 
+#	$(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_ptx/$*_ispc.h -o objs_ptx/$*_ispc.bc $<
+objs_ptx/%_ispc.h objs_ptx/%_ispc.ll: %.ispc 
+	$(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_ptx/$*_ispc.h -o objs_ptx/$*_ispc.ll $<
+
+# generate PTX from LLVM BC
+#objs_ptx/%_llvm_ispc.ptx: objs_ptx/%_ispc.bc
+#	$(LLC) $(LLC_FLAGS) -o $@ $<
+objs_ptx/%_llvm_ispc.ptx: objs_ptx/%_ispc.ll
+	$(LLC) $(LLC_FLAGS) -o $@ $<
+#objs_ptx/%_nvvm_ispc.ptx: objs_ptx/%_ispc.bc
+#	$(LLVM32DIS) $< -o objs_ptx/$*_ispc-ll32.ll
+#	$(PTXGEN) objs_ptx/$*_ispc-ll32.ll -o $@
+objs_ptx/%_nvvm_ispc.ptx: objs_ptx/%_ispc.ll
+	$(PTXGEN) $< -o $@
+
+# generate an object file from PTX
+objs_ptx/%_ispc.o: objs_ptx/%_ispc.ptx
+	$(PTXCC) $< -Xnvcc="$(PTXCC_FLAGS)" -o $@
+
+
+	 
+
+
diff --git a/examples/portable/deferred/Makefile_cpu b/examples/portable/deferred/Makefile_cpu
new file mode 100644
index 00000000..b9900224
--- /dev/null
+++ b/examples/portable/deferred/Makefile_cpu
@@ -0,0 +1,10 @@
+
+EXAMPLE=deferred_shading
+CPP_SRC=common.cpp main.cpp dynamic_c.cpp 
+# CPP_SRC+=dynamic_cilk.cpp
+ISPC_SRC=kernels.ispc
+ISPC_IA_TARGETS=avx1-i32x16
+ISPC_ARM_TARGETS=neon
+ISPC_FLAGS=--opt=fast-math
+
+include ../common_cpu.mk
diff --git a/examples/portable/deferred/Makefile_knc b/examples/portable/deferred/Makefile_knc
new file mode 100644
index 00000000..ed7ce137
--- /dev/null
+++ b/examples/portable/deferred/Makefile_knc
@@ -0,0 +1,8 @@
+EXAMPLE=deferred_shading
+CXX_SRC=common.cpp main.cpp dynamic_c.cpp 
+ISPC_SRC=kernels.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+ISPC_FLAGS=--opt=fast-math
+
+include ../common_knc.mk
diff --git a/examples/portable/deferred/Makefile_ptx b/examples/portable/deferred/Makefile_ptx
new file mode 100644
index 00000000..58385e59
--- /dev/null
+++ b/examples/portable/deferred/Makefile_ptx
@@ -0,0 +1,13 @@
+PROG=deferred_shading
+ISPC_SRC=kernels.ispc
+CU_SRC=kernels.cu
+CXX_SRC=common.cpp  main.cpp
+PTXCC_REGMAX=64
+
+NVVM_GPU=1
+#LLVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
diff --git a/examples/portable/deferred/common.cpp b/examples/portable/deferred/common.cpp
new file mode 100644
index 00000000..9a2a5c77
--- /dev/null
+++ b/examples/portable/deferred/common.cpp
@@ -0,0 +1,222 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#ifdef ISPC_IS_LINUX
+  #include <malloc.h>
+#endif
+#include "deferred.h"
+#include "timing.h"
+#include "ispc_malloc.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifndef _CUDA_
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+#else
+    void *ptr;
+    ispc_malloc(&ptr, size);
+    return ptr;
+#endif
+
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifndef _CUDA_
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+#else
+    ispc_free(ptr);
+#endif
+}
+
+
+Framebuffer::Framebuffer(int width, int height) {
+    nPixels = width*height;
+    r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+}
+
+
+Framebuffer::~Framebuffer() {
+    lAlignedFree(r);
+    lAlignedFree(g);
+    lAlignedFree(b);
+}
+
+
+void
+Framebuffer::clear() {
+    memset(r, 0, nPixels);
+    memset(g, 0, nPixels);
+    memset(b, 0, nPixels);
+}
+
+
+InputData *
+CreateInputDataFromFile(const char *path) {
+    FILE *in = fopen(path, "rb");
+    if (!in) return 0;
+
+    InputData *input = new InputData;
+
+    // Load header
+    if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+
+    // Load data chunk and update pointers
+    input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
+                                             ALIGNMENT_BYTES);
+    if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+
+    input->arrays.zBuffer =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
+    input->arrays.normalEncoded_x =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]];
+    input->arrays.normalEncoded_y =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]];
+    input->arrays.specularAmount =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]];
+    input->arrays.specularPower =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]];
+    input->arrays.albedo_x =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]];
+    input->arrays.albedo_y =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]];
+    input->arrays.albedo_z =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]];
+    input->arrays.lightPositionView_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]];
+    input->arrays.lightPositionView_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]];
+    input->arrays.lightPositionView_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]];
+    input->arrays.lightAttenuationBegin =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]];
+    input->arrays.lightColor_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]];
+    input->arrays.lightColor_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]];
+    input->arrays.lightColor_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]];
+    input->arrays.lightAttenuationEnd =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]];
+
+    fclose(in);
+    return input;
+}
+
+
+void DeleteInputData(InputData *input) {
+    lAlignedFree(input->chunk);
+}
+
+
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer) {
+    // Deswizzle and copy to RGBA output
+    // Doesn't need to be fast... only happens once
+    size_t imageBytes = 3 * input->header.framebufferWidth *
+        input->header.framebufferHeight;
+    uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
+    memset(framebufferAOS, 0, imageBytes);
+
+    for (int i = 0; i < input->header.framebufferWidth *
+                        input->header.framebufferHeight; ++i) {
+        framebufferAOS[3 * i + 0] = framebuffer.r[i];
+        framebufferAOS[3 * i + 1] = framebuffer.g[i];
+        framebufferAOS[3 * i + 2] = framebuffer.b[i];
+    }
+
+    // Write out simple PPM file
+    FILE *out = fopen(filename, "wb");
+    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
+            input->header.framebufferHeight);
+    fwrite(framebufferAOS, imageBytes, 1, out);
+    fclose(out);
+
+    lAlignedFree(framebufferAOS);
+}
diff --git a/examples/portable/deferred/data b/examples/portable/deferred/data
new file mode 120000
index 00000000..a64aa51d
--- /dev/null
+++ b/examples/portable/deferred/data
@@ -0,0 +1 @@
+../../deferred/data
\ No newline at end of file
diff --git a/examples/portable/deferred/deferred.h b/examples/portable/deferred/deferred.h
new file mode 100644
index 00000000..da4adffd
--- /dev/null
+++ b/examples/portable/deferred/deferred.h
@@ -0,0 +1,108 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef DEFERRED_H
+#define DEFERRED_H
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 64
+#define MIN_TILE_HEIGHT 16
+#define MAX_LIGHTS 1024
+
+enum InputDataArraysEnum {
+    idaZBuffer = 0,
+    idaNormalEncoded_x,
+    idaNormalEncoded_y,
+    idaSpecularAmount,
+    idaSpecularPower,
+    idaAlbedo_x,
+    idaAlbedo_y,
+    idaAlbedo_z,
+    idaLightPositionView_x,
+    idaLightPositionView_y,
+    idaLightPositionView_z,
+    idaLightAttenuationBegin,
+    idaLightColor_x,
+    idaLightColor_y,
+    idaLightColor_z,
+    idaLightAttenuationEnd,
+
+    idaNum
+};
+
+#ifndef ISPC
+
+#include <stdint.h>
+#include "kernels_ispc.h"
+
+#define ALIGNMENT_BYTES 64
+
+#define MAX_LIGHTS 1024
+
+#define VISUALIZE_LIGHT_COUNT 0
+
+struct InputData
+{
+    ispc::InputHeader header;
+    ispc::InputDataArrays arrays;
+    uint8_t *chunk;
+};
+
+
+struct Framebuffer {
+    Framebuffer(int width, int height);
+    ~Framebuffer();
+
+    void clear();
+
+    uint8_t *r, *g, *b;
+
+private:
+    int nPixels;
+    Framebuffer(const Framebuffer &);
+    Framebuffer &operator=(const Framebuffer *);
+};
+
+
+InputData *CreateInputDataFromFile(const char *path);
+void DeleteInputData(InputData *input);
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer);
+void InitDynamicC(InputData *input);
+void InitDynamicCilk(InputData *input);
+void DispatchDynamicC(InputData *input, Framebuffer *framebuffer);
+void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer);
+
+#endif // !ISPC
+
+#endif // DEFERRED_H
diff --git a/examples/portable/deferred/dynamic_c.cpp b/examples/portable/deferred/dynamic_c.cpp
new file mode 100644
index 00000000..5a99e6e5
--- /dev/null
+++ b/examples/portable/deferred/dynamic_c.cpp
@@ -0,0 +1,874 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#ifndef MIN_TILE_WIDTH
+#define MIN_TILE_WIDTH 16
+#endif
+#ifndef MIN_TILE_HEIGHT
+#define MIN_TILE_HEIGHT 16
+#endif
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+static void
+ComputeZBounds(int tileStartX, int tileEndX,
+               int tileStartY, int tileEndY,
+               // G-buffer data
+               float zBuffer[],
+               int gBufferWidth,
+               // Camera data
+               float cameraProj_33, float cameraProj_43,
+               float cameraNear, float cameraFar,
+               // Output
+               float *minZ, float *maxZ)
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (int y = tileStartY; y < tileEndY; ++y) {
+        for (int x = tileStartX; x < tileEndX; ++x) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[(y * gBufferWidth + x)];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = std::min(laneMinZ, viewSpaceZ);
+                laneMaxZ = std::max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    *minZ = laneMinZ;
+    *maxZ = laneMaxZ;
+}
+
+
+static void
+ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
+                  int numTilesX, int numTilesY,
+                  // G-buffer data
+                  float zBuffer[],
+                  int gBufferWidth,
+                  // Camera data
+                  float cameraProj_33, float cameraProj_43,
+                  float cameraNear, float cameraFar,
+                  // Output
+                  float minZArray[],
+                  float maxZArray[])
+{
+    for (int tileX = 0; tileX < numTilesX; ++tileX) {
+        float minZ, maxZ;
+        ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
+                       tileY * tileHeight, tileY * tileHeight + tileHeight,
+                       zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
+                       cameraNear, cameraFar, &minZ, &maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+class MinMaxZTree
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTree(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                              zBuffer, gBufferPitchInElements,
+                              cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                              mMinZArrays[0] + (tileY * mNumTilesX),
+                              mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTree() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays);
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTree *gMinMaxZTree = 0;
+
+void InitDynamicC(InputData *input) {
+    gMinMaxZTree =
+        new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                        input->header.framebufferWidth,
+                        input->header.framebufferHeight);
+}
+
+
+/* We're going to split a tile into 4 sub-tiles.  This function
+   reclassifies the tile's lights with respect to the sub-tiles. */
+static void
+SplitTileMinMax(
+    int tileMidX, int tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    float subtileMinZ[],
+    float subtileMaxZ[],
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int lightIndices[],
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Outputs
+    int subtileIndices[],
+    int subtileIndicesPitch,
+    int subtileNumLights[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+
+    float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                   (cameraProj_22 * gBufferScale_y) };
+    float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                 tileMidY - gBufferScale_y };
+
+    for (int i = 0; i < 2; ++i) {
+        // Normalize
+        float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
+                                 frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+    // Initialize
+    int subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    for (int i = 0; i < numLights; ++i) {
+        int lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        // Test lights again against subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] +
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+
+        if (fabsf(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        if (fabsf(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        if (inFrustum[0])
+            subtileIndices[subtileLightOffset[0]++] = lightIndex;
+        if (inFrustum[1])
+            subtileIndices[subtileLightOffset[1]++] = lightIndex;
+        if (inFrustum[2])
+            subtileIndices[subtileLightOffset[2]++] = lightIndex;
+        if (inFrustum[3])
+            subtileIndices[subtileLightOffset[3]++] = lightIndex;
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
+
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = 1.f / sqrtf(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(uint8_t u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline uint8_t
+Float32ToUnorm8(float f) {
+    return (uint8_t)(f * 255.0f);
+}
+
+
+static inline float
+half_to_float_fast(uint16_t h) {
+    uint32_t hs = h & (int32_t)0x8000u;  // Pick off sign bit
+    uint32_t he = h & (int32_t)0x7C00u;  // Pick off exponent bits
+    uint32_t hm = h & (int32_t)0x03FFu;  // Pick off mantissa bits
+
+    // sign
+    uint32_t xs = ((uint32_t) hs) << 16;
+    // Exponent: unbias the halfp, then bias the single
+    int32_t xes = ((int32_t) (he >> 10)) - 15 + 127;
+    // Exponent
+    uint32_t xe = (uint32_t) (xes << 23);
+    // Mantissa
+    uint32_t xm = ((uint32_t) hm) << 13;
+
+    uint32_t bits = (xs | xe | xm);
+    float *fp = reinterpret_cast<float *>(&bits);
+    return *fp;
+}
+
+
+static void
+ShadeTileC(
+    int32_t tileStartX, int32_t tileEndX,
+    int32_t tileStartY, int32_t tileEndY,
+    int32_t gBufferWidth, int32_t gBufferHeight,
+    const ispc::InputDataArrays &inputData,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    float cameraProj_33, float cameraProj_43,
+    // Light list
+    int32_t tileLightIndices[],
+    int32_t tileNumLights,
+    // UI
+    bool visualizeLightCount,
+    // Output
+    uint8_t framebuffer_r[],
+    uint8_t framebuffer_g[],
+    uint8_t framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255));
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t framebufferIndex = (y * gBufferWidth + x);
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        float twoOverGBufferHeight = 2.0f / gBufferHeight;
+
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t gBufferOffset = y * gBufferWidth + x;
+
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x)) *
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z /
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z /
+                    cameraProj_22;
+
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y,
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrtf(4.0f * f - 1.0f);
+
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount =
+                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  =
+                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights;
+                     ++tileLightIndex) {
+                    int32_t lightIndex = tileLightIndices[tileLightIndex];
+
+                    // Gather light data relevant to initial culling
+                    float light_positionView_x =
+                        inputData.lightPositionView_x[lightIndex];
+                    float light_positionView_y =
+                        inputData.lightPositionView_y[lightIndex];
+                    float light_positionView_z =
+                        inputData.lightPositionView_z[lightIndex];
+                    float light_attenuationEnd =
+                        inputData.lightAttenuationEnd[lightIndex];
+
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    if (distanceToLight2 < light_attenutaionEnd2) {
+                        float distanceToLight = sqrtf(distanceToLight2);
+
+                        float distanceToLightRcp = 1.f / distanceToLight;
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y,
+                                           surface_normal_z, L_x, L_y, L_z);
+
+                        // Clip back facing
+                        if (NdotL > 0.0f) {
+                            float light_attenuationBegin =
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = std::min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+
+                            float NdotH = dot3(surface_normal_x, surface_normal_y,
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = std::max(NdotH, 0.0f);
+
+                            float specular = powf(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) *
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount *
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+
+                            float light_color_x = inputData.lightColor_x[lightIndex];
+                            float light_color_y = inputData.lightColor_y[lightIndex];
+                            float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                float gamma = 1.0 / 2.2f;
+                lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
+                lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
+                lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
+
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
+                        int *lightIndices, int numLights,
+                        Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ShadeTileC(startX, endX, startY, endY,
+                       input->header.framebufferWidth, input->header.framebufferHeight,
+                       input->arrays,
+                       input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+                       input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                       lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
+                       framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    }
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES))
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight,
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x,
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+
+        // Recurse into subtiles
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY,
+                                subtileLightIndices[0], subtileNumLights[0],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                subtileLightIndices[1], subtileNumLights[1],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                subtileLightIndices[2], subtileNumLights[2],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static int
+IntersectLightsWithTileMinMax(
+    int tileStartX, int tileEndX,
+    int tileStartY, int tileEndY,
+    // Tile data
+    float minZ,
+    float maxZ,
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Output
+    int tileLightIndices[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+
+    float frustumPlanes_xy[4];
+    float frustumPlanes_z[4];
+
+    // This one is totally constant over the whole screen... worth pulling it up at all?
+    float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_22 * gBufferScale_y),
+                                    -(cameraProj_22 * gBufferScale_y) };
+
+    float frustumPlanes_z_v[4] = {  tileEndX - gBufferScale_x,
+                                    -tileStartX + gBufferScale_x,
+                                    tileEndY - gBufferScale_y,
+                                    -tileStartY + gBufferScale_y };
+
+    for (int i = 0; i < 4; ++i) {
+        float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] +
+                                 frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
+        frustumPlanes_xy_v[i] *= norm;
+        frustumPlanes_z_v[i] *= norm;
+
+        frustumPlanes_xy[i] = frustumPlanes_xy_v[i];
+        frustumPlanes_z[i] = frustumPlanes_z_v[i];
+    }
+
+    int tileNumLights = 0;
+
+    for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) {
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        if (!inFrustum)
+            continue;
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+
+        d = light_positionView_z * frustumPlanes_z[0] +
+            light_positionView_x * frustumPlanes_xy[0];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_x * frustumPlanes_xy[1];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[2] +
+            light_positionView_y * frustumPlanes_xy[2];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[3] +
+            light_positionView_y * frustumPlanes_xy[3];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        // Pack and store intersecting lights
+        if (inFrustum)
+            tileLightIndices[tileNumLights++] = lightIndex;
+    }
+
+    return tileNumLights;
+}
+
+
+void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES))
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x,
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTree *minMaxZTree = gMinMaxZTree;
+
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+        input->header.cameraNear, input->header.cameraFar);
+
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
diff --git a/examples/portable/deferred/dynamic_cilk.cpp b/examples/portable/deferred/dynamic_cilk.cpp
new file mode 100644
index 00000000..c0562291
--- /dev/null
+++ b/examples/portable/deferred/dynamic_cilk.cpp
@@ -0,0 +1,398 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef __cilk
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <assert.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+class MinMaxZTreeCilk
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTreeCilk(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        // Compute level 0 in parallel. Outer loops is here since we use Cilk
+        _Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ispc::ComputeZBoundsRow(tileY,
+                mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                zBuffer, gBufferPitchInElements,
+                cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                mMinZArrays[0] + (tileY * mNumTilesX),
+                mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        // NOTE: We currently don't use ispc here since it's sort of an
+        // awkward gather-based reduction Using SSE odd pack/unpack
+        // instructions might actually work here when we need to optimize
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            _Cilk_for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTreeCilk() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays);
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
+
+void InitDynamicCilk(InputData *input) {
+    gMinMaxZTreeCilk =
+        new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                            input->header.framebufferWidth,
+                            input->header.framebufferHeight);
+}
+
+
+static void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
+                        int *lightIndices, int numLights,
+                        Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ispc::ShadeTile(
+                startX, endX, startY, endY,
+                input->header.framebufferWidth, input->header.framebufferHeight,
+                &input->arrays,
+                input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+                input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
+                framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    }
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES))
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight,
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x,
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+
+        // Recurse into subtiles
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY,
+                                            subtileLightIndices[0], subtileNumLights[0],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                            subtileLightIndices[1], subtileNumLights[1],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                            subtileLightIndices[2], subtileNumLights[2],
+                                            framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES))
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = ispc::IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x,
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+        input->header.cameraNear, input->header.cameraFar);
+
+    // Launch the "root" tiles.  Ideally these should at least fill the
+    // machine... at the moment we have a static number of "levels" to the
+    // mip tree but it might make sense to compute it based on the width of
+    // the machine.
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    _Cilk_for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
+
+#endif // __cilk
diff --git a/examples/portable/deferred/kernels.cu b/examples/portable/deferred/kernels.cu
new file mode 100644
index 00000000..aa694efe
--- /dev/null
+++ b/examples/portable/deferred/kernels.cu
@@ -0,0 +1,778 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include "deferred.h"
+#include <stdio.h>
+#include <assert.h>
+
+#define programCount 32
+#define programIndex (threadIdx.x & 31)
+#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5))
+#define taskCount (gridDim.x*4)
+#define warpIdx (threadIdx.x >> 5)
+
+#define int32 int
+#define int16 short
+#define int8 char
+
+__device__ static inline float clamp(float v, float low, float high)
+{
+      return min(max(v, low), high);
+}
+
+struct InputDataArrays
+{
+    float *zBuffer;
+    unsigned int16 *normalEncoded_x; // half float
+    unsigned int16 *normalEncoded_y; // half float
+    unsigned int16 *specularAmount; // half float
+    unsigned int16 *specularPower; // half float
+    unsigned int8 *albedo_x; // unorm8
+    unsigned int8 *albedo_y; // unorm8
+    unsigned int8 *albedo_z; // unorm8
+    float *lightPositionView_x;
+    float *lightPositionView_y;
+    float *lightPositionView_z;
+    float *lightAttenuationBegin;
+    float *lightColor_x;
+    float *lightColor_y;
+    float *lightColor_z;
+    float *lightAttenuationEnd;
+};
+
+struct InputHeader
+{
+    float cameraProj[4][4];
+    float cameraNear;
+    float cameraFar;
+
+    int32 framebufferWidth;
+    int32 framebufferHeight;
+    int32 numLights;
+    int32 inputDataChunkSize;
+    int32 inputDataArrayOffsets[idaNum];
+};
+
+
+///////////////////////////////////////////////////////////////////////////
+// Common utility routines
+
+__device__
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+#if 0
+static __shared__ int shdata_full[128];
+template<typename T, int N>
+struct Uniform
+{
+  T data[(N+programCount-1)/programCount];
+  volatile T *shdata;
+
+  __device__ inline Uniform()
+  {
+    shdata = ((T*)shdata_full) + warpIdx*32;
+  }
+
+  __device__ inline int2 get_chunk(const int i) const
+  {
+    const int elem  = i & (programCount - 1);
+    const int chunk = i >> 5;
+    shdata[programIndex] = chunk;
+    shdata[        elem] = chunk;
+    return make_int2(shdata[programIndex], elem);
+  }
+
+  __device__ inline const T get(const int i) const
+  {
+    const int2 idx = get_chunk(i);
+    return __shfl(data[idx.x], idx.y);
+  }
+
+  __device__ inline void set(const bool active, const int i, T value)
+  {
+    const int2 idx = get_chunk(i);
+    const int chunkIdx = idx.x;
+    const int elemIdx = idx.y;
+    shdata[programIndex] = data[chunkIdx];
+    if (active) shdata[elemIdx] = value;
+    data[chunkIdx] = shdata[programIndex];
+  }
+};
+#elif 1
+template<typename T, int N>
+struct Uniform
+{
+  union
+  {
+    T *data;
+    int32_t ptr[2];
+  };
+
+  __device__ inline Uniform()
+  {
+    if (programIndex == 0)
+      data = (T*)malloc(N*sizeof(T));
+    ptr[0] = __shfl(ptr[0], 0);
+    ptr[1] = __shfl(ptr[1], 0);
+  }
+  __device__ inline ~Uniform()
+  {
+    if (programIndex == 0)
+      free(data);
+  }
+
+  __device__ inline const T get(const int i) const
+  {
+    return data[i];
+  }
+
+  __device__ inline T* get_ptr(const int i) {return &data[i]; }
+  __device__ inline void set(const bool active, const int i, T value)
+  {
+    if (active)
+      data[i] = value;
+  }
+};
+
+#else
+__shared__ int shdata_full[4*MAX_LIGHTS];
+template<typename T, int N>
+struct Uniform
+{
+  /* volatile */ T *shdata;
+
+  __device__ Uniform()
+  {
+    shdata = (T*)&shdata_full[warpIdx*MAX_LIGHTS];
+  }
+
+  __device__ inline const T get(const int i) const
+  {
+    return shdata[i];
+  }
+
+  __device__ inline void set(const bool active, const int i, T value)
+  {
+    if (active)
+      shdata[i] = value;
+  }
+};
+#endif
+
+
+__device__
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = rsqrt(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+__device__ inline
+static float reduce_min(float value)
+{
+#pragma unroll
+  for (int i = 4; i >=0; i--)
+    value = fminf(value, __shfl_xor(value, 1<<i, 32));
+  return value;
+}
+__device__ inline
+static float reduce_max(float value)
+{
+#pragma unroll
+  for (int i = 4; i >=0; i--)
+    value = fmaxf(value, __shfl_xor(value, 1<<i, 32));
+  return value;
+}
+
+#if 0
+__device__ inline
+static int reduce_sum(int value)
+{
+#pragma unroll
+  for (int i = 4; i >=0; i--)
+    value +=  __shfl_xor(value, 1<<i, 32);
+  return value;
+}
+static __device__ __forceinline__ uint shfl_scan_add_step(uint partial, uint up_offset)
+{
+  uint result;
+  asm(
+      "{.reg .u32 r0;"
+      ".reg .pred p;"
+      "shfl.up.b32 r0|p, %1, %2, 0;"
+      "@p add.u32 r0, r0, %3;"
+      "mov.u32 %0, r0;}"
+      : "=r"(result) : "r"(partial), "r"(up_offset), "r"(partial));
+  return result;
+}
+static __device__ __forceinline__ int inclusive_scan_warp(const int value)
+{
+  uint sum = value;
+#pragma unroll
+  for(int i = 0; i < 5; ++i)
+    sum = shfl_scan_add_step(sum, 1 << i);
+  return sum - value;
+}
+#endif
+
+
+static __device__ __forceinline__ int lanemask_lt()
+{
+  int mask;
+  asm("mov.u32 %0, %lanemask_lt;" : "=r" (mask));
+  return mask;
+}
+static __device__ __forceinline__ int2 warpBinExclusiveScan(const bool p)
+{
+  const int b = __ballot(p);
+  return make_int2(__popc(b), __popc(b & lanemask_lt()));
+}
+  __device__ static inline
+int packed_store_active(bool active, int* ptr, int value)
+{
+  const int2 res = warpBinExclusiveScan(active);
+  const int idx = res.y;
+  const int nactive = res.x;
+  if (active)
+    ptr[idx] = value;
+  return nactive;
+}
+
+
+
+
+
+__device__
+static inline float
+Unorm8ToFloat32(unsigned int8 u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+__device__
+static inline unsigned int8
+Float32ToUnorm8(float f) {
+    return (unsigned int8)(f * 255.0f);
+}
+
+
+__device__
+static inline void
+ComputeZBounds(
+     int32 tileStartX,  int32 tileEndX,
+     int32 tileStartY,  int32 tileEndY,
+    // G-buffer data
+     float zBuffer[],
+     int32 gBufferWidth,
+    // Camera data
+     float cameraProj_33,  float cameraProj_43,
+     float cameraNear,  float cameraFar,
+    // Output
+     float &minZ,
+     float &maxZ
+    )
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for ( int32 y = tileStartY; y < tileEndY; ++y) {
+        for ( int xb = tileStartX; xb < tileEndX; xb += programCount)
+        {
+          const int x = xb + programIndex;
+          if (x >= tileEndX) break;
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[y * gBufferWidth + x];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = min(laneMinZ, viewSpaceZ);
+                laneMaxZ = max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    minZ = reduce_min(laneMinZ);
+    maxZ = reduce_max(laneMaxZ);
+}
+
+
+__device__
+static inline  int32
+IntersectLightsWithTileMinMax(
+     int32 tileStartX,  int32 tileEndX,
+     int32 tileStartY,  int32 tileEndY,
+    // Tile data
+     float minZ,
+     float maxZ,
+    // G-buffer data
+     int32 gBufferWidth,  int32 gBufferHeight,
+    // Camera data
+     float cameraProj_11,  float cameraProj_22,
+    // Light Data
+     int32 numLights,
+     float light_positionView_x_array[],
+     float light_positionView_y_array[],
+     float light_positionView_z_array[],
+     float light_attenuationEnd_array[],
+    // Output
+     Uniform<int,MAX_LIGHTS> &tileLightIndices
+    )
+{
+     float gBufferScale_x = 0.5f * (float)gBufferWidth;
+     float gBufferScale_y = 0.5f * (float)gBufferHeight;
+
+     float frustumPlanes_xy[4] = {
+        -(cameraProj_11 * gBufferScale_x),
+         (cameraProj_11 * gBufferScale_x),
+         (cameraProj_22 * gBufferScale_y),
+        -(cameraProj_22 * gBufferScale_y) };
+     float frustumPlanes_z[4] = {
+         tileEndX - gBufferScale_x,
+        -tileStartX + gBufferScale_x,
+         tileEndY - gBufferScale_y,
+        -tileStartY + gBufferScale_y };
+
+    for ( int i = 0; i < 4; ++i) {
+         float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
+                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+     int32 tileNumLights = 0;
+
+    for ( int lightIndexB = 0; lightIndexB < numLights; lightIndexB += programCount)
+    {
+      const int lightIndex = lightIndexB + programIndex;
+      if (lightIndex >= numLights) break;
+
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        // This seems better than cif(!inFrustum) ccontinue; here since we
+        // don't actually need to mask the rest of this function - this is
+        // just a greedy early-out.  Could also structure all of this as
+        // nested if() statements, but this a bit easier to read
+        if (__ballot(inFrustum) > 0)
+        {
+            float light_positionView_x = light_positionView_x_array[lightIndex];
+            float light_positionView_y = light_positionView_y_array[lightIndex];
+
+            d = light_positionView_z * frustumPlanes_z[0] +
+                light_positionView_x * frustumPlanes_xy[0];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[1] +
+                light_positionView_x * frustumPlanes_xy[1];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[2] +
+                light_positionView_y * frustumPlanes_xy[2];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[3] +
+                light_positionView_y * frustumPlanes_xy[3];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            // Pack and store intersecting lights
+            const bool active = inFrustum && lightIndex < numLights;
+#if 0
+            if (__ballot(active) > 0)
+              tileNumLights += packed_store_active(active, tileLightIndices.get_ptr(tileNumLights), lightIndex);
+#else
+            if (__ballot(active) > 0)
+            {
+              const int2 res = warpBinExclusiveScan(active);
+              const int idx = tileNumLights + res.y;
+              const int nactive = res.x;
+              tileLightIndices.set(active, idx, lightIndex);
+              tileNumLights += nactive;
+            }
+#endif
+        }
+    }
+
+    return tileNumLights;
+}
+
+
+__device__
+static inline   int32
+IntersectLightsWithTile(
+     int32 tileStartX,  int32 tileEndX,
+     int32 tileStartY,  int32 tileEndY,
+     int32 gBufferWidth,  int32 gBufferHeight,
+    // G-buffer data
+     float zBuffer[],
+    // Camera data
+     float cameraProj_11,  float cameraProj_22,
+     float cameraProj_33,  float cameraProj_43,
+     float cameraNear,  float cameraFar,
+    // Light Data
+     int32 numLights,
+     float light_positionView_x_array[],
+     float light_positionView_y_array[],
+     float light_positionView_z_array[],
+     float light_attenuationEnd_array[],
+    // Output
+     Uniform<int,MAX_LIGHTS> &tileLightIndices
+    )
+{
+     float minZ, maxZ;
+    ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
+        zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+        minZ, maxZ);
+
+
+     int32 tileNumLights = IntersectLightsWithTileMinMax(
+        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
+        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
+        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
+        light_positionView_z_array, light_attenuationEnd_array,
+        tileLightIndices);
+
+    return tileNumLights;
+}
+
+
+__device__
+static inline void
+ShadeTile(
+     int32 tileStartX,  int32 tileEndX,
+     int32 tileStartY,  int32 tileEndY,
+     int32 gBufferWidth,  int32 gBufferHeight,
+    const  InputDataArrays &inputData,
+    // Camera data
+     float cameraProj_11,  float cameraProj_22,
+     float cameraProj_33,  float cameraProj_43,
+    // Light list
+     Uniform<int,MAX_LIGHTS> &tileLightIndices,
+     int32 tileNumLights,
+    // UI
+     bool visualizeLightCount,
+    // Output
+     unsigned int8 framebuffer_r[],
+     unsigned int8 framebuffer_g[],
+     unsigned int8 framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+         unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
+        for ( int32 y = tileStartY; y < tileEndY; ++y) {
+            for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
+            {
+              const int x = xb + programIndex;
+              if (x >= tileEndX) continue;
+                int32 framebufferIndex = (y * gBufferWidth + x);
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+         float twoOverGBufferWidth = 2.0f / gBufferWidth;
+         float twoOverGBufferHeight = 2.0f / gBufferHeight;
+
+        for ( int32 y = tileStartY; y < tileEndY; ++y) {
+             float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
+            {
+              const int x = xb + programIndex;
+//              if (x >= tileEndX) break;
+                int32 gBufferOffset = y * gBufferWidth + x;
+
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x)) *
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z /
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z /
+                    cameraProj_22;
+
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y,
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                asm("// half2float //");
+                float normal_x = __half2float(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = __half2float(inputData.normalEncoded_y[gBufferOffset]);
+                asm("// half2float //");
+
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrt(4.0f * f - 1.0f);
+
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount =
+                    __half2float(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  =
+                    __half2float(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
+                     ++tileLightIndex) {
+                     int32 lightIndex = tileLightIndices.get(tileLightIndex);
+
+                    // Gather light data relevant to initial culling
+                     float light_positionView_x =
+                        __ldg(&inputData.lightPositionView_x[lightIndex]);
+                     float light_positionView_y =
+                        __ldg(&inputData.lightPositionView_y[lightIndex]);
+                     float light_positionView_z =
+                        __ldg(&inputData.lightPositionView_z[lightIndex]);
+                     float light_attenuationEnd =
+                        __ldg(&inputData.lightAttenuationEnd[lightIndex]);
+
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    if (distanceToLight2 < light_attenutaionEnd2) {
+                        float distanceToLight = sqrt(distanceToLight2);
+
+                        // HLSL "rcp" is allowed to be fairly inaccurate
+                        float distanceToLightRcp = 1.0f/distanceToLight;
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y,
+                                           surface_normal_z, L_x, L_y, L_z);
+
+                        // Clip back facing
+                        if (NdotL > 0.0f) {
+                             float light_attenuationBegin =
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+
+                            float NdotH = dot3(surface_normal_x, surface_normal_y,
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = max(NdotH, 0.0f);
+
+                            float specular = pow(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) *
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount *
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+
+                             float light_color_x = inputData.lightColor_x[lightIndex];
+                             float light_color_y = inputData.lightColor_y[lightIndex];
+                             float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                // These pows are pretty slow right now, but we can do
+                // something faster if really necessary to squeeze every
+                // last bit of performance out of it
+                float gamma = 1.0 / 2.2f;
+                lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
+                lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
+                lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
+
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Static decomposition
+
+__global__ void
+RenderTile( int num_groups_x,  int num_groups_y,
+           const  InputHeader *inputHeaderPtr,
+           const  InputDataArrays *inputDataPtr,
+            int visualizeLightCount,
+           // Output
+            unsigned int8 framebuffer_r[],
+            unsigned int8 framebuffer_g[],
+            unsigned int8 framebuffer_b[]) {
+  if (taskIndex >= taskCount) return;
+
+  const  InputHeader inputHeader = *inputHeaderPtr;
+  const  InputDataArrays inputData = *inputDataPtr;
+     int32 group_y = taskIndex / num_groups_x;
+     int32 group_x = taskIndex % num_groups_x;
+
+     int32 tile_start_x = group_x * MIN_TILE_WIDTH;
+     int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
+     int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
+     int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
+
+     int framebufferWidth = inputHeader.framebufferWidth;
+     int framebufferHeight = inputHeader.framebufferHeight;
+     float cameraProj_00 = inputHeader.cameraProj[0][0];
+     float cameraProj_11 = inputHeader.cameraProj[1][1];
+     float cameraProj_22 = inputHeader.cameraProj[2][2];
+     float cameraProj_32 = inputHeader.cameraProj[3][2];
+
+    // Light intersection: figure out which lights illuminate this tile.
+     Uniform<int,MAX_LIGHTS> tileLightIndices;  // Light list for the tile
+#if 1
+     int numTileLights =
+        IntersectLightsWithTile(tile_start_x, tile_end_x,
+                                tile_start_y, tile_end_y,
+                                framebufferWidth, framebufferHeight,
+                                inputData.zBuffer,
+                                cameraProj_00, cameraProj_11,
+                                cameraProj_22, cameraProj_32,
+                                inputHeader.cameraNear, inputHeader.cameraFar,
+                                MAX_LIGHTS,
+                                inputData.lightPositionView_x,
+                                inputData.lightPositionView_y,
+                                inputData.lightPositionView_z,
+                                inputData.lightAttenuationEnd,
+                                tileLightIndices);
+
+    // And now shade the tile, using the lights in tileLightIndices
+    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
+              framebufferWidth, framebufferHeight, inputData,
+              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
+              tileLightIndices, numTileLights, visualizeLightCount,
+              framebuffer_r, framebuffer_g, framebuffer_b);
+#endif
+}
+
+
+extern "C" __global__ void
+RenderStatic___export( InputHeader inputHeaderPtr[],
+              InputDataArrays inputDataPtr[],
+              int visualizeLightCount,
+             // Output
+              unsigned int8 framebuffer_r[],
+              unsigned int8 framebuffer_g[],
+              unsigned int8 framebuffer_b[]) {
+
+  const  InputHeader inputHeader = *inputHeaderPtr;
+  const  InputDataArrays inputData = *inputDataPtr;
+
+
+     int num_groups_x = (inputHeader.framebufferWidth +
+                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
+     int num_groups_y = (inputHeader.framebufferHeight +
+                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
+     int num_groups = num_groups_x * num_groups_y;
+
+    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
+    // by MIN_TILE_HEIGHT pixels.
+     if (programIndex == 0)
+       RenderTile<<<(num_groups+4-1)/4,128>>>(num_groups_x, num_groups_y,
+           inputHeaderPtr, inputDataPtr, visualizeLightCount,
+           framebuffer_r, framebuffer_g, framebuffer_b);
+     cudaDeviceSynchronize();
+}
+extern "C" __host__ void
+RenderStatic( InputHeader inputHeaderPtr[],
+              InputDataArrays inputDataPtr[],
+              int visualizeLightCount,
+             // Output
+              unsigned int8 framebuffer_r[],
+              unsigned int8 framebuffer_g[],
+              unsigned int8 framebuffer_b[]) {
+  RenderStatic___export<<<1,32>>>( inputHeaderPtr,
+              inputDataPtr,
+              visualizeLightCount,
+             // Output
+              framebuffer_r,
+              framebuffer_g,
+              framebuffer_b);
+     cudaDeviceSynchronize();
+}
diff --git a/examples/portable/deferred/kernels.ispc b/examples/portable/deferred/kernels.ispc
new file mode 100644
index 00000000..b45878da
--- /dev/null
+++ b/examples/portable/deferred/kernels.ispc
@@ -0,0 +1,717 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "deferred.h"
+
+#ifdef __NVPTX__
+#define uniform_t varying
+#else
+#define uniform_t uniform
+#endif
+
+struct InputDataArrays
+{
+    float *zBuffer;
+    unsigned int16 *normalEncoded_x; // half float
+    unsigned int16 *normalEncoded_y; // half float
+    unsigned int16 *specularAmount; // half float
+    unsigned int16 *specularPower; // half float
+    unsigned int8 *albedo_x; // unorm8
+    unsigned int8 *albedo_y; // unorm8
+    unsigned int8 *albedo_z; // unorm8
+    float *lightPositionView_x;
+    float *lightPositionView_y;
+    float *lightPositionView_z;
+    float *lightAttenuationBegin;
+    float *lightColor_x;
+    float *lightColor_y;
+    float *lightColor_z;
+    float *lightAttenuationEnd;
+};
+
+struct InputHeader
+{
+    float cameraProj[4][4];
+    float cameraNear;
+    float cameraFar;
+
+    int32 framebufferWidth;
+    int32 framebufferHeight;
+    int32 numLights;
+    int32 inputDataChunkSize;
+    int32 inputDataArrayOffsets[idaNum];
+};
+
+
+///////////////////////////////////////////////////////////////////////////
+// Common utility routines
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = rsqrt(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(unsigned int8 u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline unsigned int8
+Float32ToUnorm8(float f) {
+    return (unsigned int8)(f * 255.0f);
+}
+
+
+#if 1
+inline
+#endif
+static void
+ComputeZBounds(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    uniform float &minZ,
+    uniform float &maxZ
+    )
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+        foreach (x = tileStartX ... tileEndX) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[y * gBufferWidth + x];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = min(laneMinZ, viewSpaceZ);
+                laneMaxZ = max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    minZ = reduce_min(laneMinZ);
+    maxZ = reduce_max(laneMaxZ);
+}
+
+#if 1
+inline
+#endif
+#ifndef __NVPTX__
+export
+#endif
+uniform int32
+IntersectLightsWithTileMinMax(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // Tile data
+    uniform float minZ,
+    uniform float maxZ,
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    uniform int32 tileLightIndices[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+
+    uniform_t float frustumPlanes_xy[4] = {
+        -(cameraProj_11 * gBufferScale_x),
+         (cameraProj_11 * gBufferScale_x),
+         (cameraProj_22 * gBufferScale_y),
+        -(cameraProj_22 * gBufferScale_y) };
+    uniform_t float frustumPlanes_z[4] = {
+         tileEndX - gBufferScale_x,
+        -tileStartX + gBufferScale_x,
+         tileEndY - gBufferScale_y,
+        -tileStartY + gBufferScale_y };
+
+    for (uniform int i = 0; i < 4; ++i) {
+        uniform_t float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
+                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+    uniform int32 tileNumLights = 0;
+
+    foreach (lightIndex = 0 ... numLights) {
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        // This seems better than cif(!inFrustum) ccontinue; here since we
+        // don't actually need to mask the rest of this function - this is
+        // just a greedy early-out.  Could also structure all of this as
+        // nested if() statements, but this a bit easier to read
+        if (any(inFrustum)) {
+            float light_positionView_x = light_positionView_x_array[lightIndex];
+            float light_positionView_y = light_positionView_y_array[lightIndex];
+
+            d = light_positionView_z * frustumPlanes_z[0] +
+                light_positionView_x * frustumPlanes_xy[0];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[1] +
+                light_positionView_x * frustumPlanes_xy[1];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[2] +
+                light_positionView_y * frustumPlanes_xy[2];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[3] +
+                light_positionView_y * frustumPlanes_xy[3];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+#if 0
+            // Pack and store intersecting lights
+            cif (inFrustum) {
+                tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
+                                                     lightIndex);
+            }
+#else
+        const bool active = inFrustum && lightIndex < numLights;
+        if(any(active))
+          tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex);
+#endif
+        }
+    }
+
+    return tileNumLights;
+}
+
+
+#if 1
+inline
+#endif
+static uniform int32
+IntersectLightsWithTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // G-buffer data
+    uniform float zBuffer[],
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    uniform int32 tileLightIndices[]
+    )
+{
+    uniform float minZ, maxZ;
+    ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
+        zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+        minZ, maxZ);
+
+    uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
+        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
+        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
+        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
+        light_positionView_z_array, light_attenuationEnd_array,
+        tileLightIndices);
+
+    return tileNumLights;
+}
+
+
+#if 1
+inline
+#endif
+#ifndef __NVPTX__
+export
+#endif
+void
+ShadeTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    uniform InputDataArrays &inputData,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    // Light list
+    uniform int32 tileLightIndices[],
+    uniform int32 tileNumLights,
+    // UI
+    uniform bool visualizeLightCount,
+    // Output
+    uniform unsigned int8 framebuffer_r[],
+    uniform unsigned int8 framebuffer_g[],
+    uniform unsigned int8 framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            foreach (x = tileStartX ... tileEndX) {
+                int32 framebufferIndex = (y * gBufferWidth + x);
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
+
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            foreach (x = tileStartX ... tileEndX) {
+                int32 gBufferOffset = y * gBufferWidth + x;
+
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x)) *
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z /
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z /
+                    cameraProj_22;
+
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y,
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
+
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrt(4.0f * f - 1.0f);
+
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount =
+                    half_to_float(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  =
+                    half_to_float(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
+                     ++tileLightIndex) {
+                    uniform int32 lightIndex = tileLightIndices[tileLightIndex];
+
+                    // Gather light data relevant to initial culling
+                    uniform float light_positionView_x =
+                        inputData.lightPositionView_x[lightIndex];
+                    uniform float light_positionView_y =
+                        inputData.lightPositionView_y[lightIndex];
+                    uniform float light_positionView_z =
+                        inputData.lightPositionView_z[lightIndex];
+                    uniform float light_attenuationEnd =
+                        inputData.lightAttenuationEnd[lightIndex];
+
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    cif (distanceToLight2 < light_attenutaionEnd2) {
+                        float distanceToLight = sqrt(distanceToLight2);
+
+                        // HLSL "rcp" is allowed to be fairly inaccurate
+                        float distanceToLightRcp = rcp(distanceToLight);
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y,
+                                           surface_normal_z, L_x, L_y, L_z);
+
+                        // Clip back facing
+                        cif (NdotL > 0.0f) {
+                            uniform float light_attenuationBegin =
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+
+                            float NdotH = dot3(surface_normal_x, surface_normal_y,
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = max(NdotH, 0.0f);
+
+                            float specular = pow(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) *
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount *
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+
+                            uniform float light_color_x = inputData.lightColor_x[lightIndex];
+                            uniform float light_color_y = inputData.lightColor_y[lightIndex];
+                            uniform float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                // These pows are pretty slow right now, but we can do
+                // something faster if really necessary to squeeze every
+                // last bit of performance out of it
+                float gamma = 1.0 / 2.2f;
+                lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
+                lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
+                lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
+
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Static decomposition
+
+task void
+RenderTile(uniform int num_groups_x, uniform int num_groups_y,
+           uniform InputHeader inputHeaderPtr[],
+           uniform InputDataArrays inputDataPtr[],
+           uniform int visualizeLightCount,
+           // Output
+           uniform unsigned int8 framebuffer_r[],
+           uniform unsigned int8 framebuffer_g[],
+           uniform unsigned int8 framebuffer_b[]) {
+
+  uniform InputHeader inputHeader = *inputHeaderPtr;
+  uniform InputDataArrays inputData = *inputDataPtr;
+
+    uniform int32 group_y = taskIndex / num_groups_x;
+    uniform int32 group_x = taskIndex % num_groups_x;
+    uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
+    uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
+    uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
+    uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
+
+    uniform int framebufferWidth = inputHeader.framebufferWidth;
+    uniform int framebufferHeight = inputHeader.framebufferHeight;
+    uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
+    uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
+    uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
+    uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
+
+    // Light intersection: figure out which lights illuminate this tile.
+#if 1
+    uniform int * uniform tileLightIndices = uniform new uniform int [MAX_LIGHTS];
+#define MALLOC
+#else /* shared memory doesn't full work... why? */
+    uniform int tileLightIndices[MAX_LIGHTS];  // Light list for the tile
+#endif
+    uniform int numTileLights =
+        IntersectLightsWithTile(tile_start_x, tile_end_x,
+                                tile_start_y, tile_end_y,
+                                framebufferWidth, framebufferHeight,
+                                inputData.zBuffer,
+                                cameraProj_00, cameraProj_11,
+                                cameraProj_22, cameraProj_32,
+                                inputHeader.cameraNear, inputHeader.cameraFar,
+                                MAX_LIGHTS,
+                                inputData.lightPositionView_x,
+                                inputData.lightPositionView_y,
+                                inputData.lightPositionView_z,
+                                inputData.lightAttenuationEnd,
+                                tileLightIndices);
+
+    // And now shade the tile, using the lights in tileLightIndices
+    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
+              framebufferWidth, framebufferHeight, inputData,
+              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
+              tileLightIndices, numTileLights, visualizeLightCount,
+              framebuffer_r, framebuffer_g, framebuffer_b);
+#ifdef MALLOC
+    delete tileLightIndices;
+#endif
+}
+
+
+export void
+RenderStatic(uniform InputHeader inputHeaderPtr[],
+             uniform InputDataArrays inputDataPtr[],
+             uniform int visualizeLightCount,
+             // Output
+             uniform unsigned int8 framebuffer_r[],
+             uniform unsigned int8 framebuffer_g[],
+             uniform unsigned int8 framebuffer_b[]) {
+
+    uniform InputHeader inputHeader = *inputHeaderPtr;
+    uniform InputDataArrays inputData = *inputDataPtr;
+
+    uniform int num_groups_x = (inputHeader.framebufferWidth +
+                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
+    uniform int num_groups_y = (inputHeader.framebufferHeight +
+                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
+    uniform int num_groups = num_groups_x * num_groups_y;
+
+    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
+    // by MIN_TILE_HEIGHT pixels.
+    launch[num_groups] RenderTile(num_groups_x, num_groups_y,
+                                  inputHeaderPtr, inputDataPtr, visualizeLightCount,
+                                  framebuffer_r, framebuffer_g, framebuffer_b);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Routines for dynamic decomposition path
+
+// This computes the z min/max range for a whole row worth of tiles.
+export void
+ComputeZBoundsRow(
+    uniform int32 tileY,
+    uniform int32 tileWidth, uniform int32 tileHeight,
+    uniform int32 numTilesX, uniform int32 numTilesY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    uniform float minZArray[],
+    uniform float maxZArray[]
+    )
+{
+    for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
+        uniform float minZ, maxZ;
+        ComputeZBounds(
+            tileX * tileWidth, tileX * tileWidth + tileWidth,
+            tileY * tileHeight, tileY * tileHeight + tileHeight,
+            zBuffer, gBufferWidth,
+            cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+            minZ, maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
+// numLights need not be a multiple of programCount here, but the input and output arrays
+// should be able to handle programCount-sized load/stores.
+export void
+SplitTileMinMax(
+    uniform int32 tileMidX, uniform int32 tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    uniform float subtileMinZ[],
+    uniform float subtileMaxZ[],
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    uniform int32 lightIndices[],
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Outputs
+    uniform int32 subtileIndices[],
+    uniform int32 subtileIndicesPitch,
+    uniform int32 subtileNumLights[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+
+    uniform_t float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                           (cameraProj_22 * gBufferScale_y) };
+    uniform_t float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                         tileMidY - gBufferScale_y };
+
+    // Normalize
+    uniform_t float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] +
+                                    frustumPlanes_z[0] * frustumPlanes_z[0]),
+                              rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] +
+                                    frustumPlanes_z[1] * frustumPlanes_z[1]) };
+    frustumPlanes_xy[0] *= norm[0];
+    frustumPlanes_xy[1] *= norm[1];
+    frustumPlanes_z[0] *= norm[0];
+    frustumPlanes_z[1] *= norm[1];
+
+    // Initialize
+    uniform int32 subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    foreach (i = 0 ... numLights) {
+        int32 lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        // Test lights again subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] +
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+
+        cif (abs(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        cif (abs(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        // Pack and store intersecting lights
+        // TODO: Experiment with a loop here instead
+        cif (inFrustum[0])
+            subtileLightOffset[0] +=
+            packed_store_active(&subtileIndices[subtileLightOffset[0]],
+                                lightIndex);
+        cif (inFrustum[1])
+            subtileLightOffset[1] +=
+            packed_store_active(&subtileIndices[subtileLightOffset[1]],
+                                lightIndex);
+        cif (inFrustum[2])
+            subtileLightOffset[2] +=
+            packed_store_active(&subtileIndices[subtileLightOffset[2]],
+                                lightIndex);
+        cif (inFrustum[3])
+            subtileLightOffset[3] +=
+            packed_store_active(&subtileIndices[subtileLightOffset[3]],
+                                lightIndex);
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
diff --git a/examples/portable/deferred/main.cpp b/examples/portable/deferred/main.cpp
new file mode 100644
index 00000000..6ca24796
--- /dev/null
+++ b/examples/portable/deferred/main.cpp
@@ -0,0 +1,107 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#define NOMINMAX
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <cfloat>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <cassert>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include "timing.h"
+#include "ispc_malloc.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char** argv) {
+    if (argc < 2) {
+        printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)> [tasks iterations] [serial iterations]\n");
+        return 1;
+    }
+    static unsigned int test_iterations[] = {5, 3, 500}; //last value is for nframes, it is scale.
+    if (argc == 5) {
+        for (int i = 0; i < 3; i++) {
+            test_iterations[i] = atoi(argv[2 + i]);
+        }
+    }
+
+    InputData *input = CreateInputDataFromFile(argv[1]);
+    if (!input) {
+        printf("Failed to load input file \"%s\"!\n", argv[1]);
+        return 1;
+    }
+
+    Framebuffer framebuffer(input->header.framebufferWidth,
+                            input->header.framebufferHeight);
+
+    int nframes = test_iterations[2];
+    double ispcCycles = 1e30;
+    for (int i = 0; i < test_iterations[0]; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            ispc::RenderStatic(&input->header, &input->arrays,
+                               VISUALIZE_LIGHT_COUNT,
+                               framebuffer.r, framebuffer.g, framebuffer.b);
+        double msec = get_elapsed_msec() / nframes;
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec [%.3f fps]\n", msec, 1.0e3/msec);
+        ispcCycles = std::min(ispcCycles, msec);
+    }
+    printf("[ispc static + tasks]:\t\t[%.3f] msec to render "
+           "%d x %d image\n", ispcCycles,
+           input->header.framebufferWidth, input->header.framebufferHeight);
+    WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
+
+    DeleteInputData(input);
+
+    return 0;
+}
diff --git a/examples/portable/mergeSort/Makefile_cpu b/examples/portable/mergeSort/Makefile_cpu
new file mode 100644
index 00000000..05e9906f
--- /dev/null
+++ b/examples/portable/mergeSort/Makefile_cpu
@@ -0,0 +1,12 @@
+
+EXAMPLE=mergeSort
+CPP_SRC=mergeSort.cpp 
+ISPC_SRC=mergeSort.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+#ISPC_FLAGS=-DDEBUG -g
+CXXFLAGS=-g
+CCFLAGS=-g
+#NVCC_FLAGS=-Xptxas=-O0
+
+include ../common_cpu.mk
diff --git a/examples/portable/mergeSort/Makefile_knc b/examples/portable/mergeSort/Makefile_knc
new file mode 100644
index 00000000..3d7a2765
--- /dev/null
+++ b/examples/portable/mergeSort/Makefile_knc
@@ -0,0 +1,7 @@
+EXAMPLE=mergeSort
+CXX_SRC=mergeSort.cpp 
+ISPC_SRC=mergeSort.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+
+include ../common_knc.mk
diff --git a/examples/portable/mergeSort/Makefile_ptx b/examples/portable/mergeSort/Makefile_ptx
new file mode 100644
index 00000000..f64581e4
--- /dev/null
+++ b/examples/portable/mergeSort/Makefile_ptx
@@ -0,0 +1,15 @@
+PROG=mergeSort
+ISPC_SRC=mergeSort.ispc
+CU_SRC=mergeSort.cu
+CXX_SRC=mergeSort.cpp  mergeSort.cpp
+PTXCC_REGMAX=64
+#PTXCC_FLAGS= -Xptxas=-O3
+#NVCC_FLAGS=-Xptxas=-O0
+
+LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
diff --git a/examples/portable/mergeSort/keyType.h b/examples/portable/mergeSort/keyType.h
new file mode 100644
index 00000000..82de34af
--- /dev/null
+++ b/examples/portable/mergeSort/keyType.h
@@ -0,0 +1,3 @@
+#pragma once
+typedef float Key_t;
+typedef int   Val_t;
diff --git a/examples/portable/mergeSort/mergeSort.cpp b/examples/portable/mergeSort/mergeSort.cpp
new file mode 100644
index 00000000..d3188544
--- /dev/null
+++ b/examples/portable/mergeSort/mergeSort.cpp
@@ -0,0 +1,171 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <cassert>
+#include <iomanip>
+#include "timing.h"
+#include "ispc_malloc.h"
+#include "mergeSort_ispc.h"
+
+static void progressBar(const int x, const int n, const int width = 50)
+{
+  assert(n > 1);
+  assert(x >= 0 && x < n);
+  assert(width > 10);
+  const float f = static_cast<float>(x)/(n-1);
+  const int   w = static_cast<int>(f * width);
+
+  // print bar
+  std::string bstr("[");
+  for (int i = 0; i < width; i++)
+    bstr += i < w ? '=' : ' ';
+  bstr += "]";
+
+  // print percentage 
+  char pstr0[32];
+  sprintf(pstr0, " %2d %c ", static_cast<int>(f*100.0),'%');
+  const std::string pstr(pstr0);
+  std::copy(pstr.begin(), pstr.end(), bstr.begin() + (width/2-2));
+
+  std::cout << bstr;
+  std::cout << (x == n-1 ? "\n" : "\r") << std::flush;
+}
+
+#include "keyType.h"
+struct Key
+{
+  Key_t key;
+  Val_t val;
+};
+
+
+int main (int argc, char *argv[])
+{
+  int i, j, n = argc == 1 ? 1024*1024: atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
+  double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
+
+  Key *keys = new Key[n];
+  srand48(rtc()*65536);
+#pragma omp parallel for
+  for (int i = 0; i < n; i++)
+  {
+    keys[i].key = i; //((int)(drand48() * (1<<30)));
+    keys[i].val = i;
+  }
+  std::random_shuffle(keys, keys + n);
+
+  Key_t *keysSrc = new Key_t[n];
+  Val_t *valsSrc = new Val_t[n];
+  Key_t *keysBuf = new Key_t[n];
+  Val_t *valsBuf = new Val_t[n];
+  Key_t *keysDst = new Key_t[n];
+  Val_t *valsDst = new Val_t[n];
+  Key_t *keysGld = new Key_t[n];
+  Val_t *valsGld = new Val_t[n];
+#pragma omp parallel for
+  for (int i = 0; i < n; i++)
+  {
+    keysSrc[i] = keys[i].key;
+    valsSrc[i] = keys[i].val;
+
+    keysGld[i] = keysSrc[i];
+    valsGld[i] = valsSrc[i];
+  }
+  delete keys;
+
+  ispcSetMallocHeapLimit(1024*1024*1024);
+
+  ispc::openMergeSort();
+
+  tISPC2 = 1e30;
+  for (i = 0; i < m; i ++)
+  {
+    ispcMemcpy(keysSrc, keysGld, n*sizeof(Key_t));
+    ispcMemcpy(valsSrc, valsGld, n*sizeof(Val_t));
+
+    reset_and_start_timer();
+    ispc::mergeSort(keysDst, valsDst, keysBuf, valsBuf, keysSrc, valsSrc, n);
+    tISPC2 = std::min(tISPC2, get_elapsed_msec());
+
+    if (argc != 3)
+        progressBar (i, m);
+  }
+
+  ispc::closeMergeSort();
+
+  printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n/tISPC2);
+
+#if 0
+  printf("\n---\n");
+  for (int i = 0; i < 128; i++)
+  {
+    if ((i%32) == 0) printf("\n");
+    printf("%d ", (int)keysSrc[i]);
+  }
+  printf("\n---\n");
+  for (int i = 0; i < 128; i++)
+  {
+    if ((i%32) == 0) printf("\n");
+    printf("%d ", (int)keysBuf[i]);
+  }
+  printf("\n---\n");
+  for (int i = 0; i < 128; i++)
+  {
+    if ((i%32) == 0) printf("\n");
+    printf("%d ", (int)keysDst[i]);
+  }
+  printf("\n---\n");
+#endif
+
+
+
+  std::sort(keysGld, keysGld + n);
+  for (int i = 0; i < n; i++)
+    assert(keysDst[i] == keysGld[i]);
+
+  delete keysSrc;
+  delete valsSrc;
+  delete keysDst;
+  delete valsDst;
+  delete keysBuf;
+  delete valsBuf;
+  delete keysGld;
+  delete valsGld;
+
+  return 0;
+}
diff --git a/examples/portable/mergeSort/mergeSort.cu b/examples/portable/mergeSort/mergeSort.cu
new file mode 100644
index 00000000..2f8eb19d
--- /dev/null
+++ b/examples/portable/mergeSort/mergeSort.cu
@@ -0,0 +1,694 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/*
+   Based on mergeSort from CUDA SDK
+   */
+
+#include "keyType.h"
+#include "cuda_helpers.cuh"
+#include <cassert>
+
+#define uniform
+
+#define SAMPLE_STRIDE programCount
+
+#define iDivUp(a,b) (((a) + (b) - 1)/(b))
+#define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE)))
+
+#define W (/*sizeof(int)=*/4 * 8)
+
+__device__ static inline
+int nextPowerOfTwo(int x)
+{
+#if 0
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return ++x;
+#else
+  return 1U << (W - __clz(x - 1));
+#endif
+}
+
+
+__device__ static inline
+int binarySearchInclusiveRanks(
+    const int val,
+    uniform int *data,
+    const int L,
+    int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+__device__ static inline
+int binarySearchExclusiveRanks(
+    const int val,
+    uniform int *data,
+    const int L,
+    int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+__device__ static inline
+int binarySearchInclusive(
+    const Key_t val,
+    uniform Key_t *data,
+    const int L,
+    int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+__device__ static inline
+int binarySearchExclusive(
+    const Key_t val,
+    uniform Key_t *data,
+    const int L,
+    int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+__device__ static inline
+int binarySearchInclusive1(
+    const Key_t val,
+    Key_t data,
+    const uniform int L,
+    uniform int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (shuffle(data,newPos - 1) <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+__device__ static inline
+int binarySearchExclusive1(
+    const Key_t val,
+    Key_t data,
+    const uniform int L,
+    uniform int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (shuffle(data,newPos - 1) < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Bottom-level merge sort (binary search-based)
+////////////////////////////////////////////////////////////////////////////////
+__global__
+void mergeSortGangKernel(
+    uniform int batchSize,
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[])
+{
+  const uniform int blkIdx = taskIndex;
+  const uniform int blkDim = (batchSize + taskCount - 1)/taskCount;
+  const uniform int blkBeg =     blkIdx * blkDim;
+  const uniform int blkEnd = min(blkBeg + blkDim, batchSize);
+
+  __shared__ Key_t s_key_tmp[2*programCount*4];
+  __shared__ Val_t s_val_tmp[2*programCount*4];
+  Key_t *s_key = s_key_tmp + warpIdx*(2*programCount);
+  Val_t *s_val = s_val_tmp + warpIdx*(2*programCount);
+
+  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
+  {
+    const uniform int base = blk * (programCount*2);
+    s_key[programIndex +            0] = srcKey[base + programIndex +            0];
+    s_val[programIndex +            0] = srcVal[base + programIndex +            0];
+    s_key[programIndex + programCount] = srcKey[base + programIndex + programCount];
+    s_val[programIndex + programCount] = srcVal[base + programIndex + programCount];
+
+    for (uniform int stride = 1; stride < 2*programCount; stride <<= 1)
+    {
+      const int lPos = programIndex & (stride - 1);
+      uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
+      uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
+
+      Key_t keyA = baseKey[lPos +      0];
+      Val_t valA = baseVal[lPos +      0];
+      Key_t keyB = baseKey[lPos + stride];
+      Val_t valB = baseVal[lPos + stride];
+      int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos;
+      int posB = binarySearchInclusive(keyB, baseKey +      0, stride, stride) + lPos;
+
+      baseKey[posA] = keyA;
+      baseVal[posA] = valA;
+      baseKey[posB] = keyB;
+      baseVal[posB] = valB;
+    }
+
+    dstKey[base + programIndex +            0] = s_key[programIndex +            0];
+    dstVal[base + programIndex +            0] = s_val[programIndex +            0];
+    dstKey[base + programIndex + programCount] = s_key[programIndex + programCount];
+    dstVal[base + programIndex + programCount] = s_val[programIndex + programCount];
+  }
+}
+
+__device__ static inline
+void mergeSortGang(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int batchSize)
+{
+  uniform int nTasks = batchSize;
+  launch (nTasks,1,1,mergeSortGangKernel)(batchSize, dstKey, dstVal, srcKey, srcVal);
+  sync;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Merge step 1: generate sample ranks
+////////////////////////////////////////////////////////////////////////////////
+__global__
+void generateSampleRanksKernel(
+    uniform int nBlocks,
+    uniform int in_ranksA[],
+    uniform int in_ranksB[],
+    uniform Key_t in_srcKey[],
+    uniform int stride,
+    uniform int N,
+    uniform int totalProgramCount)
+{
+  const uniform int blkIdx = taskIndex;
+  const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
+  const uniform int blkBeg =     blkIdx * blkDim;
+  const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
+
+  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
+  {
+    const int pos = blk * programCount + programIndex;
+    cif (pos >= totalProgramCount)
+      return;
+
+    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+
+    uniform Key_t * srcKey = in_srcKey + segmentBase;
+    uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
+    uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
+
+    const int segmentElementsA = stride;
+    const int segmentElementsB = min(stride, N - segmentBase - stride);
+    const int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const int  segmentSamplesB = getSampleCount(segmentElementsB);
+
+    if (i < segmentSamplesA)
+    {
+      ranksA[i] = i * SAMPLE_STRIDE;
+      ranksB[i] = binarySearchExclusive(
+          srcKey[i * SAMPLE_STRIDE], srcKey + stride,
+          segmentElementsB, nextPowerOfTwo(segmentElementsB));
+    }
+
+    if (i < segmentSamplesB)
+    {
+      ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
+      ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
+          srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
+          segmentElementsA, nextPowerOfTwo(segmentElementsA));
+    }
+  }
+}
+
+__device__ static inline
+void generateSampleRanks(
+    uniform int ranksA[],
+    uniform int ranksB[],
+    uniform Key_t srcKey[],
+    uniform int stride,
+    uniform int N)
+{
+  uniform int lastSegmentElements = N % (2 * stride);
+  uniform int threadCount = (lastSegmentElements > stride) ?
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
+    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+
+  uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
+  uniform int nTasks = nBlocks;
+
+  launch (nTasks,1,1, generateSampleRanksKernel)(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
+  sync;
+}
+////////////////////////////////////////////////////////////////////////////////
+// Merge step 2: generate sample ranks and indices
+////////////////////////////////////////////////////////////////////////////////
+__global__
+void mergeRanksAndIndicesKernel(
+    uniform int nBlocks,
+    uniform int in_Limits[],
+    uniform int in_Ranks[],
+    uniform int stride,
+    uniform int N,
+    uniform int totalProgramCount)
+{
+  const uniform int blkIdx = taskIndex;
+  const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
+  const uniform int blkBeg =     blkIdx * blkDim;
+  const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
+
+  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
+  {
+    int pos = blk * programCount + programIndex;
+    cif (pos >= totalProgramCount)
+      return;
+
+    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+    uniform int *  ranks = in_Ranks  + (pos - i) * 2;
+    uniform int * limits = in_Limits + (pos - i) * 2;
+
+    const int segmentElementsA = stride;
+    const int segmentElementsB = min(stride, N - segmentBase - stride);
+    const int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const int  segmentSamplesB = getSampleCount(segmentElementsB);
+
+    if (i < segmentSamplesA)
+    {
+      int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
+      limits[dstPos] = ranks[i];
+    }
+
+    if (i < segmentSamplesB)
+    {
+      int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
+      limits[dstPos] = ranks[segmentSamplesA + i];
+    }
+  }
+}
+__device__ static inline
+void mergeRanksAndIndices(
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int ranksA[],
+    uniform int ranksB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int lastSegmentElements = N % (2 * stride);
+  const uniform int threadCount = (lastSegmentElements > stride) ?
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
+    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+
+  const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
+  uniform int nTasks = nBlocks;
+
+  launch (nTasks,1,1,mergeRanksAndIndicesKernel)(
+      nBlocks,
+      limitsA,
+      ranksA,
+      stride,
+      N,
+      threadCount);
+  launch (nTasks,1,1, mergeRanksAndIndicesKernel)(
+      nBlocks,
+      limitsB,
+      ranksB,
+      stride,
+      N,
+      threadCount);
+  sync;
+}
+
+
+__global__
+void mergeElementaryIntervalsKernel(
+    uniform int mergePairs,
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int blkIdx = taskIndex;
+  const uniform int blkDim = (mergePairs + taskCount - 1)/taskCount;
+  const uniform int blkBeg =     blkIdx * blkDim;
+  const uniform int blkEnd = min(blkBeg + blkDim, mergePairs);
+
+  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
+  {
+    const int uniform   intervalI =  blk & ((2 * stride) / SAMPLE_STRIDE - 1);
+    const int uniform segmentBase = (blk - intervalI) * SAMPLE_STRIDE;
+
+    //Set up threadblk-wide parameters
+
+    const uniform int segmentElementsA = stride;
+    const uniform int segmentElementsB = min(stride, N - segmentBase - stride);
+    const uniform int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const uniform int  segmentSamplesB = getSampleCount(segmentElementsB);
+    const uniform int   segmentSamples = segmentSamplesA + segmentSamplesB;
+
+    const uniform int startSrcA = limitsA[blk];
+    const uniform int startSrcB = limitsB[blk];
+    const uniform int endSrcA   = (intervalI + 1 < segmentSamples) ? limitsA[blk + 1] : segmentElementsA;
+    const uniform int endSrcB   = (intervalI + 1 < segmentSamples) ? limitsB[blk + 1] : segmentElementsB;
+    const uniform int lenSrcA   = endSrcA - startSrcA;
+    const uniform int lenSrcB   = endSrcB - startSrcB;
+    const uniform int startDstA = startSrcA + startSrcB;
+    const uniform int startDstB = startDstA + lenSrcA;
+
+    //Load main input data
+
+    Key_t keyA, keyB;
+    Val_t valA, valB;
+    if (programIndex < lenSrcA)
+    {
+      keyA = srcKey[segmentBase + startSrcA + programIndex];
+      valA = srcVal[segmentBase + startSrcA + programIndex];
+    }
+
+    if (programIndex < lenSrcB)
+    {
+      keyB = srcKey[segmentBase + stride + startSrcB + programIndex];
+      valB = srcVal[segmentBase + stride + startSrcB + programIndex];
+    }
+
+    // Compute destination addresses for merge data
+    int dstPosA, dstPosB, dstA = -1, dstB = -1;
+    if (any(programIndex < lenSrcA))
+      dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
+    if (any(programIndex < lenSrcB))
+      dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;
+
+    if (programIndex < lenSrcA && dstPosA < lenSrcA)
+      dstA = segmentBase + startDstA + dstPosA;
+    dstPosA -= lenSrcA;
+    if (programIndex < lenSrcA && dstPosA < lenSrcB)
+      dstA = segmentBase + startDstB + dstPosA;
+
+    if (programIndex < lenSrcB && dstPosB < lenSrcA)
+      dstB = segmentBase + startDstA + dstPosB;
+    dstPosB -= lenSrcA;
+    if (programIndex < lenSrcB && dstPosB < lenSrcB)
+      dstB = segmentBase + startDstB + dstPosB;
+
+    // store merge data
+    if (dstA >= 0)
+    {
+ //     int dstA = segmentBase + startSrcA + programIndex;
+      dstKey[dstA] = keyA;
+      dstVal[dstA] = valA;
+    }
+    if (dstB >= 0)
+    {
+//      int dstB = segmentBase + stride + startSrcB + programIndex;
+      dstKey[dstB] = keyB;
+      dstVal[dstB] = valB;
+    }
+  }
+
+}
+
+
+__device__ static inline
+void mergeElementaryIntervals(
+    uniform int nTasks,
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int lastSegmentElements = N % (2 * stride);
+  const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
+
+
+  nTasks = mergePairs/(programCount);
+
+  launch (nTasks,1,1, mergeElementaryIntervalsKernel)(
+      mergePairs,
+      dstKey,
+      dstVal,
+      srcKey,
+      srcVal,
+      limitsA,
+      limitsB,
+      stride,
+      N);
+  sync;
+}
+
+__device__ static uniform int * uniform memPool = NULL;
+__device__ static uniform int * uniform ranksA;
+__device__ static uniform int * uniform ranksB;
+__device__ static uniform int * uniform limitsA;
+__device__ static uniform int * uniform limitsB;
+__device__ static uniform int nTasks;
+__device__ static uniform int MAX_SAMPLE_COUNT = 0;
+
+__global__
+void openMergeSort___export()
+{
+  nTasks = 13*32*13;
+  MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
+  assert(memPool == NULL);
+  const uniform int nalloc = MAX_SAMPLE_COUNT * 4;
+  memPool = uniform new uniform int[nalloc];
+  ranksA  = memPool;
+  ranksB  =  ranksA + MAX_SAMPLE_COUNT;
+  limitsA =  ranksB + MAX_SAMPLE_COUNT;
+  limitsB = limitsA + MAX_SAMPLE_COUNT;
+}
+extern "C"
+void openMergeSort()
+{
+  openMergeSort___export<<<1,1>>>();
+  sync;
+}
+
+__global__
+void closeMergeSort___export()
+{
+  assert(memPool != NULL);
+  delete memPool;
+  memPool = NULL;
+}
+extern "C"
+void closeMergeSort()
+{
+  closeMergeSort___export<<<1,1>>>();
+  sync;
+}
+
+__global__
+void mergeSort___export(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t bufKey[],
+    uniform Val_t bufVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int N)
+{
+  uniform int stageCount = 0;
+  for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++);
+
+  uniform Key_t * uniform iKey, * uniform oKey;
+  uniform Val_t * uniform iVal, * uniform oVal;
+
+  if (stageCount & 1)
+  {
+    iKey = bufKey;
+    iVal = bufVal;
+    oKey = dstKey;
+    oVal = dstVal;
+  }
+  else
+  {
+    iKey = dstKey;
+    iVal = dstVal;
+    oKey = bufKey;
+    oVal = bufVal;
+  }
+
+
+
+  assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
+  assert(N % (programCount*2) == 0);
+
+  // k20m: 140 M/s
+  {
+    // k20m:  2367 M/s
+    mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
+
+#if 1
+    for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
+    {
+      const uniform int lastSegmentElements = N % (2 * stride);
+
+      // k20m: 271 M/s
+      {
+#if 1
+        // k20m: 944 M/s
+        {
+          // k20m:  1396 M/s
+          //Find sample ranks and prepare for limiters merge
+          generateSampleRanks(ranksA, ranksB, iKey, stride, N);
+
+          // k20m: 2379 M/s
+          //Merge ranks and indices
+          mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
+        }
+#endif
+
+        // k20m: 371 M/s
+        //Merge elementary intervals
+        mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
+      }
+
+      if (lastSegmentElements <= stride)
+        for (int i = programIndex; i < lastSegmentElements; i += programCount)
+          if (i < lastSegmentElements)
+          {
+            oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i];
+            oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i];
+          }
+
+
+      {
+        uniform Key_t * uniform tmpKey = iKey;
+        iKey = oKey;
+        oKey = tmpKey;
+      }
+      {
+        uniform Val_t * uniform tmpVal = iVal;
+        iVal = oVal;
+        oVal = tmpVal;
+      }
+    }
+#endif
+  }
+}
+extern "C"
+void mergeSort(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t bufKey[],
+    uniform Val_t bufVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int N)
+{
+  mergeSort___export<<<1,32>>>(
+      dstKey,
+      dstVal,
+      bufKey,
+      bufVal,
+      srcKey,
+      srcVal,
+      N);
+  sync;
+}
diff --git a/examples/portable/mergeSort/mergeSort.ispc b/examples/portable/mergeSort/mergeSort.ispc
new file mode 100644
index 00000000..127425e4
--- /dev/null
+++ b/examples/portable/mergeSort/mergeSort.ispc
@@ -0,0 +1,658 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/*
+   Based on mergeSort from CUDA SDK
+   */
+
+#include "keyType.h"
+
+#define SAMPLE_STRIDE programCount
+
+#define iDivUp(a,b) (((a) + (b) - 1)/(b))
+#define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE)))
+
+#define W (/*sizeof(int)=*/4 * 8)
+
+static inline
+int nextPowerOfTwo(int x)
+{
+#if 0
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return ++x;
+#else
+  return 1U << (W - count_leading_zeros(x - 1));
+#endif
+}
+
+static inline
+int binarySearchInclusiveRanks(
+    const int val,
+    uniform int *data,
+    const int L,
+    int stride)
+{
+  cif (L == 0)
+    return 0;
+
+  int pos = 0;
+  cfor (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    cif (data[newPos - 1] <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+static inline
+int binarySearchExclusiveRanks(
+    const int val,
+    uniform int *data,
+    const int L,
+    int stride)
+{
+  cif (L == 0)
+    return 0;
+
+  int pos = 0;
+  cfor (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+static inline
+int binarySearchInclusive(
+    const Key_t val,
+    uniform Key_t *data,
+    const int L,
+    int stride)
+{
+  cif (L == 0)
+    return 0;
+
+  int pos = 0;
+  cfor (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+static inline
+int binarySearchExclusive(
+    const Key_t val,
+    uniform Key_t *data,
+    const int L,
+    int stride)
+{
+  cif (L == 0)
+    return 0;
+
+  int pos = 0;
+  cfor (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+static inline
+int binarySearchInclusive1(
+    const Key_t val,
+    Key_t data,
+    const uniform int L,
+    uniform int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (shuffle(data,newPos - 1) <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+static inline
+int binarySearchExclusive1(
+    const Key_t val,
+    Key_t data,
+    const uniform int L,
+    uniform int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (shuffle(data,newPos - 1) < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Bottom-level merge sort (binary search-based)
+////////////////////////////////////////////////////////////////////////////////
+task
+void mergeSortGangKernel(
+    uniform int batchSize,
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int arrayLength)
+{
+  const uniform int blockIdx = taskIndex;
+  const uniform int blockDim = (batchSize + taskCount - 1)/taskCount;
+  const uniform int blockBeg =     blockIdx * blockDim;
+  const uniform int blockEnd = min(blockBeg + blockDim, batchSize);
+
+  uniform Key_t s_key[2*programCount];
+  uniform Val_t s_val[2*programCount];
+
+  for (uniform int block = blockBeg; block < blockEnd; block++)
+  {
+    const uniform int base = block * (programCount*2);
+    s_key[programIndex +            0] = srcKey[base + programIndex +            0];
+    s_val[programIndex +            0] = srcVal[base + programIndex +            0];
+    s_key[programIndex + programCount] = srcKey[base + programIndex + programCount];
+    s_val[programIndex + programCount] = srcVal[base + programIndex + programCount];
+
+    for (uniform int stride = 1; stride < arrayLength; stride <<= 1)
+    {
+      const int lPos = programIndex & (stride - 1);
+      const int offset = 2 * (programIndex - lPos);
+      uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
+      uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
+
+      Key_t keyA = baseKey[lPos +      0];
+      Val_t valA = baseVal[lPos +      0];
+      Key_t keyB = baseKey[lPos + stride];
+      Val_t valB = baseVal[lPos + stride];
+
+      int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos;
+      int posB = binarySearchInclusive(keyB, baseKey +      0, stride, stride) + lPos;
+
+      baseKey[posA] = keyA;
+      baseVal[posA] = valA;
+      baseKey[posB] = keyB;
+      baseVal[posB] = valB;
+    }
+
+    dstKey[base + programIndex +            0] = s_key[programIndex +            0];
+    dstVal[base + programIndex +            0] = s_val[programIndex +            0];
+    dstKey[base + programIndex + programCount] = s_key[programIndex + programCount];
+    dstVal[base + programIndex + programCount] = s_val[programIndex + programCount];
+  }
+}
+
+static inline
+void mergeSortGang(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int batchSize)
+{
+  uniform int nTasks = num_cores()*4;
+#ifdef __NVPTX__
+  nTasks = iDivUp(batchSize,1);
+#endif
+  launch [nTasks] mergeSortGangKernel(batchSize, dstKey, dstVal, srcKey, srcVal, 2*programCount);
+  sync;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Merge step 1: generate sample ranks
+////////////////////////////////////////////////////////////////////////////////
+task
+void generateSampleRanksKernel(
+    uniform int nBlocks,
+    uniform int in_ranksA[],
+    uniform int in_ranksB[],
+    uniform Key_t in_srcKey[],
+    uniform int stride,
+    uniform int N,
+    uniform int totalProgramCount)
+{
+  const uniform int blockIdx = taskIndex;
+  const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
+  const uniform int blockBeg =     blockIdx * blockDim;
+  const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
+
+  for (uniform int block = blockBeg; block < blockEnd; block++)
+  {
+    const int pos = block * programCount + programIndex;
+    cif (pos >= totalProgramCount)
+      return;
+
+    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+
+    uniform Key_t * srcKey = in_srcKey + segmentBase;
+    uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
+    uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
+
+    const int segmentElementsA = stride;
+    const int segmentElementsB = min(stride, N - segmentBase - stride);
+    const int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const int  segmentSamplesB = getSampleCount(segmentElementsB);
+
+    if (i < segmentSamplesA)
+    {
+      ranksA[i] = i * SAMPLE_STRIDE;
+      ranksB[i] = binarySearchExclusive(
+          srcKey[i * SAMPLE_STRIDE], srcKey + stride,
+          segmentElementsB, nextPowerOfTwo(segmentElementsB));
+    }
+
+    if (i < segmentSamplesB)
+    {
+      ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
+      ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
+          srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
+          segmentElementsA, nextPowerOfTwo(segmentElementsA));
+    }
+  }
+}
+
+static inline
+void generateSampleRanks(
+    uniform int ranksA[],
+    uniform int ranksB[],
+    uniform Key_t srcKey[],
+    uniform int stride,
+    uniform int N)
+{
+  uniform int lastSegmentElements = N % (2 * stride);
+  uniform int threadCount = (lastSegmentElements > stride) ?
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
+    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+
+  uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
+  uniform int nTasks = num_cores()*4;
+#ifdef __NVPTX__
+  nTasks = iDivUp(nBlocks,1);
+#endif
+
+  launch [nTasks] generateSampleRanksKernel(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
+  sync;
+}
+////////////////////////////////////////////////////////////////////////////////
+// Merge step 2: generate sample ranks and indices
+////////////////////////////////////////////////////////////////////////////////
+task
+void mergeRanksAndIndicesKernel(
+    uniform int nBlocks,
+    uniform int in_Limits[],
+    uniform int in_Ranks[],
+    uniform int stride,
+    uniform int N,
+    uniform int totalProgramCount)
+{
+  const uniform int blockIdx = taskIndex;
+  const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
+  const uniform int blockBeg =     blockIdx * blockDim;
+  const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
+
+  for (uniform int block = blockBeg; block < blockEnd; block++)
+  {
+    int pos = block * programCount + programIndex;
+    cif (pos >= totalProgramCount)
+      return;
+
+    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+    uniform int *  ranks = in_Ranks  + (pos - i) * 2;
+    uniform int * limits = in_Limits + (pos - i) * 2;
+
+    const int segmentElementsA = stride;
+    const int segmentElementsB = min(stride, N - segmentBase - stride);
+    const int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const int  segmentSamplesB = getSampleCount(segmentElementsB);
+
+    if (i < segmentSamplesA)
+    {
+      int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
+      limits[dstPos] = ranks[i];
+    }
+
+    if (i < segmentSamplesB)
+    {
+      int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
+      limits[dstPos] = ranks[segmentSamplesA + i];
+    }
+  }
+}
+static inline
+void mergeRanksAndIndices(
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int ranksA[],
+    uniform int ranksB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int lastSegmentElements = N % (2 * stride);
+  const uniform int threadCount = (lastSegmentElements > stride) ?
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
+    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+
+  const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
+  uniform int nTasks = num_cores()*4;
+
+#ifdef __NVPTX__
+  nTasks = iDivUp(nBlocks,1);
+#endif
+
+  launch [nTasks] mergeRanksAndIndicesKernel(
+      nBlocks,
+      limitsA,
+      ranksA,
+      stride,
+      N,
+      threadCount);
+  launch [nTasks] mergeRanksAndIndicesKernel(
+      nBlocks,
+      limitsB,
+      ranksB,
+      stride,
+      N,
+      threadCount);
+  sync;
+}
+
+
+task
+void mergeElementaryIntervalsKernel(
+    uniform int mergePairs,
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int blockIdx = taskIndex;
+  const uniform int blockDim = (mergePairs + taskCount - 1)/taskCount;
+  const uniform int blockBeg =     blockIdx * blockDim;
+  const uniform int blockEnd = min(blockBeg + blockDim, mergePairs);
+
+  for (uniform int block = blockBeg; block < blockEnd; block++)
+  {
+    const int uniform   intervalI =  block & ((2 * stride) / SAMPLE_STRIDE - 1);
+    const int uniform segmentBase = (block - intervalI) * SAMPLE_STRIDE;
+
+    //Set up threadblock-wide parameters
+
+    const uniform int segmentElementsA = stride;
+    const uniform int segmentElementsB = min(stride, N - segmentBase - stride);
+    const uniform int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const uniform int  segmentSamplesB = getSampleCount(segmentElementsB);
+    const uniform int   segmentSamples = segmentSamplesA + segmentSamplesB;
+
+    const uniform int startSrcA = limitsA[block];
+    const uniform int startSrcB = limitsB[block];
+    const uniform int endSrcA   = (intervalI + 1 < segmentSamples) ? limitsA[block + 1] : segmentElementsA;
+    const uniform int endSrcB   = (intervalI + 1 < segmentSamples) ? limitsB[block + 1] : segmentElementsB;
+    const uniform int lenSrcA   = endSrcA - startSrcA;
+    const uniform int lenSrcB   = endSrcB - startSrcB;
+    const uniform int startDstA = startSrcA + startSrcB;
+    const uniform int startDstB = startDstA + lenSrcA;
+
+    //Load main input data
+
+    Key_t keyA, keyB;
+    Val_t valA, valB;
+    if (programIndex < lenSrcA)
+    {
+      keyA = srcKey[segmentBase + startSrcA + programIndex];
+      valA = srcVal[segmentBase + startSrcA + programIndex];
+    }
+
+    if (programIndex < lenSrcB)
+    {
+      keyB = srcKey[segmentBase + stride + startSrcB + programIndex];
+      valB = srcVal[segmentBase + stride + startSrcB + programIndex];
+    }
+
+    // Compute destination addresses for merge data
+    int dstPosA, dstPosB, dstA = -1, dstB = -1;
+    if (programIndex < lenSrcA)
+      dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
+    if (programIndex < lenSrcB)
+      dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;
+
+    if (programIndex < lenSrcA && dstPosA < lenSrcA)
+      dstA = segmentBase + startDstA + dstPosA;
+    dstPosA -= lenSrcA;
+    if (programIndex < lenSrcA && dstPosA < lenSrcB)
+      dstA = segmentBase + startDstB + dstPosA;
+
+    if (programIndex < lenSrcB && dstPosB < lenSrcA)
+      dstB = segmentBase + startDstA + dstPosB;
+    dstPosB -= lenSrcA;
+    if (programIndex < lenSrcB && dstPosB < lenSrcB)
+      dstB = segmentBase + startDstB + dstPosB;
+
+    if (dstA >= 0)
+    {
+      dstKey[dstA] = keyA;
+      dstVal[dstA] = valA;
+    }
+    if (dstB >= 0)
+    {
+      dstKey[dstB] = keyB;
+      dstVal[dstB] = valB;
+    }
+  }
+}
+
+static inline
+void mergeElementaryIntervals(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int lastSegmentElements = N % (2 * stride);
+  const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
+
+
+  uniform int nTasks = num_cores()*4;
+#ifdef __NVPTX__
+  nTasks = iDivUp(mergePairs,1*programCount);
+#endif
+
+  launch [nTasks] mergeElementaryIntervalsKernel(
+      mergePairs,
+      dstKey,
+      dstVal,
+      srcKey,
+      srcVal,
+      limitsA,
+      limitsB,
+      stride,
+      N);
+  if (lastSegmentElements <= stride)
+    foreach (i = 0 ... lastSegmentElements)
+    {
+      dstKey[N-lastSegmentElements+i] = srcKey[N-lastSegmentElements+i];
+      dstVal[N-lastSegmentElements+i] = srcVal[N-lastSegmentElements+i];
+    }
+  sync;
+}
+
+static uniform int * uniform memPool = NULL;
+static uniform int * uniform ranksA;
+static uniform int * uniform ranksB;
+static uniform int * uniform limitsA;
+static uniform int * uniform limitsB;
+static uniform int MAX_SAMPLE_COUNT = 0;
+
+export
+void openMergeSort()
+{
+  MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
+  assert(memPool == NULL);
+  const uniform int nalloc = MAX_SAMPLE_COUNT * 4;
+  memPool = uniform new uniform int[nalloc];
+  ranksA  = memPool;
+  ranksB  =  ranksA + MAX_SAMPLE_COUNT;
+  limitsA =  ranksB + MAX_SAMPLE_COUNT;
+  limitsB = limitsA + MAX_SAMPLE_COUNT;
+}
+
+export
+void closeMergeSort()
+{
+  assert(memPool != NULL);
+  delete memPool;
+  memPool = NULL;
+}
+
+export
+void mergeSort(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t bufKey[],
+    uniform Val_t bufVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int N)
+{
+  uniform int stageCount = 0;
+  for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++);
+
+  uniform Key_t * uniform iKey, * uniform oKey;
+  uniform Val_t * uniform iVal, * uniform oVal;
+
+  if (stageCount & 1)
+  {
+    iKey = bufKey;
+    iVal = bufVal;
+    oKey = dstKey;
+    oVal = dstVal;
+  }
+  else
+  {
+    iKey = dstKey;
+    iVal = dstVal;
+    oKey = bufKey;
+    oVal = bufVal;
+  }
+
+
+
+  assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
+  assert(N % (programCount*2) == 0);
+
+  // cpu: 28  gpu: 74 M/s
+  {
+    // cpu: 356   gpu: 534 M/s
+    mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
+
+#if 1
+    for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
+    {
+      // cpu: 30  gpu: 112 M/s
+      {
+#if 1
+        // cpu: 121  gpu: 460 M/s
+        {
+          // cpu: 190  gpu: 600 M/s
+          //Find sample ranks and prepare for limiters merge
+          generateSampleRanks(ranksA, ranksB, iKey, stride, N);
+
+          // cpu: 120 gpu: 457 M/s
+          //Merge ranks and indices
+          mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
+        }
+#endif
+
+        // cpu: 287  gpu: 194 M/s
+        //Merge elementary intervals
+        mergeElementaryIntervals(oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
+      }
+
+      {
+        uniform Key_t * uniform tmpKey = iKey;
+        iKey = oKey;
+        oKey = tmpKey;
+      }
+      {
+        uniform Val_t * uniform tmpVal = iVal;
+        iVal = oVal;
+        oVal = tmpVal;
+      }
+    }
+#endif
+  }
+}
diff --git a/examples/portable/nbody_hermite4/Makefile_cpu b/examples/portable/nbody_hermite4/Makefile_cpu
new file mode 100644
index 00000000..b3751669
--- /dev/null
+++ b/examples/portable/nbody_hermite4/Makefile_cpu
@@ -0,0 +1,8 @@
+
+EXAMPLE=hermite4
+CPP_SRC=hermite4.cpp 
+ISPC_SRC=hermite4.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+
+include ../common_cpu.mk
diff --git a/examples/portable/nbody_hermite4/Makefile_knc b/examples/portable/nbody_hermite4/Makefile_knc
new file mode 100644
index 00000000..3fbdda9f
--- /dev/null
+++ b/examples/portable/nbody_hermite4/Makefile_knc
@@ -0,0 +1,7 @@
+EXAMPLE=hermite4
+CXX_SRC=hermite4.cpp 
+ISPC_SRC=hermite4.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+
+include ../common_knc.mk
diff --git a/examples/portable/nbody_hermite4/Makefile_ptx b/examples/portable/nbody_hermite4/Makefile_ptx
new file mode 100644
index 00000000..da8b268b
--- /dev/null
+++ b/examples/portable/nbody_hermite4/Makefile_ptx
@@ -0,0 +1,14 @@
+PROG=hermite4
+ISPC_SRC=hermite4.ispc
+#CU_SRC=hermite4.cu
+CXX_SRC=hermite4.cpp 
+PTXCC_REGMAX=64
+#ISPC_FLAGS= --opt=disable-uniform-control-flow
+
+#LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
diff --git a/examples/portable/nbody_hermite4/hermite4.cpp b/examples/portable/nbody_hermite4/hermite4.cpp
new file mode 100644
index 00000000..8e283e2d
--- /dev/null
+++ b/examples/portable/nbody_hermite4/hermite4.cpp
@@ -0,0 +1,361 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Hermite4 N-body integrator */
+/* Makino and Aarseth, 1992 */
+/* http://adsabs.harvard.edu/abs/1992PASJ...44..141M and references there in*/
+
+#include <cstdlib>
+#include <cmath>
+#include <cstdio>
+#include <algorithm>
+#include <vector>
+#include <cassert>
+
+#include "timing.h"
+#include "ispc_malloc.h"
+
+#include "typeReal.h"
+#include "hermite4_ispc.h"
+
+struct Hermite4
+{
+  enum {PP_FLOP=44};
+  const int n;
+  const real eta;
+  real eps2;
+  real *g_mass, *g_gpot;
+  real *g_posx, *g_posy, *g_posz;
+  real *g_velx, *g_vely, *g_velz;
+  real *g_accx, *g_accy, *g_accz;
+  real *g_jrkx, *g_jrky, *g_jrkz;
+
+  std::vector<real> accx0, accy0, accz0;
+  std::vector<real> jrkx0, jrky0, jrkz0;
+
+  Hermite4(const int _n = 8192, const real _eta = 0.1) : n(_n), eta(_eta)
+  {
+    eps2  = 4.0/n;  /* eps = 4/n to give Ebin = 1 KT */
+    eps2 *= eps2;
+    g_mass = new real[n];
+    g_gpot = new real[n];
+    g_posx = new real[n];
+    g_posy = new real[n];
+    g_posz = new real[n];
+    g_velx = new real[n];
+    g_vely = new real[n];
+    g_velz = new real[n];
+    g_accx = new real[n];
+    g_accy = new real[n];
+    g_accz = new real[n];
+    g_jrkx = new real[n];
+    g_jrky = new real[n];
+    g_jrkz = new real[n];
+
+    accx0.resize(n);
+    accy0.resize(n);
+    accz0.resize(n);
+    jrkx0.resize(n);
+    jrky0.resize(n);
+    jrkz0.resize(n);
+
+    printf("---Intializing nbody--- \n");
+
+    const real R0 = 1;
+    const real mp = 1.0/n;
+#pragma omp parallel for schedule(runtime)
+    for (int i = 0; i < n; i++)
+    {
+      real xp, yp, zp, s2 = 2*R0;
+      real vx, vy, vz;
+      while (s2 > R0*R0) {
+        xp = (1.0 - 2.0*drand48())*R0;
+        yp = (1.0 - 2.0*drand48())*R0;
+        zp = (1.0 - 2.0*drand48())*R0;
+        s2 = xp*xp + yp*yp + zp*zp;
+        vx = drand48() * 0.1;
+        vy = drand48() * 0.1;
+        vz = drand48() * 0.1;
+      }
+      g_posx[i] = xp;
+      g_posy[i] = yp;
+      g_posz[i] = zp;
+      g_velx[i] = vx;
+      g_vely[i] = vy;
+      g_velz[i] = vz;
+      g_mass[i] = mp;
+    }
+  }
+
+  ~Hermite4()
+  {
+    delete g_mass;
+    delete g_gpot;
+    delete g_posx;
+    delete g_posy;
+    delete g_posz;
+    delete g_velx;
+    delete g_vely;
+    delete g_velz;
+    delete g_accx;
+    delete g_accy;
+    delete g_accz;
+    delete g_jrkx;
+    delete g_jrky;
+    delete g_jrkz;
+  }
+
+  void forces();
+
+  real step(const real dt)
+  {
+    const real dt2 = dt*real(1.0/2.0);
+    const real dt3 = dt*real(1.0/3.0);
+
+    real dt_min = HUGE;
+
+#pragma omp parallel for schedule(runtime)
+    for (int i = 0; i < n; i++)
+    {
+      accx0[i] = g_accx[i];
+      accy0[i] = g_accy[i];
+      accz0[i] = g_accz[i];
+      jrkx0[i] = g_jrkx[i];
+      jrky0[i] = g_jrky[i];
+      jrkz0[i] = g_jrkz[i];
+
+      g_posx[i] += dt*(g_velx[i] + dt2*(g_accx[i] + dt3*g_jrkx[i]));
+      g_posy[i] += dt*(g_vely[i] + dt2*(g_accy[i] + dt3*g_jrky[i]));
+      g_posz[i] += dt*(g_velz[i] + dt2*(g_accz[i] + dt3*g_jrkz[i]));
+
+      g_velx[i] += dt*(g_accx[i] + dt2*g_jrkx[i]);
+      g_vely[i] += dt*(g_accy[i] + dt2*g_jrky[i]);
+      g_velz[i] += dt*(g_accz[i] + dt2*g_jrkz[i]);
+    }
+
+    forces();
+
+    if (dt > 0.0)
+    {
+      const real h    = dt*real(0.5);
+      const real hinv = real(1.0)/h;
+      const real f1   = real(0.5)*hinv*hinv;
+      const real f2   = real(3.0)*hinv*f1;
+
+      const real dt2  = dt *dt * real(1.0/2.0);
+      const real dt3  = dt2*dt * real(1.0/3.0);
+      const real dt4  = dt3*dt * real(1.0/4.0);
+      const real dt5  = dt4*dt * real(1.0/5.0);
+
+#pragma omp parallel for schedule(runtime) reduction(min:dt_min)
+      for (int i = 0; i < n; i++)
+      {
+        /* compute snp & crk */
+
+        const real Amx = g_accx[i] - accx0[i];
+        const real Amy = g_accy[i] - accy0[i];
+        const real Amz = g_accz[i] - accz0[i];
+
+        const real Jmx = h*(g_jrkx[i] - jrkx0[i]);
+        const real Jmy = h*(g_jrky[i] - jrky0[i]);
+        const real Jmz = h*(g_jrkz[i] - jrkz0[i]);
+
+        const real Jpx = h*(g_jrkx[i] + jrkx0[i]);
+        const real Jpy = h*(g_jrky[i] + jrky0[i]);
+        const real Jpz = h*(g_jrkz[i] + jrkz0[i]);
+
+
+        real snpx = f1*Jmx;
+        real snpy = f1*Jmy;
+        real snpz = f1*Jmz;
+
+        real crkx = f2*(Jpx - Amx);
+        real crky = f2*(Jpy - Amy);
+        real crkz = f2*(Jpz - Amz);
+
+        snpx -= h*crkx;
+        snpy -= h*crky;
+        snpz -= h*crkz;
+
+        /* correct */
+
+        g_posx[i] += dt4*snpx + dt5*crkx;
+        g_posy[i] += dt4*snpy + dt5*crky;
+        g_posz[i] += dt4*snpz + dt5*crkz;
+
+        g_velx[i] += dt3*snpx + dt4*crkx;
+        g_vely[i] += dt3*snpy + dt4*crky;
+        g_velz[i] += dt3*snpz + dt4*crkz;
+
+        /* compute new timestep */
+
+        const real s0 = g_accx[i]*g_accx[i] + g_accy[i]*g_accy[i] + g_accz[i]*g_accz[i];
+        const real s1 = g_jrkx[i]*g_jrkx[i] + g_jrky[i]*g_jrky[i] + g_jrkz[i]*g_jrkz[i];
+        const real s2 = snpx*snpx + snpy*snpy + snpz*snpz;
+        const real s3 = crkx*crkx + crky*crky + crkz*crkz;
+
+        const double u = std::sqrt(s0*s2) + s1;
+        const double l = std::sqrt(s1*s3) + s2;
+        assert(l > 0.0f);
+        const real dt_loc = eta *std::sqrt(u/l);
+        dt_min = std::min(dt_min, dt_loc);
+      }
+    }
+
+    if (dt_min == HUGE)
+      return dt;
+    else
+      return dt_min;
+  }
+
+  void energy(real &Ekin, real &Epot)
+  {
+    real ekin = 0, epot = 0;
+
+#pragma omp parallel for reduction(+:ekin,epot)
+    for (int i = 0; i < n; i++)
+    {
+      ekin += g_mass[i] * (g_velx[i]*g_velx[i] + g_vely[i]*g_vely[i] + g_velz[i]*g_velz[i]) * real(0.5f);
+      epot += real(0.5f)*g_mass[i] * g_gpot[i];
+    }
+    Ekin = ekin;
+    Epot = epot;
+  }
+
+  void integrate(const int niter, const real t_end = HUGE)
+  {
+    const double tin = rtc();
+    forces();
+    const double fn = n;
+    printf(" mean flop rate in %g sec [%g GFLOP/s]\n", rtc() - tin,
+        fn*fn*PP_FLOP/(rtc() - tin)/1e9);
+
+    real Epot0, Ekin0;
+    energy(Ekin0, Epot0);
+    const real Etot0 = Epot0 + Ekin0;
+    printf(" E: %g %g %g \n", Epot0, Ekin0, Etot0);
+
+    /////////
+
+    real t_global = 0;
+    double t0 = 0;
+    int iter = 0;
+    int ntime = 10;
+    real dt = 1.0/131072;
+    real Epot, Ekin, Etot = Etot0;
+    while (t_global < t_end) {
+      if (iter % ntime == 0)
+        t0 = rtc();
+
+      if (iter >= niter) return;
+
+      dt = step(dt);
+      iter++;
+      t_global += dt;
+
+      const real Etot_pre = Etot;
+      energy(Ekin, Epot);
+      Etot = Ekin + Epot;
+
+      if (iter % 1 == 0) {
+        const real Etot = Ekin + Epot;
+        printf("iter= %d: t= %g  dt= %g Ekin= %g  Epot= %g  Etot= %g , dE = %g d(dE)= %g \n",
+            iter, t_global, dt, Ekin, Epot, Etot, (Etot - Etot0)/std::abs(Etot0),
+            (Etot - Etot_pre)/std::abs(Etot_pre)   );
+      }
+
+      if (iter % ntime == 0) {
+        printf(" mean flop rate in %g sec [%g GFLOP/s]\n", rtc() - t0,
+            fn*fn*PP_FLOP/(rtc() - t0)/1e9*ntime);
+      }
+
+      fflush(stdout);
+
+    }
+  }
+
+};
+
+
+
+void Hermite4::forces()
+{
+  ispc::compute_forces(
+      n,
+      g_mass,
+      g_posx,
+      g_posy,
+      g_posz,
+      g_velx,
+      g_vely,
+      g_velz,
+      g_accx,
+      g_accy,
+      g_accz,
+      g_jrkx,
+      g_jrky,
+      g_jrkz,
+      g_gpot,
+      eps2);
+}
+
+void run(const int nbodies, const real eta, const int nstep)
+{
+  Hermite4 h4(nbodies, eta);
+  h4.integrate(nstep);
+}
+
+int main(int argc, char *argv[])
+{
+  printf("  Usage: %s [nbodies=8192] [nsteps=40] [eta=0.1] \n", argv[0]);
+
+  int nbodies = 8192;
+  if (argc > 1) nbodies = atoi(argv[1]);
+
+  int nstep = 40;
+  if (argc > 2) nstep = atoi(argv[2]);
+
+  float eta = 0.1;
+  if (argc > 3) eta = atof(argv[3]);
+
+
+
+  printf("nbodies= %d\n", nbodies);
+  printf("nstep= %d\n", nstep);
+  printf(" eta= %g \n", eta);
+
+  run(nbodies, eta, nstep);
+
+  return 0;
+}
+
diff --git a/examples/portable/nbody_hermite4/hermite4.ispc b/examples/portable/nbody_hermite4/hermite4.ispc
new file mode 100644
index 00000000..406b1ee8
--- /dev/null
+++ b/examples/portable/nbody_hermite4/hermite4.ispc
@@ -0,0 +1,197 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "typeReal.h"
+
+typedef real<3> vec3;
+struct Force
+{
+  vec3 acc, jrk;
+  real pot, null;
+};
+
+struct Predictor
+{
+  vec3 pos, vel;
+};
+
+static inline
+void body_body_force(
+    Force &fi,
+    const Predictor &pi,
+    const Predictor &pj,
+    const real mj,
+    const real eps2)
+{
+  const real dx = pj.pos.x - pi.pos.x;
+  const real dy = pj.pos.y - pi.pos.y;
+  const real dz = pj.pos.z - pi.pos.z;
+
+  const real ds2 = dx*dx + dy*dy + dz*dz + eps2;
+
+#if 1
+  const real  inv_ds  = rsqrt((float)ds2);
+#else
+  const real  inv_ds  = rsqrt(ds2);
+#endif
+  const real  inv_ds2 = inv_ds*inv_ds;
+  const real minv_ds  = inv_ds  * mj;
+  const real minv_ds3 = inv_ds2 * minv_ds;
+
+
+  fi.acc.x += minv_ds3 * dx;
+  fi.acc.y += minv_ds3 * dy;
+  fi.acc.z += minv_ds3 * dz;
+  fi.pot   -= minv_ds;
+
+  const real dvx = pj.vel.x - pi.vel.x;
+  const real dvy = pj.vel.y - pi.vel.y;
+  const real dvz = pj.vel.z - pi.vel.z;
+  const real rv  = dx*dvx + dy*dvy + dz*dvz;
+
+  const real Jij = (real)(-3.0) * (rv * inv_ds2 * minv_ds3);
+
+  fi.jrk.x += minv_ds3*dvx + Jij*dx;
+  fi.jrk.y += minv_ds3*dvy + Jij*dy;
+  fi.jrk.z += minv_ds3*dvz + Jij*dz;
+}
+
+task void compute_forces_task(
+    uniform const int     n,
+    uniform const int nPerTask,
+    uniform const real mass[],
+    uniform const real posx[],
+    uniform const real posy[],
+    uniform const real posz[],
+    uniform const real velx[],
+    uniform const real vely[],
+    uniform const real velz[],
+    uniform       real accx[],
+    uniform       real accy[],
+    uniform       real accz[],
+    uniform       real jrkx[],
+    uniform       real jrky[],
+    uniform       real jrkz[],
+    uniform       real gpot[],
+    const uniform real eps2)
+{
+  const uniform int nibeg = taskIndex * nPerTask;
+  const uniform int niend = min(n, nibeg + nPerTask);
+
+  if (nibeg >= n)
+    return;
+
+  uniform real shdata[7][programCount];
+
+  assert((n%programCount) == 0);
+
+  foreach (i = nibeg ... niend)
+  {
+    Force fi;
+    fi.acc = (real)0.0;
+    fi.jrk = (real)0.0;
+    fi.pot = (real)0.0;
+
+    Predictor pi;
+    pi.pos.x = posx[i];
+    pi.pos.y = posy[i];
+    pi.pos.z = posz[i];
+    pi.vel.x = velx[i];
+    pi.vel.y = vely[i];
+    pi.vel.z = velz[i];
+
+    for (uniform int jb = 0; jb < n; jb += programCount)
+    {
+      const int jp = jb + programIndex;
+      shdata[0][programIndex] = posx[jp];
+      shdata[1][programIndex] = posy[jp];
+      shdata[2][programIndex] = posz[jp];
+      shdata[3][programIndex] = mass[jp];
+      shdata[4][programIndex] = velx[jp];
+      shdata[5][programIndex] = vely[jp];
+      shdata[6][programIndex] = velz[jp];
+
+      for (uniform int j = 0; j < programCount; j++)
+      {
+        Predictor pj;
+        pj.pos.x = shdata[0][j];
+        pj.pos.y = shdata[1][j];
+        pj.pos.z = shdata[2][j];
+        pj.vel.x = shdata[4][j];
+        pj.vel.y = shdata[5][j];
+        pj.vel.z = shdata[6][j];
+        const real jmass  = shdata[3][j];
+        body_body_force(fi,pi,pj,jmass,eps2);
+      }
+    }
+
+    accx[i] = fi.acc.x;
+    accy[i] = fi.acc.y;
+    accz[i] = fi.acc.z;
+    jrkx[i] = fi.jrk.x;
+    jrky[i] = fi.jrk.y;
+    jrkz[i] = fi.jrk.z;
+    gpot[i] = fi.pot;
+  }
+}
+
+export void compute_forces(
+    uniform const int     n,
+    uniform const real mass[],
+    uniform const real posx[],
+    uniform const real posy[],
+    uniform const real posz[],
+    uniform const real velx[],
+    uniform const real vely[],
+    uniform const real velz[],
+    uniform       real accx[],
+    uniform       real accy[],
+    uniform       real accz[],
+    uniform       real jrkx[],
+    uniform       real jrky[],
+    uniform       real jrkz[],
+    uniform       real gpot[],
+    const uniform real eps2)
+{
+  const uniform int nPerTask = min(128,programCount*8);
+  const uniform int nTask = (n+nPerTask-1)/nPerTask;
+
+  launch [nTask]  compute_forces_task(
+      n, nPerTask,
+      mass,
+      posx,posy,posz,
+      velx,vely,velz,
+      accx,accy,accz,
+      jrkx,jrky,jrkz,
+      gpot,eps2);
+}
diff --git a/examples/portable/nbody_hermite4/typeReal.h b/examples/portable/nbody_hermite4/typeReal.h
new file mode 100644
index 00000000..064a6867
--- /dev/null
+++ b/examples/portable/nbody_hermite4/typeReal.h
@@ -0,0 +1,2 @@
+#pragma once
+typedef double real;
diff --git a/examples/portable/omp_tasksys.cpp b/examples/portable/omp_tasksys.cpp
new file mode 100644
index 00000000..37281961
--- /dev/null
+++ b/examples/portable/omp_tasksys.cpp
@@ -0,0 +1,409 @@
+/*
+  Copyright (c) 2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#define DBG(x)
+#include <omp.h>
+#include <malloc.h>
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <algorithm>
+
+// Signature of ispc-generated 'task' functions
+typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,
+                             int taskIndex, int taskCount,
+                             int taskIndex0, int taskIndex1, int taskIndex2,
+                             int taskCount0, int taskCount1, int taskCount2);
+
+// Small structure used to hold the data for each task
+#ifdef _MSC_VER
+__declspec(align(16))
+#endif
+struct TaskInfo {
+    TaskFuncType func;
+    void *data;
+    int taskIndex;
+    int taskCount3d[3];
+#if defined(ISPC_IS_WINDOWS)
+    event taskEvent;
+#endif
+    int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; }
+    int taskIndex0() const
+    {
+      return taskIndex % taskCount3d[0];
+    }
+    int taskIndex1() const
+    {
+      return ( taskIndex / taskCount3d[0] ) % taskCount3d[1];
+    }
+    int taskIndex2() const
+    {
+      return taskIndex / ( taskCount3d[0]*taskCount3d[1] );
+    }
+    int taskCount0() const { return taskCount3d[0]; }
+    int taskCount1() const { return taskCount3d[1]; }
+    int taskCount2() const { return taskCount3d[2]; }
+    TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); }
+}
+#ifndef _MSC_VER
+__attribute__((aligned(32)));
+#endif
+;
+
+// ispc expects these functions to have C linkage / not be mangled
+extern "C" {
+    void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz);
+    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
+    void ISPCSync(void *handle);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// TaskGroupBase
+
+#define LOG_TASK_QUEUE_CHUNK_SIZE 14
+#define MAX_TASK_QUEUE_CHUNKS 8
+#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
+
+#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
+
+#define NUM_MEM_BUFFERS 16
+
+class TaskGroup;
+
+/** The TaskGroupBase structure provides common functionality for "task
+    groups"; a task group is the set of tasks launched from within a single
+    ispc function.  When the function is ready to return, it waits for all
+    of the tasks in its task group to finish before it actually returns.
+ */
+class TaskGroupBase {
+public:
+    void Reset();
+
+    int AllocTaskInfo(int count);
+    TaskInfo *GetTaskInfo(int index);
+
+    void *AllocMemory(int64_t size, int32_t alignment);
+
+protected:
+    TaskGroupBase();
+    ~TaskGroupBase();
+
+    int nextTaskInfoIndex;
+
+private:
+    /* We allocate blocks of TASK_QUEUE_CHUNK_SIZE TaskInfo structures as
+       needed by the calling function.  We hold up to MAX_TASK_QUEUE_CHUNKS
+       of these (and then exit at runtime if more than this many tasks are
+       launched.)
+     */
+    TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
+
+    /* We also allocate chunks of memory to service ISPCAlloc() calls.  The
+       memBuffers[] array holds pointers to this memory.  The first element
+       of this array is initialized to point to mem and then any subsequent
+       elements required are initialized with dynamic allocation.
+     */
+    int curMemBuffer, curMemBufferOffset;
+    int memBufferSize[NUM_MEM_BUFFERS];
+    char *memBuffers[NUM_MEM_BUFFERS];
+    char mem[256];
+};
+
+
+inline TaskGroupBase::TaskGroupBase() {
+    nextTaskInfoIndex = 0;
+
+    curMemBuffer = 0;
+    curMemBufferOffset = 0;
+    memBuffers[0] = mem;
+    memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
+    for (int i = 1; i < NUM_MEM_BUFFERS; ++i) {
+        memBuffers[i] = NULL;
+        memBufferSize[i] = 0;
+    }
+
+    for (int i = 0; i < MAX_TASK_QUEUE_CHUNKS; ++i)
+        taskInfo[i] = NULL;
+}
+
+
+inline TaskGroupBase::~TaskGroupBase() {
+    // Note: don't delete memBuffers[0], since it points to the start of
+    // the "mem" member!
+    for (int i = 1; i < NUM_MEM_BUFFERS; ++i)
+        delete[](memBuffers[i]);
+}
+
+
+inline void
+TaskGroupBase::Reset() {
+    nextTaskInfoIndex = 0;
+    curMemBuffer = 0;
+    curMemBufferOffset = 0;
+}
+
+
+inline int
+TaskGroupBase::AllocTaskInfo(int count) {
+    int ret = nextTaskInfoIndex;
+    nextTaskInfoIndex += count;
+    return ret;
+}
+
+
+inline TaskInfo *
+TaskGroupBase::GetTaskInfo(int index) {
+    int chunk = (index >> LOG_TASK_QUEUE_CHUNK_SIZE);
+    int offset = index & (TASK_QUEUE_CHUNK_SIZE-1);
+
+    if (chunk == MAX_TASK_QUEUE_CHUNKS) {
+        fprintf(stderr, "A total of %d tasks have been launched from the "
+                "current function--the simple built-in task system can handle "
+                "no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE "
+                "and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation.  "
+                "Sorry!  Exiting.\n", index);
+        exit(1);
+    }
+
+    if (taskInfo[chunk] == NULL)
+        taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
+    return &taskInfo[chunk][offset];
+}
+
+
+inline void *
+TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
+    char *basePtr = memBuffers[curMemBuffer];
+    intptr_t iptr = (intptr_t)(basePtr + curMemBufferOffset);
+    iptr = (iptr + (alignment-1)) & ~(alignment-1);
+
+    int newOffset = int(iptr - (intptr_t)basePtr + size);
+    if (newOffset < memBufferSize[curMemBuffer]) {
+        curMemBufferOffset = newOffset;
+        return (char *)iptr;
+    }
+
+    ++curMemBuffer;
+    curMemBufferOffset = 0;
+    assert(curMemBuffer < NUM_MEM_BUFFERS);
+
+    int allocSize = 1 << (12 + curMemBuffer);
+    allocSize = std::max(int(size+alignment), allocSize);
+    char *newBuf = new char[allocSize];
+    memBufferSize[curMemBuffer] = allocSize;
+    memBuffers[curMemBuffer] = newBuf;
+    return AllocMemory(size, alignment);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Atomics and the like
+
+static inline void
+lMemFence() {
+    // Windows atomic functions already contain the fence
+    // KNC doesn't need the memory barrier
+#if !defined ISPC_IS_KNC && !defined ISPC_IS_WINDOWS
+    __sync_synchronize();
+#endif
+}
+
+static void *
+lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
+#ifdef ISPC_IS_WINDOWS
+    return InterlockedCompareExchangePointer(v, newValue, oldValue);
+#else
+    void *result = __sync_val_compare_and_swap(v, oldValue, newValue);
+    lMemFence();
+    return result;
+#endif // ISPC_IS_WINDOWS
+}
+
+static int32_t
+lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
+#ifdef ISPC_IS_WINDOWS
+    return InterlockedCompareExchange((volatile LONG *)v, newValue, oldValue);
+#else
+    int32_t result = __sync_val_compare_and_swap(v, oldValue, newValue);
+    lMemFence();
+    return result;
+#endif // ISPC_IS_WINDOWS
+}
+
+static inline int32_t
+lAtomicAdd(volatile int32_t *v, int32_t delta) {
+#ifdef ISPC_IS_WINDOWS
+    return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta;
+#else
+    return __sync_fetch_and_add(v, delta);
+#endif
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+class TaskGroup : public TaskGroupBase {
+public:
+    void Launch(int baseIndex, int count);
+    void Sync();
+
+};
+
+
+///////////////////////////////////////////////////////////////////////////
+// OpenMP
+
+static void
+InitTaskSystem() {
+        // No initialization needed
+}
+
+inline void
+TaskGroup::Launch(int baseIndex, int count) {
+#pragma omp parallel
+  {
+    const int threadIndex = omp_get_thread_num();
+    const int threadCount = omp_get_num_threads();
+
+    TaskInfo ti = *GetTaskInfo(baseIndex);
+#pragma omp for schedule(runtime)
+    for(int i = 0; i < count; i++)
+    {
+        ti.taskIndex = i;
+
+        // Actually run the task.
+        ti.func(ti.data, threadIndex, threadCount, ti.taskIndex, ti.taskCount(),
+            ti.taskIndex0(), ti.taskIndex1(), ti.taskIndex2(),
+            ti.taskCount0(), ti.taskCount1(), ti.taskCount2());
+    }
+  }
+}
+
+inline void
+TaskGroup::Sync() {
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+#define MAX_FREE_TASK_GROUPS 64
+static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
+
+  static inline TaskGroup *
+AllocTaskGroup()
+{
+  for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
+    TaskGroup *tg = freeTaskGroups[i];
+    if (tg != NULL) {
+      void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);
+      if (ptr != NULL) {
+        return (TaskGroup *)ptr;
+      }
+    }
+  }
+
+  return new TaskGroup;
+}
+
+
+  static inline void
+FreeTaskGroup(TaskGroup *tg)
+{
+  tg->Reset();
+
+  for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
+    if (freeTaskGroups[i] == NULL) {
+      void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL);
+      if (ptr == NULL)
+        return;
+    }
+  }
+
+  delete tg;
+}
+
+  void
+ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2)
+{
+  const int count = count0*count1*count2;
+  TaskGroup *taskGroup;
+  if (*taskGroupPtr == NULL) {
+    InitTaskSystem();
+    taskGroup = AllocTaskGroup();
+    *taskGroupPtr = taskGroup;
+  }
+  else
+    taskGroup = (TaskGroup *)(*taskGroupPtr);
+
+  int baseIndex = taskGroup->AllocTaskInfo(count);
+  for (int i = 0; i < 1; ++i) {
+    TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i);
+    ti->func = (TaskFuncType)func;
+    ti->data = data;
+    ti->taskIndex = i;
+    ti->taskCount3d[0] = count0;
+    ti->taskCount3d[1] = count1;
+    ti->taskCount3d[2] = count2;
+  }
+  taskGroup->Launch(baseIndex, count);
+}
+
+
+  void
+ISPCSync(void *h)
+{
+  TaskGroup *taskGroup = (TaskGroup *)h;
+  if (taskGroup != NULL) {
+    taskGroup->Sync();
+    FreeTaskGroup(taskGroup);
+  }
+}
+
+
+  void *
+ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment)
+{
+  TaskGroup *taskGroup;
+  if (*taskGroupPtr == NULL) {
+    InitTaskSystem();
+    taskGroup = AllocTaskGroup();
+    *taskGroupPtr = taskGroup;
+  }
+  else
+    taskGroup = (TaskGroup *)(*taskGroupPtr);
+
+  return taskGroup->AllocMemory(size, alignment);
+}
+
diff --git a/examples/portable/options/.gitignore b/examples/portable/options/.gitignore
new file mode 100644
index 00000000..55bdd069
--- /dev/null
+++ b/examples/portable/options/.gitignore
@@ -0,0 +1 @@
+options
diff --git a/examples/portable/options/Makefile_cpu b/examples/portable/options/Makefile_cpu
new file mode 100644
index 00000000..dc4f8108
--- /dev/null
+++ b/examples/portable/options/Makefile_cpu
@@ -0,0 +1,8 @@
+
+EXAMPLE=options
+CPP_SRC=options.cpp 
+ISPC_SRC=options.ispc
+ISPC_IA_TARGETS=avx1-i32x16
+ISPC_ARM_TARGETS=neon
+
+include ../common_cpu.mk
diff --git a/examples/portable/options/Makefile_knc b/examples/portable/options/Makefile_knc
new file mode 100644
index 00000000..5c96ab94
--- /dev/null
+++ b/examples/portable/options/Makefile_knc
@@ -0,0 +1,7 @@
+EXAMPLE=options
+CXX_SRC=options.cpp 
+ISPC_SRC=options.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+
+include ../common_knc.mk
diff --git a/examples/portable/options/Makefile_ptx b/examples/portable/options/Makefile_ptx
new file mode 100644
index 00000000..1065eb92
--- /dev/null
+++ b/examples/portable/options/Makefile_ptx
@@ -0,0 +1,14 @@
+PROG=options
+ISPC_SRC=options.ispc
+CU_SRC=options.cu
+CXX_SRC=options.cpp 
+PTXCC_REGMAX=128
+
+
+#LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
diff --git a/examples/portable/options/options.cpp b/examples/portable/options/options.cpp
new file mode 100644
index 00000000..037e30be
--- /dev/null
+++ b/examples/portable/options/options.cpp
@@ -0,0 +1,120 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define NOMINMAX
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#include <cmath>
+#include <algorithm>
+using std::max;
+
+#include "options_defs.h"
+#include "timing.h"
+#include "ispc_malloc.h"
+
+#include "options_ispc.h"
+using namespace ispc;
+
+static void usage() {
+    printf("usage: options [--count=<num options>]\n");
+}
+
+
+int main(int argc, char *argv[]) {
+    int nOptions = 128*1024;
+
+    for (int i = 1; i < argc; ++i) {
+        if (strncmp(argv[i], "--count=", 8) == 0) {
+            nOptions = atoi(argv[i] + 8);
+            if (nOptions <= 0) {
+                usage();
+                exit(1);
+            }
+        }
+    }
+
+    float *S = new float[nOptions];
+    float *X = new float[nOptions];
+    float *T = new float[nOptions];
+    float *r = new float[nOptions];
+    float *v = new float[nOptions];
+    float *result = new float[nOptions];
+
+    for (int i = 0; i < nOptions; ++i) {
+        S[i] = 100;  // stock price
+        X[i] = 98;   // option strike price
+        T[i] = 2;    // time (years)
+        r[i] = .02;  // risk-free interest rate
+        v[i] = 5;    // volatility
+    }
+
+    double sum;
+
+    //
+    // Binomial options pricing model, ispc implementation, tasks
+    //
+    double binomial_tasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        binomial_put_ispc_tasks(S, X, T, r, v, result, nOptions);
+        double dt = get_elapsed_msec();
+        binomial_tasks = std::min(binomial_tasks, dt);
+    }
+    sum = 0.;
+    for (int i = 0; i < nOptions; ++i)
+      sum += result[i];
+    printf("[binomial ispc, tasks]:\t\t[%.3f] msec (avg %f)\n",
+           binomial_tasks, sum / nOptions);
+
+    //
+    // Black-Scholes options pricing model, ispc implementation, tasks
+    //
+    double bs_ispc_tasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        black_scholes_ispc_tasks(S, X, T, r, v, result, nOptions);
+        double dt = get_elapsed_msec();
+        sum = 0.;
+        for (int i = 0; i < nOptions; ++i)
+            sum += result[i];
+        bs_ispc_tasks = std::min(bs_ispc_tasks, dt);
+    }
+    printf("[black-scholes ispc, tasks]:\t[%.3f] msec (avg %f)\n",
+           bs_ispc_tasks, sum / nOptions);
+
+
+    return 0;
+}
diff --git a/examples/portable/options/options.cu b/examples/portable/options/options.cu
new file mode 100644
index 00000000..2465c53d
--- /dev/null
+++ b/examples/portable/options/options.cu
@@ -0,0 +1,334 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "options_defs.h"
+#include "cuda_helpers.cuh"
+
+__device__ static inline void __range_reduce_log(float input, float * reduced,
+                                      int * exponent) {
+    int int_version = __float_as_int(input); //intbits(input);
+    // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
+    // exponent mask    = 0111 1111 1000 0000 0000 0000 0000 0000
+    //                    0x7  0xF  0x8  0x0  0x0  0x0  0x0  0x0
+    // non-exponent     = 1000 0000 0111 1111 1111 1111 1111 1111
+    //                  = 0x8  0x0  0x7  0xF  0xF  0xF  0xF  0xF
+
+    //const int exponent_mask(0x7F800000)
+    const int nonexponent_mask = 0x807FFFFF;
+
+    // We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126
+    const int exponent_neg1 = (126l << 23);
+    // NOTE(boulos): We don't need to mask anything out since we know
+    // the sign bit has to be 0. If it's 1, we need to return infinity/nan
+    // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
+    int biased_exponent = int_version >> 23; // This number is [0, 255] but it means [-127, 128]
+
+    int offset_exponent = biased_exponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
+    *exponent = offset_exponent - 127; // get the real value
+
+    // Blend the offset_exponent with the original input (do this in
+    // int for now, until I decide if float can have & and &not)
+    int blended = (int_version & nonexponent_mask) | (exponent_neg1);
+    *reduced = __int_as_float(blended); //floatbits(blended);
+}
+
+
+__device__ static inline float __Logf(const float x_full)
+{
+#if 1
+  return __logf(x_full);
+#else
+  float reduced;
+  int exponent;
+
+  const int NaN_bits = 0x7fc00000;
+  const int Neg_Inf_bits = 0xFF800000;
+  const float NaN = __int_as_float(NaN_bits); //floatbits(NaN_bits);
+  const float neg_inf = __int_as_float(Neg_Inf_bits); //floatbits(Neg_Inf_bits);
+  bool use_nan = x_full < 0.f;
+  bool use_inf = x_full == 0.f;
+  bool exceptional = use_nan || use_inf;
+  const float one = 1.0f;
+
+  float patched = exceptional ? one : x_full;
+  __range_reduce_log(patched, &reduced, &exponent);
+
+  const float ln2 = 0.693147182464599609375f;
+
+  float x1 = one - reduced;
+  const float c1 = 0.50000095367431640625f;
+  const float c2 = 0.33326041698455810546875f;
+  const float c3 = 0.2519190013408660888671875f;
+  const float c4 = 0.17541764676570892333984375f;
+  const float c5 = 0.3424419462680816650390625f;
+  const float c6 = -0.599632322788238525390625f;
+  const float c7 = +1.98442304134368896484375f;
+  const float c8 = -2.4899270534515380859375f;
+  const float c9 = +1.7491014003753662109375f;
+
+  float result = x1 * c9 + c8;
+  result = x1 * result + c7;
+  result = x1 * result + c6;
+  result = x1 * result + c5;
+  result = x1 * result + c4;
+  result = x1 * result + c3;
+  result = x1 * result + c2;
+  result = x1 * result + c1;
+  result = x1 * result + one;
+
+  // Equation was for -(ln(red)/(1-red))
+  result *= -x1;
+  result += (float)(exponent) * ln2;
+
+  return exceptional ? (use_nan ? NaN : neg_inf) : result;
+#endif
+}
+
+__device__ static inline float __Expf(const float x_full)
+{
+#if 1
+  return __expf(x_full);
+#else
+  const float ln2_part1 = 0.6931457519f;
+  const float ln2_part2 = 1.4286067653e-6f;
+  const float one_over_ln2 = 1.44269502162933349609375f;
+
+  float scaled = x_full * one_over_ln2;
+  float k_real = floor(scaled);
+  int k = (int)k_real;
+
+  // Reduced range version of x
+  float x = x_full - k_real * ln2_part1;
+  x -= k_real * ln2_part2;
+
+  // These coefficients are for e^x in [0, ln(2)]
+  const float one = 1.f;
+  const float c2 = 0.4999999105930328369140625f;
+  const float c3 = 0.166668415069580078125f;
+  const float c4 = 4.16539050638675689697265625e-2f;
+  const float c5 = 8.378830738365650177001953125e-3f;
+  const float c6 = 1.304379315115511417388916015625e-3f;
+  const float c7 = 2.7555381529964506626129150390625e-4f;
+
+  float result = x * c7 + c6;
+  result = x * result + c5;
+  result = x * result + c4;
+  result = x * result + c3;
+  result = x * result + c2;
+  result = x * result + one;
+  result = x * result + one;
+
+  // Compute 2^k (should differ for float and double, but I'll avoid
+  // it for now and just do floats)
+  const int fpbias = 127;
+  int biased_n = k + fpbias;
+  bool overflow = k > fpbias;
+  // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
+  // we've got underflow. -127 * ln(2) -> -88.02. So the most
+  // negative float input that doesn't result in zero is like -88.
+  bool underflow = (biased_n <= 0);
+  const int InfBits = 0x7f800000;
+  biased_n <<= 23;
+  // Reinterpret this thing as float
+  float two_to_the_n = __int_as_float(biased_n); //floatbits(biased_n);
+  // Handle both doubles and floats (hopefully eliding the copy for float)
+  float elemtype_2n = two_to_the_n;
+  result *= elemtype_2n;
+//  result = overflow ? floatbits(InfBits) : result;
+  result = overflow ? __int_as_float(InfBits) : result;
+  result = underflow ? 0.0f : result;
+  return result;
+#endif
+}
+
+// Cumulative normal distribution function
+//
+__device__
+static inline float
+CND(float X) {
+    float L = fabsf(X);
+
+    float k = 1.0f / (1.0f + 0.2316419f * L);
+    float k2 = k*k;
+    float k3 = k2*k;
+    float k4 = k2*k2;
+    float k5 = k3*k2;
+
+    const float invSqrt2Pi = 0.39894228040f;
+    float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 +
+               -1.821255978f * k4 + 1.330274429f * k5);
+    w *= invSqrt2Pi * __Expf(-L * L * .5f);
+
+    if (X > 0.f)
+        w = 1.0f - w;
+    return w;
+}
+
+__global__
+void bs_task( float Sa[],  float Xa[],  float Ta[],
+    float ra[],  float va[],
+    float result[],  int count) {
+  if (taskIndex >= taskCount) return;
+     int first = taskIndex * (count/taskCount);
+     int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
+
+    for (int i = programIndex + first; i < last; i += programCount)
+      if (i < last)
+    {
+        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
+
+        float d1 = (__Logf(S/X) + (r + v * v * .5f) * T) / (v * sqrtf(T));
+        float d2 = d1 - v * sqrtf(T);
+
+        result[i] = S * CND(d1) - X * __Expf(-r * T) * CND(d2);
+    }
+}
+
+extern "C"
+__global__ void
+black_scholes_ispc_tasks___export( float Sa[],  float Xa[],  float Ta[],
+                          float ra[],  float va[],
+                          float result[],  int count) {
+  int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
+  launch(nTasks,1,1,bs_task)
+    (Sa, Xa, Ta, ra, va, result, count);
+  cudaDeviceSynchronize();
+}
+extern "C"
+__host__ void
+black_scholes_ispc_tasks( float Sa[],  float Xa[],  float Ta[],
+                          float ra[],  float va[],
+                          float result[],  int count) {
+  black_scholes_ispc_tasks___export<<<1,32>>>(Sa,Xa,Ta,ra,va,result,count);
+  cudaDeviceSynchronize();
+}
+
+/********/
+
+
+template<int NBEG, int NEND, int STEP>
+struct loop
+{
+  __device__ static void op1(float V[], const float u, const float X, const float S)
+  {
+    const int j = NBEG;
+    float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
+    V[j] = max(0.0f, X - S * upow);
+    loop<j+STEP,NEND,STEP>::op1(V,u,X,S);
+  }
+  __device__ static void op2(float V[], const float Pu, const float disc)
+  {
+    const int j = NBEG;
+#pragma unroll
+    for ( int k = 0; k < j; ++k)
+      V[k] = ((1.0f - Pu) * V[k] + Pu * V[k+ 1]) / disc;
+    loop<j+STEP,NEND,STEP>::op2(V, Pu,disc);
+  }
+};
+
+template<int NEND, int STEP>
+struct loop<NEND,NEND,STEP>
+{
+  __device__ static void op1(float V[], const float u, const float X, const float S) {}
+  __device__ static void op2(float V[], const float Pu, const float disc) {}
+};
+
+__device__
+static inline float
+binomial_put(float S, float X, float T, float r, float v)
+{
+
+  float V[BINOMIAL_NUM];
+
+  float dt = T / BINOMIAL_NUM;
+  float u = exp(v * sqrt(dt));
+  float d = 1.f / u;
+  float disc = exp(r * dt);
+  float Pu = (disc - d) / (u - d);
+
+#if 0  /* slow */
+  for ( int j = 0; j < BINOMIAL_NUM; ++j) {
+    float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
+    V[j] = max(0.0f, X - S * upow);
+  }
+  for ( int j = BINOMIAL_NUM-1; j >= 0; --j)
+    for ( int k = 0; k < j; ++k)
+      V[k] = ((1.0f - Pu) * V[k] + Pu * V[k+ 1]) / disc;
+#else  /* with loop unrolling, stores resutls in registers */
+  loop<0,BINOMIAL_NUM,1>::op1(V,u,X,S);
+  loop<BINOMIAL_NUM-1, -1, -1>::op2(V, Pu, disc);
+#endif
+  return V[0];
+}
+
+
+
+__global__ void
+binomial_task( float Sa[],  float Xa[],
+               float Ta[],  float ra[],
+               float va[],  float result[],
+               int count)
+{
+  int first = taskIndex * (count/taskCount);
+  int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
+
+  for (int i = programIndex + first; i < last; i += programCount)
+    if (i < last)
+    {
+      float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
+      result[i] = binomial_put(S, X, T, r, v);
+    }
+}
+
+
+extern "C" __global__ void
+binomial_put_ispc_tasks___export( float Sa[],  float Xa[],
+                         float Ta[],  float ra[],
+                         float va[],  float result[],
+                         int count) {
+  int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
+  launch(nTasks,1,1,binomial_task)
+    (Sa, Xa, Ta, ra, va, result, count);
+  cudaDeviceSynchronize();
+}
+extern "C"
+__host__ void
+binomial_put_ispc_tasks( float Sa[],  float Xa[],  float Ta[],
+                          float ra[],  float va[],
+                          float result[],  int count) {
+
+  cudaDeviceSetCacheConfig (cudaFuncCachePreferL1);
+  binomial_put_ispc_tasks___export<<<1,32>>>(Sa,Xa,Ta,ra,va,result,count);
+  cudaDeviceSynchronize();
+}
diff --git a/examples/portable/options/options.ispc b/examples/portable/options/options.ispc
new file mode 100644
index 00000000..61505cf4
--- /dev/null
+++ b/examples/portable/options/options.ispc
@@ -0,0 +1,211 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "options_defs.h"
+
+// Cumulative normal distribution function
+static inline float
+CND(float X) {
+    float L = abs(X);
+
+    float k = 1.0 / (1.0 + 0.2316419 * L);
+    float k2 = k*k;
+    float k3 = k2*k;
+    float k4 = k2*k2;
+    float k5 = k3*k2;
+
+    const float invSqrt2Pi = 0.39894228040f;
+    float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 +
+               -1.821255978f * k4 + 1.330274429f * k5);
+    w *= invSqrt2Pi * exp(-L * L * .5f);
+
+    if (X > 0.f)
+        w = 1.0 - w;
+    return w;
+}
+
+task void
+bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
+        uniform float ra[], uniform float va[],
+        uniform float result[], uniform int count) {
+    uniform int first = taskIndex * (count/taskCount);
+    uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
+
+    foreach (i = first ... last) {
+        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
+
+        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
+        float d2 = d1 - v * sqrt(T);
+
+        result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
+    }
+}
+
+export void
+black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[],
+                         uniform float ra[], uniform float va[],
+                         uniform float result[], uniform int count) {
+    uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
+    launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
+}
+
+/********/
+
+
+export void
+black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
+                   uniform float ra[], uniform float va[],
+                   uniform float result[], uniform int count) {
+    foreach (i = 0 ... count) {
+        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
+
+        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
+        float d2 = d1 - v * sqrt(T);
+
+        result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
+    }
+}
+
+
+static inline float
+binomial_put(float S, float X, float T, float r, float v) {
+    float V[BINOMIAL_NUM];
+
+    float dt = T / BINOMIAL_NUM;
+    float u = exp(v * sqrt(dt));
+    float d = 1. / u;
+    float disc = exp(r * dt);
+    float Pu = (disc - d) / (u - d);
+
+#ifndef __NVPTX__
+
+    for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
+        float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
+        V[j] = max(0., X - S * upow);
+    }
+    for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
+        for (uniform int k = 0; k < j; ++k)
+            V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
+
+#else
+
+    /* loop unrolling helps NVVM to place V -> registers therefore boosting performance */
+    /* takes looong time to compile... */
+#if BINOMIAL_NUM != 64
+#error "Cannot unroll. Please use generic version above"
+#endif
+
+    // with PTX target unroll loops which will store data in registers..
+
+    /* first loop */
+
+#define OP(j) { \
+        float upow = pow(u, (float)(2*(j)-BINOMIAL_NUM)); \
+        V[j] = max(0., X - S * upow); }
+#define OP10(k) \
+    OP(k+0); OP(k+1); OP(k+2); OP(k+3); OP(k+4) \
+    OP(k+5); OP(k+6); OP(k+7); OP(k+8); OP(k+9);
+    OP10(0)
+    OP10(10)
+    OP10(20)
+    OP10(30)
+    OP10(40)
+    OP10(50)
+    OP(60)
+    OP(61)
+    OP(62)
+    OP(63)
+#undef OP10
+#undef OP
+
+    /* second loop */
+
+#define OP(j) {\
+  for (uniform int k = 0; k < (j); ++k) \
+      V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc; }
+#define OP10(k) \
+  OP(k+9); OP(k+8); OP(k+7); OP(k+6); OP(k+5); \
+  OP(k+4); OP(k+3); OP(k+2); OP(k+1); OP(k+0);
+  OP(63)
+  OP(62)
+  OP(61)
+  OP(60)
+  OP10(50)
+  OP10(40)
+  OP10(30)
+  OP10(20)
+  OP10(10)
+  OP10(0)
+#undef OP10
+#undef OP
+
+#endif
+    return V[0];
+}
+
+
+export void
+binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
+                  uniform float ra[], uniform float va[],
+                  uniform float result[], uniform int count) {
+    foreach (i = 0 ... count) {
+        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
+        result[i] = binomial_put(S, X, T, r, v);
+    }
+}
+
+
+task void
+binomial_task(uniform float Sa[], uniform float Xa[],
+              uniform float Ta[], uniform float ra[],
+              uniform float va[], uniform float result[],
+              uniform int count) {
+    uniform int first = taskIndex * (count/taskCount);
+    uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
+
+    foreach (i = first ... last) {
+        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
+        result[i] = binomial_put(S, X, T, r, v);
+    }
+}
+
+
+export void
+binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
+                        uniform float Ta[], uniform float ra[],
+                        uniform float va[], uniform float result[],
+                        uniform int count) {
+    uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
+    launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);
+}
diff --git a/examples/portable/options/options_defs.h b/examples/portable/options/options_defs.h
new file mode 100644
index 00000000..4286a276
--- /dev/null
+++ b/examples/portable/options/options_defs.h
@@ -0,0 +1,40 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef OPTIONS_DEFS_H
+#define OPTIONS_DEFS_H 1
+
+#define BINOMIAL_NUM 64
+
+
+#endif // OPTIONS_DEFS_H
diff --git a/examples/portable/radixSort/Makefile_cpu b/examples/portable/radixSort/Makefile_cpu
new file mode 100644
index 00000000..1d3808dc
--- /dev/null
+++ b/examples/portable/radixSort/Makefile_cpu
@@ -0,0 +1,9 @@
+
+EXAMPLE=radixSort
+CPP_SRC=radixSort.cpp 
+ISPC_SRC=radixSort.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+#ISPC_FLAGS=-DDEBUG -g
+
+include ../common_cpu.mk
diff --git a/examples/portable/radixSort/Makefile_knc b/examples/portable/radixSort/Makefile_knc
new file mode 100644
index 00000000..1204364f
--- /dev/null
+++ b/examples/portable/radixSort/Makefile_knc
@@ -0,0 +1,7 @@
+EXAMPLE=radixSort
+CXX_SRC=radixSort.cpp 
+ISPC_SRC=radixSort.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+
+include ../common_knc.mk
diff --git a/examples/portable/radixSort/Makefile_ptx b/examples/portable/radixSort/Makefile_ptx
new file mode 100644
index 00000000..da7494e4
--- /dev/null
+++ b/examples/portable/radixSort/Makefile_ptx
@@ -0,0 +1,15 @@
+PROG=radixSort
+ISPC_SRC=radixSort.ispc
+
+CU_SRC=radixSort.cu
+# NVCC_FLAGS=-Xptxas=-O1
+CXX_SRC=radixSort.cpp  radixSort.cpp
+PTXCC_REGMAX=64
+
+LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
diff --git a/examples/portable/radixSort/radixSort.cpp b/examples/portable/radixSort/radixSort.cpp
new file mode 100644
index 00000000..b9f9dcca
--- /dev/null
+++ b/examples/portable/radixSort/radixSort.cpp
@@ -0,0 +1,154 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <cassert>
+#include <iomanip>
+#include "timing.h"
+#include "ispc_malloc.h"
+#include "radixSort_ispc.h"
+
+static void progressBar(const int x, const int n, const int width = 50)
+{
+  assert(n > 1);
+  assert(x >= 0 && x < n);
+  assert(width > 10);
+  const float f = static_cast<float>(x)/(n-1);
+  const int   w = static_cast<int>(f * width);
+
+  // print bar
+  std::string bstr("[");
+  for (int i = 0; i < width; i++)
+    bstr += i < w ? '=' : ' ';
+  bstr += "]";
+
+  // print percentage 
+  char pstr0[32];
+  sprintf(pstr0, " %2d %c ", static_cast<int>(f*100.0),'%');
+  const std::string pstr(pstr0);
+  std::copy(pstr.begin(), pstr.end(), bstr.begin() + (width/2-2));
+
+  std::cout << bstr;
+  std::cout << (x == n-1 ? "\n" : "\r") << std::flush;
+}
+
+struct Key
+{
+  int32_t key,val;
+};
+
+int main (int argc, char *argv[])
+{
+  int i, j, n = argc == 1 ? 1000000 : atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
+  double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
+  Key *keys = new Key [n];
+  Key *keys_orig = new Key [n];
+  unsigned int *keys_gold = new unsigned int [n];
+
+  srand48(rtc()*65536);
+
+  int sortBits = 32;
+  assert(sortBits <= 32);
+
+#pragma omp parallel for
+  for (int i = 0; i < n; i++)
+  {
+    keys[i].key = ((int)(drand48() * (1<<30))) & ((1ULL << sortBits) - 1);
+    keys[i].val = i;
+  }
+
+  std::random_shuffle(keys, keys + n);
+
+#pragma omp parallel for
+  for (int i = 0; i < n; i++)
+  {
+    keys_gold[i] = keys[i].key;
+    keys_orig[i] = keys[i];
+  }
+
+  ispcSetMallocHeapLimit(1024*1024*1024);
+
+  ispc::radixSort_alloc(n);
+
+  tISPC2 = 1e30;
+  for (i = 0; i < m; i ++)
+  {
+    ispcMemcpy(keys, keys_orig, n*sizeof(Key));
+    reset_and_start_timer();
+    ispc::radixSort(n, (int64_t*)keys, sortBits);
+    tISPC2 = std::min(tISPC2, get_elapsed_msec());
+    if (argc != 3)
+        progressBar (i, m);
+  }
+
+  ispc::radixSort_free();
+
+  printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n/tISPC2);
+
+  std::sort(keys_gold, keys_gold + n);
+  for (int i = 0; i < n; i++)
+    assert(keys[i].key == keys_gold[i]);
+
+
+#if 0
+  for (i = 0; i < m; i ++)
+  {
+    ispcMemcpy(code, code_orig, n*sizeof(unsigned int));
+
+    reset_and_start_timer();
+
+    sort_serial (n, code, order);
+
+    tSerial += get_elapsed_msec();
+
+    if (argc != 3)
+        progressBar (i, m);
+  }
+
+  printf("[sort serial]:\t\t[%.3f] msec [%.3f Mpair/s]\n", tSerial, 1.0e-3*n*m/tSerial);
+
+#ifndef _CUDA_
+  printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2);
+#else
+  printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", tSerial/tISPC2);
+#endif
+#endif
+
+  delete keys;
+  delete keys_orig;
+  delete keys_gold;
+  return 0;
+}
diff --git a/examples/portable/radixSort/radixSort.cu b/examples/portable/radixSort/radixSort.cu
new file mode 100644
index 00000000..d7e8439b
--- /dev/null
+++ b/examples/portable/radixSort/radixSort.cu
@@ -0,0 +1,401 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/*
+   Based on radixSort from  http://www.moderngpu.com
+   */
+
+#include "cuda_helpers.cuh"
+#include <cassert>
+
+#define NUMBITS 8
+#define NUMDIGITS (1<<NUMBITS)
+
+typedef long long Key;
+
+__forceinline__ __device__ int atomic_add_global(int* ptr, int value)
+{
+  return atomicAdd(ptr, value);
+}
+
+static __device__ __forceinline__ int shfl_scan_add_step(int partial, int up_offset)
+{
+  int result;
+  asm(
+      "{.reg .u32 r0;"
+      ".reg .pred p;"
+      "shfl.up.b32 r0|p, %1, %2, 0;"
+      "@p add.u32 r0, r0, %3;"
+      "mov.u32 %0, r0;}"
+      : "=r"(result) : "r"(partial), "r"(up_offset), "r"(partial));
+  return result;
+}
+
+__forceinline__ __device__ int exclusive_scan_add(int value)
+{
+  int mysum = value;
+#pragma unroll
+  for(int i = 0; i < 5; ++i)
+    mysum = shfl_scan_add_step(mysum, 1 << i);
+  return mysum - value;
+}
+
+__global__
+void countPass(
+    const  Key keysAll[],
+    Key sortedAll[],
+    const  int bit,
+    const  int numElements,
+    int countsAll[],
+    int countsGlobal[])
+{
+  const  int  blkIdx = taskIndex;
+  const  int numBlocks = taskCount;
+  const  int  blkDim = (numElements + numBlocks - 1) / numBlocks;
+
+  const  int mask = (1 << NUMBITS) - 1;
+
+  const  Key *  keys   =   keysAll + blkIdx*blkDim;
+  Key *  sorted = sortedAll + blkIdx*blkDim;
+  int *      counts = countsAll + blkIdx*NUMDIGITS;
+  const  int           nloc = min(numElements - blkIdx*blkDim, blkDim);
+
+#pragma unroll 8
+  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+    counts[digit] = 0;
+
+  for (int i = programIndex; i < nloc; i += programCount)
+    if (i < nloc)
+    {
+      sorted[i] = keys[i];
+      const int key = mask & ((unsigned int)keys[i] >> bit);
+      atomic_add_global(&counts[key], 1);
+    }
+
+#pragma unroll 8
+  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+    atomic_add_global(&countsGlobal[digit], counts[digit]);
+}
+
+__global__
+void sortPass(
+    Key keysAll[],
+    Key sorted[],
+    int bit,
+    int numElements,
+    int digitOffsetsAll[])
+{
+  const  int  blkIdx = taskIndex;
+  const  int numBlocks = taskCount;
+
+  const  int  blkDim = (numElements + numBlocks - 1) / numBlocks;
+
+
+  const  int keyIndex = blkIdx * blkDim;
+  Key *  keys = keysAll + keyIndex;
+
+
+  const  int nloc = min(numElements - keyIndex, blkDim);
+
+  const  int mask = (1 << NUMBITS) - 1;
+
+  /* copy digit offset from Gmem to Lmem */
+#if 1
+  __shared__ int digitOffsets_sh[NUMDIGITS*4];
+  volatile int *digitOffsets = digitOffsets_sh + warpIdx*NUMDIGITS;
+  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+    digitOffsets[digit] = digitOffsetsAll[blkIdx*NUMDIGITS + digit];
+#else
+  int *digitOffsets = &digitOffsetsAll[blkIdx*NUMDIGITS];
+#endif
+
+
+  for (int i = programIndex; i < nloc; i += programCount)
+    if (i < nloc)
+    {
+      const int key = mask & ((unsigned int)keys[i] >> bit);
+      int scatter;
+      /* not a vector friendly loop */
+#pragma unroll 1  /* needed, otherwise compiler unroll and optimizes the result :S */
+      for (int iv = 0; iv < programCount; iv++)
+        if (programIndex == iv)
+          scatter = digitOffsets[key]++;
+      sorted [scatter] = keys[i];
+    }
+}
+
+__global__
+void partialScanLocal(
+    int numBlocks,
+    int excScanAll[],
+    int  countsAll[],
+    int partialSumAll[])
+{
+  const  int  blkIdx = taskIndex;
+
+  const  int  blkDim = (numBlocks+taskCount-1)/taskCount;
+  const  int      bbeg = blkIdx * blkDim;
+  const  int      bend = min(bbeg + blkDim, numBlocks);
+
+  int (*   countsBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])countsAll;
+  int (*  excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll;
+  int (*    partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll;
+
+#pragma unroll 8
+  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+  {
+    int prev = bbeg == 0 ? excScanBlock[0][digit] : 0;
+    for ( int block = bbeg; block < bend; block++)
+    {
+      const int y = countsBlock[block][digit];
+      excScanBlock[block][digit] = prev;
+      prev += y;
+    }
+    partialSum[blkIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit];
+  }
+}
+
+__global__
+void partialScanGlobal(
+    const  int numBlocks,
+    int partialSumAll[],
+    int prefixSumAll[])
+{
+  int (*  partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll;
+  int (*   prefixSum)[NUMDIGITS] = ( int (*)[NUMDIGITS]) prefixSumAll;
+  const  int digit = taskIndex;
+  int carry = 0;
+  for (int block = programIndex;  block < numBlocks; block += programCount)
+  {
+    const int value = partialSum[block][digit];
+    const int scan  = exclusive_scan_add(value);
+    if (block < numBlocks)
+      prefixSum[block][digit] = scan + carry;
+    carry += __shfl(scan+value, programCount-1);
+  }
+}
+
+__global__
+void completeScanGlobal(
+    int numBlocks,
+    int excScanAll[],
+    int carryValueAll[])
+{
+  const  int  blkIdx = taskIndex;
+  const  int  blkDim = (numBlocks+taskCount-1)/taskCount;
+  const  int      bbeg = blkIdx * blkDim;
+  const  int      bend = min(bbeg  + blkDim, numBlocks);
+
+  int (*  excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll;
+  int (*    carryValue)[NUMDIGITS] = ( int (*)[NUMDIGITS])carryValueAll;
+
+#pragma unroll 8
+  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+  {
+    const int carry = carryValue[blkIdx][digit];
+    for ( int block = bbeg; block < bend; block++)
+      excScanBlock[block][digit] += carry;
+  }
+}
+
+__device__ static
+inline void radixExclusiveScan(
+    const  int numBlocks,
+    int excScanPtr[],
+    int  countsPtr[],
+    int partialSum[],
+    int  prefixSum[])
+{
+  const  int scale = 8;
+  launch (numBlocks/scale, 1,1, partialScanLocal)(numBlocks, excScanPtr, countsPtr, partialSum);
+  sync;
+
+  launch (NUMDIGITS,1,1,partialScanGlobal) (numBlocks/scale, partialSum, prefixSum);
+  sync;
+
+  launch (numBlocks/scale,1,1, completeScanGlobal) (numBlocks, excScanPtr, prefixSum);
+  sync;
+}
+
+__device__ static  int *  memoryPool = NULL;
+__device__ static  int numBlocks;
+__device__ static  int nSharedCounts;
+__device__ static  int nCountsGlobal;
+__device__ static  int nExcScan;
+__device__ static  int nCountsBlock;
+__device__ static  int nPartialSum;
+__device__ static  int nPrefixSum;
+
+__device__ static  int *  sharedCounts;
+__device__ static  int *  countsGlobal;
+__device__ static  int *  excScan;
+__device__ static  int *  counts;
+__device__ static  int *  partialSum;
+__device__ static  int *  prefixSum;
+
+__device__ static  int numElementsBuf = 0;
+__device__ static  Key *  bufKeys;
+
+__global__
+void radixSort_alloc___export(const  int n)
+{
+  assert(memoryPool == NULL);
+  numBlocks     = 13*32*4;
+  nSharedCounts = NUMDIGITS*numBlocks;
+  nCountsGlobal = NUMDIGITS;
+  nExcScan      = NUMDIGITS*numBlocks;
+  nCountsBlock  = NUMDIGITS*numBlocks;
+  nPartialSum   = NUMDIGITS*numBlocks;
+  nPrefixSum    = NUMDIGITS*numBlocks;
+
+
+  const  int nalloc =
+    nSharedCounts +
+    nCountsGlobal +
+    nExcScan +
+    nCountsBlock +
+    nPartialSum +
+    nPrefixSum;
+
+  if (programIndex == 0)
+    memoryPool =  new  int[nalloc];
+
+  sharedCounts = memoryPool;
+  countsGlobal = sharedCounts + nSharedCounts;
+  excScan      = countsGlobal + nCountsGlobal;
+  counts       = excScan      + nExcScan;
+  partialSum   = counts       + nCountsBlock;
+  prefixSum    = partialSum   + nPartialSum;
+}
+
+extern "C"
+void radixSort_alloc(const  int n)
+{
+  radixSort_alloc___export<<<1,32>>>(n);
+  sync;
+}
+
+
+__device__  static
+void radixSort_freeBufKeys()
+{
+  if (numElementsBuf > 0)
+  {
+    if (programIndex == 0)
+      delete bufKeys;
+    numElementsBuf = 0;
+  }
+}
+
+__global__ void radixSort_free___export()
+{
+  assert(memoryPool != NULL);
+  if (programIndex == 0)
+    delete memoryPool;
+  memoryPool = NULL;
+
+  radixSort_freeBufKeys();
+}
+extern "C"
+void radixSort_free()
+{
+  radixSort_free___export<<<1,32>>>();
+  sync;
+}
+
+__global__ void radixSort___export(
+    const  int numElements,
+    Key keys[],
+    const  int nBits)
+{
+#ifdef __NVPTX__
+  assert((numBlocks & 3) == 0);  /* task granularity on Kepler is 4 */
+#endif
+
+  if (numElementsBuf < numElements)
+    radixSort_freeBufKeys();
+  if (numElementsBuf == 0)
+  {
+    numElementsBuf = numElements;
+    if (programIndex == 0)
+      bufKeys =  new  Key[numElementsBuf];
+  }
+
+  const  int blkDim  = (numElements + numBlocks - 1) / numBlocks;
+
+  for ( int bit = 0; bit < nBits; bit += NUMBITS)
+  {
+    /* initialize histogram for each digit */
+    for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+      countsGlobal[digit] = 0;
+
+    /* compute histogram for each digit */
+    launch (numBlocks,1,1, countPass)(keys, bufKeys, bit, numElements, counts, countsGlobal);
+    sync;
+
+    /* exclusive scan on global histogram */
+    int carry = 0;
+    excScan[0] = 0;
+#pragma unroll 8
+    for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+    {
+      const int value = countsGlobal[digit];
+      const int scan  = exclusive_scan_add(value);
+      excScan[digit] = scan + carry;
+      carry += __shfl(scan+value, programCount-1);
+    }
+
+    /* computing offsets for each digit */
+    radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
+
+    /* sorting */
+    launch (numBlocks,1,1,
+      sortPass)(
+          bufKeys,
+          keys,
+          bit,
+          numElements,
+          excScan);
+    sync;
+  }
+}
+
+extern "C"
+void radixSort(
+    const  int numElements,
+    Key keys[],
+    const  int nBits)
+{
+  cudaDeviceSetCacheConfig ( cudaFuncCachePreferEqual );
+  radixSort___export<<<1,32>>>(numElements, keys, nBits);
+  sync;
+}
diff --git a/examples/portable/radixSort/radixSort.ispc b/examples/portable/radixSort/radixSort.ispc
new file mode 100644
index 00000000..5ddc8e73
--- /dev/null
+++ b/examples/portable/radixSort/radixSort.ispc
@@ -0,0 +1,337 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/*
+   Based on radixSort from  http://www.moderngpu.com
+   */
+
+#define NUMBITS 8
+#define NUMDIGITS (1<<NUMBITS)
+
+typedef int64 Key;
+
+task
+void countPass(
+    const uniform Key keysAll[],
+    uniform Key sortedAll[],
+    const uniform int bit,
+    const uniform int numElements,
+    uniform int countsAll[],
+    uniform int countsGlobal[])
+{
+  const uniform int  blockIdx = taskIndex;
+  const uniform int numBlocks = taskCount;
+  const uniform int  blockDim = (numElements + numBlocks - 1) / numBlocks;
+
+  const uniform int mask = (1 << NUMBITS) - 1;
+
+  const uniform Key * uniform keys   =   keysAll + blockIdx*blockDim;
+  uniform Key * uniform sorted = sortedAll + blockIdx*blockDim;
+  uniform int * uniform     counts = countsAll + blockIdx*NUMDIGITS;
+  const uniform int           nloc = min(numElements - blockIdx*blockDim, blockDim);
+
+  foreach (digit = 0 ... NUMDIGITS)
+    counts[digit] = 0;
+
+  foreach (i = 0 ... nloc)
+  {
+    sorted[i] = keys[i];
+    const int key = mask & ((unsigned int)keys[i] >> bit);
+#ifdef __NVPTX__
+    atomic_add_global(&counts[key], 1);
+#else
+    atomic_add_local(&counts[key], 1);
+#endif
+  }
+
+  foreach (digit = 0 ... NUMDIGITS)
+    atomic_add_global(&countsGlobal[digit], counts[digit]);
+}
+
+task
+void sortPass(
+    uniform Key keysAll[],
+    uniform Key sorted[],
+    uniform int bit,
+    uniform int numElements,
+    uniform int digitOffsetsAll[])
+{
+  const uniform int  blockIdx = taskIndex;
+  const uniform int numBlocks = taskCount;
+
+  const uniform int  blockDim = (numElements + numBlocks - 1) / numBlocks;
+
+
+  const uniform int keyIndex = blockIdx * blockDim;
+  uniform Key * uniform keys = keysAll + keyIndex;
+
+
+  const uniform int nloc = min(numElements - keyIndex, blockDim);
+
+  const uniform int mask = (1 << NUMBITS) - 1;
+
+  /* copy digit offset from Gmem to Lmem */
+#if 1
+  uniform int digitOffsets[NUMDIGITS];
+  foreach (digit = 0 ... NUMDIGITS)
+    digitOffsets[digit] = digitOffsetsAll[blockIdx*NUMDIGITS + digit];
+#else
+  uniform int * uniform digitOffsets = &digitOffsetsAll[blockIdx*NUMDIGITS];
+#endif
+
+  foreach (i = 0 ... nloc)
+  {
+    const int key = mask & ((unsigned int)keys[i] >> bit);
+    int scatter;
+    /* not a vector friendly loop */
+    foreach_active(iv)
+      scatter = digitOffsets[key]++;
+    sorted[scatter] = keys[i];
+  }
+}
+
+task
+void partialScanLocal(
+    uniform int numBlocks,
+    uniform int excScanAll[],
+    uniform int  countsAll[],
+    uniform int partialSumAll[])
+{
+  const uniform int  blockIdx = taskIndex;
+
+  const uniform int  blockDim = (numBlocks+taskCount-1)/taskCount;
+  const uniform int      bbeg = blockIdx * blockDim;
+  const uniform int      bend = min(bbeg + blockDim, numBlocks);
+
+  uniform int (* uniform  countsBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])countsAll;
+  uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
+  uniform int (* uniform   partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
+
+  foreach (digit = 0 ... NUMDIGITS)
+  {
+    int prev = bbeg == 0 ? excScanBlock[0][digit] : 0;
+    for (uniform int block = bbeg; block < bend; block++)
+    {
+      const int y = countsBlock[block][digit];
+      excScanBlock[block][digit] = prev;
+      prev += y;
+    }
+    partialSum[blockIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit];
+  }
+}
+
+task
+void partialScanGlobal(
+    const uniform int numBlocks,
+    uniform int partialSumAll[],
+    uniform int prefixSumAll[])
+{
+  uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
+  uniform int (* uniform  prefixSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS]) prefixSumAll;
+  const uniform int digit = taskIndex;
+  int carry = 0;
+  foreach (block = 0 ... numBlocks)
+  {
+    const int value = partialSum[block][digit];
+    const int scan  = exclusive_scan_add(value);
+    prefixSum[block][digit] = scan + carry;
+    carry += broadcast(scan+value, programCount-1);
+  }
+}
+
+task
+void completeScanGlobal(
+    uniform int numBlocks,
+    uniform int excScanAll[],
+    uniform int carryValueAll[])
+{
+  const uniform int  blockIdx = taskIndex;
+  const uniform int  blockDim = (numBlocks+taskCount-1)/taskCount;
+  const uniform int      bbeg = blockIdx * blockDim;
+  const uniform int      bend = min(bbeg  + blockDim, numBlocks);
+
+  uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
+  uniform int (* uniform   carryValue)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])carryValueAll;
+
+  foreach (digit = 0 ... NUMDIGITS)
+  {
+    const int carry = carryValue[blockIdx][digit];
+    for (uniform int block = bbeg; block < bend; block++)
+      excScanBlock[block][digit] += carry;
+  }
+}
+
+static
+inline void radixExclusiveScan(
+    const uniform int numBlocks,
+    uniform int excScanPtr[],
+    uniform int  countsPtr[],
+    uniform int partialSum[],
+    uniform int  prefixSum[])
+{
+  const uniform int scale = 8;
+  launch [numBlocks/scale] partialScanLocal(numBlocks, excScanPtr, countsPtr, partialSum);
+  sync;
+
+  launch [NUMDIGITS] partialScanGlobal(numBlocks/scale, partialSum, prefixSum);
+  sync;
+
+  launch [numBlocks/scale] completeScanGlobal(numBlocks, excScanPtr, prefixSum);
+  sync;
+}
+
+static uniform int * uniform memoryPool = NULL;
+static uniform int numBlocks;
+static uniform int nSharedCounts;
+static uniform int nCountsGlobal;
+static uniform int nExcScan;
+static uniform int nCountsBlock;
+static uniform int nPartialSum;
+static uniform int nPrefixSum;
+
+static uniform int * uniform sharedCounts;
+static uniform int * uniform countsGlobal;
+static uniform int * uniform excScan;
+static uniform int * uniform counts;
+static uniform int * uniform partialSum;
+static uniform int * uniform prefixSum;
+
+static uniform int numElementsBuf = 0;
+static uniform Key * uniform bufKeys;
+
+export void radixSort_alloc(const uniform int n)
+{
+  assert(memoryPool == NULL);
+  numBlocks     = num_cores()*4;
+#ifdef __NVPTX__
+  numBlocks     = 13*32*4; //num_cores()*4;
+#endif
+  nSharedCounts = NUMDIGITS*numBlocks;
+  nCountsGlobal = NUMDIGITS;
+  nExcScan      = NUMDIGITS*numBlocks;
+  nCountsBlock  = NUMDIGITS*numBlocks;
+  nPartialSum   = NUMDIGITS*numBlocks;
+  nPrefixSum    = NUMDIGITS*numBlocks;
+
+
+  const uniform int nalloc =
+    nSharedCounts +
+    nCountsGlobal +
+    nExcScan +
+    nCountsBlock +
+    nPartialSum +
+    nPrefixSum;
+
+  memoryPool = uniform new uniform int[nalloc];
+
+  sharedCounts = memoryPool;
+  countsGlobal = sharedCounts + nSharedCounts;
+  excScan      = countsGlobal + nCountsGlobal;
+  counts       = excScan      + nExcScan;
+  partialSum   = counts       + nCountsBlock;
+  prefixSum    = partialSum   + nPartialSum;
+}
+
+static
+void radixSort_freeBufKeys()
+{
+  if (numElementsBuf > 0)
+  {
+    delete bufKeys;
+    numElementsBuf = 0;
+  }
+}
+
+export void radixSort_free()
+{
+  assert(memoryPool != NULL);
+  delete memoryPool;
+  memoryPool = NULL;
+
+  radixSort_freeBufKeys();
+}
+
+export void radixSort(
+    const uniform int numElements,
+    uniform Key keys[],
+    const uniform int nBits)
+{
+#ifdef __NVPTX__
+  assert((numBlocks & 3) == 0);  /* task granularity on Kepler is 4 */
+#endif
+
+  if (numElementsBuf < numElements)
+    radixSort_freeBufKeys();
+  if (numElementsBuf == 0)
+  {
+    numElementsBuf = numElements;
+    bufKeys = uniform new uniform Key[numElementsBuf];
+  }
+
+  const uniform int blockDim  = (numElements + numBlocks - 1) / numBlocks;
+
+  for (uniform int bit = 0; bit < nBits; bit += NUMBITS)
+  {
+    /* initialize histogram for each digit */
+    foreach (digit = 0 ... NUMDIGITS)
+      countsGlobal[digit] = 0;
+
+    /* compute histogram for each digit */
+    launch [numBlocks] countPass(keys, bufKeys, bit, numElements, counts, countsGlobal);
+    sync;
+
+    /* exclusive scan on global histogram */
+    int carry = 0;
+    excScan[0] = 0;
+    foreach (digit = 0 ... NUMDIGITS)
+    {
+      const int value = countsGlobal[digit];
+      const int scan  = exclusive_scan_add(value);
+      excScan[digit] = scan + carry;
+      carry += broadcast(scan+value, programCount-1);
+    }
+
+    /* computing offsets for each digit */
+    radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
+
+    /* sorting */
+    launch [numBlocks]
+      sortPass(
+          bufKeys,
+          keys,
+          bit,
+          numElements,
+          excScan);
+    sync;
+  }
+
+}
diff --git a/examples/portable/rt/.gitignore b/examples/portable/rt/.gitignore
new file mode 100644
index 00000000..5a95423b
--- /dev/null
+++ b/examples/portable/rt/.gitignore
@@ -0,0 +1,2 @@
+rt
+*.ppm
diff --git a/examples/portable/rt/Makefile_cpu b/examples/portable/rt/Makefile_cpu
new file mode 100644
index 00000000..9cf3de47
--- /dev/null
+++ b/examples/portable/rt/Makefile_cpu
@@ -0,0 +1,8 @@
+
+EXAMPLE=rt
+CPP_SRC=rt.cpp
+ISPC_SRC=rt.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+
+include ../common_cpu.mk
diff --git a/examples/portable/rt/Makefile_knc b/examples/portable/rt/Makefile_knc
new file mode 100644
index 00000000..188acad7
--- /dev/null
+++ b/examples/portable/rt/Makefile_knc
@@ -0,0 +1,7 @@
+EXAMPLE=rt
+CXX_SRC=rt.cpp 
+ISPC_SRC=rt.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+
+include ../common_knc.mk
diff --git a/examples/portable/rt/Makefile_ptx b/examples/portable/rt/Makefile_ptx
new file mode 100644
index 00000000..45eae8c6
--- /dev/null
+++ b/examples/portable/rt/Makefile_ptx
@@ -0,0 +1,13 @@
+PROG=rt
+ISPC_SRC=rt.ispc
+CU_SRC=rt.cu
+CXX_SRC=rt.cpp 
+PTXCC_REGMAX=32
+
+#LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
diff --git a/examples/portable/rt/cornell.bvh b/examples/portable/rt/cornell.bvh
new file mode 120000
index 00000000..61b2c8af
--- /dev/null
+++ b/examples/portable/rt/cornell.bvh
@@ -0,0 +1 @@
+../../rt/cornell.bvh
\ No newline at end of file
diff --git a/examples/portable/rt/cornell.camera b/examples/portable/rt/cornell.camera
new file mode 120000
index 00000000..a1cdfd79
--- /dev/null
+++ b/examples/portable/rt/cornell.camera
@@ -0,0 +1 @@
+../../rt/cornell.camera
\ No newline at end of file
diff --git a/examples/portable/rt/rt.cpp b/examples/portable/rt/rt.cpp
new file mode 100644
index 00000000..d4ad1672
--- /dev/null
+++ b/examples/portable/rt/rt.cpp
@@ -0,0 +1,229 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <cstdio>
+#include <cmath>
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <sys/types.h>
+#include "timing.h"
+#include "rt_ispc.h"
+#include "ispc_malloc.h"
+
+using namespace ispc;
+
+typedef unsigned int uint;
+
+static void writeImage(int *idImage, float *depthImage, int width, int height,
+                       const char *filename) {
+    FILE *f = fopen(filename, "wb");
+    if (!f) {
+        perror(filename);
+        exit(1);
+    }
+
+    fprintf(f, "P6\n%d %d\n255\n", width, height);
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+            // use the bits from the object id of the hit object to make a
+            // random color
+            int id = idImage[y * width + x];
+            unsigned char r = 0, g = 0, b = 0;
+
+            for (int i = 0; i < 8; ++i) {
+                // extract bit 3*i for red, 3*i+1 for green, 3*i+2 for blue
+                int rbit = (id & (1 << (3*i)))   >> (3*i);
+                int gbit = (id & (1 << (3*i+1))) >> (3*i+1);
+                int bbit = (id & (1 << (3*i+2))) >> (3*i+2);
+                // and then set the bits of the colors starting from the
+                // high bits...
+                r |= rbit << (7-i);
+                g |= gbit << (7-i);
+                b |= bbit << (7-i);
+            }
+            fputc(r, f);
+            fputc(g, f);
+            fputc(b, f);
+        }
+    }
+    fclose(f);
+    printf("Wrote image file %s\n", filename);
+}
+
+
+static void usage() {
+    fprintf(stderr, "rt <scene name base> [--scale=<factor>] [ispc iterations] [tasks iterations] [serial iterations]\n");
+    exit(1);
+}
+
+
+int main(int argc, char *argv[]) {
+    static unsigned int test_iterations[] = {3, 7, 1};
+    float scale = 1.f;
+    const char *filename = NULL;
+    if (argc < 2) usage();
+    filename = argv[1];
+    if (argc > 2) {
+        if (strncmp(argv[2], "--scale=", 8) == 0) {
+            scale = atof(argv[2] + 8);
+        }
+    }
+    if ((argc == 6) || (argc == 5)) {
+        for (int i = 0; i < 3; i++) {
+            test_iterations[i] = atoi(argv[argc - 3 + i]);
+        }
+    }
+
+#define READ(var, n)                                            \
+    if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) {  \
+        fprintf(stderr, "Unexpected EOF reading scene file\n"); \
+        return 1;                                               \
+    } else /* eat ; */
+
+    //
+    // Read the camera specification information from the camera file
+    //
+    char fnbuf[1024];
+    sprintf(fnbuf, "%s.camera", filename);
+    FILE *f = fopen(fnbuf, "rb");
+    if (!f) {
+        perror(fnbuf);
+        return 1;
+    }
+
+    //
+    // Nothing fancy, and trouble if we run on a big-endian system, just
+    // fread in the bits
+    //
+    int baseWidth, baseHeight;
+//    float camera2world[4][4], raster2camera[4][4];
+    float *camera2world_ispc = new float[4*4];
+    float *raster2camera_ispc = new float[4*4];
+    float (*camera2world )[4] = (float (*)[4])camera2world_ispc;
+    float (*raster2camera)[4] = (float (*)[4])raster2camera_ispc;
+    READ(baseWidth, 1);
+    READ(baseHeight, 1);
+    READ(camera2world[0][0], 16);
+    READ(raster2camera[0][0], 16);
+
+    //
+    // Read in the serialized BVH
+    //
+    sprintf(fnbuf, "%s.bvh", filename);
+    f = fopen(fnbuf, "rb");
+    if (!f) {
+        perror(fnbuf);
+        return 1;
+    }
+
+    // The BVH file starts with an int that gives the total number of BVH
+    // nodes
+    uint nNodes;
+    READ(nNodes, 1);
+
+    LinearBVHNode *nodes = new LinearBVHNode[nNodes];
+    for (unsigned int i = 0; i < nNodes; ++i) {
+        // Each node is 6x floats for a boox, then an integer for an offset
+        // to the second child node, then an integer that encodes the type
+        // of node, the total number of int it if a leaf node, etc.
+        float b[6];
+        READ(b[0], 6);
+        nodes[i].bounds[0][0] = b[0];
+        nodes[i].bounds[0][1] = b[1];
+        nodes[i].bounds[0][2] = b[2];
+        nodes[i].bounds[1][0] = b[3];
+        nodes[i].bounds[1][1] = b[4];
+        nodes[i].bounds[1][2] = b[5];
+        READ(nodes[i].offset, 1);
+        READ(nodes[i].nPrimitives, 1);
+        READ(nodes[i].splitAxis, 1);
+        READ(nodes[i].pad, 1);
+    }
+
+    // And then read the triangles
+    uint nTris;
+    READ(nTris, 1);
+    Triangle *triangles = new Triangle[nTris];
+    for (uint i = 0; i < nTris; ++i) {
+        // 9x floats for the 3 vertices
+        float v[9];
+        READ(v[0], 9);
+        float *vp = v;
+        for (int j = 0; j < 3; ++j) {
+            triangles[i].p[j][0] = *vp++;
+            triangles[i].p[j][1] = *vp++;
+            triangles[i].p[j][2] = *vp++;
+        }
+        // And create an object id
+        triangles[i].id = i+1;
+    }
+    fclose(f);
+
+    int height = int(baseHeight * scale);
+    int width = int(baseWidth * scale);
+
+    // allocate images; one to hold hit object ids, one to hold depth to
+    // the first interseciton
+    int *id = new int[width*height];
+    float *image = new float[width*height];
+
+    ispc_memset(id, 0, width*height*sizeof(int));
+    ispc_memset(image, 0, width*height*sizeof(float));
+
+    //
+    // Run 3 iterations with ispc + 1 core, record the minimum time
+    //
+    double minTimeISPCtasks = 1e30;
+    for (int i = 0; i < test_iterations[1]; ++i) {
+        reset_and_start_timer();
+        raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera,
+                            camera2world, image, id, nodes, triangles);
+        double dt = get_elapsed_msec();
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
+        minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
+    }
+    printf("[rt ispc + tasks]:\t\t[%.3f] msec for %d x %d image\n",
+           minTimeISPCtasks, width, height);
+
+    writeImage(id, image, width, height, "rt-ispc-tasks.ppm");
+
+    return 0;
+}
diff --git a/examples/portable/rt/rt.cu b/examples/portable/rt/rt.cu
new file mode 100644
index 00000000..352edcb8
--- /dev/null
+++ b/examples/portable/rt/rt.cu
@@ -0,0 +1,373 @@
+#include "cuda_helpers.cuh"
+
+#define float3 Float3
+struct Float3
+{
+  float x,y,z;
+  __device__ friend Float3 operator+(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x+b.x;
+    c.y = a.y+b.y;
+    c.z = a.z+b.z;
+    return c;
+  }
+  __device__ friend Float3 operator-(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x-b.x;
+    c.y = a.y-b.y;
+    c.z = a.z-b.z;
+    return c;
+  }
+  __device__ friend Float3 operator/(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x/b.x;
+    c.y = a.y/b.y;
+    c.z = a.z/b.z;
+    return c;
+  }
+  __device__ friend Float3 operator/(const float a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a/b.x;
+    c.y = a/b.y;
+    c.z = a/b.z;
+    return c;
+  }
+  __device__ friend Float3 operator*(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x*b.x;
+    c.y = a.y*b.y;
+    c.z = a.z*b.z;
+    return c;
+  }
+  __device__ friend Float3 operator*(const Float3 a, const float b)
+  {
+    Float3 c;
+    c.x = a.x*b;
+    c.y = a.y*b;
+    c.z = a.z*b;
+    return c;
+  }
+};
+
+#define int8 char
+#define int16 short
+
+struct Ray {
+    float3 origin, dir, invDir;
+    unsigned int dirIsNeg0, dirIsNeg1, dirIsNeg2;
+    float mint, maxt;
+    int hitId;
+};
+
+struct Triangle {
+    float p[3][4];
+    int id;
+    int pad[3];
+};
+
+struct LinearBVHNode {
+    float bounds[2][3];
+    unsigned int offset;     // num primitives for leaf, second child for interior
+    unsigned int8 nPrimitives;
+    unsigned int8 splitAxis;
+    unsigned int16 pad;
+};
+
+__device__
+static inline float3 Cross(const float3 v1, const float3 v2) {
+    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
+    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
+    float3 ret;
+    ret.x = (v1y * v2z) - (v1z * v2y);
+    ret.y = (v1z * v2x) - (v1x * v2z);
+    ret.z = (v1x * v2y) - (v1y * v2x);
+    return ret;
+}
+
+__device__
+static inline float Dot(const float3 a, const float3 b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+__device__
+inline
+static void generateRay( const float raster2camera[4][4],
+                         const float camera2world[4][4],
+                        float x, float y, Ray &ray) {
+    ray.mint = 0.f;
+    ray.maxt = 1e30f;
+
+    ray.hitId = 0;
+
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
+        camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
+        camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
+        camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+
+    ray.invDir = 1.f / ray.dir;
+
+#if 0
+    ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
+    ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
+    ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
+#else
+    ray.dirIsNeg0 = any(ray.invDir.x < 0) ? 1 : 0;
+    ray.dirIsNeg1 = any(ray.invDir.y < 0) ? 1 : 0;
+    ray.dirIsNeg2 = any(ray.invDir.z < 0) ? 1 : 0;
+#endif
+}
+
+__device__
+inline
+static bool BBoxIntersect(const  float bounds[2][3],
+                          const Ray &ray) {
+     float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
+     float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
+    float t0 = ray.mint, t1 = ray.maxt;
+
+    // Check all three axis-aligned slabs.  Don't try to early out; it's
+    // not worth the trouble
+    float3 tNear = (bounds0 - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds1 - ray.origin) * ray.invDir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = max(tNear.x, t0);
+    t1 = min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = max(tNear.y, t0);
+    t1 = min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = max(tNear.z, t0);
+    t1 = min(tFar.z, t1);
+
+    return (t0 <= t1);
+}
+
+
+__device__
+inline
+static bool TriIntersect(const  Triangle &tri, Ray &ray) {
+     float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
+     float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
+     float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
+     float3 e1 = p1 - p0;
+     float3 e2 = p2 - p0;
+
+    float3 s1 = Cross(ray.dir, e2);
+    float divisor = Dot(s1, e1);
+    bool hit = true;
+
+    if (divisor == 0.)
+        hit = false;
+    float invDivisor = 1.f / divisor;
+
+    // Compute first barycentric coordinate
+    float3 d = ray.origin - p0;
+    float b1 = Dot(d, s1) * invDivisor;
+    if (b1 < 0. || b1 > 1.)
+        hit = false;
+
+    // Compute second barycentric coordinate
+    float3 s2 = Cross(d, e1);
+    float b2 = Dot(ray.dir, s2) * invDivisor;
+    if (b2 < 0. || b1 + b2 > 1.)
+        hit = false;
+
+    // Compute _t_ to intersection point
+    float t = Dot(e2, s2) * invDivisor;
+    if (t < ray.mint || t > ray.maxt)
+        hit = false;
+
+    if (hit) {
+        ray.maxt = t;
+        ray.hitId = tri.id;
+    }
+    return hit;
+}
+
+__device__
+inline
+bool BVHIntersect(const  LinearBVHNode nodes[],
+                  const  Triangle tris[], Ray &r,
+                   int todo[]) {
+    Ray ray = r;
+    bool hit = false;
+    // Follow ray through BVH nodes to find primitive intersections
+     int todoOffset = 0, nodeNum = 0;
+
+    while (true) {
+        // Check ray against BVH node
+         LinearBVHNode node = nodes[nodeNum];
+        if (any(BBoxIntersect(node.bounds, ray))) {
+             unsigned int nPrimitives = node.nPrimitives;
+            if (nPrimitives > 0) {
+                // Intersect ray with primitives in leaf BVH node
+                 unsigned int primitivesOffset = node.offset;
+                for ( unsigned int i = 0; i < nPrimitives; ++i) {
+                    if (TriIntersect(tris[primitivesOffset+i], ray))
+                        hit = true;
+                }
+                if (todoOffset == 0)
+                    break;
+                nodeNum = todo[--todoOffset];
+            }
+            else {
+                // Put far BVH node on _todo_ stack, advance to near node
+                int dirIsNeg;
+                if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg0;
+                if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg1;
+                if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg2;
+                if (dirIsNeg) {
+                   todo[todoOffset++] = nodeNum + 1;
+                   nodeNum = node.offset;
+                }
+                else {
+                   todo[todoOffset++] = node.offset;
+                   nodeNum = nodeNum + 1;
+                }
+            }
+        }
+        else {
+            if (todoOffset == 0)
+                break;
+            nodeNum = todo[--todoOffset];
+        }
+    }
+    r.maxt = ray.maxt;
+    r.hitId = ray.hitId;
+
+    return hit;
+}
+
+__device__
+inline
+static void raytrace_tile( int x0,  int x1,
+                           int y0,  int y1,
+                           int width,  int height,
+                           int baseWidth,  int baseHeight,
+                          const  float raster2camera[4][4],
+                          const  float camera2world[4][4],
+                           float image[],  int id[],
+                          const  LinearBVHNode nodes[],
+                          const  Triangle triangles[]) {
+     float widthScale = (float)(baseWidth) / (float)(width);
+     float heightScale = (float)(baseHeight) / (float)(height);
+
+#if 0
+   int *  todo =  new  int[64];
+#define ALLOC
+#else
+   int todo[64];
+#endif
+
+    for (int y = y0 ;y < y1; y++)
+      for (int x = x0 + programIndex; x < x1; x += programCount)
+        if (x < x1)
+        {
+          Ray ray;
+          generateRay(raster2camera, camera2world, x*widthScale,
+              y*heightScale, ray);
+          BVHIntersect(nodes, triangles, ray, todo);
+
+          int offset = y * width + x;
+          image[offset] = ray.maxt;
+          id[offset] = ray.hitId;
+        }
+
+#ifdef ALLOC
+  delete todo;
+#endif
+}
+
+
+
+__global__
+void raytrace_tile_task( int width,  int height,
+                              int baseWidth,  int baseHeight,
+                             const  float raster2camera[4][4],
+                             const  float camera2world[4][4],
+                              float image[],  int id[],
+                             const  LinearBVHNode nodes[],
+                             const  Triangle triangles[]) {
+     int dx = 64, dy = 8; // must match dx, dy below
+     int xBuckets = (width + (dx-1)) / dx;
+     int x0 = (taskIndex % xBuckets) * dx;
+     int x1 = min(x0 + dx, width);
+     int y0 = (taskIndex / xBuckets) * dy;
+     int y1 = min(y0 + dy, height);
+
+    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
+                  raster2camera, camera2world, image,
+                  id, nodes, triangles);
+}
+
+
+extern "C" __global__ void raytrace_ispc_tasks___export( int width,  int height,
+                                 int baseWidth,  int baseHeight,
+                                const  float raster2camera[4][4],
+                                const  float camera2world[4][4],
+                                 float image[],  int id[],
+                                const  LinearBVHNode nodes[],
+                                const  Triangle triangles[]) {
+     int dx = 64, dy = 8;
+     int xBuckets = (width + (dx-1)) / dx;
+     int yBuckets = (height + (dy-1)) / dy;
+     int nTasks = xBuckets * yBuckets;
+     launch(nTasks,1,1,raytrace_tile_task)
+       (width, height, baseWidth, baseHeight,
+        raster2camera, camera2world,
+        image, id, nodes, triangles);
+     cudaDeviceSynchronize();
+}
+
+
+
+extern "C" __host__ void raytrace_ispc_tasks( int width,  int height,
+    int baseWidth,  int baseHeight,
+    const  float raster2camera[4][4],
+    const  float camera2world[4][4],
+    float image[],  int id[],
+    const  LinearBVHNode nodes[],
+    const  Triangle triangles[]) {
+  raytrace_ispc_tasks___export<<<1,32>>>( width,  height,
+      baseWidth,  baseHeight,
+      raster2camera,
+      camera2world,
+      image,  id,
+      nodes,
+      triangles);
+  cudaDeviceSynchronize();
+}
diff --git a/examples/portable/rt/rt.ispc b/examples/portable/rt/rt.ispc
new file mode 100644
index 00000000..67ebd4a9
--- /dev/null
+++ b/examples/portable/rt/rt.ispc
@@ -0,0 +1,351 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if 1
+typedef int bool_t;
+#else
+typedef bool bool_t;
+#endif
+typedef float<3> float3;
+
+#ifdef __NVPTX__
+#define uniform_t varying
+#else
+#define uniform_t uniform
+#endif
+
+
+
+struct int3
+{
+  int x,y,z;
+};
+
+struct Ray {
+    float3 origin, dir, invDir;
+    uniform unsigned int dirIsNeg[3];
+    float mint, maxt;
+    int hitId;
+};
+
+struct Triangle {
+    float p[3][4];
+    int id;
+    int pad[3];
+};
+
+struct LinearBVHNode {
+    float bounds[2][3];
+    unsigned int offset;     // num primitives for leaf, second child for interior
+    unsigned int8 nPrimitives;
+    unsigned int8 splitAxis;
+    unsigned int16 pad;
+};
+
+static inline float3 Cross(const float3 v1, const float3 v2) {
+    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
+    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
+    float3 ret;
+    ret.x = (v1y * v2z) - (v1z * v2y);
+    ret.y = (v1z * v2x) - (v1x * v2z);
+    ret.z = (v1x * v2y) - (v1y * v2x);
+    return ret;
+}
+
+static inline float Dot(const float3 a, const float3 b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+
+#if 1
+inline
+#endif
+static void generateRay(uniform const float raster2camera[4][4],
+                        uniform const float camera2world[4][4],
+                        float x, float y, Ray &ray) {
+    ray.mint = 0.f;
+    ray.maxt = 1e30f;
+
+    ray.hitId = 0;
+
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
+        camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
+        camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
+        camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+
+    ray.invDir = 1.f / ray.dir;
+
+    ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
+    ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
+    ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
+}
+
+
+#if 1
+inline
+#endif
+static bool_t BBoxIntersect(const uniform float bounds[2][3],
+                          const Ray &ray) {
+    const uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
+    const uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
+    float t0 = ray.mint, t1 = ray.maxt;
+
+    // Check all three axis-aligned slabs.  Don't try to early out; it's
+    // not worth the trouble
+    float3 tNear = (bounds0 - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds1 - ray.origin) * ray.invDir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = max(tNear.x, t0);
+    t1 = min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = max(tNear.y, t0);
+    t1 = min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = max(tNear.z, t0);
+    t1 = min(tFar.z, t1);
+
+    return (t0 <= t1);
+}
+
+
+
+#if 1
+inline
+#endif
+static bool_t TriIntersect(const uniform_t Triangle tri, Ray &ray) {
+    const uniform_t float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
+    const uniform_t float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
+    const uniform_t float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
+    const uniform_t float3 e1 = p1 - p0;
+    const uniform_t float3 e2 = p2 - p0;
+
+    float3 s1 = Cross(ray.dir, e2);
+    float divisor = Dot(s1, e1);
+    bool_t hit = true;
+
+    if (divisor == 0.)
+        hit = false;
+    float invDivisor = 1.f / divisor;
+
+    // Compute first barycentric coordinate
+    float3 d = ray.origin - p0;
+    float b1 = Dot(d, s1) * invDivisor;
+    if (b1 < 0. || b1 > 1.)
+        hit = false;
+
+    // Compute second barycentric coordinate
+    float3 s2 = Cross(d, e1);
+    float b2 = Dot(ray.dir, s2) * invDivisor;
+    if (b2 < 0. || b1 + b2 > 1.)
+        hit = false;
+
+    // Compute _t_ to intersection point
+    float t = Dot(e2, s2) * invDivisor;
+    if (t < ray.mint || t > ray.maxt)
+        hit = false;
+
+    if (hit) {
+        ray.maxt = t;
+        ray.hitId = tri.id;
+    }
+    return hit;
+}
+
+
+#if 1
+inline
+#endif
+bool_t
+BVHIntersect(const uniform LinearBVHNode nodes[],
+                  const uniform Triangle tris[], Ray &r) {
+    Ray ray = r;
+    bool_t hit = false;
+    // Follow ray through BVH nodes to find primitive intersections
+    uniform int todoOffset = 0, nodeNum = 0;
+    uniform int todo[64];
+
+    while (true) {
+        // Check ray against BVH node
+        const uniform LinearBVHNode node = nodes[nodeNum];
+        if (any(BBoxIntersect(node.bounds, ray))) {
+            const uniform unsigned int nPrimitives = node.nPrimitives;
+            if (nPrimitives > 0) {
+                // Intersect ray with primitives in leaf BVH node
+                const uniform unsigned int primitivesOffset = node.offset;
+                for (uniform_t unsigned int i = 0; i < nPrimitives; ++i) {
+                    if (TriIntersect(tris[primitivesOffset+i], ray))
+                        hit = true;
+                }
+                if (todoOffset == 0)
+                    break;
+                nodeNum = todo[--todoOffset];
+            }
+            else {
+                // Put far BVH node on _todo_ stack, advance to near node
+#if 0 /* fails */
+                int dirIsNeg = r.dirIsNeg[node.splitAxis];
+#else
+                int dirIsNeg;
+                if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg[0];
+                if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg[1];
+                if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg[2];
+#endif
+                if (dirIsNeg) {
+                   todo[todoOffset++] = nodeNum + 1;
+                   nodeNum = node.offset;
+                }
+                else {
+                   todo[todoOffset++] = node.offset;
+                   nodeNum = nodeNum + 1;
+                }
+            }
+        }
+        else {
+            if (todoOffset == 0)
+                break;
+            nodeNum = todo[--todoOffset];
+        }
+    }
+    r.maxt = ray.maxt;
+    r.hitId = ray.hitId;
+
+    return hit;
+}
+
+
+#if 1
+inline
+#endif
+static void raytrace_tile(uniform int x0, uniform int x1,
+                          uniform int y0, uniform int y1,
+                          uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
+                          const uniform float raster2camera[4][4],
+                          const uniform float camera2world[4][4],
+                          uniform float image[], uniform int id[],
+                          const uniform LinearBVHNode nodes[],
+                          const uniform Triangle triangles[]) {
+    const uniform float widthScale = (float)(baseWidth) / (float)(width);
+    const uniform float heightScale = (float)(baseHeight) / (float)(height);
+
+    foreach_tiled (y = y0 ... y1, x = x0 ... x1) {
+        Ray ray;
+        generateRay(raster2camera, camera2world, x*widthScale,
+                    y*heightScale, ray);
+        BVHIntersect(nodes, triangles, ray);
+
+        int offset = y * width + x;
+        image[offset] = ray.maxt;
+        id[offset] = ray.hitId;
+    }
+}
+
+
+export void raytrace_ispc(uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
+                          const uniform float raster2camera[4][4],
+                          const uniform float camera2world[4][4],
+                          uniform float image[], uniform int id[],
+                          const uniform LinearBVHNode nodes[],
+                          const uniform Triangle triangles[]) {
+    raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
+                  raster2camera, camera2world, image,
+                  id, nodes, triangles);
+}
+
+
+task void raytrace_tile_task(uniform int width, uniform int height,
+                             uniform int baseWidth, uniform int baseHeight,
+                             const uniform float raster2camera[4][4],
+                             const uniform float camera2world[4][4],
+                             uniform float image[], uniform int id[],
+                             const uniform LinearBVHNode nodes[],
+                             const uniform Triangle triangles[]) {
+    const uniform int dx = 64, dy = 8; // must match dx, dy below
+    const uniform int xBuckets = (width + (dx-1)) / dx;
+    const uniform int x0 = (taskIndex % xBuckets) * dx;
+    const uniform int x1 = min(x0 + dx, width);
+    const uniform int y0 = (taskIndex / xBuckets) * dy;
+    const uniform int y1 = min(y0 + dy, height);
+
+    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
+                  raster2camera, camera2world, image,
+                  id, nodes, triangles);
+}
+
+
+export void raytrace_ispc_tasks(uniform int width, uniform int height,
+                                uniform int baseWidth, uniform int baseHeight,
+                                const uniform float raster2camera[4][4],
+                                const uniform float camera2world[4][4],
+                                uniform float image[], uniform int id[],
+                                const uniform LinearBVHNode nodes[],
+                                const uniform Triangle triangles[]) {
+    const uniform int dx = 64, dy = 8;
+    const uniform int xBuckets = (width + (dx-1)) / dx;
+    const uniform int yBuckets = (height + (dy-1)) / dy;
+    const uniform int nTasks = xBuckets * yBuckets;
+    launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight,
+                                      raster2camera, camera2world,
+                                      image, id, nodes, triangles);
+}
+
diff --git a/examples/portable/rt/sponza.bvh b/examples/portable/rt/sponza.bvh
new file mode 120000
index 00000000..57f10d00
--- /dev/null
+++ b/examples/portable/rt/sponza.bvh
@@ -0,0 +1 @@
+../../rt/sponza.bvh
\ No newline at end of file
diff --git a/examples/portable/rt/sponza.camera b/examples/portable/rt/sponza.camera
new file mode 120000
index 00000000..3d98a622
--- /dev/null
+++ b/examples/portable/rt/sponza.camera
@@ -0,0 +1 @@
+../../rt/sponza.camera
\ No newline at end of file
diff --git a/examples/portable/rt/teapot.bvh b/examples/portable/rt/teapot.bvh
new file mode 120000
index 00000000..31d05739
--- /dev/null
+++ b/examples/portable/rt/teapot.bvh
@@ -0,0 +1 @@
+../../rt/teapot.bvh
\ No newline at end of file
diff --git a/examples/portable/rt/teapot.camera b/examples/portable/rt/teapot.camera
new file mode 120000
index 00000000..7e0951c4
--- /dev/null
+++ b/examples/portable/rt/teapot.camera
@@ -0,0 +1 @@
+../../rt/teapot.camera
\ No newline at end of file
diff --git a/examples/portable/volume_rendering/.gitignore b/examples/portable/volume_rendering/.gitignore
new file mode 100644
index 00000000..c2471c27
--- /dev/null
+++ b/examples/portable/volume_rendering/.gitignore
@@ -0,0 +1,2 @@
+mandelbrot
+*.ppm
diff --git a/examples/portable/volume_rendering/Makefile_cpu b/examples/portable/volume_rendering/Makefile_cpu
new file mode 100644
index 00000000..73c61719
--- /dev/null
+++ b/examples/portable/volume_rendering/Makefile_cpu
@@ -0,0 +1,8 @@
+
+EXAMPLE=volume
+CPP_SRC=volume.cpp
+ISPC_SRC=volume.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+
+include ../common_cpu.mk
diff --git a/examples/portable/volume_rendering/Makefile_knc b/examples/portable/volume_rendering/Makefile_knc
new file mode 100644
index 00000000..3056ef7a
--- /dev/null
+++ b/examples/portable/volume_rendering/Makefile_knc
@@ -0,0 +1,7 @@
+EXAMPLE=volume
+CXX_SRC=volume.cpp 
+ISPC_SRC=volume.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+
+include ../common_knc.mk
diff --git a/examples/portable/volume_rendering/Makefile_ptx b/examples/portable/volume_rendering/Makefile_ptx
new file mode 100644
index 00000000..6aef695a
--- /dev/null
+++ b/examples/portable/volume_rendering/Makefile_ptx
@@ -0,0 +1,13 @@
+PROG=volume
+ISPC_SRC=volume.ispc
+CU_SRC=volume.cu
+CXX_SRC=volume.cpp  
+PTXCC_REGMAX=64
+
+#LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
diff --git a/examples/portable/volume_rendering/camera.dat b/examples/portable/volume_rendering/camera.dat
new file mode 100644
index 00000000..555ac769
--- /dev/null
+++ b/examples/portable/volume_rendering/camera.dat
@@ -0,0 +1,11 @@
+896 1184
+
+0.000155 0.000000 0.000000 -0.069927
+0.000000 -0.000155 0.000000 0.093236
+0.000000 0.000000 0.000000 1.000000
+0.000000 0.000000 -99.999001 100.000000
+
+1.000000 0.000000 0.000000 1.000000
+0.000000 0.980129 -0.198360 2.900000
+0.000000 0.198360 0.980129 -10.500000
+0.000000 0.000000 0.000000 1.000000
diff --git a/examples/portable/volume_rendering/density_highres.vol b/examples/portable/volume_rendering/density_highres.vol
new file mode 120000
index 00000000..08de6cc6
--- /dev/null
+++ b/examples/portable/volume_rendering/density_highres.vol
@@ -0,0 +1 @@
+../../volume_rendering/density_highres.vol
\ No newline at end of file
diff --git a/examples/portable/volume_rendering/density_lowres.vol b/examples/portable/volume_rendering/density_lowres.vol
new file mode 120000
index 00000000..f5911247
--- /dev/null
+++ b/examples/portable/volume_rendering/density_lowres.vol
@@ -0,0 +1 @@
+../../volume_rendering/density_lowres.vol
\ No newline at end of file
diff --git a/examples/portable/volume_rendering/volume.cpp b/examples/portable/volume_rendering/volume.cpp
new file mode 100644
index 00000000..3b67e77e
--- /dev/null
+++ b/examples/portable/volume_rendering/volume.cpp
@@ -0,0 +1,183 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <cstdio>
+#include <algorithm>
+#include "timing.h"
+#include "ispc_malloc.h"
+#include "volume_ispc.h"
+using namespace ispc;
+
+/* Write a PPM image file with the image */
+static void
+writePPM(float *buf, int width, int height, const char *fn) {
+    FILE *fp = fopen(fn, "wb");
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", width, height);
+    fprintf(fp, "255\n");
+    for (int i = 0; i < width*height; ++i) {
+        float v = buf[i] * 255.f;
+        if (v < 0.f) v = 0.f;
+        else if (v > 255.f) v = 255.f;
+        unsigned char c = (unsigned char)v;
+        for (int j = 0; j < 3; ++j)
+            fputc(c, fp);
+    }
+    fclose(fp);
+    printf("Wrote image file %s\n", fn);
+}
+
+
+/* Load image and viewing parameters from a camera data file.
+   FIXME: we should add support to be able to specify viewing parameters
+   in the program here directly. */
+static void
+loadCamera(const char *fn, int *width, int *height, float raster2camera[4][4],
+           float camera2world[4][4]) {
+    FILE *f = fopen(fn, "r");
+    if (!f) {
+        perror(fn);
+        exit(1);
+    }
+    if (fscanf(f, "%d %d", width, height) != 2) {
+        fprintf(stderr, "Unexpected end of file in camera file\n");
+        exit(1);
+    }
+
+    for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            if (fscanf(f, "%f", &raster2camera[i][j]) != 1) {
+                fprintf(stderr, "Unexpected end of file in camera file\n");
+                exit(1);
+            }
+        }
+    }
+    for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            if (fscanf(f, "%f", &camera2world[i][j]) != 1) {
+                fprintf(stderr, "Unexpected end of file in camera file\n");
+                exit(1);
+            }
+        }
+    }
+    fclose(f);
+}
+
+
+/* Load a volume density file.  Expects the number of x, y, and z samples
+   as the first three values (as integer strings), then x*y*z
+   floating-point values (also as strings) to give the densities.  */
+static float *
+loadVolume(const char *fn, int n[3]) {
+    FILE *f = fopen(fn, "r");
+    if (!f) {
+        perror(fn);
+        exit(1);
+    }
+
+    if (fscanf(f, "%d %d %d", &n[0], &n[1], &n[2]) != 3) {
+        fprintf(stderr, "Couldn't find resolution at start of density file\n");
+        exit(1);
+    }
+
+    int count = n[0] * n[1] * n[2];
+    float *v = new float[count];
+    for (int i = 0; i < count; ++i) {
+        if (fscanf(f, "%f", &v[i]) != 1) {
+            fprintf(stderr, "Unexpected end of file at %d'th density value\n", i);
+            exit(1);
+        }
+    }
+
+    return v;
+}
+
+
+int main(int argc, char *argv[]) {
+    static unsigned int test_iterations[] = {3, 7, 1};
+    if (argc < 3) {
+        fprintf(stderr, "usage: volume <camera.dat> <volume_density.vol> [ispc iterations] [tasks iterations] [serial iterations]\n");
+        return 1;
+    }
+    if (argc == 6) {
+        for (int i = 0; i < 3; i++) {
+            test_iterations[i] = atoi(argv[3 + i]);
+        }
+    }
+
+    //
+    // Load viewing data and the volume density data
+    //
+    int width, height;
+
+    float *camera2world_ispc = new float[4*4];
+    float *raster2camera_ispc = new float[4*4];
+    float (*camera2world )[4] = (float (*)[4])camera2world_ispc;
+    float (*raster2camera)[4] = (float (*)[4])raster2camera_ispc;
+
+    loadCamera(argv[1], &width, &height, raster2camera, camera2world);
+    float *image = new float[width*height];
+
+    int *n = new int[3];
+    float *density = loadVolume(argv[2], n);
+
+    // Clear out the buffer
+    for (int i = 0; i < width * height; ++i)
+        image[i] = 0.;
+
+    //
+    // Compute the image using the ispc implementation that also uses
+    // tasks; report the minimum time of three runs.
+    //
+    double minISPCtasks = 1e30;
+    for (int i = 0; i < test_iterations[1]; ++i) {
+        reset_and_start_timer();
+        volume_ispc_tasks(density, n, raster2camera, camera2world,
+                          width, height, image);
+        double dt = get_elapsed_msec();
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
+        minISPCtasks = std::min(minISPCtasks, dt);
+    }
+
+    printf("[volume ispc + tasks]:\t\t[%.3f] msec\n", minISPCtasks);
+    writePPM(image, width, height, "volume-ispc-tasks.ppm");
+
+    return 0;
+}
diff --git a/examples/portable/volume_rendering/volume.cu b/examples/portable/volume_rendering/volume.cu
new file mode 100644
index 00000000..c0d37bf1
--- /dev/null
+++ b/examples/portable/volume_rendering/volume.cu
@@ -0,0 +1,454 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cuda_helpers.cuh"
+__device__ static inline float clamp(float v, float low, float high)
+{
+      return min(max(v, low), high);
+}
+
+
+#define float3 Float3
+struct Float3
+{
+  float x,y,z;
+  __device__ friend Float3 operator+(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x+b.x;
+    c.y = a.y+b.y;
+    c.z = a.z+b.z;
+    return c;
+  }
+  __device__ friend Float3 operator-(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x-b.x;
+    c.y = a.y-b.y;
+    c.z = a.z-b.z;
+    return c;
+  }
+  __device__ friend Float3 operator/(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x/b.x;
+    c.y = a.y/b.y;
+    c.z = a.z/b.z;
+    return c;
+  }
+  __device__ friend Float3 operator*(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x*b.x;
+    c.y = a.y*b.y;
+    c.z = a.z*b.z;
+    return c;
+  }
+  __device__ friend Float3 operator*(const Float3 a, const float b)
+  {
+    Float3 c;
+    c.x = a.x*b;
+    c.y = a.y*b;
+    c.z = a.z*b;
+    return c;
+  }
+};
+
+struct Ray {
+    float3 origin, dir;
+};
+
+
+__device__ static void
+generateRay(const float raster2camera[4][4],
+            const float camera2world[4][4],
+            float x, float y, Ray &ray) {
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+}
+
+
+__device__ static inline bool
+Inside(float3 p, float3 pMin, float3 pMax) {
+    return (p.x >= pMin.x && p.x <= pMax.x &&
+            p.y >= pMin.y && p.y <= pMax.y &&
+            p.z >= pMin.z && p.z <= pMax.z);
+}
+
+
+__device__ static bool
+IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
+    float t0 = -1e30f, t1 = 1e30f;
+
+    float3 tNear = (pMin - ray.origin) / ray.dir;
+    float3 tFar  = (pMax - ray.origin) / ray.dir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = max(tNear.x, t0);
+    t1 = min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = max(tNear.y, t0);
+    t1 = min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = max(tNear.z, t0);
+    t1 = min(tFar.z, t1);
+
+    if (t0 <= t1) {
+        hit0 = t0;
+        hit1 = t1;
+        return true;
+    }
+    else
+        return false;
+}
+
+
+__device__ static inline float Lerp(float t, float a, float b) {
+    return (1.f - t) * a + t * b;
+}
+
+
+__device__ static inline float D(int x, int y, int z, int nVoxels[3],
+                      float density[]) {
+    x = clamp(x, 0, nVoxels[0]-1);
+    y = clamp(y, 0, nVoxels[1]-1);
+    z = clamp(z, 0, nVoxels[2]-1);
+
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+__device__ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
+    return (p - pMin) / (pMax - pMin);
+}
+
+
+__device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
+                     float density[], int nVoxels[3]) {
+    if (!Inside(Pobj, pMin, pMax))
+        return 0;
+    // Compute voxel coordinates and offsets for _Pobj_
+    float3 vox = Offset(Pobj, pMin, pMax);
+    vox.x = vox.x * nVoxels[0] - .5f;
+    vox.y = vox.y * nVoxels[1] - .5f;
+    vox.z = vox.z * nVoxels[2] - .5f;
+    int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
+    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
+
+    // Trilinearly interpolate density values to compute local density
+    float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
+                     D(vx+1, vy, vz, nVoxels, density));
+    float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
+                     D(vx+1, vy+1, vz, nVoxels, density));
+    float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
+                     D(vx+1, vy, vz+1, nVoxels, density));
+    float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
+                     D(vx+1, vy+1, vz+1, nVoxels, density));
+    float d0 = Lerp(dy, d00, d10);
+    float d1 = Lerp(dy, d01, d11);
+    return Lerp(dz, d0, d1);
+}
+
+
+/* Returns the transmittance between two points p0 and p1, in a volume
+   with extent (pMin,pMax) with transmittance coefficient sigma_t,
+   defined by nVoxels[3] voxels in each dimension in the given density
+   array. */
+__device__ static inline float
+transmittance(float3 p0, float3 p1, float3 pMin,
+              float3 pMax, float sigma_t,
+              float density[], int nVoxels[3]) {
+    float rayT0, rayT1;
+    Ray ray;
+    ray.origin = p1;
+    ray.dir = p0 - p1;
+
+    // Find the parametric t range along the ray that is inside the volume.
+    if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 1.f;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Accumulate beam transmittance in tau
+    float tau = 0.0f;
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    float stepDist = 0.2f;
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1) {
+        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    return exp(-tau);
+}
+
+
+__device__ static inline float
+distanceSquared(float3 a, float3 b) {
+    float3 d = a-b;
+    return d.x*d.x + d.y*d.y + d.z*d.z;
+}
+
+
+__device__ static inline float
+raymarch(float density[], int nVoxels[3], Ray ray) {
+    float rayT0, rayT1;
+    float3 pMin = {.3f, -.2f, .3f}, pMax = {1.8f, 2.3f, 1.8f};
+    float3 lightPos = { -1.f, 4., 1.5f };
+
+    if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 0.f;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Parameters that define the volume scattering characteristics and
+    // sampling rate for raymarching
+    float Le = .25f;            // Emission coefficient
+    float sigma_a = 10.f;        // Absorption coefficient
+    float sigma_s = 10.f;        // Scattering coefficient
+    float stepDist = 0.025f;    // Ray step amount
+    float lightIntensity = 40.0f; // Light source intensity
+
+    float tau = 0.f;  // accumulated beam transmittance
+    float L = 0.f;      // radiance along the ray
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1)
+    {
+        float d = Density(pos, pMin, pMax, density, nVoxels);
+
+        // terminate once attenuation is high
+        float atten = exp(-tau);
+        if (atten < .005f)
+            break;
+
+        // direct lighting
+        float Li = lightIntensity / distanceSquared(lightPos, pos) *
+            transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
+                          density, nVoxels);
+        L += stepDist * atten * d * sigma_s * (Li + Le);
+
+        // update beam transmittance
+        tau += stepDist * (sigma_a + sigma_s) * d;
+
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    // Gamma correction
+    return pow(L, 1.f / 2.2f);
+}
+
+
+/* Utility routine used by both the task-based and the single-core entrypoints.
+   Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the
+   result into the image[] array.
+ */
+__device__ static void
+volume_tile(int x0, int y0, int x1,
+            int y1, float density[], int nVoxels[3],
+            const float raster2camera[4][4],
+            const float camera2world[4][4],
+            int width, int height, float image[]) {
+    // Work on 4x4=16 pixel big tiles of the image.  This function thus
+    // implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
+    // by 4.
+    for (int y = y0; y < y1; y += 8) {
+        for (int x = x0; x < x1; x += 8) {
+              for (int ob = 0; ob < 64; ob += programCount)
+              {
+                const int o = ob + programIndex;
+
+
+                // These two arrays encode the mapping from [0,15] to
+                // offsets within the 4x4 pixel block so that we render
+                // each pixel inside the block
+                const int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
+                  0, 1, 0, 1, 2, 3, 2, 3 };
+                const int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
+                  2, 2, 3, 3, 2, 2, 3, 3 };
+
+                const int xblock[4] = {0, 4, 0, 4};
+                const int yblock[4] = {0, 0, 4, 4};
+
+                // Figure out the pixel to render for this program instance
+                const int xo = x + xblock[o/16] + xoffsets[o&15];
+                const int yo = y + yblock[o/16] + yoffsets[o&15];
+
+                // Use viewing parameters to compute the corresponding ray
+                // for the pixel
+                Ray ray;
+                generateRay(raster2camera, camera2world, xo, yo, ray);
+
+                // And raymarch through the volume to compute the pixel's
+                // value
+                int offset = yo * width + xo;
+                if (xo < x1 && yo < y1)
+                  image[offset] = raymarch(density, nVoxels, ray);
+            }
+        }
+    }
+}
+
+
+__global__ void
+volume_task(float density[], int _nVoxels[3],
+            const float _raster2camera[4][4],
+            const float _camera2world[4][4],
+            int width, int height, float image[]) {
+  if (taskIndex0 >= taskCount0) return;
+
+#if 0
+  int nVoxels[3];
+  nVoxels[0] = _nVoxels[0];
+  nVoxels[1] = _nVoxels[1];
+  nVoxels[2] = _nVoxels[2];
+
+  float raster2camera[4][4];
+  raster2camera[0][0] = _raster2camera[0][0];
+  raster2camera[0][1] = _raster2camera[0][1];
+  raster2camera[0][2] = _raster2camera[0][2];
+  raster2camera[0][3] = _raster2camera[0][3];
+  raster2camera[1][0] = _raster2camera[1][0];
+  raster2camera[1][1] = _raster2camera[1][1];
+  raster2camera[1][2] = _raster2camera[1][2];
+  raster2camera[1][3] = _raster2camera[1][3];
+  raster2camera[2][0] = _raster2camera[2][0];
+  raster2camera[2][1] = _raster2camera[2][1];
+  raster2camera[2][2] = _raster2camera[2][2];
+  raster2camera[2][3] = _raster2camera[2][3];
+  raster2camera[3][0] = _raster2camera[3][0];
+  raster2camera[3][1] = _raster2camera[3][1];
+  raster2camera[3][2] = _raster2camera[3][2];
+  raster2camera[3][3] = _raster2camera[3][3];
+
+  float camera2world[4][4];
+  camera2world[0][0] = _camera2world[0][0];
+  camera2world[0][1] = _camera2world[0][1];
+  camera2world[0][2] = _camera2world[0][2];
+  camera2world[0][3] = _camera2world[0][3];
+  camera2world[1][0] = _camera2world[1][0];
+  camera2world[1][1] = _camera2world[1][1];
+  camera2world[1][2] = _camera2world[1][2];
+  camera2world[1][3] = _camera2world[1][3];
+  camera2world[2][0] = _camera2world[2][0];
+  camera2world[2][1] = _camera2world[2][1];
+  camera2world[2][2] = _camera2world[2][2];
+  camera2world[2][3] = _camera2world[2][3];
+  camera2world[3][0] = _camera2world[3][0];
+  camera2world[3][1] = _camera2world[3][1];
+  camera2world[3][2] = _camera2world[3][2];
+  camera2world[3][3] = _camera2world[3][3];
+#else
+#define  nVoxels _nVoxels
+#define  raster2camera _raster2camera
+#define  camera2world _camera2world
+#endif
+
+  int dx = 8, dy = 8; // must match value in volume_ispc_tasks
+  int xbuckets = (width + (dx-1)) / dx;
+  int ybuckets = (height + (dy-1)) / dy;
+
+  int x0 = (taskIndex % xbuckets) * dx;
+  int y0 = (taskIndex / xbuckets) * dy;
+  int x1 = x0 + dx, y1 = y0 + dy;
+  x1 = min(x1, width);
+  y1 = min(y1, height);
+
+  volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
+      camera2world, width, height, image);
+}
+
+
+extern "C"
+__global__ void
+volume_ispc_tasks___export( float density[],  int nVoxels[3],
+    const  float raster2camera[4][4],
+    const  float camera2world[4][4],
+    int width,  int height,  float image[]) {
+  // Launch tasks to work on (dx,dy)-sized tiles of the image
+  int dx = 8, dy = 8;
+  int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
+  launch(nTasks,1,1,volume_task)
+    (density, nVoxels, raster2camera, camera2world,
+     width, height, image);
+  cudaDeviceSynchronize();
+}
+
+extern "C"
+__host__ void
+volume_ispc_tasks( float density[],  int nVoxels[3],
+    const  float raster2camera[4][4],
+    const  float camera2world[4][4],
+    int width,  int height,  float image[]) {
+  volume_ispc_tasks___export<<<1,32>>>(density, nVoxels, raster2camera, camera2world, width, height,image);
+  cudaDeviceSynchronize();
+}
diff --git a/examples/portable/volume_rendering/volume.ispc b/examples/portable/volume_rendering/volume.ispc
new file mode 100644
index 00000000..2f5c6bfe
--- /dev/null
+++ b/examples/portable/volume_rendering/volume.ispc
@@ -0,0 +1,413 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+
+typedef float<3> float3;
+
+struct Ray {
+    float3 origin, dir;
+};
+
+
+static inline void
+generateRay(const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4],
+            float x, float y, Ray &ray) {
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+}
+
+
+static inline bool
+Inside(float3 p, float3 pMin, float3 pMax) {
+    return (p.x >= pMin.x && p.x <= pMax.x &&
+            p.y >= pMin.y && p.y <= pMax.y &&
+            p.z >= pMin.z && p.z <= pMax.z);
+}
+
+
+static inline bool
+IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
+    float t0 = -1e30, t1 = 1e30;
+
+    float3 tNear = (pMin - ray.origin) / ray.dir;
+    float3 tFar  = (pMax - ray.origin) / ray.dir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = max(tNear.x, t0);
+    t1 = min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = max(tNear.y, t0);
+    t1 = min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = max(tNear.z, t0);
+    t1 = min(tFar.z, t1);
+
+    if (t0 <= t1) {
+        hit0 = t0;
+        hit1 = t1;
+        return true;
+    }
+    else
+        return false;
+}
+
+
+static inline float Lerp(float t, float a, float b) {
+    return (1.f - t) * a + t * b;
+}
+
+
+static inline float D(int x, int y, int z, uniform int nVoxels[3],
+                      uniform float density[]) {
+    x = clamp(x, 0, nVoxels[0]-1);
+    y = clamp(y, 0, nVoxels[1]-1);
+    z = clamp(z, 0, nVoxels[2]-1);
+
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
+    return (p - pMin) / (pMax - pMin);
+}
+
+
+static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
+                     uniform float density[], uniform int nVoxels[3]) {
+    if (!Inside(Pobj, pMin, pMax))
+        return 0;
+    // Compute voxel coordinates and offsets for _Pobj_
+    float3 vox = Offset(Pobj, pMin, pMax);
+    vox.x = vox.x * nVoxels[0] - .5f;
+    vox.y = vox.y * nVoxels[1] - .5f;
+    vox.z = vox.z * nVoxels[2] - .5f;
+    int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
+    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
+
+    // Trilinearly interpolate density values to compute local density
+    float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
+                     D(vx+1, vy, vz, nVoxels, density));
+    float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
+                     D(vx+1, vy+1, vz, nVoxels, density));
+    float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
+                     D(vx+1, vy, vz+1, nVoxels, density));
+    float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
+                     D(vx+1, vy+1, vz+1, nVoxels, density));
+    float d0 = Lerp(dy, d00, d10);
+    float d1 = Lerp(dy, d01, d11);
+    return Lerp(dz, d0, d1);
+}
+
+
+/* Returns the transmittance between two points p0 and p1, in a volume
+   with extent (pMin,pMax) with transmittance coefficient sigma_t,
+   defined by nVoxels[3] voxels in each dimension in the given density
+   array. */
+static inline float
+transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
+              uniform float3 pMax, uniform float sigma_t,
+              uniform float density[], uniform int nVoxels[3]) {
+    float rayT0, rayT1;
+    Ray ray;
+    ray.origin = p1;
+    ray.dir = p0 - p1;
+
+    // Find the parametric t range along the ray that is inside the volume.
+    if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 1.;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Accumulate beam transmittance in tau
+    float tau = 0;
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    const uniform float stepDist = 0.2;
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1) {
+        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    return exp(-tau);
+}
+
+
+static inline float
+distanceSquared(float3 a, float3 b) {
+    float3 d = a-b;
+    return d.x*d.x + d.y*d.y + d.z*d.z;
+}
+
+
+static inline float
+raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
+    float rayT0, rayT1;
+    const uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8};
+    const uniform float3 lightPos = { -1, 4, 1.5 };
+
+    if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 0.;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Parameters that define the volume scattering characteristics and
+    // sampling rate for raymarching
+    const uniform float Le = .25;            // Emission coefficient
+    const uniform float sigma_a = 10;        // Absorption coefficient
+    const uniform float sigma_s = 10;        // Scattering coefficient
+    const uniform float stepDist = 0.025;    // Ray step amount
+    const uniform float lightIntensity = 40; // Light source intensity
+
+    float tau = 0.f;  // accumulated beam transmittance
+    float L = 0;      // radiance along the ray
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1)
+    {
+        float d = Density(pos, pMin, pMax, density, nVoxels);
+
+        // terminate once attenuation is high
+        float atten = exp(-tau);
+        if (atten < .005)
+            break;
+
+        // direct lighting
+        float Li = lightIntensity / distanceSquared(lightPos, pos) *
+            transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
+                          density, nVoxels);
+        L += stepDist * atten * d * sigma_s * (Li + Le);
+
+        // update beam transmittance
+        tau += stepDist * (sigma_a + sigma_s) * d;
+
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    // Gamma correction
+    return pow(L, 1.f / 2.2f);
+}
+
+
+/* Utility routine used by both the task-based and the single-core entrypoints.
+   Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the
+   result into the image[] array.
+ */
+static inline void
+volume_tile(uniform int x0, uniform int y0, uniform int x1,
+            uniform int y1, uniform float density[], uniform int nVoxels[3],
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4],
+            uniform int width, uniform int height, uniform float image[]) {
+  // Work on 4x4=16 pixel big tiles of the image.  This function thus
+  // implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
+  // by 4.
+#if 0
+  for (uniform int y = y0; y < y1; y += 8)
+    for (uniform int x = x0; x < x1; x += 8)
+      foreach (o = 0 ... 64)
+      {
+        // These two arrays encode the mapping from [0,15] to
+        // offsets within the 4x4 pixel block so that we render
+        // each pixel inside the block
+        const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
+          0, 1, 0, 1, 2, 3, 2, 3 };
+        const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
+          2, 2, 3, 3, 2, 2, 3, 3 };
+
+        const uniform int xblock[4] = {0, 4, 0, 4};
+        const uniform int yblock[4] = {0, 0, 4, 4};
+
+        // Figure out the pixel to render for this program instance
+        const int xo = x + xblock[o/16] + xoffsets[o&15];
+        const int yo = y + yblock[o/16] + yoffsets[o&15];
+
+        // Use viewing parameters to compute the corresponding ray
+        // for the pixel
+        Ray ray;
+        generateRay(raster2camera, camera2world, xo, yo, ray);
+
+        // And raymarch through the volume to compute the pixel's
+        // value
+        int offset = yo * width + xo;
+        if (xo < x1 && yo < y1)
+          image[offset] = raymarch(density, nVoxels, ray);
+      }
+#else
+  foreach_tiled (y = y0 ... y1, x = x0 ... x1)
+  {
+    // Use viewing parameters to compute the corresponding ray
+    // for the pixel
+    Ray ray;
+    generateRay(raster2camera, camera2world, x, y, ray);
+
+    // And raymarch through the volume to compute the pixel's
+    // value
+    int offset = y * width + x;
+    image[offset] = raymarch(density, nVoxels, ray);
+  }
+#endif
+}
+
+
+task void
+volume_task(uniform float density[], uniform int _nVoxels[3],
+            const uniform float _raster2camera[4][4],
+            const uniform float _camera2world[4][4],
+            uniform int width, uniform int height, uniform float image[])
+{
+  if (taskIndex >= taskCount) return;
+
+#if 1 /* cannot pass shared memory pointers to functions, need to find a way to solve this one :S */
+  uniform int nVoxels[3];
+  nVoxels[0] = _nVoxels[0];
+  nVoxels[1] = _nVoxels[1];
+  nVoxels[2] = _nVoxels[2];
+
+  uniform float raster2camera[4][4];
+  raster2camera[0][0] = _raster2camera[0][0];
+  raster2camera[0][1] = _raster2camera[0][1];
+  raster2camera[0][2] = _raster2camera[0][2];
+  raster2camera[0][3] = _raster2camera[0][3];
+  raster2camera[1][0] = _raster2camera[1][0];
+  raster2camera[1][1] = _raster2camera[1][1];
+  raster2camera[1][2] = _raster2camera[1][2];
+  raster2camera[1][3] = _raster2camera[1][3];
+  raster2camera[2][0] = _raster2camera[2][0];
+  raster2camera[2][1] = _raster2camera[2][1];
+  raster2camera[2][2] = _raster2camera[2][2];
+  raster2camera[2][3] = _raster2camera[2][3];
+  raster2camera[3][0] = _raster2camera[3][0];
+  raster2camera[3][1] = _raster2camera[3][1];
+  raster2camera[3][2] = _raster2camera[3][2];
+  raster2camera[3][3] = _raster2camera[3][3];
+
+  uniform float camera2world[4][4];
+  camera2world[0][0] = _camera2world[0][0];
+  camera2world[0][1] = _camera2world[0][1];
+  camera2world[0][2] = _camera2world[0][2];
+  camera2world[0][3] = _camera2world[0][3];
+  camera2world[1][0] = _camera2world[1][0];
+  camera2world[1][1] = _camera2world[1][1];
+  camera2world[1][2] = _camera2world[1][2];
+  camera2world[1][3] = _camera2world[1][3];
+  camera2world[2][0] = _camera2world[2][0];
+  camera2world[2][1] = _camera2world[2][1];
+  camera2world[2][2] = _camera2world[2][2];
+  camera2world[2][3] = _camera2world[2][3];
+  camera2world[3][0] = _camera2world[3][0];
+  camera2world[3][1] = _camera2world[3][1];
+  camera2world[3][2] = _camera2world[3][2];
+  camera2world[3][3] = _camera2world[3][3];
+#else
+#define  nVoxels _nVoxels
+#define  raster2camera _raster2camera
+#define  camera2world _camera2world
+#endif
+
+  const uniform int dx = 8, dy = 8; // must match value in volume_ispc_tasks
+  const uniform int xbuckets = (width + (dx-1)) / dx;
+  const uniform int ybuckets = (height + (dy-1)) / dy;
+
+  const uniform int x0 = (taskIndex % xbuckets) * dx;
+  const uniform int y0 = (taskIndex / xbuckets) * dy;
+  const uniform int x1 = min(x0 + dx, width);
+  const uniform int y1 = min(y0 + dy, height);
+
+  volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
+      camera2world, width, height, image);
+}
+
+
+export void
+volume_ispc(uniform float density[], uniform int nVoxels[3],
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4],
+            uniform int width, uniform int height, uniform float image[]) {
+    volume_tile(0, 0, width, height, density, nVoxels, raster2camera,
+                camera2world, width, height,  image);
+}
+
+
+export void
+volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
+                  const uniform float raster2camera[4][4],
+                  const uniform float camera2world[4][4],
+                  uniform int width, uniform int height, uniform float image[]) {
+    // Launch tasks to work on (dx,dy)-sized tiles of the image
+    const uniform int dx = 8, dy = 8;
+    const uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
+    launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world,
+                               width, height, image);
+    sync;
+}
diff --git a/examples/sort/sort.cpp b/examples/sort/sort.cpp
index 20221d90..69e537a1 100644
--- a/examples/sort/sort.cpp
+++ b/examples/sort/sort.cpp
@@ -37,6 +37,7 @@
 #include <stdlib.h>
 #include <algorithm>
 #include <iostream>
+#include <cassert>
 #include <iomanip>
 #include "../timing.h"
 #include "sort_ispc.h"
@@ -45,26 +46,28 @@ using namespace ispc;
 
 extern void sort_serial (int n, unsigned int code[], int order[]);
 
-/* progress bar by Ross Hemsley;
- * http://www.rosshemsley.co.uk/2011/02/creating-a-progress-bar-in-c-or-any-other-console-app/ */
-static inline void progressbar (unsigned int x, unsigned int n, unsigned int w = 50)
+static void progressBar(const int x, const int n, const int width = 50)
 {
-  if (n < 100)
-  {
-    x *= 100/n;
-    n = 100;
-  }
+  assert(n > 1);
+  assert(x >= 0 && x < n);
+  assert(width > 10);
+  const float f = static_cast<float>(x)/(n-1);
+  const int   w = static_cast<int>(f * width);
 
-  if ((x != n) && (x % (n/100) != 0)) return;
+  // print bar
+  std::string bstr("[");
+  for (int i = 0; i < width; i++)
+    bstr += i < w ? '=' : ' ';
+  bstr += "]";
 
-  using namespace std;
-  float ratio  =  x/(float)n;
-  int c =  ratio * w;
+  // print percentage 
+  char pstr0[32];
+  sprintf(pstr0, " %2d %c ", static_cast<int>(f*100.0),'%');
+  const std::string pstr(pstr0);
+  std::copy(pstr.begin(), pstr.end(), bstr.begin() + (width/2-2));
 
-  cout << setw(3) << (int)(ratio*100) << "% [";
-  for (int x=0; x<c; x++) cout << "=";
-  for (int x=c; x<w; x++) cout << " ";
-  cout << "]\r" << flush;
+  std::cout << bstr;
+  std::cout << (x == n-1 ? "\n" : "\r") << std::flush;
 }
 
 int main (int argc, char *argv[])
@@ -87,7 +90,7 @@ int main (int argc, char *argv[])
     tISPC1 += get_elapsed_mcycles();
 
     if (argc != 3)
-        progressbar (i, m);
+        progressBar (i, m);
   }
 
   printf("[sort ispc]:\t[%.3f] million cycles\n", tISPC1);
@@ -105,7 +108,7 @@ int main (int argc, char *argv[])
     tISPC2 += get_elapsed_mcycles();
 
     if (argc != 3)
-        progressbar (i, m);
+        progressBar (i, m);
   }
 
   printf("[sort ispc + tasks]:\t[%.3f] million cycles\n", tISPC2);
@@ -123,7 +126,7 @@ int main (int argc, char *argv[])
     tSerial += get_elapsed_mcycles();
 
     if (argc != 3)
-        progressbar (i, m);
+        progressBar (i, m);
   }
 
   printf("[sort serial]:\t\t[%.3f] million cycles\n", tSerial);
diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp
index 45be7605..38b66762 100644
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -960,17 +960,22 @@ InitTaskSystem() {
 
 inline void
 TaskGroup::Launch(int baseIndex, int count) {
-#pragma omp parallel for
-    for(int i = 0; i < count; i++) {
+#pragma omp parallel
+  {
+    const int threadIndex = omp_get_thread_num();
+    const int threadCount = omp_get_num_threads();
+
+#pragma omp for schedule(runtime)
+    for(int i = 0; i < count; i++) 
+    {
         TaskInfo *ti = GetTaskInfo(baseIndex + i);
 
         // Actually run the task. 
-        int threadIndex = omp_get_thread_num();
-        int threadCount = omp_get_num_threads();
         ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
             ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
             ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
     }
+  }
 }
 
 inline void
diff --git a/examples/timing.h b/examples/timing.h
index 8569d439..5254a787 100644
--- a/examples/timing.h
+++ b/examples/timing.h
@@ -58,6 +58,7 @@ __inline__ uint64_t rdtsc() {
 
 #ifdef WIN32
 #include <windows.h>
+double rtc();
 #define rdtsc __rdtsc
 #else // WIN32
 __inline__ uint64_t rdtsc() {
@@ -72,14 +73,30 @@ __inline__ uint64_t rdtsc() {
   __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
   return (uint64_t)high << 32 | low;
 }
+
+#include <sys/time.h>
+static inline double rtc(void)
+{
+  struct timeval Tvalue;
+  double etime;
+  struct timezone dummy;
+
+  gettimeofday(&Tvalue,&dummy);
+  etime =  (double) Tvalue.tv_sec +
+    1.e-6*((double) Tvalue.tv_usec);
+  return etime;
+}
+
 #endif // !WIN32
 #endif // !__arm__            
             
-static uint64_t start, end;
+static uint64_t start,  end;
+static double  tstart, tend;
 
 static inline void reset_and_start_timer()
 {
     start = rdtsc();
+    tstart = rtc();
 }
 
 /* Returns the number of millions of elapsed processor cycles since the
@@ -89,3 +106,9 @@ static inline double get_elapsed_mcycles()
     end = rdtsc();
     return (end-start) / (1024. * 1024.);
 }
+
+static inline double get_elapsed_msec()
+{
+    tend = rtc();
+    return (tend - tstart)*1e3;
+}
diff --git a/examples/util/cuda_helpers.cuh b/examples/util/cuda_helpers.cuh
new file mode 100644
index 00000000..e6257b75
--- /dev/null
+++ b/examples/util/cuda_helpers.cuh
@@ -0,0 +1,58 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#define programCount 32
+#define programIndex (threadIdx.x & 31)
+#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5))
+#define taskCount0 (gridDim.x*4)
+#define taskIndex1 (blockIdx.y)
+#define taskCount1 (gridDim.y)
+#define taskIndex2 (blockIdx.z)
+#define taskCount2 (gridDim.z)
+#define taskIndex (taskIndex0 + taskCount0*(taskIndex1 + taskCount1*taskIndex2))
+#define taskCount (taskCount0*taskCount1*taskCount2)
+#define warpIdx (threadIdx.x >> 5)
+#define launch(ntx,nty,ntz,func) if (programIndex==0) func<<<dim3(((ntx)+4-1)/4,nty,ntz),128>>>
+#define sync cudaDeviceSynchronize()
+#define cif if
+__device__ __forceinline__ static double __shfl(double x, int lane)
+{
+  return __hiloint2double(
+      __shfl_xor(__double2hiint(x), lane),
+      __shfl_xor(__double2loint(x), lane));
+
+}
+#define shuffle(x,y) __shfl(x,y)
+#define broadcast(x,y) __shfl(x,y)
diff --git a/examples/util/ispc_malloc.cpp b/examples/util/ispc_malloc.cpp
new file mode 100644
index 00000000..7956b3d0
--- /dev/null
+++ b/examples/util/ispc_malloc.cpp
@@ -0,0 +1,87 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <cstring>
+#include "ispc_malloc.h"
+
+#ifdef _CUDA_
+
+void * operator new(size_t size) throw(std::bad_alloc)
+{
+  void *ptr;
+  ispc_malloc(&ptr, size);
+  return ptr;
+}
+void operator delete(void *ptr) throw()
+{
+  ispc_free(ptr);
+}
+
+#else
+
+void ispc_malloc(void **ptr, const size_t size)
+{
+  *ptr = malloc(size);
+}
+void ispc_free(void *ptr)
+{
+  free(ptr);
+}
+void ispc_memset(void *ptr, int value, size_t size)
+{
+  memset(ptr, value, size);
+}
+void ispcSetMallocHeapLimit(size_t value)
+{
+}
+void ispcSetStackLimit(size_t value)
+{
+}
+unsigned long long ispcGetMallocHeapLimit()
+{
+  return -1;
+}
+unsigned long long ispcGetStackLimit()
+{
+  return -1;
+}
+void * ispcMemcpy(void *dest,  void *src,  size_t num)
+{
+  memcpy(dest, src, num);
+  return dest;
+}
+
+#endif
diff --git a/examples/util/ispc_malloc.h b/examples/util/ispc_malloc.h
new file mode 100644
index 00000000..9378d3d5
--- /dev/null
+++ b/examples/util/ispc_malloc.h
@@ -0,0 +1,43 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+extern void ispc_malloc(void **ptr, const size_t size);
+extern void ispc_free(void *ptr);
+extern void ispc_memset(void *ptr, int value, size_t size);
+extern void ispcSetMallocHeapLimit(size_t value);
+extern void ispcSetStackLimit(size_t value);
+extern unsigned long long ispcGetMallocHeapLimit();
+extern unsigned long long ispcGetStackLimit();
+extern void * ispcMemcpy(void *dest,  void *src,  size_t num);
diff --git a/examples/util/nvcc_helpers.cu b/examples/util/nvcc_helpers.cu
new file mode 100644
index 00000000..cb5a18c9
--- /dev/null
+++ b/examples/util/nvcc_helpers.cu
@@ -0,0 +1,76 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _CUDA_
+#error "Something went wrong..."
+#endif
+
+void ispc_malloc(void **ptr, const size_t size)
+{
+  cudaMallocManaged(ptr, size);
+}
+void ispc_free(void *ptr)
+{
+  cudaFree(ptr);
+}
+void ispc_memset(void *ptr, int value, size_t size)
+{
+  cudaMemset(ptr, value, size);
+}
+void ispcSetMallocHeapLimit(size_t value)
+{
+  cudaDeviceSetLimit(cudaLimitMallocHeapSize,value);
+}
+void ispcSetStackLimit(size_t value)
+{
+  cudaDeviceSetLimit(cudaLimitStackSize,value);
+}
+unsigned long long ispcGetMallocHeapLimit()
+{
+  size_t value;
+  cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize);
+  return value;
+}
+unsigned long long ispcGetStackLimit()
+{
+  size_t value;
+  cudaDeviceGetLimit(&value, cudaLimitStackSize);
+  return value;
+}
+void * ispcMemcpy(void *dest,  void *src,  size_t num)
+{
+  cudaMemcpy(dest, src, num, cudaMemcpyDefault);
+  return dest;
+}
+
+
diff --git a/expr.cpp b/expr.cpp
index 80398ab5..69056426 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -7872,6 +7872,14 @@ SizeOfExpr::TypeCheck() {
               "struct type \"%s\".", type->GetString().c_str());
         return NULL;
     }
+#ifdef ISPC_NVPTX_ENABLED
+    if (type != NULL)
+      if (g->target->getISA() == Target::NVPTX && type->IsVaryingType())
+      {
+        Error(pos, "\"sizeof\" with varying data types is not yet supported with \"nvptx\" target.");
+        return NULL;
+      }
+#endif /* ISPC_NVPTX_ENABLED */
 
     return this;
 }
@@ -8704,6 +8712,13 @@ NewExpr::TypeCheck() {
         AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
+#ifdef ISPC_NVPTX_ENABLED
+    if (g->target->getISA() == Target::NVPTX && allocType->IsVaryingType())
+    {
+      Error(pos, "\"new\" with varying data types is not yet supported with \"nvptx\" target.");
+      return NULL;
+    }
+#endif /* ISPC_NVPTX_ENABLED */
     if (CastType<UndefinedStructType>(allocType) != NULL) {
         Error(pos, "Can't dynamically allocate storage for declared "
               "but not defined type \"%s\".", allocType->GetString().c_str());
diff --git a/fail_db.txt b/fail_db.txt
index b7c1ad74..46d29f3d 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -511,3 +511,1396 @@
 ./tests/psubus_vi8.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
 ./tests/psubus_vi16.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
 ./tests/psubus_vi8.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/acos.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/asin.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/foreach-25.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/foreach-26.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/foreach-27.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int8-2.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int8-4.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gs-improve-progindex.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/aossoa-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/aossoa-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/aossoa-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/aossoa-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/aossoa-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/aossoa-6.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/aossoa-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/aossoa-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/atomics-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/atomics-13.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/atomics-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/atomics-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/atomics-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/atomics-swap.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/avg-down-int16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/avg-down-uint16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/avg-down-uint8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/avg-up-int16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/avg-up-uint16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/avg-up-uint8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/bool-float-typeconv.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/broadcast-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/broadcast-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/count-leading-trailing-zeros-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/count-leading-trailing-zeros-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/count-leading-trailing-zeros-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/count-leading-trailing-zeros-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/double-consts.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/double-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/double-max.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/double-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/double-min.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/double-sqrt.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/foreach-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/foreach-11.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/foreach-12.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/foreach-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/foreach-unique-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/funcptr-uniform-10.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/funcptr-varying-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/funcptr-varying-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-double-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-double-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-double-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-double-6.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-double-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-double-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int16-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int16-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int16-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int16-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int16-6.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int16-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int16-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int32-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int32-6.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int32-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int32-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int8-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int8-6.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int8-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/gather-int8-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/half-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/half-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/half-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/half.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/idiv.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/int16-wrap.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/int64-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/int64-max.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/int64-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/int64-min.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/int8-wrap.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/load-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/load-int16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/load-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/load-int8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/local-atomics-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/local-atomics-13.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/local-atomics-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/local-atomics-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/local-atomics-swap.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/max-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/max-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/memcpy-varying.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/memmove-varying.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/memset-uniform.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/memset-varying.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/min-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/min-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/padds_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/padds_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/padds_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/paddus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/paddus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/paddus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/pass-varying-lvalue-to-ref.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/pdivs_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/pdivs_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/pdivs_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/pdivus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/pdivus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/pdivus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/pmuls_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/pmuls_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/pmuls_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/pmulus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/pmulus_vi32.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/pmulus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/pmulus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/popcnt-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/prefetch.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/psubs_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/psubs_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/psubs_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/psubus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/psubus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/psubus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/ptr-15.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/ptr-22.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/ptr-24.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/ptr-25.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/ptr-cmp-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-add-int-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-add-int.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-add-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-add-int16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-add-int64-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-add-int64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-add-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-add-int8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-add-uint-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-add-uint.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-add-uint64-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-add-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-equal-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-max-int.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-max-int64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-max-uint.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-max-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-min-int.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-min-int64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-min-uint.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/reduce-min-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/rotate-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/rotate-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/rotate-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/rotate-6.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/scatter-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/scatter-int16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/scatter-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/scatter-int8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shift-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle-flatten.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle2-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle2-10.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle2-11.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle2-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle2-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle2-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle2-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle2-6.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle2-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle2-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle2-9.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/shuffle2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/soa-16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/soa-17.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/soa-18.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/soa-21.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/soa-22.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/soa-23.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/soa-24.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/soa-25.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/store-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/store-int16-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/store-int16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/store-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/store-int8-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/store-int8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/struct-nested-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/test-103.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/test-105.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/test-107.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/test-148.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/uint64-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/uint64-max.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/uint64-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/uint64-min.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
+./tests/acos.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/asin.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/funcptr-varying-6.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/funcptr-varying-7.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/funcptr-varying-8.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/ldexp-double.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/aossoa-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/aossoa-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/aossoa-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/aossoa-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/aossoa-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/aossoa-6.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/aossoa-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/aossoa-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/atomics-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/atomics-10.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/atomics-11.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/atomics-12.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/atomics-13.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/atomics-14.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/atomics-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/atomics-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/atomics-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/atomics-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/atomics-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/atomics-9.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/atomics-swap.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/avg-down-int16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/avg-down-int8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/avg-down-uint16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/avg-down-uint8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/avg-up-int16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/avg-up-int8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/avg-up-uint16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/avg-up-uint8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/bool-float-typeconv.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/broadcast-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/broadcast-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/count-leading-trailing-zeros-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/count-leading-trailing-zeros-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/count-leading-trailing-zeros-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/count-leading-trailing-zeros-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/double-consts.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/double-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/double-max.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/double-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/double-min.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/double-sqrt.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/exclusive-scan-add-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/exclusive-scan-and-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/exclusive-scan-and-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/exclusive-scan-or-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/foreach-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/foreach-11.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/foreach-12.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/foreach-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/foreach-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/foreach-unique-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/frexp-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/frexp-double.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/funcptr-uniform-10.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-double-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-double-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-double-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-double-6.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-double-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-double-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int16-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int16-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int16-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int16-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int16-6.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int16-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int16-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int32-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int32-6.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int32-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int32-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int8-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int8-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int8-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int8-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int8-6.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int8-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/gather-int8-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/half-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/half-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/half-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/half.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/idiv.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/insert-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/int16-wrap.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/int64-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/int64-max.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/int64-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/int64-min.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/int8-wrap.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/load-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/load-int16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/load-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/load-int8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/local-atomics-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/local-atomics-13.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/local-atomics-14.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/local-atomics-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/local-atomics-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/local-atomics-swap.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/max-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/max-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/memcpy-varying.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/memmove-varying.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/memset-uniform.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/memset-varying.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/min-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/min-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/padds_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/padds_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/padds_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/paddus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/paddus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/paddus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/pass-varying-lvalue-to-ref.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/pdivs_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/pdivs_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/pdivs_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/pdivus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/pdivus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/pdivus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/pmuls_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/pmuls_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/pmuls_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/pmulus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/pmulus_vi32.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/pmulus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/pmulus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/popcnt-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/prefetch.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/psubs_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/psubs_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/psubs_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/psubus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/psubus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/psubus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/ptr-15.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/ptr-19.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/ptr-24.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/ptr-25.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/ptr-assign-lhs-math-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/ptr-cast-complex.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/ptr-cmp-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/ptr-int-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/ptr-int-null-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/ptr-null-func-arg.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/ptr-varying-unif-index.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-add-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-add-int-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-add-int.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-add-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-add-int16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-add-int64-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-add-int64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-add-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-add-int8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-add-uint-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-add-uint.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-add-uint64-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-add-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-equal-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-equal-10.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-equal-12.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-equal-13.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-equal-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-equal-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-equal-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-equal-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-equal-6.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-equal-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-max-int.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-max-int64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-max-uint.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-max-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-min-int.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-min-int64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-min-uint.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/reduce-min-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/rotate-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/rotate-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/rotate-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/rotate-6.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/scatter-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/scatter-int16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/scatter-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/scatter-int8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shift-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shift-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shift-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle-flatten.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle2-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle2-10.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle2-11.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle2-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle2-3.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle2-4.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle2-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle2-6.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle2-7.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle2-8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle2-9.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/shuffle2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/soa-16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/soa-17.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/soa-18.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/soa-19.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/soa-21.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/soa-22.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/soa-23.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/soa-24.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/soa-25.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/store-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/store-int16-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/store-int16.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/store-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/store-int8-2.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/store-int8.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/struct-nested-5.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/test-103.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/test-105.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/test-107.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/test-148.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/uint64-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/uint64-max.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/uint64-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/uint64-min.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
+./tests/acos.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/asin.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/foreach-25.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/foreach-26.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/foreach-27.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int8-2.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int8-4.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gs-improve-progindex.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/ptr-diff-3.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/ptr-diff-5.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/ptr-diff-6.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/aossoa-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/aossoa-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/aossoa-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/aossoa-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/aossoa-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/aossoa-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/aossoa-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/aossoa-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/atomics-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/atomics-13.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/atomics-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/atomics-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/atomics-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/atomics-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/atomics-swap.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/atomics-uniform-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/atomics-uniform-9.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/avg-down-int16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/avg-down-uint16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/avg-down-uint8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/avg-up-int16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/avg-up-uint16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/avg-up-uint8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/bool-float-typeconv.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/broadcast-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/broadcast-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/count-leading-trailing-zeros-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/count-leading-trailing-zeros-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/count-leading-trailing-zeros-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/count-leading-trailing-zeros-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/double-consts.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/double-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/double-max.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/double-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/double-min.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/double-sqrt.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/foreach-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/foreach-11.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/foreach-12.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/foreach-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/foreach-unique-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/funcptr-uniform-10.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/funcptr-varying-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/funcptr-varying-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-double-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-double-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-double-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-double-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-double-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-double-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int16-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int16-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int16-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int16-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int16-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int16-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int16-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int32-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int32-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int32-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int32-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int8-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int8-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int8-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/gather-int8-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/half-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/half-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/half-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/half.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/idiv.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/int16-wrap.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/int64-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/int64-max.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/int64-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/int64-min.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/int8-wrap.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/load-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/load-int16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/load-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/load-int8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/local-atomics-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/local-atomics-13.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/local-atomics-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/local-atomics-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/local-atomics-swap.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/max-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/max-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/memcpy-varying.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/memmove-varying.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/memset-uniform.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/memset-varying.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/min-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/min-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/padds_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/padds_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/padds_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/paddus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/paddus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/paddus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pass-varying-lvalue-to-ref.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pdivs_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pdivs_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pdivs_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pdivus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pdivus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pdivus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pmuls_i64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pmuls_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pmuls_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pmuls_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pmulus_i16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pmulus_i32.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pmulus_i64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pmulus_i8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pmulus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pmulus_vi32.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pmulus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/pmulus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/popcnt-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/prefetch.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/psubs_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/psubs_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/psubs_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/psubus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/psubus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/psubus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/ptr-15.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/ptr-22.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/ptr-24.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/ptr-25.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/ptr-cmp-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-add-int-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-add-int.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-add-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-add-int16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-add-int64-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-add-int64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-add-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-add-int8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-add-uint-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-add-uint.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-add-uint64-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-add-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-equal-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-max-int.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-max-int64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-max-uint.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-max-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-min-int.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-min-int64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-min-uint.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/reduce-min-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/rotate-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/rotate-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/rotate-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/rotate-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/scatter-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/scatter-int16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/scatter-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/scatter-int8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shift-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle-flatten.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle2-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle2-10.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle2-11.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle2-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle2-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle2-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle2-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle2-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle2-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle2-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle2-9.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/shuffle2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/soa-16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/soa-17.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/soa-18.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/soa-21.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/soa-22.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/soa-23.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/soa-24.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/soa-25.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/store-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/store-int16-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/store-int16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/store-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/store-int8-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/store-int8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/struct-nested-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/test-103.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/test-105.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/test-107.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/test-148.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/uint64-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/uint64-max.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/uint64-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/uint64-min.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
+./tests/acos.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/asin.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/funcptr-varying-6.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/funcptr-varying-7.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/funcptr-varying-8.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/ldexp-double.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/aossoa-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/aossoa-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/aossoa-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/aossoa-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/aossoa-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/aossoa-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/aossoa-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/aossoa-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-10.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-11.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-12.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-13.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-14.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-9.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-swap.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-uniform-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/atomics-uniform-9.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/avg-down-int16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/avg-down-int8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/avg-down-uint16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/avg-down-uint8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/avg-up-int16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/avg-up-int8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/avg-up-uint16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/avg-up-uint8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/bool-float-typeconv.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/broadcast-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/broadcast-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/count-leading-trailing-zeros-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/count-leading-trailing-zeros-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/count-leading-trailing-zeros-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/count-leading-trailing-zeros-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/double-consts.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/double-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/double-max.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/double-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/double-min.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/double-sqrt.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/exclusive-scan-add-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/exclusive-scan-and-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/exclusive-scan-and-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/exclusive-scan-or-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/foreach-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/foreach-11.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/foreach-12.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/foreach-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/foreach-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/foreach-unique-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/frexp-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/frexp-double.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/funcptr-uniform-10.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-double-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-double-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-double-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-double-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-double-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-double-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int16-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int16-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int16-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int16-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int16-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int16-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int16-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int32-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int32-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int32-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int32-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int8-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int8-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int8-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int8-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int8-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int8-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/gather-int8-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/half-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/half-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/half-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/half.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/idiv.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/insert-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/int16-wrap.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/int64-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/int64-max.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/int64-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/int64-min.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/int8-wrap.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/load-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/load-int16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/load-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/load-int8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/local-atomics-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/local-atomics-13.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/local-atomics-14.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/local-atomics-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/local-atomics-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/local-atomics-swap.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/max-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/max-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/memcpy-varying.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/memmove-varying.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/memset-uniform.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/memset-varying.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/min-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/min-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/padds_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/padds_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/padds_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/paddus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/paddus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/paddus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/pass-varying-lvalue-to-ref.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/pdivs_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/pdivs_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/pdivs_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/pdivus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/pdivus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/pdivus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/pmuls_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/pmuls_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/pmuls_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/pmulus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/pmulus_vi32.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/pmulus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/pmulus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/popcnt-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/prefetch.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/psubs_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/psubs_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/psubs_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/psubus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/psubus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/psubus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/ptr-15.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/ptr-19.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/ptr-24.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/ptr-25.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/ptr-assign-lhs-math-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/ptr-cast-complex.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/ptr-cmp-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/ptr-int-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/ptr-int-null-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/ptr-null-func-arg.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/ptr-varying-unif-index.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-add-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-add-int-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-add-int.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-add-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-add-int16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-add-int64-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-add-int64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-add-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-add-int8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-add-uint-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-add-uint.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-add-uint64-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-add-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-equal-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-equal-10.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-equal-12.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-equal-13.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-equal-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-equal-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-equal-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-equal-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-equal-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-equal-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-max-int.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-max-int64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-max-uint.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-max-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-min-int.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-min-int64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-min-uint.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/reduce-min-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/rotate-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/rotate-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/rotate-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/rotate-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/scatter-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/scatter-int16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/scatter-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/scatter-int8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shift-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shift-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shift-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle-flatten.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle2-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle2-10.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle2-11.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle2-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle2-3.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle2-4.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle2-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle2-6.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle2-7.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle2-8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle2-9.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/shuffle2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/soa-16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/soa-17.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/soa-18.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/soa-19.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/soa-21.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/soa-22.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/soa-23.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/soa-24.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/soa-25.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/store-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/store-int16-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/store-int16.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/store-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/store-int8-2.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/store-int8.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/struct-nested-5.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/test-103.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/test-105.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/test-107.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/test-148.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/uint64-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/uint64-max.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/uint64-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/uint64-min.ispc compfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
+./tests/acos.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/asin.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/foreach-25.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/foreach-26.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/foreach-27.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int8-2.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int8-4.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gs-improve-progindex.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/ptr-diff-3.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/ptr-diff-5.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/ptr-diff-6.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/aossoa-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/aossoa-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/aossoa-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/aossoa-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/aossoa-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/aossoa-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/aossoa-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/aossoa-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/atomics-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/atomics-13.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/atomics-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/atomics-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/atomics-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/atomics-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/atomics-swap.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/atomics-uniform-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/atomics-uniform-9.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/avg-down-int16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/avg-down-uint16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/avg-down-uint8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/avg-up-int16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/avg-up-uint16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/avg-up-uint8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/bool-float-typeconv.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/broadcast-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/broadcast-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/count-leading-trailing-zeros-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/count-leading-trailing-zeros-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/count-leading-trailing-zeros-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/count-leading-trailing-zeros-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/double-consts.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/double-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/double-max.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/double-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/double-min.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/double-sqrt.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/foreach-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/foreach-11.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/foreach-12.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/foreach-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/foreach-unique-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/funcptr-uniform-10.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/funcptr-varying-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/funcptr-varying-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-double-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-double-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-double-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-double-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-double-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-double-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int16-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int16-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int16-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int16-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int16-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int16-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int16-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int32-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int32-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int32-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int32-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int8-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int8-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int8-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/gather-int8-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/half-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/half-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/half-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/half.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/idiv.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/int16-wrap.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/int64-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/int64-max.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/int64-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/int64-min.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/int8-wrap.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/load-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/load-int16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/load-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/load-int8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/local-atomics-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/local-atomics-13.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/local-atomics-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/local-atomics-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/local-atomics-swap.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/max-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/max-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/memcpy-varying.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/memmove-varying.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/memset-uniform.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/memset-varying.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/min-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/min-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/padds_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/padds_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/padds_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/paddus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/paddus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/paddus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pass-varying-lvalue-to-ref.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pdivs_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pdivs_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pdivs_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pdivus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pdivus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pdivus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pmuls_i64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pmuls_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pmuls_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pmuls_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pmulus_i16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pmulus_i32.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pmulus_i64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pmulus_i8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pmulus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pmulus_vi32.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pmulus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/pmulus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/popcnt-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/prefetch.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/psubs_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/psubs_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/psubs_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/psubus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/psubus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/psubus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/ptr-15.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/ptr-22.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/ptr-24.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/ptr-25.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/ptr-cmp-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-add-int-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-add-int.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-add-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-add-int16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-add-int64-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-add-int64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-add-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-add-int8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-add-uint-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-add-uint.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-add-uint64-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-add-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-equal-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-max-int.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-max-int64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-max-uint.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-max-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-min-int.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-min-int64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-min-uint.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/reduce-min-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/rotate-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/rotate-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/rotate-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/rotate-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/scatter-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/scatter-int16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/scatter-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/scatter-int8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shift-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle-flatten.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle2-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle2-10.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle2-11.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle2-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle2-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle2-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle2-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle2-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle2-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle2-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle2-9.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/shuffle2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/soa-16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/soa-17.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/soa-18.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/soa-21.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/soa-22.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/soa-23.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/soa-24.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/soa-25.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/store-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/store-int16-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/store-int16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/store-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/store-int8-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/store-int8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/struct-nested-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/test-103.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/test-105.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/test-107.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/test-148.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/uint64-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/uint64-max.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/uint64-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/uint64-min.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
+./tests/acos.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/asin.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/funcptr-varying-6.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/funcptr-varying-7.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/funcptr-varying-8.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/ldexp-double.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/aossoa-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/aossoa-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/aossoa-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/aossoa-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/aossoa-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/aossoa-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/aossoa-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/aossoa-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-10.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-11.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-12.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-13.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-14.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-9.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-swap.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-uniform-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/atomics-uniform-9.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/avg-down-int16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/avg-down-int8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/avg-down-uint16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/avg-down-uint8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/avg-up-int16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/avg-up-int8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/avg-up-uint16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/avg-up-uint8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/bool-float-typeconv.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/broadcast-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/broadcast-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/count-leading-trailing-zeros-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/count-leading-trailing-zeros-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/count-leading-trailing-zeros-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/count-leading-trailing-zeros-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/double-consts.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/double-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/double-max.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/double-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/double-min.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/double-sqrt.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/exclusive-scan-add-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/exclusive-scan-and-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/exclusive-scan-and-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/exclusive-scan-or-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/foreach-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/foreach-11.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/foreach-12.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/foreach-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/foreach-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/foreach-unique-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/frexp-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/frexp-double.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/funcptr-uniform-10.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-double-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-double-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-double-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-double-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-double-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-double-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int16-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int16-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int16-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int16-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int16-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int16-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int16-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int32-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int32-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int32-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int32-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int8-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int8-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int8-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int8-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int8-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int8-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/gather-int8-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/half-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/half-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/half-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/half.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/idiv.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/insert-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/int16-wrap.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/int64-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/int64-max.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/int64-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/int64-min.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/int8-wrap.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/load-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/load-int16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/load-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/load-int8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/local-atomics-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/local-atomics-13.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/local-atomics-14.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/local-atomics-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/local-atomics-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/local-atomics-swap.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/max-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/max-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/memcpy-varying.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/memmove-varying.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/memset-uniform.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/memset-varying.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/min-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/min-double-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/padds_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/padds_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/padds_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/paddus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/paddus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/paddus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/pass-varying-lvalue-to-ref.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/pdivs_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/pdivs_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/pdivs_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/pdivus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/pdivus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/pdivus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/pmuls_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/pmuls_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/pmuls_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/pmulus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/pmulus_vi32.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/pmulus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/pmulus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/popcnt-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/prefetch.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/psubs_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/psubs_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/psubs_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/psubus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/psubus_vi64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/psubus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/ptr-15.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/ptr-19.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/ptr-24.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/ptr-25.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/ptr-assign-lhs-math-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/ptr-cast-complex.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/ptr-cmp-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/ptr-int-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/ptr-int-null-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/ptr-null-func-arg.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/ptr-varying-unif-index.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-add-double-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-add-int-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-add-int.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-add-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-add-int16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-add-int64-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-add-int64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-add-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-add-int8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-add-uint-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-add-uint.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-add-uint64-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-add-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-equal-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-equal-10.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-equal-12.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-equal-13.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-equal-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-equal-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-equal-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-equal-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-equal-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-equal-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-max-int.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-max-int64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-max-uint.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-max-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-min-int.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-min-int64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-min-uint.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/reduce-min-uint64.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/rotate-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/rotate-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/rotate-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/rotate-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/scatter-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/scatter-int16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/scatter-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/scatter-int8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shift-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shift-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shift-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle-flatten.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle2-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle2-10.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle2-11.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle2-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle2-3.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle2-4.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle2-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle2-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle2-7.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle2-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle2-9.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/shuffle2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/soa-16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/soa-17.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/soa-18.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/soa-19.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/soa-21.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/soa-22.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/soa-23.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/soa-24.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/soa-25.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/store-int16-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/store-int16-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/store-int16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/store-int8-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/store-int8-2.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/store-int8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/struct-nested-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/test-103.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/test-105.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/test-107.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/test-148.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/uint64-max-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/uint64-max.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/uint64-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
+./tests/uint64-min.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
diff --git a/func.cpp b/func.cpp
index 7412c560..0cfc5ded 100644
--- a/func.cpp
+++ b/func.cpp
@@ -47,6 +47,9 @@
 #include <stdio.h>
 
 #if defined(LLVM_3_2)
+#ifdef ISPC_NVPTX_ENABLED
+  #include <llvm/Metadata.h>
+#endif /* ISPC_NVPTX_ENABLED */
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Type.h>
@@ -54,6 +57,9 @@
   #include <llvm/Intrinsics.h>
   #include <llvm/DerivedTypes.h>
 #else
+#ifdef ISPC_NVPTX_ENABLED
+  #include <llvm/IR/Metadata.h>
+#endif /* ISPC_NVPTX_ENABLED */
   #include <llvm/IR/LLVMContext.h>
   #include <llvm/IR/Module.h>
   #include <llvm/IR/Type.h>
@@ -129,7 +135,11 @@ Function::Function(Symbol *s, Stmt *c) {
             sym->parentFunction = this;
     }
 
-    if (type->isTask) {
+    if (type->isTask
+#ifdef ISPC_NVPTX_ENABLED
+        && (g->target->getISA() != Target::NVPTX) 
+#endif
+       ){
         threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
         Assert(threadIndexSym);
         threadCountSym = m->symbolTable->LookupVariable("threadCount");
@@ -240,7 +250,11 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
 #endif
     const FunctionType *type = CastType<FunctionType>(sym->type);
     Assert(type != NULL);
-    if (type->isTask == true) {
+    if (type->isTask == true
+#ifdef ISPC_NVPTX_ENABLED
+        && (g->target->getISA() != Target::NVPTX) 
+#endif 
+       ){
         // For tasks, there should always be three parameters: the
         // pointer to the structure that holds all of the arguments, the
         // thread index, and the thread count variables.
@@ -338,6 +352,18 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
             ctx->SetFunctionMask(argIter);
             Assert(++argIter == function->arg_end());
         }
+#ifdef ISPC_NVPTX_ENABLED
+        if (type->isTask == true && g->target->getISA() == Target::NVPTX)
+        {
+          llvm::NamedMDNode* annotations =
+            m->module->getOrInsertNamedMetadata("nvvm.annotations");
+          llvm::SmallVector<llvm::Value*, 3> av;
+          av.push_back(function);
+          av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
+          av.push_back(LLVMInt32(1));
+          annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
+        }
+#endif /* ISPC_NVPTX_ENABLED */
     }
 
     // Finally, we can generate code for the function
@@ -499,6 +525,21 @@ Function::GenerateIR() {
                 std::string functionName = sym->name;
                 if (g->mangleFunctionsWithTarget)
                     functionName += std::string("_") + g->target->GetISAString();
+#ifdef ISPC_NVPTX_ENABLED
+                if (g->target->getISA() == Target::NVPTX)
+                {
+                  functionName += std::string("___export");  /* add ___export to the end, for ptxcc to recognize it is exported */
+#if 0
+                  llvm::NamedMDNode* annotations =
+                    m->module->getOrInsertNamedMetadata("nvvm.annotations");
+                  llvm::SmallVector<llvm::Value*, 3> av;
+                  av.push_back(function);
+                  av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
+                  av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
+                  annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); 
+#endif
+                }
+#endif /* ISPC_NVPTX_ENABLED */
                 llvm::Function *appFunction =
                     llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
                 appFunction->setDoesNotThrow();
@@ -536,6 +577,18 @@ Function::GenerateIR() {
                             FATAL("Function verificication failed");
                         }
                     }
+#ifdef ISPC_NVPTX_ENABLED
+                    if (g->target->getISA() == Target::NVPTX)
+                    {
+                      llvm::NamedMDNode* annotations =
+                        m->module->getOrInsertNamedMetadata("nvvm.annotations");
+                      llvm::SmallVector<llvm::Value*, 3> av;
+                      av.push_back(appFunction);
+                      av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
+                      av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
+                      annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); 
+                    }
+#endif /* ISPC_NVPTX_ENABLED */
                 }
             }
         }
diff --git a/ispc.cpp b/ispc.cpp
index ad1bd455..cf310f89 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -243,6 +243,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             arch = "arm";
         else
 #endif
+#ifdef ISPC_NVPTX_ENABLED
+         if(!strncmp(isa, "nvptx", 5))
+           arch = "nvptx64";
+         else
+#endif /* ISPC_NVPTX_ENABLED */
             arch = "x86-64";
     }
 
@@ -582,6 +587,23 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskBitCount = 32;
     }
 #endif
+#ifdef ISPC_NVPTX_ENABLED
+    else if (!strcasecmp(isa, "nvptx")) 
+    {
+        this->m_isa = Target::NVPTX;
+        this->m_cpu = "sm_35";
+        this->m_nativeVectorWidth = 32;
+        this->m_nativeVectorAlignment = 32;
+        this->m_vectorWidth = 1;
+        this->m_hasHalf = true;
+        this->m_maskingIsFree = true;
+        this->m_maskBitCount = 1;
+        this->m_hasTranscendentals = true;
+        this->m_hasTrigonometry = true;
+        this->m_hasGather = this->m_hasScatter = false;
+        cpuFromIsa = "sm_35";
+    }
+#endif /* ISPC_NVPTX_ENABLED */
     else {
         Error(SourcePos(), "Target \"%s\" is unknown.  Choices are: %s.",
                 isa, SupportedTargets());
@@ -679,6 +701,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
                 "i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-"
                 "f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32-v4:128:128";
         }
+#ifdef ISPC_NVPTX_ENABLED
+        else if (m_isa == Target::NVPTX)
+        {
+          dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
+        }
+#endif
 
         // 3. Finally set member data
         m_dataLayout = new llvm::DataLayout(dl_string);
@@ -695,6 +723,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         // Initialize target-specific "target-feature" attribute.
         if (!m_attributes.empty()) {
             llvm::AttrBuilder attrBuilder;
+#ifdef ISPC_NVPTX_ENABLED
+            if (m_isa != Target::NVPTX)
+#endif
             attrBuilder.addAttribute("target-cpu", this->m_cpu);
             attrBuilder.addAttribute("target-features", this->m_attributes);
             this->m_tf_attributes = new llvm::AttributeSet(
@@ -742,6 +773,9 @@ Target::SupportedTargets() {
     return
 #ifdef ISPC_ARM_ENABLED
         "neon-i8x16, neon-i16x8, neon-i32x4, "
+#endif
+#ifdef ISPC_NVPTX_ENABLED
+        "nvptx, "
 #endif
         "sse2-i32x4, sse2-i32x8, "
         "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
@@ -777,6 +811,10 @@ Target::GetTripleString() const {
             triple.setArchName("i386");
         else if (m_arch == "x86-64")
             triple.setArchName("x86_64");
+#ifdef ISPC_NVPTX_ENABLED
+        else if (m_arch == "nvptx64")
+          triple = llvm::Triple("nvptx64", "nvidia", "cuda");
+#endif /* ISPC_NVPTX_ENABLED */
         else
             triple.setArchName(m_arch);
     }
@@ -809,6 +847,10 @@ Target::ISAToString(ISA isa) {
         return "avx2";
     case Target::GENERIC:
         return "generic";
+#ifdef ISPC_NVPTX_ENABLED
+    case Target::NVPTX:
+        return "nvptx";
+#endif /* ISPC_NVPTX_ENABLED */
     default:
         FATAL("Unhandled target in ISAToString()");
     }
@@ -847,6 +889,10 @@ Target::ISAToTargetString(ISA isa) {
         return "avx2-i32x8";
     case Target::GENERIC:
         return "generic-4";
+#ifdef ISPC_NVPTX_ENABLED
+    case Target::NVPTX:
+        return "nvptx";
+#endif /* ISPC_NVPTX_ENABLED */
     default:
         FATAL("Unhandled target in ISAToTargetString()");
     }
diff --git a/ispc.h b/ispc.h
index 9f9447f8..3c917cce 100644
--- a/ispc.h
+++ b/ispc.h
@@ -38,7 +38,7 @@
 #ifndef ISPC_H
 #define ISPC_H
 
-#define ISPC_VERSION "1.7.1dev"
+#define ISPC_VERSION "1.8.1dev"
 
 #if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5) && !defined(LLVM_3_6)
 #error "Only LLVM 3.2, 3.3, 3.4, 3.5 and the 3.6 development branch are supported"
@@ -176,6 +176,9 @@ public:
         also that __best_available_isa() needs to be updated if ISAs are
         added or the enumerant values are reordered.  */
     enum ISA {
+#ifdef ISPC_NVPTX_ENABLED
+               NVPTX,
+#endif 
 #ifdef ISPC_ARM_ENABLED
                NEON32, NEON16, NEON8,
 #endif
diff --git a/main.cpp b/main.cpp
index 3510d548..90b263ff 100644
--- a/main.cpp
+++ b/main.cpp
@@ -321,6 +321,13 @@ int main(int Argc, char *Argv[]) {
     LLVMInitializeARMTargetMC();
 #endif
 
+#ifdef ISPC_NVPTX_ENABLED
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXAsmPrinter();
+    LLVMInitializeNVPTXTargetMC();
+#endif /* ISPC_NVPTX_ENABLED */
+
     char *file = NULL;
     const char *headerFileName = NULL;
     const char *outFileName = NULL;
diff --git a/module.cpp b/module.cpp
index d9b5ed34..a20d2297 100644
--- a/module.cpp
+++ b/module.cpp
@@ -58,6 +58,9 @@
 #include <set>
 #include <sstream>
 #include <iostream>
+#ifdef ISPC_NVPTX_ENABLED
+#include <map>
+#endif /* ISPC_NVPTX_ENABLED */
 #ifdef ISPC_IS_WINDOWS
 #include <windows.h>
 #include <io.h>
@@ -71,6 +74,9 @@
   #include <llvm/Instructions.h>
   #include <llvm/Intrinsics.h>
   #include <llvm/DerivedTypes.h>
+#ifdef ISPC_NVPTX_ENABLED
+  #include "llvm/Assembly/AssemblyAnnotationWriter.h"
+#endif /* ISPC_NVPTX_ENABLED */
 #else
   #include <llvm/IR/LLVMContext.h>
   #include <llvm/IR/Module.h>
@@ -78,6 +84,13 @@
   #include <llvm/IR/Instructions.h>
   #include <llvm/IR/Intrinsics.h>
   #include <llvm/IR/DerivedTypes.h>
+#ifdef ISPC_NVPTX_ENABLED
+#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
+  #include <llvm/IR/AssemblyAnnotationWriter.h>
+#else
+  #include <llvm/Assembly/AssemblyAnnotationWriter.h>
+#endif
+#endif /* ISPC_NVPTX_ENABLED */
 #endif
 #include <llvm/PassManager.h>
 #include <llvm/PassRegistry.h>
@@ -443,6 +456,39 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
         return;
     }
 
+#ifdef ISPC_NVPTX_ENABLED
+    if (g->target->getISA() == Target::NVPTX && 
+#if 0
+        !type->IsConstType()  &&
+#endif
+#if 1
+        at != NULL &&
+#endif
+        type->IsVaryingType())
+    {
+      Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target.");
+      return;
+#if 0
+        int nel = 32;  /* warp-size */
+        if (type->IsArrayType())
+        {
+          const ArrayType *at = CastType<ArrayType>(type);
+          /* we must scale # elements by 4, because a thread-block will run 4 warps
+           * or 128 threads.
+           * ***note-to-me***:please define these value (128threads/4warps)
+           * in nvptx-target definition
+           * instead of compile-time constants 
+           */
+          nel *= at->GetElementCount();
+          assert (!type->IsSOAType());
+          type = new ArrayType(at->GetElementType()->GetAsUniformType(), nel);
+        }
+        else
+          type = new ArrayType(type->GetAsUniformType(), nel);
+#endif
+    }
+#endif /* ISPC_NVPTX_ENABLED */
+
     llvm::Type *llvmType = type->LLVMType(g->ctx);
     if (llvmType == NULL)
         return;
@@ -653,6 +699,22 @@ lCheckExportedParameterTypes(const Type *type, const std::string &name,
     }
 }
 
+#ifdef ISPC_NVPTX_ENABLED
+static void
+lCheckTaskParameterTypes(const Type *type, const std::string &name,
+                             SourcePos pos) {
+  if (g->target->getISA() != Target::NVPTX) 
+    return;
+  if (lRecursiveCheckValidParamType(type, false) == false) {
+    if (CastType<VectorType>(type))
+      Error(pos, "Vector-typed parameter \"%s\" is illegal in a task "
+          "function with \"nvptx\" target.", name.c_str());
+    else
+      Error(pos, "Varying parameter \"%s\" is illegal in a task function with \"nvptx\" target.",
+          name.c_str());
+    }
+}
+#endif /* ISPC_NVPTX_ENABLED */
 
 /** Given a function type, loop through the function parameters and see if
     any are StructTypes.  If so, issue an error; this is currently broken
@@ -810,7 +872,12 @@ Module::AddFunctionDeclaration(const std::string &name,
 #else // LLVM 3.3+
         function->addFnAttr(llvm::Attribute::AlwaysInline);
 #endif
+
     if (functionType->isTask)
+#ifdef ISPC_NVPTX_ENABLED
+      /* evghenii: fails function verification when "if" executed in nvptx target */
+      if (g->target->getISA() != Target::NVPTX)
+#endif /* ISPC_NVPTX_ENABLED */
         // This also applies transitively to members I think?
         function->setDoesNotAlias(1);
 
@@ -827,6 +894,15 @@ Module::AddFunctionDeclaration(const std::string &name,
         functionType->GetReturnType()->IsVoidType() == false)
         Error(pos, "Task-qualified functions must have void return type.");
 
+#ifdef ISPC_NVPTX_ENABLED
+    if (g->target->getISA() == Target::NVPTX &&
+        Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false &&
+        functionType->isExported)
+    {
+        Error(pos, "Export-qualified functions must have void return type with \"nvptx\" target.");
+    }
+#endif /* ISPC_NVPTX_ENABLED */
+
     if (functionType->isExported || functionType->isExternC)
         lCheckForStructParameters(functionType, pos);
 
@@ -847,6 +923,12 @@ Module::AddFunctionDeclaration(const std::string &name,
           lCheckExportedParameterTypes(argType, argName, argPos);
         }
 
+#ifdef ISPC_NVPTX_ENABLED
+        if (functionType->isTask) {
+          lCheckTaskParameterTypes(argType, argName, argPos);
+        }
+#endif /* ISPC_NVPTX_ENABLED */
+
         // ISPC assumes that no pointers alias.  (It should be possible to
         // specify when this is not the case, but this should be the
         // default.)  Set parameter attributes accordingly.  (Only for
@@ -968,10 +1050,26 @@ Module::writeOutput(OutputType outputType, const char *outFileName,
         const char *fileType = NULL;
         switch (outputType) {
         case Asm:
+#ifdef ISPC_NVPTX_ENABLED
+          if (g->target->getISA() == Target::NVPTX)
+          {
+            if (strcasecmp(suffix, "ptx"))
+                fileType = "assembly";
+          }
+          else
+#endif /* ISPC_NVPTX_ENABLED */
             if (strcasecmp(suffix, "s"))
                 fileType = "assembly";
             break;
         case Bitcode:
+#ifdef ISPC_NVPTX_ENABLED
+          if (g->target->getISA() == Target::NVPTX)
+          {
+            if (strcasecmp(suffix, "ll"))
+                fileType = "LLVM assembly";
+          }
+          else
+#endif /* ISPC_NVPTX_ENABLED */
             if (strcasecmp(suffix, "bc"))
                 fileType = "LLVM bitcode";
             break;
@@ -1042,6 +1140,84 @@ Module::writeOutput(OutputType outputType, const char *outFileName,
         return writeObjectFileOrAssembly(outputType, outFileName);
 }
 
+#ifdef ISPC_NVPTX_ENABLED
+typedef std::vector<std::string> vecString_t;
+static vecString_t 
+lSplitString(const std::string &s)
+{
+  std::stringstream ss(s);
+  std::istream_iterator<std::string> begin(ss);
+  std::istream_iterator<std::string> end;
+  return vecString_t(begin,end);
+}
+
+static void 
+lFixAttributes(const vecString_t &src, vecString_t &dst)
+{
+  dst.clear();
+
+  std::vector< std::pair<int,int> > attributePos;
+
+  typedef std::map<std::string, std::string> attributeMap_t;
+  attributeMap_t attributeMap;
+
+  for (vecString_t::const_iterator it = src.begin();  it != src.end(); it++)
+  {
+    const vecString_t words = lSplitString(*it);
+    if (!words.empty() && words[0] == "attributes" && words[1][0] == '#')
+    {
+      const int nWords = words.size();
+      assert(nWords > 3);
+      assert(words[2       ] == "=");
+      assert(words[3       ] == "{");
+      assert(words[nWords-1] == "}");
+      std::string attributes;
+      for (int w = 4; w < nWords-1; w++)
+          attributes += words[w] + " ";
+      attributeMap[words[1]] = attributes;
+    }
+  }
+  for (vecString_t::const_iterator it = src.begin();  it != src.end(); it++)
+  {
+    vecString_t words = lSplitString(*it);
+    if (words.size() > 1 && (words[0] == "target" && words[1] == "datalayout"))
+    {
+      std::string s = "target datalayout = ";
+      s += '"';
+      s += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
+      s += '"';
+      dst.push_back(s);
+      continue;
+    }
+    if (!words.empty() && words[0] == "attributes")
+      continue;
+    std::string s;
+    std::map<std::string, std::string> attributeSet;
+#if 1  /* this attributed cannot be used in function parametrers, so remove them */
+    attributeSet["readnone"]   = " ";
+    attributeSet["readonly"]   = " ";
+    attributeSet["readnone,"]   = ",";
+    attributeSet["readonly,"]   = ",";
+#endif
+
+
+    for (vecString_t::iterator w = words.begin(); w != words.end(); w++)
+    {
+      if (attributeSet.find(*w) != attributeSet.end())
+        *w = attributeSet[*w];
+
+      if ((*w)[0] == '#')
+      {
+        attributeMap_t::iterator m = attributeMap.find(*w);
+        assert (m != attributeMap.end());
+        *w = attributeMap[*w];
+      }
+      s += *w + " ";
+    }
+    dst.push_back(s);
+  }
+}
+#endif /* ISPC_NVPTX_ENABLED */
 
 bool
 Module::writeBitcode(llvm::Module *module, const char *outFileName) {
@@ -1066,7 +1242,47 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) {
     }
 
     llvm::raw_fd_ostream fos(fd, (fd != 1), false);
-    llvm::WriteBitcodeToFile(module, fos);
+#ifdef ISPC_NVPTX_ENABLED
+    if (g->target->getISA() == Target::NVPTX)
+    {
+      /* when using "nvptx" target, emit patched/hacked assembly 
+       * NVPTX only accepts 3.2-style LLVM assembly, where attributes
+       * must be inlined, rather then referenced by #attribute_d
+       * As soon as NVVM support 3.3,3.4 style assembly this fix won't be needed
+       */
+      const std::string dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
+      module->setDataLayout(dl_string);
+
+      std::string s;
+      llvm::raw_string_ostream out(s);
+#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
+      std::unique_ptr<llvm::AssemblyAnnotationWriter> Annotator;
+#else
+      llvm::OwningPtr<llvm::AssemblyAnnotationWriter> Annotator;
+#endif
+      module->print(out, Annotator.get());
+      std::istringstream iss(s);
+
+      vecString_t input,output;
+      while (std::getline(iss,s))
+        input.push_back(s);
+      output = input;
+
+#if !(defined(LLVM_3_1) || defined(LLVM_3_2))
+      /* do not fix attributed with LLVM 3.2, everything is fine there */
+      lFixAttributes(input,output);
+#endif
+
+      for (vecString_t::iterator it = output.begin(); it != output.end(); it++)
+      {
+        *it += "\n";
+        fos << *it;
+      }
+    }
+    else
+#endif /* ISPC_NVPTX_ENABLED */
+      llvm::WriteBitcodeToFile(module, fos);
+
     return true;
 }
 
@@ -2117,6 +2333,28 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
             opts.addMacroDef(g->cppArgs[i].substr(2));
         }
     }
+#ifdef ISPC_NVPTX_ENABLED
+    if (g->target->getISA() == Target::NVPTX)
+    {
+      opts.addMacroDef("__NVPTX__");
+      opts.addMacroDef("programIndex=__programIndex()");
+#if 1
+      opts.addMacroDef("cif=if");
+      opts.addMacroDef("cfor=for");
+      opts.addMacroDef("cwhile=while");
+      opts.addMacroDef("ccontinue=continue");
+      opts.addMacroDef("cdo=do");
+#endif
+      opts.addMacroDef("taskIndex0=__taskIndex0()");
+      opts.addMacroDef("taskIndex1=__taskIndex1()");
+      opts.addMacroDef("taskIndex2=__taskIndex2()");
+      opts.addMacroDef("taskIndex=__taskIndex()");
+      opts.addMacroDef("taskCount0=__taskCount0()");
+      opts.addMacroDef("taskCount1=__taskCount1()");
+      opts.addMacroDef("taskCount2=__taskCount2()");
+      opts.addMacroDef("taskCount=__taskCount()");
+    }
+#endif /* ISPC_NVPTX_ENABLED */
 
     inst.getLangOpts().LineComment = 1;
 
@@ -2563,6 +2801,30 @@ lCreateDispatchModule(std::map<std::string, FunctionTargetVariants> &functions)
     return module;
 }
 
+#ifdef ISPC_NVPTX_ENABLED
+static std::string lCBEMangle(const std::string &S) {
+  std::string Result;
+
+  for (unsigned i = 0, e = S.size(); i != e; ++i) {
+    if (i+1 != e && ((S[i] == '>' && S[i+1] == '>') ||
+                     (S[i] == '<' && S[i+1] == '<'))) {
+      Result += '_';
+      Result += 'A'+(S[i]&15);
+      Result += 'A'+((S[i]>>4)&15);
+      Result += '_';
+      i++;
+    } else if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') {
+      Result += S[i];
+    } else {
+      Result += '_';
+      Result += 'A'+(S[i]&15);
+      Result += 'A'+((S[i]>>4)&15);
+      Result += '_';
+    }
+  }
+  return Result;
+}
+#endif /* ISPC_NVPTX_ENABLED */
 
 int
 Module::CompileAndOutput(const char *srcFile,
@@ -2586,6 +2848,32 @@ Module::CompileAndOutput(const char *srcFile,
 
         m = new Module(srcFile);
         if (m->CompileFile() == 0) {
+#ifdef ISPC_NVPTX_ENABLED
+            /* NVPTX:
+             * for PTX target replace '.' with '_' in all global variables 
+             * a PTX identifier name must match [a-zA-Z$_][a-zA-Z$_0-9]*
+             */
+            if (g->target->getISA() == Target::NVPTX)
+            {
+              /* mangle global variables names */
+              {
+                llvm::Module::global_iterator I = m->module->global_begin(), E = m->module->global_end();
+                for (; I != E; I++)
+                  I->setName(lCBEMangle(I->getName()));
+              }
+
+              /* mangle functions names */
+              {
+                llvm::Module::iterator I = m->module->begin(), E = m->module->end();
+                for (; I != E; I++)
+                {
+                  std::string str = I->getName();
+                  if (str.find("operator") != std::string::npos)
+                    I->setName(lCBEMangle(str));
+                }
+              }
+            }
+#endif /* ISPC_NVPTX_ENABLED */
             if (outputType == CXX) {
                 if (target == NULL || strncmp(target, "generic-", 8) != 0) {
                     Error(SourcePos(), "When generating C++ output, one of the \"generic-*\" "
diff --git a/opt.cpp b/opt.cpp
index 2715c0fc..135a7c8c 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -55,6 +55,9 @@
   #include <llvm/Function.h>
   #include <llvm/BasicBlock.h>
   #include <llvm/Constants.h>
+#ifdef ISPC_NVPTX_ENABLED
+  #include <llvm/InlineAsm.h>
+#endif /* ISPC_NVPTX_ENABLED */
 #else
   #include <llvm/IR/Module.h>
   #include <llvm/IR/Instructions.h>
@@ -62,6 +65,9 @@
   #include <llvm/IR/Function.h>
   #include <llvm/IR/BasicBlock.h>
   #include <llvm/IR/Constants.h>
+#ifdef ISPC_NVPTX_ENABLED
+  #include <llvm/IR/InlineAsm.h>
+#endif /* ISPC_NVPTX_ENABLED */
 #endif
 #if !defined(LLVM_3_2) && !defined(LLVM_3_3) // LLVM 3.4+
   #include <llvm/Transforms/Instrumentation.h>
@@ -129,6 +135,9 @@ static llvm::Pass *CreateDebugPass(char * output);
 static llvm::Pass *CreateReplaceStdlibShiftPass();
 
 static llvm::Pass *CreateFixBooleanSelectPass();
+#ifdef ISPC_NVPTX_ENABLED
+static llvm::Pass *CreatePromoteLocalToPrivatePass();
+#endif /* ISPC_NVPTX_ENABLED */
 
 #define DEBUG_START_PASS(NAME)                                 \
     if (g->debugPrint &&                                       \
@@ -498,6 +507,12 @@ Optimize(llvm::Module *module, int optLevel) {
         // take the various __pseudo_* functions it has emitted and turn
         // them into something that can actually execute.
         optPM.add(CreateImproveMemoryOpsPass(), 100);
+#ifdef ISPC_NVPTX_ENABLED
+        if (g->opt.disableGatherScatterOptimizations == false &&
+            g->target->getVectorWidth() > 1) 
+#endif /* ISPC_NVPTX_ENABLED */
+          optPM.add(CreateImproveMemoryOpsPass(), 100);
+
         if (g->opt.disableHandlePseudoMemoryOps == false)
             optPM.add(CreateReplacePseudoMemoryOpsPass());
 
@@ -576,7 +591,12 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createGlobalOptimizerPass());
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createIPConstantPropagationPass());
-        optPM.add(CreateReplaceStdlibShiftPass(),229);
+
+#ifdef ISPC_NVPTX_ENABLED
+        if (g->target->getISA() != Target::NVPTX)
+#endif /* ISPC_NVPTX_ENABLED */
+          optPM.add(CreateReplaceStdlibShiftPass(),229);
+
         optPM.add(llvm::createDeadArgEliminationPass(),230);
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());
@@ -688,6 +708,113 @@ Optimize(llvm::Module *module, int optLevel) {
 
         // Should be the last
         optPM.add(CreateFixBooleanSelectPass(), 400);
+#ifdef ISPC_NVPTX_ENABLED
+        if (g->target->getISA() == Target::NVPTX)
+        {
+          optPM.add(CreatePromoteLocalToPrivatePass());
+          optPM.add(llvm::createGlobalDCEPass());
+
+          optPM.add(llvm::createTypeBasedAliasAnalysisPass());
+          optPM.add(llvm::createBasicAliasAnalysisPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          // Here clang has an experimental pass SROAPass instead of
+          // ScalarReplAggregatesPass. We should add it in the future.
+          optPM.add(llvm::createScalarReplAggregatesPass());
+          optPM.add(llvm::createEarlyCSEPass());
+          optPM.add(llvm::createLowerExpectIntrinsicPass());
+          optPM.add(llvm::createTypeBasedAliasAnalysisPass());
+          optPM.add(llvm::createBasicAliasAnalysisPass());
+
+          // Early optimizations to try to reduce the total amount of code to
+          // work with if we can
+          optPM.add(llvm::createReassociatePass());
+          optPM.add(llvm::createConstantPropagationPass());
+          optPM.add(llvm::createDeadInstEliminationPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+
+          optPM.add(llvm::createPromoteMemoryToRegisterPass());
+          optPM.add(llvm::createAggressiveDCEPass());
+
+
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createDeadInstEliminationPass());
+
+          // On to more serious optimizations
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          optPM.add(llvm::createPromoteMemoryToRegisterPass());
+          optPM.add(llvm::createGlobalOptimizerPass());
+          optPM.add(llvm::createReassociatePass());
+          optPM.add(llvm::createIPConstantPropagationPass());
+
+          optPM.add(llvm::createDeadArgEliminationPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          optPM.add(llvm::createPruneEHPass());
+          optPM.add(llvm::createFunctionAttrsPass());
+          optPM.add(llvm::createFunctionInliningPass());
+          optPM.add(llvm::createConstantPropagationPass());
+          optPM.add(llvm::createDeadInstEliminationPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+
+          optPM.add(llvm::createArgumentPromotionPass());
+#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
+          // Starting from 3.4 this functionality was moved to
+          // InstructionCombiningPass. See r184459 for details.
+          optPM.add(llvm::createSimplifyLibCallsPass());
+#endif
+          optPM.add(llvm::createAggressiveDCEPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createJumpThreadingPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createTailCallEliminationPass());
+
+          optPM.add(llvm::createInstructionCombiningPass());
+
+          optPM.add(llvm::createFunctionInliningPass());
+          optPM.add(llvm::createConstantPropagationPass());
+
+          optPM.add(llvm::createInstructionCombiningPass());
+
+          optPM.add(llvm::createIPSCCPPass());
+          optPM.add(llvm::createDeadArgEliminationPass());
+          optPM.add(llvm::createAggressiveDCEPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+
+          optPM.add(llvm::createFunctionInliningPass());
+          optPM.add(llvm::createArgumentPromotionPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          optPM.add(llvm::createReassociatePass());
+          optPM.add(llvm::createLoopRotatePass());
+          optPM.add(llvm::createLICMPass());
+//          optPM.add(llvm::createLoopUnswitchPass(false));
+#if 1
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createIndVarSimplifyPass());
+          optPM.add(llvm::createLoopIdiomPass());
+          optPM.add(llvm::createLoopDeletionPass());
+          optPM.add(llvm::createLoopUnrollPass());
+          optPM.add(llvm::createGVNPass());
+          optPM.add(llvm::createMemCpyOptPass());
+          optPM.add(llvm::createSCCPPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createJumpThreadingPass());
+          optPM.add(llvm::createCorrelatedValuePropagationPass());
+          optPM.add(llvm::createDeadStoreEliminationPass());
+          optPM.add(llvm::createAggressiveDCEPass());
+          optPM.add(llvm::createCFGSimplificationPass());
+          optPM.add(llvm::createInstructionCombiningPass());
+          optPM.add(llvm::createFunctionInliningPass());
+          optPM.add(llvm::createAggressiveDCEPass());
+          optPM.add(llvm::createStripDeadPrototypesPass());
+          optPM.add(llvm::createGlobalDCEPass());
+          optPM.add(llvm::createConstantMergePass());
+#endif
+        }
+#endif /* ISPC_NVPTX_ENABLED */
     }
 
     // Finish up by making sure we didn't mess anything up in the IR along
@@ -5379,4 +5506,94 @@ CreateFixBooleanSelectPass() {
     return new FixBooleanSelectPass();
 }
 
+#ifdef ISPC_NVPTX_ENABLED
+///////////////////////////////////////////////////////////////////////////////
+// Detect addrspace(3)
+///////////////////////////////////////////////////////////////////////////////
+
+class PromoteLocalToPrivatePass: public llvm::BasicBlockPass
+{
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    PromoteLocalToPrivatePass() : BasicBlockPass(ID) {}
+
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+};
+
+char PromoteLocalToPrivatePass::ID = 0;
+
+bool 
+PromoteLocalToPrivatePass::runOnBasicBlock(llvm::BasicBlock &BB)
+{
+  std::vector<llvm::AllocaInst*> Allocas;
+
+  bool modifiedAny  = false;
+
+#if 1
+restart:
+  for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+  {
+    llvm::Instruction *inst = &*I;
+    if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
+    {
+      llvm::Function *func = ci->getCalledFunction();
+      if (func && func->getName() == "llvm.trap")
+      {
+        std::vector<llvm::Type*> funcTyArgs;
+        llvm::FunctionType *funcTy = llvm::FunctionType::get(
+            /*Result=*/llvm::Type::getVoidTy(*g->ctx),
+            /*Params=*/funcTyArgs,
+            /*isVarArg=*/false);
+        llvm::InlineAsm *trap_ptx = llvm::InlineAsm::get(funcTy, "trap;", "", false);
+        assert(trap_ptx != NULL);
+        llvm::Instruction *trap_call = llvm::CallInst::Create(trap_ptx);
+        assert(trap_call != NULL);
+        llvm::ReplaceInstWithInst(ci, trap_call);
+        modifiedAny = true;
+        goto restart;
+      }
+    }
+  }
+#endif
+
+#if 0
+  llvm::Function *cvtFunc = m->module->getFunction("__cvt_loc2gen_var");
+
+  // Find allocas that are safe to promote, by looking at all instructions in
+  // the entry node
+  for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+  {
+    llvm::Instruction *inst = &*I;
+    if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
+    {
+      llvm::Function *func = ci->getCalledFunction();
+      if (cvtFunc && (cvtFunc == func))
+      {
+#if 0
+        fprintf(stderr , "--found cvt-- name= %s \n",
+            I->getName().str().c_str());
+#endif
+        llvm::AllocaInst *alloca = new llvm::AllocaInst(LLVMTypes::Int64Type, "opt_loc2var", ci);
+        assert(alloca != NULL);
+#if 0
+        const int align = 8; // g->target->getNativeVectorAlignment();
+        alloca->setAlignment(align);
+#endif
+        ci->replaceAllUsesWith(alloca);
+        modifiedAny = true;
+      }
+    }
+  }
+#endif
+  return modifiedAny;
+}
+
+static llvm::Pass *
+CreatePromoteLocalToPrivatePass() {
+    return new PromoteLocalToPrivatePass();
+}
+
+
+
+#endif /* ISPC_NVPTX_ENABLED */
 
diff --git a/ptxtools/.gitignore b/ptxtools/.gitignore
new file mode 100644
index 00000000..428bf32d
--- /dev/null
+++ b/ptxtools/.gitignore
@@ -0,0 +1,6 @@
+*.hh
+*.cc
+*.o
+ptxcc
+ptxgen
+ptxgrammar.output
diff --git a/ptxtools/Makefile b/ptxtools/Makefile
new file mode 100644
index 00000000..1cb3a8d2
--- /dev/null
+++ b/ptxtools/Makefile
@@ -0,0 +1,81 @@
+#
+#  Copyright (c) 2014, Evghenii Gaburov
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+#
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+#
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+all: ptxcc ptxgen
+
+CXX=clang++
+CXXFLAGS += -O3
+CXXFLAGS += -I/opt/local/include
+
+LD=clang++
+LDFLAGS += -L/opt/local/lib
+
+FLEX=flex
+BISON=bison
+
+CUDATK=/usr/local/cuda
+LIBDEVICE_MAJOR=1
+LIBDEVICE_MINOR=0
+
+ptxgrammar.cc : ptxgrammar.yy
+	$(BISON) -d -v -t  ptxgrammar.yy -o ptxgrammar.cc
+
+ptx.cc: ptx.ll ptxgrammar.cc
+	$(FLEX) -t ptx.ll > ptx.cc
+
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) -c $< -o $@ 
+
+%.o: %.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@ 
+
+OBJ= ptxcc.o \
+     ptx.o \
+		 ptxgrammar.o
+
+ptxcc: $(OBJ)
+	$(LD) $(LDFLAGS) $^ -o $@ 
+
+ptxgen: ptxgen.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<  \
+	  -L$(CUDATK)/nvvm/lib64 -lnvvm \
+		-I$(CUDATK)/nvvm/include \
+		-I$(CUDATK)/include   \
+		-DLIBDEVICE_MAJOR_VERSION=$(LIBDEVICE_MAJOR) \
+		-DLIBDEVICE_MINOR_VERSION=$(LIBDEVICE_MINOR) \
+		-DLIBNVVM_HOME=$(CUDATK)/nvvm -Wl,-rpath,$(CUDATK)/nvvm/lib64
+
+clean: 
+	/bin/rm -f ptxgen ptxcc $(OBJ) ptxgrammar.hh ptxgrammar.cc ptx.cc ptxgrammar.output
+
+$(OBJ):  ptxgrammar.cc ptx.cc PTXParser.h PTXLexer.h
+
diff --git a/ptxtools/PTXLexer.h b/ptxtools/PTXLexer.h
new file mode 100644
index 00000000..dd1f8504
--- /dev/null
+++ b/ptxtools/PTXLexer.h
@@ -0,0 +1,77 @@
+// -*- mode: c++ -*-
+/*
+   Copyright (c) 2014, Evghenii Gaburov
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+   Based on GPU Ocelot PTX parser : https://code.google.com/p/gpuocelot/ 
+   */
+
+#pragma once
+
+#include <cstring>
+#include <cassert>
+
+namespace parser
+{
+  class PTXLexer;
+  class PTXParser;
+}
+
+#include "ptxgrammar.hh"
+
+namespace parser
+{
+	/*!	\brief A wrapper around yyFlexLexer to allow for a local variable */
+	class PTXLexer : public ptxFlexLexer
+	{
+		public:
+			YYSTYPE*     yylval;
+			int          column;
+			int          nextColumn;
+
+		public:
+			PTXLexer( std::istream* arg_yyin, 
+				std::ostream* arg_yyout ) :
+        yyFlexLexer( arg_yyin, arg_yyout ), yylval( 0 ), column( 0 ), 
+        nextColumn( 0 ) { }
+	
+			int yylex();
+      int yylexPosition()
+      {
+        int token = yylex();
+        column = nextColumn;
+        nextColumn = column + strlen( YYText() );
+        return token;
+      }
+
+  };
+}
diff --git a/ptxtools/PTXParser.h b/ptxtools/PTXParser.h
new file mode 100644
index 00000000..6f8d1a81
--- /dev/null
+++ b/ptxtools/PTXParser.h
@@ -0,0 +1,291 @@
+// -*- mode: c++ -*-
+/*
+   Copyright (c) 2014, Evghenii Gaburov
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+   Based on GPU Ocelot PTX parser : https://code.google.com/p/gpuocelot/ 
+   */
+
+#pragma once
+
+#undef yyFlexLexer
+#define yyFlexLexer ptxFlexLexer
+#include <FlexLexer.h>
+
+
+#include "PTXLexer.h"
+
+#include <vector>
+#include <sstream>
+#include <string>
+namespace ptx
+{ 
+	extern int yyparse( parser::PTXLexer&, parser::PTXParser& );
+}
+
+namespace parser
+{
+  /*! \brief An implementation of the Parser interface for PTX */
+  class PTXParser 
+  {
+    private:
+    typedef int token_t;
+    std::ostream &out;
+    std::string _identifier;
+    token_t _dataTypeId;
+    int _alignment;
+
+    bool isArgumentList, isReturnArgumentList;
+    struct  argument_t
+    {
+      token_t type;
+      std::string name;
+      int dim;
+
+      argument_t(const token_t _type, const std::string &_name, const int _dim = 1) :
+        type(_type), name(_name), dim(_dim) {}
+    };
+    std::vector<argument_t> argumentList, returnArgumentList;
+    std::vector<int> arrayDimensionsList;
+
+    public:
+    PTXParser(std::ostream &_out) : out(_out)
+    {
+      isArgumentList = isReturnArgumentList = false;
+      _alignment = 1;
+    }
+
+    void printHeader()
+    {
+      std::stringstream s;
+#if 0
+      s << "template<int N> struct __align__(N)   b8_t  { unsigned char  _v[N]; __device__ b8_t()  {}; __device__ b8_t (const int value) {}}; \n";
+      s << "template<int N> struct __align__(2*N) b16_t { unsigned short _v[N]; __device__ b16_t() {}; __device__ b16_t(const int value) {}}; \n";
+#else
+      s << "template<int N> struct b8_t  { unsigned char  _v[N]; __device__ b8_t()  {}; __device__ b8_t (const int value) {}}; \n";
+      s << "template<int N> struct b16_t { unsigned short _v[N]; __device__ b16_t() {}; __device__ b16_t(const int value) {}}; \n";
+#endif
+      s << "struct b8d_t  { unsigned char  _v[1]; }; \n";
+      s << "struct b16d_t { unsigned short _v[1]; }; \n";
+
+      s << "typedef unsigned int       b32_t; \n";
+      s << "typedef unsigned int       u32_t; \n";
+      s << "typedef int                s32_t; \n";
+
+      s << "typedef unsigned long long b64_t; \n";
+      s << "typedef unsigned long long u64_t; \n";
+      s << "typedef long long          s64_t; \n";
+
+      s << "typedef float              f32_t; \n";
+      s << "typedef double             f64_t; \n";
+      s << " \n";
+      out << s.str();
+    }
+
+#define LOC YYLTYPE& location
+
+    void identifier(const std::string &s) { _identifier = s;     }
+    void dataTypeId(const token_t token)  { _dataTypeId = token; }
+    void argumentListBegin(LOC) { isArgumentList = true;  }
+    void argumentListEnd  (LOC) { isArgumentList = false; }
+    void returnArgumentListBegin(LOC) { isReturnArgumentList = true;  }
+    void returnArgumentListEnd  (LOC) { isReturnArgumentList = false; }
+    void argumentDeclaration(LOC) 
+    {
+      assert(arrayDimensionsList.size() <= 1);
+      const int dim = arrayDimensionsList.empty() ? 1 : arrayDimensionsList[0];
+      const argument_t arg(_dataTypeId, _identifier, dim);
+      if (isArgumentList)
+        argumentList.push_back(arg);
+      else if (isReturnArgumentList)
+        returnArgumentList.push_back(arg);
+      else
+        assert(0);
+      arrayDimensionsList.clear();
+    }
+    void alignment(const int value) { _alignment = value; }
+
+    void arrayDimensions(const int value)
+    {
+      arrayDimensionsList.push_back(value);
+    }
+
+    std::string printArgument(const argument_t arg, const bool printDataType = true)
+    {
+      std::stringstream s;
+      if (printDataType) 
+        s << tokenToDataType(arg.type, arg.dim) << " ";
+      s << arg.name << " ";
+      return s.str();
+    }
+
+    std::string printArgumentList(const bool printDataType = true)
+    {
+      std::stringstream s;
+      if (argumentList.empty()) return s.str();
+      const int n = argumentList.size();
+      s << " " << printArgument(argumentList[0], printDataType);
+      for (int i = 1; i < n; i++)
+        s << ",\n " <<  printArgument(argumentList[i], printDataType);
+      return s.str();
+    }
+
+    void visibleEntryDeclaration(const std::string &calleeName, LOC) 
+    {
+      std::stringstream s;
+      assert(returnArgumentList.empty());
+      s << "extern \"C\" \n";
+      s << "__global__ void " << calleeName << " (\n";
+      s << printArgumentList();
+      s << "\n ) { asm(\" // entry \"); }\n";
+
+     
+      /* check if this is an "export"  entry */
+      const int entryNameLength = calleeName.length();
+      const int hostNameLength = std::max(entryNameLength-9,0);
+      const std::string ___export(&calleeName.c_str()[hostNameLength]);
+      if (___export.compare("___export") == 0)
+      {
+        std::string hostCalleeName;
+        hostCalleeName.append(calleeName.c_str(), hostNameLength);
+        s << "/*** host interface ***/\n";
+        s << "extern \"C\" \n";
+        s << "__host__ void " << hostCalleeName << " (\n";
+        s << printArgumentList();
+        s << "\n )\n";
+        s << "{\n   ";
+//        s << " cudaFuncSetCacheConfig (" << calleeName << ", ";
+        s << " cudaDeviceSetCacheConfig (";
+#if 1
+        s << " cudaFuncCachePreferEqual ";
+#elif 1
+        s << " cudaFuncCachePreferL1 ";
+#else
+        s << " cudaFuncCachePreferShared ";
+#endif
+        s << ");\n";
+        s << calleeName;
+        s << "<<<1,32>>>(\n";
+        s << printArgumentList(false);
+        s << ");\n";
+        s << " cudaDeviceSynchronize(); \n";
+        s << "}\n";
+      }
+      s << "\n";
+      argumentList.clear();
+
+      out << s.str();
+    }
+    
+    void visibleFunctionDeclaration(const std::string &calleeName, LOC) 
+    {
+      std::stringstream s;
+      assert(returnArgumentList.size() < 2);
+      s << "extern \"C\" \n";
+      s << "__device__ ";
+      if (returnArgumentList.empty())
+        s << " void ";
+      else
+        s << " " <<  tokenToDataType(returnArgumentList[0].type, returnArgumentList[0].dim);
+      s << calleeName << " (\n";
+      s << printArgumentList();
+
+      if (returnArgumentList.empty())
+        s << "\n ) { asm(\" // function \"); }\n\n";
+      else
+      {
+        s << "\n ) { asm(\" // function \"); return 0;} /* return value to disable warnings */\n\n";
+//        s << "\n ) { asm(\" // function \"); } /* this will generate warrning */\n\n";
+      }
+
+      argumentList.clear();
+      returnArgumentList.clear();
+
+      out << s.str();
+    }
+
+    void visibleInitializableDeclaration(const std::string &name, LOC)
+    {
+      assert(arrayDimensionsList.size() == 1);
+      std::stringstream s;
+      s << "extern \"C\" __device__ ";
+      if (_alignment > 0)
+        s << "__attribute__((aligned(" << _alignment << "))) ";
+      s << tokenToDataType(_dataTypeId, 0);
+      if (arrayDimensionsList[0] == 0)
+        s << name << ";\n\n";
+      else
+        s << name << "[" << arrayDimensionsList[0] << "] = {0};\n\n";
+      out << s.str();
+      arrayDimensionsList.clear();
+    }
+
+#undef LOC
+
+    std::string tokenToDataType( token_t token , int dim)
+    {
+      std::stringstream s;
+      switch( token )
+      {
+        case TOKEN_B8:  
+          if (dim > 0)   s << "b8_t<"<<dim<<"> "; 
+          else           s << "b8d_t ";
+          break;
+        case TOKEN_U8:  assert(0); s << "u8_t "; break;
+        case TOKEN_S8:  assert(0); s << "s8_t "; break;
+                         
+        case TOKEN_B16: 
+          if (dim > 0)   s << "b16_t<"<<dim<<"> "; 
+          else           s << "b16d_t ";
+          break;
+        case TOKEN_U16: assert(0); s << "u16_t "; break;
+        case TOKEN_S16: assert(0); s << "s16_t "; break;
+
+        case TOKEN_B32:  assert(dim <= 1); s << "b32_t "; break;
+        case TOKEN_U32:  assert(dim <= 1); s << "u32_t "; break;
+        case TOKEN_S32:  assert(dim <= 1); s << "s32_t "; break;
+
+        case TOKEN_B64:  assert(dim <= 1); s << "b64_t "; break;
+        case TOKEN_U64:  assert(dim <= 1); s << "u64_t "; break;
+        case TOKEN_S64:  assert(dim <= 1); s << "s64_t "; break;
+
+        case TOKEN_F32:  assert(dim <= 1); s << "f32_t "; break;
+        case TOKEN_F64:  assert(dim <= 1); s << "f64_t "; break;
+        default: std::cerr << "token= " << token<< std::endl; assert(0);
+      }
+
+      return s.str();
+    }
+  };
+}
+
+
diff --git a/ptxtools/ptx.ll b/ptxtools/ptx.ll
new file mode 100644
index 00000000..56e45ae9
--- /dev/null
+++ b/ptxtools/ptx.ll
@@ -0,0 +1,115 @@
+/*
+   Copyright (c) 2014, Evghenii Gaburov
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+   Based on GPU Ocelot PTX parser : https://code.google.com/p/gpuocelot/ 
+   */
+
+%option yylineno
+%option noyywrap
+%option yyclass="parser::PTXLexer"
+%option prefix="ptx"
+%option c++
+
+%{
+#include "PTXLexer.h"
+#include <cassert>
+#include <sstream>
+#include <cstring>
+#ifdef LLSETTOKEN
+#error "TOKEN is defined"
+#endif
+#define LLSETTOKEN(tok) yylval->ivalue = tok; return tok;
+%}
+
+COMMENT ("//"[^\n]*)
+TAB [\t]*
+
+%%
+{COMMENT}       {nextColumn += strlen(yytext); /* lCppComment(&yylloc); */ }
+".version"      { return TOKEN_VERSION; }
+".target"       { return TOKEN_TARGET; }
+".address_size" { return TOKEN_ADDRESS_SIZE; }
+".func"         { return TOKEN_FUNC; }
+".entry"        { return TOKEN_ENTRY; }
+".align"        { return TOKEN_ALIGN; }
+".visible"      { return TOKEN_VISIBLE; }
+".global"       { return TOKEN_GLOBAL; }
+".param"        { return TOKEN_PARAM; }
+".b0"           { LLSETTOKEN( TOKEN_B32);}   /* fix for buggy llvm-ptx generator */
+".b8"           { LLSETTOKEN( TOKEN_B8);}
+".b16"          { LLSETTOKEN( TOKEN_B16);}
+".b32"          { LLSETTOKEN( TOKEN_B32);}
+".b64"          { LLSETTOKEN( TOKEN_B64);}
+".u8"           { LLSETTOKEN( TOKEN_U8);}
+".u16"          { LLSETTOKEN( TOKEN_U16);}
+".u32"          { LLSETTOKEN( TOKEN_U32);}
+".u64"          { LLSETTOKEN( TOKEN_U64);}
+".s8"           { LLSETTOKEN( TOKEN_S8);}
+".s16"          { LLSETTOKEN( TOKEN_S16);}
+".s32"          { LLSETTOKEN( TOKEN_S32);}
+".s64"          { LLSETTOKEN( TOKEN_S64);}
+".f32"          { LLSETTOKEN( TOKEN_F32);}
+".f64"          { LLSETTOKEN( TOKEN_F64);}
+"["             { return '[';}
+"]"             { return ']';}
+"("             { return '(';}
+")"             { return ')';}
+","             { return ',';}
+";"             { return ';';}
+"="             { return '=';}
+[0-9]+\.[0-9]+ { yylval->fvalue = atof(yytext); return TOKEN_FLOAT; }
+[0-9]+   { yylval->ivalue = atoi(yytext); return TOKEN_INT; }
+[a-zA-Z0-9_]+   { strcpy(yylval->svalue, yytext); return TOKEN_STRING;}
+\n {
+ //   yylloc.last_line++;
+//    yylloc.last_column = 1;
+    nextColumn = 1;
+}
+.              ;
+%%
+
+/** Handle a C++-style comment--eat everything up until the end of the line.
+ */
+#if 0
+static void
+lCppComment(SourcePos *pos) {
+    char c;
+    do {
+        c = yyinput();
+    } while (c != 0 && c != '\n');
+    if (c == '\n') {
+        pos->last_line++;
+        pos->last_column = 1;
+    }
+}
+#endif
diff --git a/ptxtools/ptxcc.cpp b/ptxtools/ptxcc.cpp
new file mode 100644
index 00000000..474ab3ff
--- /dev/null
+++ b/ptxtools/ptxcc.cpp
@@ -0,0 +1,312 @@
+// -*- mode: c++ -*-
+/*
+   Copyright (c) 2014, Evghenii Gaburov
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+   Based on GPU Ocelot PTX parser : https://code.google.com/p/gpuocelot/ 
+   */
+
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <fstream>
+#include <cassert>
+#include <algorithm>
+#include <sys/time.h>
+#include "PTXParser.h"
+
+
+/*
+ * The C++ code below is based on the following bash-script:
+      #!/bin/sh
+
+      PTXSRC=$1__tmp_ptx.ptx
+      PTXCU=$1___tmp_ptx.cu
+      PTXSH=$1___tmp_ptx.sh
+
+      NVCCPARM=${@:2}
+
+      DEPTX=dePTX
+      NVCC=nvcc
+
+      $(cat $1 | sed 's/\.b0/\.b32/g' > $PTXSRC) &&
+      $DEPTX < $PTXSRC > $PTXCU &&
+      $NVCC -arch=sm_35 -dc $NVCCPARM -dryrun $PTXCU 2>&1 | \
+        sed 's/\#\$//g'| \
+        awk '{ if ($1 == "LIBRARIES=") print $1$2; else if ($1 == "cicc") print "cp '$PTXSRC'", $NF; else print $0 }' > $PTXSH &&
+      sh $PTXSH
+
+      # rm $PTXCU $PTXSH
+ *
+ */
+
+static char lRandomAlNum()
+{
+  const char charset[] =
+    "0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    "abcdefghijklmnopqrstuvwxyz";
+  const size_t max_index = (sizeof(charset) - 1);
+  return charset[ rand() % max_index ];
+}
+
+static std::string lRandomString(const size_t length)
+{
+  timeval t1;
+  gettimeofday(&t1, NULL);
+  srand(t1.tv_usec * t1.tv_sec);
+  std::string str(length,0);
+  std::generate_n( str.begin(), length, lRandomAlNum);
+  return str;
+}
+
+static void lGetAllArgs(int Argc, char *Argv[], int &argc, char *argv[128]) 
+{
+  // Copy over the command line arguments (passed in)
+  for (int i = 0; i < Argc; ++i)
+    argv[i] = Argv[i];
+  argc = Argc;
+}
+const char *lGetExt (const char *fspec) 
+{
+  const char *e = strrchr (fspec, '.');
+  return e;
+}
+
+static std::vector<std::string> lSplitString(const std::string &s, char delim)
+{
+  std::vector<std::string> elems;
+  std::stringstream ss(s);
+  std::string item;
+  while (std::getline(ss, item, delim)) {
+    if (!item.empty())
+      elems.push_back(item);
+  }
+  return elems;
+}
+
+static void lUsage(const int ret)
+{
+  fprintf(stdout, "\nusage: ptxcc [options] file.ptx \n");
+  fprintf(stdout, "    [--help]\t\t\t\t This help\n");
+  fprintf(stdout, "    [--verbose]\t\t\t\t Be verbose\n");
+  fprintf(stdout, "    [--arch={%s}]\t\t\t GPU target architecture\n", "sm_35");
+  fprintf(stdout, "    [-o <name>]\t\t\t\t Output file name\n");
+  fprintf(stdout, "    [-Xnvcc=<arguments>]\t\t Arguments to pass through to \"nvcc\"\n");
+  fprintf(stdout, " \n");
+  exit(ret);
+}
+
+int main(int _argc, char * _argv[])
+{
+  int argc;
+  char *argv[128];
+  lGetAllArgs(_argc, _argv, argc, argv);
+
+  std::string arch="sm_35";
+  std::string filePTX;
+  std::string fileOBJ;
+  std::string extString = ".ptx";
+  bool keepTemporaries = false;
+  bool verbose = false;
+  std::string nvccArguments;
+
+  for (int i = 1; i < argc; ++i) 
+  {
+    if (!strcmp(argv[i], "--help"))
+      lUsage(0);
+    else if (!strncmp(argv[i], "--arch=", 7))
+      arch = std::string(argv[i]+7);
+    else if (!strncmp(argv[i], "--keep-temporaries", 11))
+      keepTemporaries = true;
+    else if (!strncmp(argv[i], "--verbose", 9))
+      verbose = true;
+    else if (!strncmp(argv[i], "-Xnvcc=", 7))
+      nvccArguments = std::string(argv[i]+7);
+    else if (!strcmp(argv[i], "-o"))
+    {
+      if (++i == argc)
+      {
+        fprintf(stderr, "No output file specified after -o option.\n");
+        lUsage(1);
+      }
+      fileOBJ = std::string(argv[i]);
+    }
+    else 
+    {
+      const char * ext = strrchr(argv[i], '.');
+      if (ext == NULL)
+      {
+        fprintf(stderr, " Unknown argument: %s \n", argv[i]);
+        lUsage(1);
+      }
+      else if (strncmp(ext, extString.c_str(), 4))
+      {
+        fprintf(stderr, " Unkown extension of the input file: %s \n", ext);
+        lUsage(1);
+      }
+      else if (filePTX.empty())
+      {
+        filePTX = std::string(argv[i]);
+        if (fileOBJ.empty())
+        {
+          char * baseName = argv[i];
+          while (baseName != ext)
+            fileOBJ += std::string(baseName++,1);
+        }
+        fileOBJ += ".o";
+      }
+    }
+  }
+#if 0
+  fprintf(stderr, " fileOBJ= %s\n", fileOBJ.c_str());
+  fprintf(stderr, " arch= %s\n", arch.c_str());
+  fprintf(stderr, " file= %s\n", filePTX.empty() ? "$stdin" : filePTX.c_str());
+  fprintf(stderr, " num_args= %d\n", (int)nvccArgumentList.size());
+  for (int i= 0; i < (int)nvccArgumentList.size(); i++)
+    fprintf(stderr, " arg= %d : %s \n", i, nvccArgumentList[i].c_str());
+#endif
+  assert(arch == std::string("sm_35"));
+  if (filePTX.empty())
+  {
+    fprintf(stderr, "ptxcc fatal : No input file specified; use option --help for more information\n");
+    exit(1);
+  }
+
+	// open a file handle to a particular file:
+  std::ifstream inputPTX(filePTX.c_str());
+  if (!inputPTX)
+  {
+    fprintf(stderr, "ptxcc: error: %s: No such file\n", filePTX.c_str());
+    exit(1);
+  }
+
+  std::string randomBaseName = std::string("/tmp/") + lRandomString(8) + "_" + lSplitString(lSplitString(filePTX,'/').back(),'.')[0];
+  if (verbose)
+    fprintf(stderr, "baseFileName= %s\n", randomBaseName.c_str());
+
+  std::string fileCU= randomBaseName + ".cu";
+  std::ofstream outputCU(fileCU.c_str());
+  assert(outputCU);
+
+  std::istream &  input = inputPTX;
+  std::ostream & output = outputCU;
+  std::ostream &  error = std::cerr;
+  parser::PTXLexer lexer(&input, &error);
+  parser::PTXParser state(output);
+
+	// parse through the input until there is no more:
+  //
+
+  do {
+    ptx::yyparse(lexer, state);
+  }
+  while (!input.eof());
+
+  inputPTX.close();
+  outputCU.close();
+
+  // process output from nvcc
+  //
+  /* nvcc -dc -arch=$arch -dryrun -argumentlist fileCU */
+
+  std::string fileSH= randomBaseName + ".sh";
+
+  std::string nvccExe("nvcc");
+  std::string nvccCmd;
+  nvccCmd += nvccExe + std::string(" ");
+  nvccCmd += "-dc ";
+  nvccCmd += std::string("-arch=") + arch + std::string(" ");
+  nvccCmd += "-dryrun ";
+  nvccCmd += nvccArguments + std::string(" ");
+  nvccCmd += std::string("-o ") + fileOBJ + std::string(" ");
+  nvccCmd += fileCU + std::string(" ");
+  nvccCmd += std::string("2> ") + fileSH;
+  if (verbose)
+    fprintf(stderr , "%s\n", nvccCmd.c_str());
+  const int nvccRet = std::system(nvccCmd.c_str());
+  if (nvccRet)
+    fprintf(stderr, "FAIL: %s\n", nvccCmd.c_str());
+
+
+  std::ifstream inputSH(fileSH.c_str());
+  assert(inputSH);
+  std::vector<std::string> nvccSteps;
+  while (!inputSH.eof())
+  {
+    nvccSteps.push_back(std::string());
+    std::getline(inputSH, nvccSteps.back());
+    if (nvccRet)
+      fprintf(stderr, " %s\n", nvccSteps.back().c_str());
+  }
+  inputSH.close();
+  if (nvccRet)
+    exit(-1);
+
+
+  for (int i = 0; i < (int)nvccSteps.size(); i++)
+  {
+    std::string cmd = nvccSteps[i];
+    for (int j = 0; j < (int)cmd.size()-1; j++)
+      if (cmd[j] == '#' && cmd[j+1] == '$')
+        cmd[j] = cmd[j+1] = ' ';
+    std::vector<std::string> splitCmd = lSplitString(cmd, ' ');
+
+    if (!splitCmd.empty())
+    {
+      if (splitCmd[0] == std::string("cicc"))
+        cmd = std::string("   cp ") + filePTX + std::string(" ") + splitCmd.back();
+      if (splitCmd[0] == std::string("LIBRARIES="))
+        cmd = "";
+    }
+    nvccSteps[i] = cmd;
+    if (verbose)
+      fprintf(stderr, "%3d: %s\n", i, cmd.c_str());
+    const int ret = std::system(cmd.c_str());
+    if (ret)
+    {
+      fprintf(stderr, " Something went wrong .. \n");
+      for (int j = 0; j < i; j++)
+        fprintf(stderr, "PASS: %s\n", nvccSteps[j].c_str());
+      fprintf(stderr, "FAIL: %s\n", nvccSteps[i].c_str());
+      exit(-1);
+    }
+  }
+
+  if (!keepTemporaries)
+  {
+    /* remove temporaries */
+  }
+  
+
+	
+}
diff --git a/ptxtools/ptxgen.cpp b/ptxtools/ptxgen.cpp
new file mode 100644
index 00000000..0dc64194
--- /dev/null
+++ b/ptxtools/ptxgen.cpp
@@ -0,0 +1,444 @@
+// -*- mode: c++ -*-
+/*
+   Copyright (c) 2014, Evghenii Gaburov
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+   Based on "ptxgen" NVVM example from CUDA Toolkit
+   */
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <nvvm.h>
+#include <sys/stat.h>
+
+
+template<typename T>
+static std::string lValueToString(const T& value)
+{
+  std::ostringstream oss;
+  oss << value;
+  return oss.str();
+}
+
+typedef struct stat Stat;
+
+
+#define PTXGENStatus int
+enum {
+  PTXGEN_SUCCESS                    = 0x0000,
+  PTXGEN_FILE_IO_ERROR              = 0x0001,
+  PTXGEN_BAD_ALLOC_ERROR            = 0x0002,
+  PTXGEN_LIBNVVM_COMPILATION_ERROR  = 0x0004,
+  PTXGEN_LIBNVVM_ERROR              = 0x0008,
+  PTXGEN_INVALID_USAGE              = 0x0010,
+  PTXGEN_LIBNVVM_HOME_UNDEFINED     = 0x0020,
+  PTXGEN_LIBNVVM_VERIFICATION_ERROR = 0x0040
+};
+
+static PTXGENStatus getLibDeviceName(const int computeArch, std::string &libDeviceName)
+{
+  const char *env = getenv("LIBNVVM_HOME");
+#ifdef LIBNVVM_HOME
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+  const std::string libnvvmPath(env ? env : TOSTRING(LIBNVVM_HOME));
+#undef TOSTRING
+#undef STRINGIFY
+#else
+  const std::string libnvvmPath(env);
+#endif
+
+  if (libnvvmPath.empty())
+  {
+    fprintf(stderr, "The environment variable LIBNVVM_HOME is undefined\n");
+    return PTXGEN_LIBNVVM_HOME_UNDEFINED;
+  }
+
+  /* Use libdevice for compute_20, if the target is not compute_20, compute_30,
+   * or compute_35. */
+  const std::string libdevice = 
+    std::string("/libdevice/libdevice.compute_") +
+    lValueToString(computeArch)+ "." +
+    lValueToString(LIBDEVICE_MAJOR_VERSION) + 
+    lValueToString(LIBDEVICE_MINOR_VERSION) +
+    ".bc";
+
+  libDeviceName = libnvvmPath + libdevice;
+
+  return PTXGEN_SUCCESS;
+}
+
+static PTXGENStatus addFileToProgram(const std::string &filename, nvvmProgram prog)
+{
+  char        *buffer;
+  size_t       size;
+  Stat         fileStat;
+
+  /* Open the input file. */
+  FILE *f = fopen(filename.c_str(), "rb");
+  if (f == NULL) {
+    fprintf(stderr, "Failed to open %s\n", filename.c_str());
+    return PTXGEN_FILE_IO_ERROR;
+  }
+
+  /* Allocate buffer for the input. */
+  fstat(fileno(f), &fileStat);
+  buffer = (char *) malloc(fileStat.st_size);
+  if (buffer == NULL) {
+    fprintf(stderr, "Failed to allocate memory\n");
+    return PTXGEN_BAD_ALLOC_ERROR;
+  }
+  size = fread(buffer, 1, fileStat.st_size, f);
+  if (ferror(f)) {
+    fprintf(stderr, "Failed to read %s\n", filename.c_str());
+    fclose(f);
+    free(buffer);
+    return PTXGEN_FILE_IO_ERROR;
+  }
+  fclose(f);
+
+  if (nvvmAddModuleToProgram(prog, buffer, size, filename.c_str()) != NVVM_SUCCESS) {
+    fprintf(stderr,
+            "Failed to add the module %s to the compilation unit\n",
+            filename.c_str());
+    free(buffer);
+    return PTXGEN_LIBNVVM_ERROR;
+  }
+
+  free(buffer);
+  return PTXGEN_SUCCESS;
+}
+
+static PTXGENStatus generatePTX(
+    std::vector<std::string> nvvmOptions, 
+    std::vector<std::string> nvvmFiles, 
+    std::ostream &out,
+    const int computeArch)
+{
+  nvvmProgram prog;
+  PTXGENStatus status;
+
+  /* Create the compiliation unit. */
+  if (nvvmCreateProgram(&prog) != NVVM_SUCCESS) 
+  {
+    fprintf(stderr, "Failed to create the compilation unit\n");
+    return PTXGEN_LIBNVVM_ERROR;
+  }
+  
+
+  /* Add libdevice. */
+  std::string libDeviceName;
+  status = getLibDeviceName(computeArch, libDeviceName);
+  if (status != PTXGEN_SUCCESS) 
+  {
+    nvvmDestroyProgram(&prog);
+    return status;
+  }
+  status = addFileToProgram(libDeviceName, prog);
+  if (status != PTXGEN_SUCCESS) 
+  {
+    fprintf(stderr, "Please double-check LIBNVVM_HOME environmental variable.\n");
+    nvvmDestroyProgram(&prog);
+    return status;
+  }
+
+  /* Add the module to the compilation unit. */
+  for (int i = 0; i < (int)nvvmFiles.size(); ++i) 
+  {
+    status = addFileToProgram(nvvmFiles[i], prog);
+    if (status != PTXGEN_SUCCESS) 
+    {
+      nvvmDestroyProgram(&prog);
+      return status;
+    }
+  }
+
+  const int numOptions = nvvmOptions.size();
+  std::vector<const char*> options(numOptions);
+  for (int i = 0; i < numOptions; i++)
+    options[i] = nvvmOptions[i].c_str();
+
+  /* Verify the compilation unit. */
+  if (nvvmVerifyProgram(prog, numOptions, &options[0]) != NVVM_SUCCESS) 
+  {
+    fprintf(stderr, "Failed to verify the compilation unit\n");
+    status |= PTXGEN_LIBNVVM_VERIFICATION_ERROR;
+  }
+
+  /* Print warnings and errors. */
+  {
+    size_t logSize;
+    if (nvvmGetProgramLogSize(prog, &logSize) != NVVM_SUCCESS) 
+    {
+      fprintf(stderr, "Failed to get the compilation log size\n");
+      status |= PTXGEN_LIBNVVM_ERROR;
+    } 
+    else 
+    {
+      std::string log(logSize,0);
+      if (nvvmGetProgramLog(prog, &log[0]) != NVVM_SUCCESS) 
+      {
+        fprintf(stderr, "Failed to get the compilation log\n");
+        status |= PTXGEN_LIBNVVM_ERROR;
+      } 
+      else 
+      {
+        fprintf(stderr, "%s\n", log.c_str());
+      }
+    }
+  }
+
+  if (status & PTXGEN_LIBNVVM_VERIFICATION_ERROR) 
+  {
+    nvvmDestroyProgram(&prog);
+    return status;
+  }
+  
+  /* Compile the compilation unit. */
+  if (nvvmCompileProgram(prog, numOptions, &options[0]) != NVVM_SUCCESS) 
+  {
+    fprintf(stderr, "Failed to generate PTX from the compilation unit\n");
+    status |= PTXGEN_LIBNVVM_COMPILATION_ERROR;
+  } 
+  else 
+  {
+    size_t ptxSize;
+    if (nvvmGetCompiledResultSize(prog, &ptxSize) != NVVM_SUCCESS) 
+    {
+      fprintf(stderr, "Failed to get the PTX output size\n");
+      status |= PTXGEN_LIBNVVM_ERROR;
+    } 
+    else 
+    {
+      std::string ptx(ptxSize,0);
+      if (nvvmGetCompiledResult(prog, &ptx[0]) != NVVM_SUCCESS) 
+      {
+        fprintf(stderr, "Failed to get the PTX output\n");
+        status |= PTXGEN_LIBNVVM_ERROR;
+      } 
+      else 
+      {
+        out << ptx;
+      }
+    }
+  }
+
+  /* Print warnings and errors. */
+  {
+    size_t logSize;
+    if (nvvmGetProgramLogSize(prog, &logSize) != NVVM_SUCCESS) 
+    {
+      fprintf(stderr, "Failed to get the compilation log size\n");
+      status |= PTXGEN_LIBNVVM_ERROR;
+    } 
+    else 
+    {
+      std::string log(logSize,0);
+      if (nvvmGetProgramLog(prog, &log[0]) != NVVM_SUCCESS) 
+      {
+        fprintf(stderr, "Failed to get the compilation log\n");
+        status |= PTXGEN_LIBNVVM_ERROR;
+      } 
+      else 
+      {
+        fprintf(stderr, "%s\n", log.c_str());
+      }
+    }
+  }
+
+  /* Release the resources. */
+  nvvmDestroyProgram(&prog);
+
+  return PTXGEN_SUCCESS;
+}
+
+static void showUsage()
+{
+  fprintf(stderr,"Usage: ptxgen [OPTION]... [FILE]...\n"
+                 "  [FILE] could be a .bc file or a .ll file\n");
+}
+
+static void lUsage(const int ret)
+{
+  fprintf(stdout, "\nusage: ptxgen [options] file.[ll,bc] \n");
+  fprintf(stdout, "    [--help]\t\t This help\n");
+  fprintf(stdout, "    [--verbose]\t\t Be verbose\n");
+  fprintf(stdout, "    [--arch={%s}]\t GPU target architecture\n", "sm_35");
+  fprintf(stdout, "    [-o <name>]\t\t Output file name\n");
+  fprintf(stdout, "    [-g]\t\t Enable generation of debuggin information \n");
+  fprintf(stdout, "    [--opt=]\t\t Optimization parameters \n");
+  fprintf(stdout, "     \t\t\t    0 - disable optimizations \n");
+  fprintf(stdout, "     \t\t\t    3 - defalt, enable optimizations \n");
+  fprintf(stdout, "    [--ftz=]\t\t Flush-to-zero mode when performsing single-precision floating-point operations\n");
+  fprintf(stdout, "     \t\t\t    0 - default, preserve denormal values\n");
+  fprintf(stdout, "     \t\t\t    1 - flush denormal values to zero\n");
+  fprintf(stdout, "    [--prec-sqrt=]\t Precision mode for single-precision floating-point square root\n");
+  fprintf(stdout, "     \t\t\t    0 - use a faster approximation\n");
+  fprintf(stdout, "     \t\t\t    1 - default, use IEEE round-to-nearest mode\n");
+  fprintf(stdout, "    [--prec-div=]\t Precision mode for single-precision floating-point division and reciprocals\n");
+  fprintf(stdout, "     \t\t\t    0 - use a faster approximation\n");
+  fprintf(stdout, "     \t\t\t    1 - default, use IEEE round-to-nearest mode\n");
+  fprintf(stdout, "    [--fma=]\t\t FMA contraction mode \n");
+  fprintf(stdout, "     \t\t\t    0 - disable\n");
+  fprintf(stdout, "     \t\t\t    1 - default, enable\n");
+  fprintf(stdout, "    [--use_fast_math]\t Make use of fast maih. Implies --ftz=1 --prec-div=0 --prec-sqrt=0\n");
+  fprintf(stdout, " \n");
+  exit(ret);
+}
+
+int main(int argc, char *argv[])
+{
+  int _opt      = 3;
+  int _ftz      = 0;
+  int _precSqrt = 1;
+  int _precDiv  = 1;
+  int _fma      = 1;
+  bool _useFastMath = false;
+  bool _debug       = false;
+  bool _verbose     = false;
+  std::string _arch = "sm_35";
+  std::string fileIR, filePTX;
+
+  for (int i = 1; i < argc; ++i) 
+  {
+    if (!strcmp(argv[i], "--help"))
+      lUsage(0);
+    else if (!strncmp(argv[i], "--arch=", 7))
+      _arch = std::string(argv[i]+7);
+    else if (!strncmp(argv[i], "-g", 2))
+      _debug = true;
+    else if (!strncmp(argv[i], "--verbose", 9))
+      _verbose = true;
+    else if (!strncmp(argv[i], "--opt=", 6))
+      _opt = atoi(argv[i]+6);
+    else if (!strncmp(argv[i], "--ftz=", 6))
+      _ftz = atoi(argv[i]+6);
+    else if (!strncmp(argv[i], "--prec-sqrt=", 12))
+      _precSqrt = atoi(argv[i]+12);
+    else if (!strncmp(argv[i], "--prec-div=", 11))
+      _precDiv = atoi(argv[i]+11);
+    else if (!strncmp(argv[i], "--fma=", 6))
+      _fma = atoi(argv[i]+6);
+    else if (!strncmp(argv[i], "--use_fast_math", 15))
+      _useFastMath = true;
+    else if (!strcmp(argv[i], "-o"))
+    {
+      if (++i == argc)
+      {
+        fprintf(stderr, "No output file specified after -o option.\n");
+        lUsage(1);
+      }
+      filePTX = std::string(argv[i]);
+    }
+    else 
+    {
+      const char * ext = strrchr(argv[i], '.');
+      if (ext == NULL)
+      {
+        fprintf(stderr, " Unknown argument: %s \n", argv[i]);
+        lUsage(1);
+      }
+      else if (strncmp(ext, ".ll", 3) && strncmp(ext, ".bc", 3))
+      {
+        fprintf(stderr, " Unkown extension of the input file: %s \n", ext);
+        lUsage(1);
+      }
+      else if (filePTX.empty())
+      {
+        fileIR = std::string(argv[i]);
+        if (filePTX.empty())
+        {
+          char * baseName = argv[i];
+          while (baseName != ext)
+            filePTX += std::string(baseName++,1);
+        }
+        filePTX += ".ptx";
+      }
+    }
+  }
+  
+  if (fileIR.empty())
+  {
+    fprintf(stderr, "ptxgen fatal : No input file specified; use option --help for more information\n");
+    exit(1);
+  }
+
+#if 0
+  fprintf(stderr, "fileIR= %s\n", fileIR.c_str());
+  fprintf(stderr, "filePTX= %s\n", filePTX.c_str());
+  fprintf(stderr, "arch= %s\n", _arch.c_str());
+  fprintf(stderr, "debug= %s\n", _debug ? "true" : "false");
+  fprintf(stderr, "verbose= %s\n", _verbose ? "true" : "false");
+  fprintf(stderr, "opt= %d\n", _opt);
+  fprintf(stderr, "ftz= %d\n", _ftz);
+  fprintf(stderr, "prec-sqrt= %d\n", _precSqrt);
+  fprintf(stderr, "prec-div= %d\n", _precDiv);
+  fprintf(stderr, "fma= %d\n", _fma);
+  fprintf(stderr, "use_fast_math= %s\n", _useFastMath ? "true" : "false");
+#endif
+
+  int computeArch = 35;
+  assert(_arch == std::string("sm_35"));
+
+  if (_useFastMath)
+  {
+    _ftz = 1;
+    _precSqrt = _precDiv = 0;
+  }
+
+  std::vector<std::string> nvvmOptions;
+  nvvmOptions.push_back("-arch=compute_35");
+  nvvmOptions.push_back("-ftz="       + lValueToString(_ftz));
+  nvvmOptions.push_back("-prec-sqrt=" + lValueToString(_precSqrt));
+  nvvmOptions.push_back("-prec-div="  + lValueToString(_precDiv));
+  nvvmOptions.push_back("-fma="       + lValueToString(_fma));
+  if (_debug)
+    nvvmOptions.push_back("-g");
+
+  std::vector<std::string> nvvmFiles;
+  nvvmFiles.push_back(fileIR);
+
+  std::ofstream outputPTX(filePTX.c_str());
+  assert(outputPTX);
+
+  const int ret = generatePTX(nvvmOptions, nvvmFiles, outputPTX, computeArch);
+    outputPTX.open(filePTX.c_str());
+  return ret;
+}
+
diff --git a/ptxtools/ptxgrammar.yy b/ptxtools/ptxgrammar.yy
new file mode 100644
index 00000000..13120739
--- /dev/null
+++ b/ptxtools/ptxgrammar.yy
@@ -0,0 +1,250 @@
+/*
+   Copyright (c) 2014, Evghenii Gaburov
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+   Based on GPU Ocelot PTX parser : https://code.google.com/p/gpuocelot/ 
+   */
+%locations
+
+%{
+	#include <iostream>
+	#include "PTXParser.h"
+	#include "PTXLexer.h"
+	#include <cassert>
+	#include <cstring>
+  #include <sstream>
+  #include <cstdio>
+
+	#define YYERROR_VERBOSE 1
+
+	#ifdef REPORT_BASE
+	#undef REPORT_BASE
+	#endif
+
+	#define REPORT_BASE 0
+
+	namespace ptx
+	{
+	
+    int yylex( YYSTYPE* token, YYLTYPE* location, parser::PTXLexer& lexer, 
+      parser::PTXParser& state );
+    void yyerror( YYLTYPE* location, parser::PTXLexer& lexer, 
+      parser::PTXParser& state, char const* message );
+    
+    std::string yyTypeToString( int );
+	
+%}
+
+%union
+{
+	char         svalue[1024];
+	double       fvalue;
+	int          ivalue;
+	unsigned int uvalue;
+}
+
+%parse-param {parser::PTXLexer& lexer}
+%parse-param {parser::PTXParser& state}
+%lex-param   {parser::PTXLexer& lexer}
+%lex-param   {parser::PTXParser& state}
+%pure-parser
+
+// define the constant-string tokens:
+%token TOKEN_VERSION TOKEN_TARGET TOKEN_ADDRESS_SIZE
+%token TOKEN_VISIBLE TOKEN_FUNC TOKEN_ENTRY
+%token TOKEN_PARAM TOKEN_ALIGN 
+%token TOKEN_GLOBAL
+%token<ivalue> TOKEN_B8 TOKEN_B16 TOKEN_B32 TOKEN_B64
+%token<ivalue> TOKEN_U8 TOKEN_U16 TOKEN_U32 TOKEN_U64
+%token<ivalue> TOKEN_S8 TOKEN_S16 TOKEN_S32 TOKEN_S64
+%token<ivalue> TOKEN_F32 TOKEN_F64
+
+// define the "terminal symbol" token types I'm going to use (in CAPS
+// by convention), and associate each with a field of the union:
+%token <ivalue> TOKEN_INT
+%token <fvalue> TOKEN_FLOAT
+%token <svalue> TOKEN_STRING
+
+%type<svalue> identifier
+%type<ivalue> arrayDimensionSet
+%type<ivalue> alignment
+
+%start ptxsource
+
+%%
+// the first rule defined is the highest-level rule, which in our
+// case is just the concept of a whole "snazzle file":
+ptxsource:
+  header ptxbody;
+
+header:
+  version target  address_size 
+{ 
+//  std::cerr << "Done reading PTX \n" << std::endl; 
+   state.printHeader(); 
+};
+
+version:
+  TOKEN_VERSION TOKEN_FLOAT  { assert($2 >= 3.0); } ;//std::cerr << "Reading PTX version " << $2  << std::endl; };
+target:
+  TOKEN_TARGET TOKEN_STRING  { assert(std::string($2) == std::string("sm_35")); } //std::cerr << "Target " << $2  << std::endl; };
+address_size:
+  TOKEN_ADDRESS_SIZE TOKEN_INT  { assert($2 == 64); } //std::cerr << "Address_Size " << $2  << std::endl; };
+
+
+dataTypeId : 
+    TOKEN_U8 | TOKEN_U16 | TOKEN_U32 | TOKEN_U64 
+  | TOKEN_S8 | TOKEN_S16 | TOKEN_S32 | TOKEN_S64 
+  | TOKEN_B8 | TOKEN_B16 | TOKEN_B32 | TOKEN_B64 
+  | TOKEN_F32 | TOKEN_F64;
+
+dataType: dataTypeId { state.dataTypeId($<ivalue>1); }
+
+anytoken: 
+  TOKEN_ALIGN 
+| TOKEN_PARAM 
+| dataTypeId
+| TOKEN_STRING | TOKEN_FLOAT | TOKEN_INT
+| TOKEN_FUNC | TOKEN_ENTRY
+| TOKEN_GLOBAL
+| '['
+| ']'
+| '('
+| ')'
+| ','
+| ';'
+| '='
+;
+
+ptxbody: 
+    ptxbody visibleFunctionDeclaration | visibleFunctionDeclaration
+  | ptxbody visibleEntryDeclaration| visibleEntryDeclaration
+  | ptxbody visibleInitializableDeclaration| visibleInitializableDeclaration
+  | ptxbody anytoken | anytoken;
+
+
+
+arrayDimensionSet : '[' TOKEN_INT ']' { $$ = $2; state.arrayDimensions($<ivalue>2); }
+// arrayDimensionSet : arrayDimensionSet '[' TOKEN_INT ']' { $$ = $2; }
+// arrayDimensionSet : '[' ']' { $$ = 0; }
+arrayDimensions : /* empty string */;
+arrayDimensions : arrayDimensionSet;
+
+identifier: TOKEN_STRING { strcpy($$, $1); state.identifier($1); }
+parameter : TOKEN_PARAM;
+
+alignment : TOKEN_ALIGN TOKEN_INT {$$ = $2; state.alignment($<ivalue>2);}
+addressableVariablePrefix : dataType { state.alignment(0); }
+addressableVariablePrefix : alignment dataType;
+
+argumentDeclaration : parameter addressableVariablePrefix identifier arrayDimensions
+{
+  state.argumentDeclaration(@1);
+}
+
+
+argumentListBegin : '(' { state.argumentListBegin(@1); };
+argumentListEnd : ')' {state.argumentListEnd(@1); };
+argumentListBody : argumentDeclaration;
+argumentListBody : /* empty string */;
+argumentListBody : argumentListBody ',' argumentDeclaration;
+argumentList: argumentListBegin argumentListBody argumentListEnd;
+
+visibleEntryDeclaration: TOKEN_VISIBLE TOKEN_ENTRY identifier argumentList
+{
+   state.visibleEntryDeclaration($<svalue>3, @1);
+};
+
+returnArgumentListBegin : '(' { state.returnArgumentListBegin(@1); }
+returnArgumentListEnd : ')' {state.returnArgumentListEnd(@1); }
+returnArgumentList : returnArgumentListBegin argumentListBody returnArgumentListEnd;
+optionalReturnArgumentList : returnArgumentList | /* empty string */;
+visibleFunctionDeclaration: TOKEN_VISIBLE TOKEN_FUNC optionalReturnArgumentList identifier argumentList
+{
+   state.visibleFunctionDeclaration($<svalue>4, @1);
+};
+
+visibleInitializableDeclaration :
+  TOKEN_VISIBLE TOKEN_GLOBAL addressableVariablePrefix identifier arrayDimensionSet
+  { state.visibleInitializableDeclaration($<svalue>4,@1); }
+| TOKEN_VISIBLE TOKEN_GLOBAL addressableVariablePrefix identifier ';'
+  {state.arrayDimensions(0); state.visibleInitializableDeclaration($<svalue>4,@1); }
+| TOKEN_VISIBLE TOKEN_GLOBAL addressableVariablePrefix identifier '='
+  {state.arrayDimensions(0); state.visibleInitializableDeclaration($<svalue>4,@1); }
+
+
+%%
+
+int yylex( YYSTYPE* token, YYLTYPE* location, parser::PTXLexer& lexer, 
+	parser::PTXParser& state )
+{
+	lexer.yylval = token;
+	
+	int tokenValue         = lexer.yylexPosition();
+	location->first_line   = lexer.lineno();
+	location->first_column = lexer.column;
+
+#if 0
+	report( " Lexer (" << location->first_line << ","
+		<< location->first_column 
+		<< "): " << parser::PTXLexer::toString( tokenValue ) << " \"" 
+		<< lexer.YYText() << "\"");
+#endif
+	
+	return tokenValue;
+}
+	
+static std::string toString( YYLTYPE& location, parser::PTXParser& state )
+{
+  std::stringstream stream;
+  stream 
+#if 0
+  << state.fileName 
+#else
+  << "ptx "
+#endif
+  << " (" << location.first_line << ", " 
+    << location.first_column << "): ";
+  return stream.str();
+}
+
+void yyerror( YYLTYPE* location, parser::PTXLexer& lexer, 
+	parser::PTXParser& state, char const* message )
+{
+	std::stringstream stream;
+	stream << toString( *location, state ) 
+		<< " " << message;
+  fprintf(stderr, "--Parser ERROR-- %s %s \n", toString(*location, state).c_str(), message);
+  exit(-1);
+}
+
+}
diff --git a/ptxtools/runtest_ptxcc.sh b/ptxtools/runtest_ptxcc.sh
new file mode 100755
index 00000000..c2133a65
--- /dev/null
+++ b/ptxtools/runtest_ptxcc.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+#
+#  Copyright (c) 2014, Evghenii Gaburov
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+PTXCC=$ISPC_HOME/ptxtools/ptxcc
+PTXGEN=$ISPC_HOME/ptxtools/ptxgen
+ARGS=${@:2}
+TMPDIR=/tmp
+fbname=`basename $1`
+if [ "$NVVM" == "1" ];
+then
+#  LLVM32=$HOME/usr/local/llvm/bin-3.2
+#  LLVM34=$HOME/usr/local/llvm/bin-3.4
+#  LLVMAS=$LLVM34/bin/llvm-as
+#  LLVMDIS=$LLVM32/bin/llvm-dis
+#  $($LLVMAS $1 -o $TMPDIR/$fbname.bc) && $($LLVMDIS $TMPDIR/$fbname.bc -o $TMPDIR/$fbname.ll) && $($PTXGEN $TMPDIR/$fbname.ll -o $TMPDIR/$fbname.ptx) && \
+  $($PTXGEN $1 -o $TMPDIR/$fbname.ptx) && \
+  $($PTXCC $TMPDIR/$fbname.ptx -o $TMPDIR/$fbname.o -Xnvcc="-G") && \
+  $(nvcc test_static_nvptx.cpp examples/util/nvcc_helpers.cu examples/util/ispc_malloc.cpp $TMPDIR/$fbname.o -arch=sm_35 -Iexamples/util/ -D_CUDA_ -lcudadevrt $ARGS) && \
+  $(/bin/rm -rf $TMPDIR/*$fbname*);
+else
+  $(sed 's/\.b0/\.b32/g' $1 > $TMPDIR/$fbname) && \
+  $($PTXCC $TMPDIR/$fbname -o $TMPDIR/$fbname.o -Xnvcc="-G") && \
+  $(nvcc test_static_nvptx.cpp examples/util/nvcc_helpers.cu examples/util/ispc_malloc.cpp $TMPDIR/$fbname.o -arch=sm_35 -Iexamples/util/ -D_CUDA_ -lcudadevrt $ARGS) && \
+  $(/bin/rm -rf $TMPDIR/*$fbname*);
+fi
+
+
+
diff --git a/run_tests.py b/run_tests.py
index 2b98ee52..c540c020 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -214,6 +214,8 @@ def run_test(testname):
             return (1, 0)
         else:
             global is_generic_target
+            global is_nvptx_target
+            global is_nvptx_nvvm
             if is_windows:
                 if is_generic_target:
                     obj_name = "%s.cpp" % os.path.basename(filename)
@@ -228,6 +230,13 @@ def run_test(testname):
             else:
                 if is_generic_target:
                     obj_name = "%s.cpp" % testname
+                elif is_nvptx_target:
+                  if os.environ.get("NVVM") == "1":
+                    is_nvptx_nvvm = True
+                    obj_name = "%s.ll" % testname
+                  else:
+                    obj_name = "%s.ptx" % testname
+                    is_nvptx_nvvm = False
                 else:
                     obj_name = "%s.o" % testname
                 exe_name = "%s.run" % testname
@@ -263,17 +272,47 @@ def run_test(testname):
                     cc_cmd += ' -Wl,-no_pie'
                 if should_fail:
                     cc_cmd += " -DEXPECT_FAILURE"
+
+                if is_nvptx_target:
+                  nvptxcc_exe = "ptxtools/runtest_ptxcc.sh"
+                  nvptxcc_exe_rel = add_prefix(nvptxcc_exe)
+                  cc_cmd = "%s %s -DTEST_SIG=%d -o %s" % \
+                      (nvptxcc_exe_rel, obj_name, match, exe_name)
+
+            ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --arch=%s --target=%s" % \
+                       (filename, obj_name, options.arch, options.target)
+
             if (options.target == "knc"):
                 ispc_cmd = ispc_exe_rel + " --woff %s -o %s --arch=%s --target=%s" % \
                            (filename, obj_name, options.arch, "generic-16")
             else:
                 ispc_cmd = ispc_exe_rel + " --woff %s -o %s --arch=%s --target=%s" % \
                            (filename, obj_name, options.arch, options.target)
+
             if options.no_opt:
                 ispc_cmd += " -O0" 
             if is_generic_target:
                 ispc_cmd += " --emit-c++ --c++-include-file=%s" % add_prefix(options.include_file)
+
+            if is_nvptx_target:
+                filename4ptx = "/tmp/"+os.path.basename(filename)+".parsed.ispc"
+#                grep_cmd = "grep -v 'export uniform int width' %s > %s " % \
+                grep_cmd = "sed  's/export\ uniform\ int\ width/static uniform\ int\ width/g' %s > %s" % \
+                    (filename, filename4ptx)
+                if options.verbose:
+                  print "Grepping: %s" % grep_cmd
+                sp = subprocess.Popen(grep_cmd, shell=True)
+                sp.communicate()
+                if is_nvptx_nvvm:
+                  ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-llvm --target=%s" % \
+                         (filename4ptx, obj_name, options.target)
+                else:
+                  ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-asm --target=%s" % \
+                         (filename4ptx, obj_name, options.target)
+
+
              
+
         # compile the ispc code, make the executable, and run it...
         (compile_error, run_error) = run_cmds([ispc_cmd, cc_cmd], 
                                               options.wrapexe + " " + exe_name, \
@@ -309,6 +348,7 @@ def run_tasks_from_queue(queue, queue_ret, queue_error, queue_finish, total_test
     ispc_exe = glob_var[3]
     global is_generic_target
     is_generic_target = glob_var[4]
+    global is_nvptx_target
     global run_tests_log
     run_tests_log = glob_var[5]    
 
@@ -551,6 +591,8 @@ def run_tests(options1, args, print_version):
  
     if options.target == 'neon':
         options.arch = 'arm'
+    if options.target == "nvptx":
+        options.arch = "nvptx64"
  
     # use relative path to not depend on host directory, which may possibly
     # have white spaces and unicode characters.
@@ -580,6 +622,10 @@ def run_tests(options1, args, print_version):
     is_generic_target = ((options.target.find("generic-") != -1 and
                      options.target != "generic-1" and options.target != "generic-x1") or 
                      options.target == "knc")
+
+    global is_nvptx_target
+    is_nvptx_target = (options.target.find("nvptx") != -1)
+
     if is_generic_target and options.include_file == None:
         if options.target == "generic-4" or options.target == "generic-x4":
             error("No generics #include specified; using examples/intrinsics/sse4.h\n", 2)
diff --git a/stdlib.ispc b/stdlib.ispc
index d2111d72..01aae815 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -57,6 +57,29 @@
   #error Unknown value of ISPC_MASK_BITS
 #endif
 
+///////////////////////////////////////////////////////////////////////////
+// CUDA Specific primitives
+//
+/***************/
+
+__declspec(safe,cost0) static inline varying int __programIndex() { return __program_index(); }
+__declspec(safe,cost0) static inline uniform int __programCount() { return __program_count(); }
+__declspec(safe,cost0) static inline uniform int __warpIndex()    { return __warp_index();    }
+
+/***************/
+
+__declspec(safe,cost0) static inline uniform int __taskIndex0() { return __task_index0(); }
+__declspec(safe,cost0) static inline uniform int __taskIndex1() { return __task_index1(); }
+__declspec(safe,cost0) static inline uniform int __taskIndex2() { return __task_index2(); }
+__declspec(safe,cost0) static inline uniform int __taskIndex () { return __task_index (); }
+
+/***************/
+
+__declspec(safe,cost0) static inline uniform int __taskCount0() { return __task_count0(); }
+__declspec(safe,cost0) static inline uniform int __taskCount1() { return __task_count1(); }
+__declspec(safe,cost0) static inline uniform int __taskCount2() { return __task_count2(); }
+__declspec(safe,cost0) static inline uniform int __taskCount () { return __task_count (); }
+
 /* Limits of integral types. */
 #ifndef INT8_MAX
 #define INT8_MAX               (127)
@@ -94,6 +117,7 @@
 #ifndef INT64_MIN
 #define INT64_MIN              (-INT64_MAX - 1)
 #endif
+
 ///////////////////////////////////////////////////////////////////////////
 // Low level primitives
 
@@ -501,7 +525,10 @@ __declspec(safe)
 static inline uniform int popcnt(bool v) {
     // As with any() and all(), only count across the active lanes
 #if (ISPC_MASK_BITS == 1)
-    return __popcnt_int64(__movmsk(v & __mask));
+    if (__is_nvptx_target)
+      return __popcnt_int64(__movmsk_ptx(v & __mask));
+    else
+      return __popcnt_int64(__movmsk(v & __mask));
 #else
     return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask));
 #endif
@@ -1239,6 +1266,11 @@ packed_store_active(uniform int a[], int vals) {
     return __packed_store_active(a, vals, (IntMaskType)__mask);
 }
 
+static inline uniform int 
+packed_store_active(bool active, uniform int a[], int vals) {
+    return __packed_store_active(a, vals, (IntMaskType)(-(int)active));
+}
+
 static inline uniform int
 packed_store_active2(uniform int a[], int vals) {
     return __packed_store_active2(a, vals, (IntMaskType)__mask);
@@ -1249,6 +1281,9 @@ packed_store_active2(uniform int a[], int vals) {
 // System information
 
 static inline uniform int num_cores() {
+  if (__is_nvptx_target)
+    return 15*32; // K20/K20X/K40 - 15SMX x 32 warps/smx (max is 64 warps/smx)
+  else
     return __num_cores();
 }
 
@@ -1796,7 +1831,7 @@ static inline void memory_barrier() {
     __memory_barrier();
 }
 
-#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
+#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE,TC)                        \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
     TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
     return ret;                                                         \
@@ -1807,6 +1842,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
     return ret;                                                         \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
     TA ret;                                                             \
@@ -1817,10 +1856,15 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
     return ret;                                                         \
+  } \
 }                                                                       \
 
-#define DEFINE_ATOMIC_SWAP(TA,TB)                                       \
+#define DEFINE_ATOMIC_SWAP(TA,TB,MASKTYPE,TC)                \
 static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
     uniform int i = 0;                                                  \
     TA ret[programCount];                                               \
     TA memVal;                                                          \
@@ -1851,6 +1895,7 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
        originally got back from memory... */                            \
     ret[lastSwap] = memVal;                                             \
     return ret[programIndex];                                           \
+  }\
 }                                                                       \
 static inline uniform TA atomic_swap_global(uniform TA * uniform ptr,   \
                                             uniform TA value) {         \
@@ -1858,6 +1903,10 @@ static inline uniform TA atomic_swap_global(uniform TA * uniform ptr,   \
     return ret;                                                         \
 }                                                                       \
 static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
     TA ret;                                                             \
@@ -1868,9 +1917,10 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
     return ret;                                                         \
+  }\
 }                                                                       \
 
-#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB)                          \
+#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC)                          \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
     uniform TA oneval = reduce_##OPA(value);                            \
     TA ret;                                                             \
@@ -1885,6 +1935,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
                                        TA value) {                      \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
     TA ret;                                                             \
@@ -1895,57 +1949,58 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
     return ret;                                                         \
+  } \
 }
 
-DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
-DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
-DEFINE_ATOMIC_SWAP(int32,int32)
+DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64)
+DEFINE_ATOMIC_SWAP(int32,int32,IntMaskType,int64)
 
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
-DEFINE_ATOMIC_SWAP(unsigned int32,int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_SWAP(unsigned int32,int32,UIntMaskType, unsigned int64)
 
-DEFINE_ATOMIC_SWAP(float,float)
+DEFINE_ATOMIC_SWAP(float,float,IntMaskType,int64)
 
-DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
-DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
-DEFINE_ATOMIC_SWAP(int64,int64)
+DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64)
+DEFINE_ATOMIC_SWAP(int64,int64,IntMaskType, int64)
 
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
-DEFINE_ATOMIC_SWAP(unsigned int64,int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_SWAP(unsigned int64,int64,UIntMaskType, unsigned int64)
 
-DEFINE_ATOMIC_SWAP(double,double)
+DEFINE_ATOMIC_SWAP(double,double,IntMaskType, int64)
 
 #undef DEFINE_ATOMIC_OP
 #undef DEFINE_ATOMIC_MINMAX_OP
 #undef DEFINE_ATOMIC_SWAP
 
-#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
+#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE, TC)                           \
 static inline uniform TA atomic_compare_exchange_global(               \
          uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
     uniform TA ret =                                                    \
@@ -1960,6 +2015,10 @@ static inline TA atomic_compare_exchange_global(                           \
 } \
 static inline TA atomic_compare_exchange_global(               \
          uniform TA * varying ptr, TA oldval, TA newval) { \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_compare_exchange_varying_##TB##_global((TC)ptr, oldval, newval, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
     TA ret;                                                             \
@@ -1971,14 +2030,15 @@ static inline TA atomic_compare_exchange_global(               \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
     return ret;                                                         \
+  } \
 }
 
-ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
-ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
-ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
-ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
-ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
-ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
+ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType,int64)
+ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType,unsigned int64)
+ATOMIC_DECL_CMPXCHG(float, float, IntMaskType,int64)
+ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType,int64)
+ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType,unsigned int64)
+ATOMIC_DECL_CMPXCHG(double, double, IntMaskType,int64)
 
 #undef ATOMIC_DECL_CMPXCHG
 
@@ -2045,12 +2105,20 @@ static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value)
 }                                                                      \
 static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) {    \
     TYPE ret;                                                          \
+  if (__is_nvptx_target) {                                            \
+    foreach_active (i) {                                             \
+        uniform TYPE * uniform ptr = (uniform TYPE * uniform)extract((int64)p, i); \
+        ret  = insert(ret, i, *ptr);                                \
+        *ptr = OPFUNC(*ptr, extract(value, i));                \
+    }                                                                  \
+  } else {    \
     uniform TYPE * uniform ptrs[programCount];                         \
     ptrs[programIndex] = p;                                            \
     foreach_active (i) {                                             \
         ret = insert(ret, i, *ptrs[i]);                                \
         *ptrs[i] = OPFUNC(*ptrs[i], extract(value, i));                \
     }                                                                  \
+  } \
     return ret;                                                        \
 }
 
diff --git a/stmt.cpp b/stmt.cpp
index ac22cff8..586cb0fe 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -142,6 +142,64 @@ lHasUnsizedArrays(const Type *type) {
         return lHasUnsizedArrays(at->GetElementType());
 }
 
+#ifdef ISPC_NVPTX_ENABLED
+static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos &currentPos, const bool variable = false)
+{
+  if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX) 
+    return value;
+  llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
+  const int addressSpace = pt->getAddressSpace();
+  if (addressSpace != 3 && addressSpace != 4) 
+    return value;
+
+  llvm::Type *elTy = pt->getElementType();
+
+  /* convert elTy addrspace(3)* to i64* addrspace(3)* */
+  llvm::PointerType *Int64Ptr3 = llvm::PointerType::get(LLVMTypes::Int64Type, addressSpace);
+  value = ctx->BitCastInst(value, Int64Ptr3, "gep2gen_cast1");
+
+  /* convert i64* addrspace(3) to i64* */
+  llvm::Function *__cvt2gen = m->module->getFunction(
+      addressSpace == 3 ? (variable ? "__cvt_loc2gen_var" : "__cvt_loc2gen") : "__cvt_const2gen");
+
+  std::vector<llvm::Value *> __cvt2gen_args;
+  __cvt2gen_args.push_back(value);
+  value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, variable ? "gep2gen_cvt_var" : "gep2gen_cvt", ctx->GetCurrentBasicBlock());
+
+  /* compute offset */
+  if (addressSpace == 3)
+  {
+    assert(elTy->isArrayTy());
+    const int numElTot = elTy->getArrayNumElements();
+    const int numEl    = numElTot/4;
+#if 0
+    fprintf(stderr, " --- detected addrspace(3) sz= %d --- \n", numEl);
+#endif
+    llvm::ArrayType *arrTy = llvm::dyn_cast<llvm::ArrayType>(pt->getArrayElementType());
+    assert(arrTy != NULL);
+    llvm::Type *arrElTy = arrTy->getElementType();
+#if 0
+    if (arrElTy->isArrayTy())
+      Error(currentPos, "Currently \"nvptx\" target doesn't support array-of-array");
+#endif
+
+    /* convert i64* to errElTy* */
+    llvm::PointerType *arrElTyPt0 = llvm::PointerType::get(arrElTy, 0);
+    value  = ctx->BitCastInst(value, arrElTyPt0, "gep2gen_cast2");
+
+    llvm::Function *func_warp_index    = m->module->getFunction("__warp_index");
+    llvm::Value *warpId = ctx->CallInst(func_warp_index, NULL, std::vector<llvm::Value*>(),  "gep2gen_warp_index");
+    llvm::Value *offset = ctx->BinaryOperator(llvm::Instruction::Mul, warpId, LLVMInt32(numEl), "gep2gen_offset");
+    value = llvm::GetElementPtrInst::Create(value, offset, "gep2gen_offset", ctx->GetCurrentBasicBlock());
+  }
+
+  /* convert arrElTy* to elTy* */
+  llvm::PointerType *elTyPt0 = llvm::PointerType::get(elTy, 0);
+  value  = ctx->BitCastInst(value, elTyPt0, "gep2gen_cast3");
+
+  return value;
+}
+#endif /* ISPC_NVPTX_ENABLED */
 
 void
 DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
@@ -206,6 +264,23 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
         }
 
         if (sym->storageClass == SC_STATIC) {
+#ifdef ISPC_NVPTX_ENABLED
+            if (g->target->getISA() == Target::NVPTX && !sym->type->IsConstType())
+            {
+                Error(sym->pos, 
+                    "Non-constant static variable ""\"%s\" is not supported with ""\"nvptx\" target.",
+                    sym->name.c_str());
+                return;
+            }
+            if (g->target->getISA() == Target::NVPTX && sym->type->IsVaryingType())
+                PerformanceWarning(sym->pos, 
+                    "\"const static varying\" variable ""\"%s\" is stored in __global address space with ""\"nvptx\" target.",
+                    sym->name.c_str());
+            if (g->target->getISA() == Target::NVPTX && sym->type->IsUniformType())
+                PerformanceWarning(sym->pos, 
+                    "\"const static uniform\" variable ""\"%s\" is stored in __constant address space with ""\"nvptx\" target.",
+                    sym->name.c_str());
+#endif /* ISPC_NVPTX_ENABLED */
             // For static variables, we need a compile-time constant value
             // for its initializer; if there's no initializer, we use a
             // zero value.
@@ -235,6 +310,24 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
 
             // Allocate space for the static variable in global scope, so
             // that it persists across function calls
+#ifdef ISPC_NVPTX_ENABLED
+            int addressSpace = 0;
+            if (g->target->getISA() == Target::NVPTX &&
+                sym->type->IsConstType() &&
+                sym->type->IsUniformType())
+              addressSpace = 4;
+            sym->storagePtr =
+                new llvm::GlobalVariable(*m->module, llvmType,
+                                         sym->type->IsConstType(),
+                                         llvm::GlobalValue::InternalLinkage, cinit,
+                                         llvm::Twine("static.") +
+                                         llvm::Twine(sym->pos.first_line) +
+                                         llvm::Twine(".") + sym->name.c_str(),
+                                         NULL,
+                                         llvm::GlobalVariable::NotThreadLocal,
+                                         addressSpace);
+            sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos);
+#else /* ISPC_NVPTX_ENABLED */
             sym->storagePtr =
                 new llvm::GlobalVariable(*m->module, llvmType,
                                          sym->type->IsConstType(),
@@ -242,16 +335,90 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
                                          llvm::Twine("static.") +
                                          llvm::Twine(sym->pos.first_line) +
                                          llvm::Twine(".") + sym->name.c_str());
+#endif /* ISPC_NVPTX_ENABLED */
             // Tell the FunctionEmitContext about the variable
             ctx->EmitVariableDebugInfo(sym);
         }
-        else {
+#ifdef ISPC_NVPTX_ENABLED
+        else if ((sym->type->IsUniformType() || sym->type->IsSOAType()) &&
+          /* NVPTX:
+           * only non-constant uniform data types are stored in shared memory 
+           * constant uniform are automatically promoted to varying 
+           */
+           !sym->type->IsConstType() &&
+#if 1     
+           sym->type->IsArrayType() &&
+#endif
+           g->target->getISA() == Target::NVPTX)
+          {
+              PerformanceWarning(sym->pos,
+                  "Non-constant \"uniform\" data types might be slow with \"nvptx\" target. "
+                  "Unless data sharing between program instances is desired, try \"const [static] uniform\", \"varying\" or \"uniform new uniform \"+\"delete\" if possible.");
+
+              /* with __shared__ memory everything must be an array */
+              int nel = 4;
+              ArrayType *nat;
+              bool variable = true;
+              if (sym->type->IsArrayType())
+              {
+                const ArrayType *at = CastType<ArrayType>(sym->type);
+                /* we must scale # elements by 4, because a thread-block will run 4 warps
+                 * or 128 threads.
+                 * ***note-to-me***:please define these value (128threads/4warps)
+                 * in nvptx-target definition
+                 * instead of compile-time constants 
+                 */
+                nel *= at->GetElementCount();
+                if (sym->type->IsSOAType())
+                  nel *= sym->type->GetSOAWidth();
+                nat = new ArrayType(at->GetElementType(), nel);
+                variable = false;
+              }
+              else
+                nat = new ArrayType(sym->type, nel);
+
+              llvm::Type *llvmTypeUn = nat->LLVMType(g->ctx);
+              llvm::Constant *cinit = llvm::UndefValue::get(llvmTypeUn);
+
+              sym->storagePtr =
+                new llvm::GlobalVariable(*m->module, llvmTypeUn,
+                    sym->type->IsConstType(),
+                    llvm::GlobalValue::InternalLinkage, 
+                    cinit,
+                    llvm::Twine("local_") + 
+                    llvm::Twine(sym->pos.first_line) +
+                    llvm::Twine("_") + sym->name.c_str(),
+                    NULL,
+                    llvm::GlobalVariable::NotThreadLocal,
+                    /*AddressSpace=*/3);
+              sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos, variable);
+              llvm::PointerType *ptrTy = llvm::PointerType::get(sym->type->LLVMType(g->ctx),0);
+              sym->storagePtr = ctx->BitCastInst(sym->storagePtr, ptrTy, "uniform_decl");
+
+              // Tell the FunctionEmitContext about the variable; must do
+              // this before the initializer stuff.
+              ctx->EmitVariableDebugInfo(sym);
+
+              if (initExpr == 0 && sym->type->IsConstType())
+                Error(sym->pos, "Missing initializer for const variable "
+                    "\"%s\".", sym->name.c_str());
+
+              // And then get it initialized...
+              sym->parentFunction = ctx->GetFunction();
+              InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos);
+          }
+#endif /* ISPC_NVPTX_ENABLED */
+          else
+          {
             // For non-static variables, allocate storage on the stack
             sym->storagePtr = ctx->AllocaInst(llvmType, sym->name.c_str());
 
             // Tell the FunctionEmitContext about the variable; must do
             // this before the initializer stuff.
             ctx->EmitVariableDebugInfo(sym);
+            if (initExpr == 0 && sym->type->IsConstType())
+              Error(sym->pos, "Missing initializer for const variable "
+                  "\"%s\".", sym->name.c_str());
 
             // And then get it initialized...
             sym->parentFunction = ctx->GetFunction();
@@ -415,6 +582,19 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {
     if (testValue == NULL)
         return;
 
+#ifdef ISPC_NVPTX_ENABLED
+#if 0
+    if (!isUniform && g->target->getISA() == Target::NVPTX)
+    {
+      /* With "nvptx" target, SIMT hardware takes care of non-uniform 
+       * control flow. We trick ISPC to generate uniform control flow.
+       */
+      testValue = ctx->ExtractInst(testValue, 0);
+      isUniform = true;
+    }
+#endif
+#endif /* ISPC_NVPTX_ENABLED */
+
     if (isUniform) {
         ctx->StartUniformIf();
         if (doAllCheck)
@@ -695,7 +875,17 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
 
     // Do any of the program instances want to run the 'true'
     // block?  If not, jump ahead to bNext.
+
+#ifdef ISPC_NVPTX_ENABLED
+#if 0
+    llvm::Value *maskAnyTrueQ = ctx->ExtractInst(ctx->GetFullMask(),0);
+#else
     llvm::Value *maskAnyTrueQ = ctx->Any(ctx->GetFullMask());
+#endif
+#else /* ISPC_NVPTX_ENABLED */
+    llvm::Value *maskAnyTrueQ = ctx->Any(ctx->GetFullMask());
+#endif /* ISPC_NVPTX_ENABLED */
+
     ctx->BranchInst(bRunTrue, bNext, maskAnyTrueQ);
 
     // Emit statements for true
@@ -712,7 +902,16 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
 
     // Similarly, check to see if any of the instances want to
     // run the 'false' block...
+
+#ifdef ISPC_NVPTX_ENABLED
+#if 0
+    llvm::Value *maskAnyFalseQ = ctx->ExtractInst(ctx->GetFullMask(),0);
+#else
     llvm::Value *maskAnyFalseQ = ctx->Any(ctx->GetFullMask());
+#endif
+#else /* ISPC_NVPTX_ENABLED */
+    llvm::Value *maskAnyFalseQ = ctx->Any(ctx->GetFullMask());
+#endif /* ISPC_NVPTX_ENABLED */
     ctx->BranchInst(bRunFalse, bDone, maskAnyFalseQ);
 
     // Emit code for false
@@ -1277,6 +1476,95 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
                       llvm::Value *uniformCounterPtr,
                       llvm::Value *varyingCounterPtr,
                       const std::vector<int> &spans) {
+#ifdef ISPC_NVPTX_ENABLED
+    if (g->target->getISA() == Target::NVPTX)
+    {
+      // Smear the uniform counter value out to be varying
+      llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
+      llvm::Value *smearCounter = ctx->BroadcastValue(
+          counter, LLVMTypes::Int32VectorType, "smear_counter");
+
+      // Figure out the offsets; this is a little bit tricky.  As an example,
+      // consider a 2D tiled foreach loop, where we're running 8-wide and
+      // where the inner dimension has a stride of 4 and the outer dimension
+      // has a stride of 2.  For the inner dimension, we want the offsets
+      // (0,1,2,3,0,1,2,3), and for the outer dimension we want
+      // (0,0,0,0,1,1,1,1).
+      int32_t delta[ISPC_MAX_NVEC];
+      const int vecWidth = 32; 
+      std::vector<llvm::Constant*> constDeltaList;
+      for (int i = 0; i < vecWidth; ++i) 
+      {
+        int d = i;
+        // First, account for the effect of any dimensions at deeper
+        // nesting levels than the current one.
+        int prevDimSpanCount = 1;
+        for (int j = dim; j < nDims-1; ++j)
+          prevDimSpanCount *= spans[j+1];
+        d /= prevDimSpanCount;
+
+        // And now with what's left, figure out our own offset
+        delta[i] = d % spans[dim];
+        constDeltaList.push_back(LLVMInt8(delta[i]));
+      }
+
+      llvm::ArrayType* ArrayDelta = llvm::ArrayType::get(LLVMTypes::Int8Type, 32);
+  //    llvm::PointerType::get(ArrayDelta, 4); /* constant memory */
+
+
+      llvm::GlobalVariable* globalDelta = new llvm::GlobalVariable(
+          /*Module=*/*m->module,
+          /*Type=*/ArrayDelta,
+          /*isConstant=*/true,
+          /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
+          /*Initializer=*/0, // has initializer, specified below
+          /*Name=*/"constDeltaForeach");
+#if 0
+          /*ThreadLocalMode=*/llvm::GlobalVariable::NotThreadLocal,
+          /*unsigned AddressSpace=*/4 /*constant*/);
+#endif
+
+
+      llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList);
+
+      globalDelta->setInitializer(constDelta);
+      llvm::Function *func_program_index = m->module->getFunction("__program_index");
+      llvm::Value *laneIdx = ctx->CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__programIndex");
+
+      std::vector<llvm::Value*> ptr_arrayidx_indices;
+      ptr_arrayidx_indices.push_back(LLVMInt32(0));
+      ptr_arrayidx_indices.push_back(laneIdx);
+#if 1
+      llvm::Instruction* ptr_arrayidx = llvm::GetElementPtrInst::Create(globalDelta, ptr_arrayidx_indices, "arrayidx", ctx->GetCurrentBasicBlock());
+      llvm::LoadInst* int8_39 = new llvm::LoadInst(ptr_arrayidx, "", false, ctx->GetCurrentBasicBlock());
+      llvm::Value * int32_39 = ctx->ZExtInst(int8_39, LLVMTypes::Int32Type);
+
+      llvm::VectorType* VectorTy_2 = llvm::VectorType::get(llvm::IntegerType::get(*g->ctx, 32), 1);
+      llvm::UndefValue* const_packed_41 = llvm::UndefValue::get(VectorTy_2);
+
+      llvm::InsertElementInst* packed_43 = llvm::InsertElementInst::Create(
+  //        llvm::UndefValue(LLVMInt32Vector),
+          const_packed_41,
+          int32_39, LLVMInt32(0), "", ctx->GetCurrentBasicBlock());
+#endif
+
+
+      // Add the deltas to compute the varying counter values; store the
+      // result to memory and then return it directly as well.
+#if 0
+      llvm::Value *varyingCounter =
+          ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
+                              LLVMInt32Vector(delta), "iter_val");
+#else
+      llvm::Value *varyingCounter =
+          ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
+                              packed_43, "iter_val");
+#endif
+      ctx->StoreInst(varyingCounter, varyingCounterPtr);
+      return varyingCounter;
+    }
+#endif /* ISPC_NVPTX_ENABLED */
+
     // Smear the uniform counter value out to be varying
     llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
     llvm::Value *smearCounter = ctx->BroadcastValue(
@@ -1397,7 +1685,13 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
     std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;
 
     std::vector<int> span(nDims, 0);
+#ifdef ISPC_NVPTX_ENABLED
+    const int vectorWidth = 
+      g->target->getISA() == Target::NVPTX ? 32 : g->target->getVectorWidth();
+    lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]);
+#else /* ISPC_NVPTX_ENABLED */
     lGetSpans(nDims-1, nDims, g->target->getVectorWidth(), isTiled, &span[0]);
+#endif /* ISPC_NVPTX_ENABLED */
 
     for (int i = 0; i < nDims; ++i) {
         // Basic blocks that we'll fill in later with the looping logic for
@@ -1996,7 +2290,12 @@ ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const {
         // math...)
 
         // Get the "program index" vector value
+#ifdef ISPC_NVPTX_ENABLED
+        llvm::Value *programIndex = g->target->getISA() == Target::NVPTX ?
+          ctx->ProgramIndexVectorPTX() : ctx->ProgramIndexVector();
+#else /* ISPC_NVPTX_ENABLED */
         llvm::Value *programIndex = ctx->ProgramIndexVector();
+#endif /* ISPC_NVPTX_ENABLED */
 
         // And smear the current lane out to a vector
         llvm::Value *firstSet32 =
@@ -2192,11 +2491,23 @@ ForeachUniqueStmt::EmitCode(FunctionEmitContext *ctx) const {
 
         // And load the corresponding element value from the temporary
         // memory storing the value of the varying expr.
-        llvm::Value *uniqueValuePtr =
+        llvm::Value *uniqueValue;
+#ifdef ISPC_NVPTX_ENABLED
+        if (g->target->getISA() == Target::NVPTX)
+        {
+          llvm::Value *firstSet32 = ctx->TruncInst(firstSet, LLVMTypes::Int32Type);
+          uniqueValue = ctx->Extract(exprValue, firstSet32);
+        }
+        else
+        {
+#endif /* ISPC_NVPTX_ENABLED */
+          llvm::Value *uniqueValuePtr =
             ctx->GetElementPtrInst(exprMem, LLVMInt64(0), firstSet, exprPtrType,
-                                   "unique_index_ptr");
-        llvm::Value *uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value");
-
+                "unique_index_ptr");
+          uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value");
+#ifdef ISPC_NVPTX_ENABLED
+        }
+#endif /* ISPC_NVPTX_ENABLED */
         // If it's a varying pointer type, need to convert from the int
         // type we store in the vector to the actual pointer type
         if (llvm::dyn_cast<llvm::PointerType>(symType) != NULL)
@@ -3103,7 +3414,12 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
     }
 
     // Now we can emit code to call __do_print()
+#ifdef ISPC_NVPTX_ENABLED
+    llvm::Function *printFunc = g->target->getISA() != Target::NVPTX ?
+      m->module->getFunction("__do_print") : m->module->getFunction("__do_print_nvptx");
+#else /* ISPC_NVPTX_ENABLED */
     llvm::Function *printFunc = m->module->getFunction("__do_print");
+#endif /* ISPC_NVPTX_ENABLED */
     AssertPos(pos, printFunc);
 
     llvm::Value *mask = ctx->GetFullMask();
diff --git a/test_static.cpp b/test_static.cpp
index c27e2741..412115d0 100644
--- a/test_static.cpp
+++ b/test_static.cpp
@@ -66,7 +66,7 @@ extern "C" {
     void ISPCSync(void *handle);
     void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
 }
- 
+
 void ISPCLaunch(void **handle, void *f, void *d, int count0, int count1, int count2) {
     *handle = (void *)0xdeadbeef;
     typedef void (*TaskFuncType)(void *, int, int, int, int, int, int, int, int, int, int);
diff --git a/test_static_cuda.cpp b/test_static_cuda.cpp
new file mode 100644
index 00000000..29b9bddd
--- /dev/null
+++ b/test_static_cuda.cpp
@@ -0,0 +1,440 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_WINDOWS
+#include <windows.h>
+#endif // ISPC_IS_WINDOWS
+
+#include <cassert>
+#include <cstring>
+#include <cstdio>
+#include <cstdint>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif
+
+/******************************/
+
+#include <cassert>
+#include <iostream>
+#include <cuda.h>
+#include "drvapi_error_string.h"
+#include "ispc_malloc.h"
+
+#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)
+// These are the inline versions for all of the SDK helper functions
+void __checkCudaErrors(CUresult err, const char *file, const int line) {
+  if(CUDA_SUCCESS != err) {
+    std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
+           << getCudaDrvErrorString(err) << "\" from file <" << file
+           << ", line " << line << "\n";
+    exit(-1);
+  }
+}
+
+
+/******************************/
+/****  Basic CUDriver API  ****/
+/******************************/
+
+CUcontext context;
+
+static void createContext(const int deviceId = 0, const bool verbose = true)
+{
+  CUdevice device;
+  int devCount;
+  checkCudaErrors(cuInit(0));
+  checkCudaErrors(cuDeviceGetCount(&devCount));
+  assert(devCount > 0);
+  checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
+
+  char name[128];
+  checkCudaErrors(cuDeviceGetName(name, 128, device));
+  if (verbose)
+    std::cout << "Using CUDA Device [0]: " << name << "\n";
+
+  int devMajor, devMinor;
+  checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
+  if (verbose)
+    std::cout << "Device Compute Capability: "
+      << devMajor << "." << devMinor << "\n";
+  if (devMajor < 2) {
+    if (verbose)
+      std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
+    exit(1);
+  }
+
+  // Create driver context
+  checkCudaErrors(cuCtxCreate(&context, 0, device));
+}
+static void destroyContext()
+{
+  checkCudaErrors(cuCtxDestroy(context));
+}
+
+static CUmodule loadModule(
+    const char * module,
+    const int maxrregcount = 64,
+    const char cudadevrt_lib[] = "libcudadevrt.a",
+    const size_t log_size = 32768,
+    const bool print_log = true
+    )
+{
+  CUmodule cudaModule;
+  // in this branch we use compilation with parameters
+
+  CUlinkState  CUState;
+  CUlinkState *lState = &CUState;
+  const int nOptions = 8;
+  CUjit_option options[nOptions];
+  void* optionVals[nOptions];
+  float walltime;
+  size_t logSize = log_size;
+  char error_log[logSize],
+       info_log[logSize];
+  void *cuOut;
+  size_t outSize;
+  int myErr = 0;
+
+  // Setup linker options
+  // Return walltime from JIT compilation
+  options[0] = CU_JIT_WALL_TIME;
+  optionVals[0] = (void*) &walltime;
+  // Pass a buffer for info messages
+  options[1] = CU_JIT_INFO_LOG_BUFFER;
+  optionVals[1] = (void*) info_log;
+  // Pass the size of the info buffer
+  options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+  optionVals[2] = (void*) logSize;
+  // Pass a buffer for error message
+  options[3] = CU_JIT_ERROR_LOG_BUFFER;
+  optionVals[3] = (void*) error_log;
+  // Pass the size of the error buffer
+  options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+  optionVals[4] = (void*) logSize;
+  // Make the linker verbose
+  options[5] = CU_JIT_LOG_VERBOSE;
+  optionVals[5] = (void*) 1;
+  // Max # of registers/pthread
+  options[6] = CU_JIT_MAX_REGISTERS;
+  int jitRegCount = maxrregcount;
+  optionVals[6] = (void *)(size_t)jitRegCount;
+  // Caching
+  options[7] = CU_JIT_CACHE_MODE;
+  optionVals[7] = (void *)CU_JIT_CACHE_OPTION_CA;
+  // Create a pending linker invocation
+
+  // Create a pending linker invocation
+  checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState));
+
+#if 0
+  if (sizeof(void *)==4)
+  {
+    // Load the PTX from the string myPtx32
+    printf("Loading myPtx32[] program\n");
+    // PTX May also be loaded from file, as per below.
+    myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0);
+  }
+  else
+#endif
+  {
+    // Load the PTX from the string myPtx (64-bit)
+    if (print_log)
+      fprintf(stderr, "Loading ptx..\n");
+    myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0);
+    myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, cudadevrt_lib, 0,0,0);
+    // PTX May also be loaded from file, as per below.
+    // myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0);
+  }
+
+  // Complete the linker step
+  myErr = cuLinkComplete(*lState, &cuOut, &outSize);
+
+  if ( myErr != CUDA_SUCCESS )
+  {
+    // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above.
+    fprintf(stderr,"PTX Linker Error:\n%s\n",error_log);
+    assert(0);
+  }
+
+  // Linker walltime and info_log were requested in options above.
+ if (print_log)
+   fprintf(stderr, "CUDA Link Completed in %fms. Linker Output:\n%s\n",walltime,info_log);
+
+ // Load resulting cuBin into module
+ checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut));
+
+ // Destroy the linker invocation
+ checkCudaErrors(cuLinkDestroy(*lState));
+ return cudaModule;
+}
+static void unloadModule(CUmodule &cudaModule)
+{
+  checkCudaErrors(cuModuleUnload(cudaModule));
+}
+
+static CUfunction getFunction(CUmodule &cudaModule, const char * function)
+{
+  CUfunction cudaFunction;
+  checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
+  return cudaFunction;
+}
+
+static CUdeviceptr deviceMalloc(const size_t size)
+{
+  CUdeviceptr d_buf;
+  checkCudaErrors(cuMemAlloc(&d_buf, size));
+  return d_buf;
+}
+static void deviceFree(CUdeviceptr d_buf)
+{
+  checkCudaErrors(cuMemFree(d_buf));
+}
+static void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
+{
+  checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
+}
+static void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
+{
+  checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
+}
+#define deviceLaunch(func,params) \
+  checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \
+checkCudaErrors( \
+    cuLaunchKernel( \
+      (func), \
+      1,1,1, \
+      32, 1, 1, \
+      0, NULL, (params), NULL \
+      ));
+
+typedef CUdeviceptr devicePtr;
+
+
+/**************/
+#include <vector>
+static std::vector<char> readBinary(const char * filename, const bool print_size = false)
+{
+  std::vector<char> buffer;
+  FILE *fp = fopen(filename, "rb");
+  if (!fp )
+  {
+    fprintf(stderr, "file %s not found\n", filename);
+    assert(0);
+  }
+  fseek(fp, 0, SEEK_END);
+  const unsigned long long size = ftell(fp);         /*calc the size needed*/
+  fseek(fp, 0, SEEK_SET);
+  buffer.resize(size);
+
+  if (fp == NULL){ /*ERROR detection if file == empty*/
+    fprintf(stderr, "Error: There was an Error reading the file %s \n",filename);
+    exit(1);
+  }
+  else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/
+    fprintf(stderr, "Error: There was an Error reading the file %s \n", filename);
+    exit(1);
+  }
+  if (print_size)
+    fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size());
+  return buffer;
+}
+
+static double CUDALaunch(
+    void **handlePtr,
+    const char * func_name,
+    void **func_args,
+    const bool print_log = true,
+    const int maxrregcount = 64,
+    const char kernel_file[] = "__kernels.ptx",
+    const char cudadevrt_lib[] = "libcudadevrt.a",
+    const int log_size = 32768)
+{
+  fprintf(stderr, " launching kernel: %s \n", func_name);
+  const std::vector<char> module_str = readBinary(kernel_file, print_log);
+  const char *  module = &module_str[0];
+  CUmodule   cudaModule   = loadModule(module, maxrregcount, cudadevrt_lib, log_size, print_log);
+  CUfunction cudaFunction = getFunction(cudaModule, func_name);
+  deviceLaunch(cudaFunction, func_args);
+  checkCudaErrors(cuStreamSynchronize(0));
+  unloadModule(cudaModule);
+  return 0.0;
+}
+/******************************/
+
+
+extern "C" {
+//    extern int width();
+    int width() { return 32; }
+    extern void f_v(float *result);
+    extern void f_f(float *result, float *a);
+    extern void f_fu(float *result, float *a, float b);
+    extern void f_fi(float *result, float *a, int *b);
+    extern void f_du(float *result, double *a, double b);
+    extern void f_duf(float *result, double *a, float b);
+    extern void f_di(float *result, double *a, int *b);
+    extern void result(float *val);
+}
+
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ALIGN
+#else
+#define ALIGN __attribute__((aligned(64)))
+#endif
+
+int main(int argc, char *argv[]) {
+    int w = width();
+    assert(w <= 64);
+
+    float returned_result[64] ALIGN;
+    float vfloat[64] ALIGN;
+    double vdouble[64] ALIGN;
+    int vint[64] ALIGN;
+    int vint2[64] ALIGN;
+
+    const int device = 0;
+#if 0
+    const bool verbose = true;
+#else
+    const bool verbose = false;
+#endif
+
+    /*******************/
+    createContext(device, verbose);
+    /*******************/
+
+    devicePtr d_returned_result = deviceMalloc(64*sizeof(float));
+    devicePtr d_vfloat          = deviceMalloc(64*sizeof(float));
+    devicePtr d_vdouble         = deviceMalloc(64*sizeof(double));
+    devicePtr d_vint            = deviceMalloc(64*sizeof(int));
+    devicePtr d_vint2           = deviceMalloc(64*sizeof(int));
+
+
+    for (int i = 0; i < 64; ++i) {
+        returned_result[i] = -1e20;
+        vfloat[i] = i+1;
+        vdouble[i] = i+1;
+        vint[i] = 2*(i+1);
+        vint2[i] = i+5;
+    }
+
+    memcpyH2D(d_returned_result, returned_result, 64*sizeof(float));
+    memcpyH2D(d_vfloat         , vfloat,          64*sizeof(float));
+    memcpyH2D(d_vdouble        , vdouble,         64*sizeof(double));
+    memcpyH2D(d_vint        , vint,         64*sizeof(int));
+    memcpyH2D(d_vint2       , vint2,         64*sizeof(int));
+
+
+    float b = 5.;
+
+    const bool print_log = false;
+    const int  nreg = 64;
+#if (TEST_SIG == 0)
+    void *args[] = {&d_returned_result};
+    CUDALaunch(NULL, "f_v", args, print_log, nreg);
+#elif (TEST_SIG == 1)
+    void *args[] = {&d_returned_result, &d_vfloat};
+    CUDALaunch(NULL, "f_f", args, print_log, nreg);
+#elif (TEST_SIG == 2)
+    void *args[] = {&d_returned_result, &d_vfloat, &b};
+    CUDALaunch(NULL, "f_fu", args, print_log, nreg);
+#elif (TEST_SIG == 3)
+    void *args[] = {&d_returned_result, &d_vfloat, &vint};
+    CUDALaunch(NULL, "f_fi", args, print_log, nreg);
+#elif (TEST_SIG == 4)
+    int num = 5;
+    void *args[] = {&d_returned_result, &d_vdouble, &num};
+    CUDALaunch(NULL, "f_du", args, print_log, nreg);
+#elif (TEST_SIG == 5)
+    float num = 5.0f;
+    void *args[] = {&d_returned_result, &d_vdouble, &num};
+    CUDALaunch(NULL, "f_duf", args, print_log, nreg);
+#elif (TEST_SIG == 6)
+    void *args[] = {&d_returned_result, &d_vdouble, &v_int2};
+    CUDALaunch(NULL, "f_di", args, print_log, nreg);
+#else
+#error "Unknown or unset TEST_SIG value"
+#endif
+
+    float expected_result[64];
+
+    memset(expected_result, 0, 64*sizeof(float));
+    devicePtr d_expected_result = deviceMalloc(64*sizeof(float));
+    memcpyH2D(d_expected_result, expected_result, 64*sizeof(float));
+    void *res_args[] = {&d_expected_result};
+    CUDALaunch(NULL, "result", res_args, print_log, nreg);
+    memcpyD2H(expected_result, d_expected_result, 64*sizeof(float));
+    memcpyD2H(returned_result, d_returned_result, 64*sizeof(float));
+
+    deviceFree(d_returned_result);
+    deviceFree(d_vfloat);
+    deviceFree(d_vdouble);
+    deviceFree(d_vint);
+    deviceFree(d_vint2);
+    deviceFree(d_expected_result);
+
+    /*******************/
+    destroyContext();
+    /*******************/
+
+    int errors = 0;
+    for (int i = 0; i < w; ++i) {
+        if (returned_result[i] != expected_result[i]) {
+#ifdef EXPECT_FAILURE
+            // bingo, failed
+            return 1;
+#else
+            printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
+                   argv[0], i, returned_result[i], returned_result[i],
+                   expected_result[i], expected_result[i]);
+            ++errors;
+#endif // EXPECT_FAILURE
+        }
+    }
+
+#ifdef EXPECT_FAILURE
+    // Don't expect to get here
+    return 0;
+#else
+    return errors > 0;
+#endif
+}
diff --git a/test_static_nvptx.cpp b/test_static_nvptx.cpp
new file mode 100644
index 00000000..5a6d1399
--- /dev/null
+++ b/test_static_nvptx.cpp
@@ -0,0 +1,133 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_WINDOWS
+#include <windows.h>
+#endif // ISPC_IS_WINDOWS
+
+#include <cassert>
+#include <cstring>
+#include <cstdio>
+#include <stdint.h>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif
+
+#include "ispc_malloc.h"
+
+#define N 32
+extern "C" {
+    int width() { return N; }
+    extern void f_v(float *result);
+    extern void f_f(float *result, float *a);
+    extern void f_fu(float *result, float *a, float b);
+    extern void f_fi(float *result, float *a, int *b);
+    extern void f_du(float *result, double *a, double b);
+    extern void f_duf(float *result, double *a, float b);
+    extern void f_di(float *result, double *a, int *b);
+    extern void result(float *val);
+}
+
+int main(int argc, char *argv[]) {
+    int w = width();
+    assert(w <= N);
+
+    float *returned_result = new float[N*4];
+    float *vfloat = new float[N*4];
+    double *vdouble = new double[N*4];
+    int *vint = new int[N*4];
+    int *vint2 = new int[N*4];
+
+    for (int i = 0; i < N*4; ++i) {
+        returned_result[i] = -1e20;
+        vfloat[i] = i+1;
+        vdouble[i] = i+1;
+        vint[i] = 2*(i+1);
+        vint2[i] = i+5;
+    }
+
+    float b = 5.;
+
+#if (TEST_SIG == 0)
+    f_v(returned_result);
+#elif (TEST_SIG == 1)
+    f_f(returned_result, vfloat);
+#elif (TEST_SIG == 2)
+    f_fu(returned_result, vfloat, b);
+#elif (TEST_SIG == 3)
+    f_fi(returned_result, vfloat, vint);
+#elif (TEST_SIG == 4)
+    f_du(returned_result, vdouble, 5.);
+#elif (TEST_SIG == 5)
+    f_duf(returned_result, vdouble, 5.f);
+#elif (TEST_SIG == 6)
+    f_di(returned_result, vdouble, vint2);
+#else
+#error "Unknown or unset TEST_SIG value"
+#endif
+
+    float *expected_result = new float[N];
+    memset(expected_result, 0, N*sizeof(float));
+    result(expected_result);
+
+    int errors = 0;
+    for (int i = 0; i < w; ++i) {
+        if (returned_result[i] != expected_result[i])
+        {
+#ifdef EXPECT_FAILURE
+            // bingo, failed
+            return 1;
+#else
+            printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
+                   argv[0], i, returned_result[i], returned_result[i],
+                   expected_result[i], expected_result[i]);
+            ++errors;
+#endif // EXPECT_FAILURE
+        }
+    }
+
+#ifdef EXPECT_FAILURE
+    // Don't expect to get here
+    return 0;
+#else
+    return errors > 0;
+#endif
+}
diff --git a/tests/array-mixed-unif-vary-indexing-3.ispc b/tests/array-mixed-unif-vary-indexing-3.ispc
index ab3a7a7c..c6623cf6 100644
--- a/tests/array-mixed-unif-vary-indexing-3.ispc
+++ b/tests/array-mixed-unif-vary-indexing-3.ispc
@@ -5,7 +5,13 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     assert(programCount <= 64);
+#ifdef __NVPTX__
+    uniform float  * uniform xarr   =  uniform new uniform float[70*70];
+    uniform float (* uniform x)[70] = (uniform float (* uniform)[70])xarr;
+#define _SHMALLOC
+#else
     uniform float x[70][70];
+#endif
     for (uniform int i = 0; i < 70; ++i)
         for (uniform int j = 0; j < 70; ++j)
             x[i][j] = 2+b-5;
@@ -16,6 +22,10 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     else
         x[b-1][a-1] = 1;
     RET[programIndex] = x[4][a];
+
+#ifdef _SHMALLOC
+    delete xarr;
+#endif
 }
 
 export void result(uniform float RET[]) { 
diff --git a/tests/broadcast.ispc b/tests/broadcast.ispc
index 1df835ae..6dfa1a00 100644
--- a/tests/broadcast.ispc
+++ b/tests/broadcast.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    float b = (programCount == 1) ? 3 : broadcast(a, 2);
+    float b = (programCount == 1) ? 4 : broadcast(a, 2);
     RET[programIndex] = b;
 }
 
diff --git a/tests/c-test-64.ispc b/tests/c-test-64.ispc
index 3429bf91..d2602bc7 100644
--- a/tests/c-test-64.ispc
+++ b/tests/c-test-64.ispc
@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 2;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 5;
-    RET[3] = RET[7] = RET[11] = RET[15] = 6;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 2;
+    RET[i+1] = 3;
+    RET[i+2] = 5;
+    RET[i+3] = 6;
+  }
 }
diff --git a/tests/c-test-65.ispc b/tests/c-test-65.ispc
index 9a363864..15df6367 100644
--- a/tests/c-test-65.ispc
+++ b/tests/c-test-65.ispc
@@ -18,6 +18,9 @@ export void f_fu(uniform float RET[4], uniform float aFOO[4], uniform float b) {
 
 export void result(uniform float RET[]) {
     RET[programIndex] = 3;
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+    for (int i = 0; i < programCount; i += 4)
+    {
+      RET[i+0] = 1;
+      RET[i+3] = 29;
+    }
 }
diff --git a/tests/c-test-66.ispc b/tests/c-test-66.ispc
index a6c35dc7..22511604 100644
--- a/tests/c-test-66.ispc
+++ b/tests/c-test-66.ispc
@@ -19,6 +19,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 export void result(uniform float RET[]) {
     RET[programIndex] = 32;
-    RET[2] = RET[6] = RET[10] = RET[14] = 38;
-    RET[3] = RET[7] = RET[11] = RET[15] = 39;
+    for (int i = 0; i < programCount; i += 4)
+    {
+      RET[i+2] = 38;
+      RET[i+3] = 39;
+    }
 }
diff --git a/tests/cfor-array-struct-gather.ispc b/tests/cfor-array-struct-gather.ispc
index c320ad7c..d433b00d 100644
--- a/tests/cfor-array-struct-gather.ispc
+++ b/tests/cfor-array-struct-gather.ispc
@@ -4,14 +4,14 @@ export uniform int width() { return programCount; }
 
 
 struct Foo {
-    uniform float x[17];
+    uniform float x[programCount+1];
 };
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     uniform Foo foo;
     uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
         foo.x[i] = i;
 
     if ((int)a & 1)
diff --git a/tests/cfor-gs-double-improve-multidim-1.ispc b/tests/cfor-gs-double-improve-multidim-1.ispc
index ed672bd8..62124e2a 100644
--- a/tests/cfor-gs-double-improve-multidim-1.ispc
+++ b/tests/cfor-gs-double-improve-multidim-1.ispc
@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
-    uniform double udx[25][25];
-    cfor (uniform int i = 0; i < 25; ++i)
-        cfor (uniform int j = 0; j < 25; ++j)
+    uniform double udx[programCount+1][programCount+1];
+    cfor (uniform int i = 0; i < programCount+1; ++i)
+        cfor (uniform int j = 0; j < programCount+1; ++j)
             udx[i][j] = 10*i+j;
 
     int x = 1;
diff --git a/tests/cfor-gs-improve-multidim-1.ispc b/tests/cfor-gs-improve-multidim-1.ispc
index b0893617..32482ced 100644
--- a/tests/cfor-gs-improve-multidim-1.ispc
+++ b/tests/cfor-gs-improve-multidim-1.ispc
@@ -5,9 +5,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
-    uniform float udx[20][20];
-    cfor (uniform int i = 0; i < 20; ++i)
-        cfor (uniform int j = 0; j < 20; ++j)
+    uniform float udx[programCount+1][programCount+1];
+    cfor (uniform int i = 0; i < programCount+1; ++i)
+        cfor (uniform int j = 0; j < programCount+1; ++j)
             udx[i][j] = 100*i+j;
 
     int x = 1;
diff --git a/tests/cfor-gs-improve-multidim-struct-1.ispc b/tests/cfor-gs-improve-multidim-struct-1.ispc
index d599ceb9..0d682f9a 100644
--- a/tests/cfor-gs-improve-multidim-struct-1.ispc
+++ b/tests/cfor-gs-improve-multidim-struct-1.ispc
@@ -4,19 +4,27 @@ export uniform int width() { return programCount; }
 
 
 struct Foo {
-    uniform float udx[25][25];
+    uniform float udx[32][32];
 };
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
+#ifndef __NVPTX__ 
     uniform Foo f[5];
+#else     /* too much shared memory allocated, nvcc fails to link */
+    uniform Foo * uniform f = uniform new uniform Foo[5];
+#define _UNMALLOC
+#endif
     cfor (uniform int i = 0; i < 5; ++i)
-        cfor (uniform int j = 0; j < 25; ++j)
-            cfor (uniform int k = 0; k < 25; ++k)
+        cfor (uniform int j = 0; j < 32; ++j)
+            cfor (uniform int k = 0; k < 32; ++k)
                 f[i].udx[j][k] = 1000*i+100*j+k;
 
     int x = 1;
     RET[programIndex] = f[x+1].udx[b-4][programIndex];
+#ifdef _UNMALLOC
+    delete f;
+#endif
 }
 
 export void result(uniform float RET[]) { RET[programIndex] = 2100 +programIndex; }
diff --git a/tests/cfor-struct-gather-2.ispc b/tests/cfor-struct-gather-2.ispc
index 7c615139..75da4a3f 100644
--- a/tests/cfor-struct-gather-2.ispc
+++ b/tests/cfor-struct-gather-2.ispc
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
     uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
         foo[i].f = i*a;
     RET[programIndex] = func(foo, (int)a);
 }
diff --git a/tests/cfor-struct-gather-3.ispc b/tests/cfor-struct-gather-3.ispc
index 7c615139..75da4a3f 100644
--- a/tests/cfor-struct-gather-3.ispc
+++ b/tests/cfor-struct-gather-3.ispc
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
     uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
         foo[i].f = i*a;
     RET[programIndex] = func(foo, (int)a);
 }
diff --git a/tests/cfor-struct-gather.ispc b/tests/cfor-struct-gather.ispc
index 49928a6b..9265da32 100644
--- a/tests/cfor-struct-gather.ispc
+++ b/tests/cfor-struct-gather.ispc
@@ -9,9 +9,9 @@ struct Foo {
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
     uniform int i;
-    cfor (i = 0; i < 17; ++i)
+    cfor (i = 0; i < programCount+1; ++i)
         foo[i].f = i*a;
     RET[programIndex] = foo[(int)a].f;
 }
diff --git a/tests/cfor-struct-test-114.ispc b/tests/cfor-struct-test-114.ispc
index 0ea2f65a..e7b83a79 100644
--- a/tests/cfor-struct-test-114.ispc
+++ b/tests/cfor-struct-test-114.ispc
@@ -10,9 +10,9 @@ struct Foo {
 export void f_fi(uniform float RET[], uniform float aFOO[], uniform int bFOO[]) {
     float a = aFOO[programIndex];
     int b = bFOO[programIndex];
-    varying Foo myFoo[17];
+    varying Foo myFoo[programCount+1];
     uniform int i;
-    cfor (i = 0; i < 17; ++i) {
+    cfor (i = 0; i < programCount+1; ++i) {
         myFoo[i].x = i;
         myFoo[i].f = 2*i;
     }
diff --git a/tests/cfor-test-134.ispc b/tests/cfor-test-134.ispc
index 96493dff..0e8af645 100644
--- a/tests/cfor-test-134.ispc
+++ b/tests/cfor-test-134.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/cfor-test-135.ispc b/tests/cfor-test-135.ispc
index 5926ba30..9f17350e 100644
--- a/tests/cfor-test-135.ispc
+++ b/tests/cfor-test-135.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/cfor-test-136.ispc b/tests/cfor-test-136.ispc
index 62834f67..e7ac9f75 100644
--- a/tests/cfor-test-136.ispc
+++ b/tests/cfor-test-136.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/cfor-test-64.ispc b/tests/cfor-test-64.ispc
index 9c51c9b0..eb2cbec0 100644
--- a/tests/cfor-test-64.ispc
+++ b/tests/cfor-test-64.ispc
@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 2;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 5;
-    RET[3] = RET[7] = RET[11] = RET[15] = 6;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 2;
+    RET[i+1] = 3;
+    RET[i+2] = 5;
+    RET[i+3] = 6;
+  }
 }
diff --git a/tests/cfor-test-65.ispc b/tests/cfor-test-65.ispc
index a3c11c6d..28f82225 100644
--- a/tests/cfor-test-65.ispc
+++ b/tests/cfor-test-65.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/cfor-test-66.ispc b/tests/cfor-test-66.ispc
index d3698ffe..e53d2b94 100644
--- a/tests/cfor-test-66.ispc
+++ b/tests/cfor-test-66.ispc
@@ -18,8 +18,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 32;
-    RET[1] = RET[5] = RET[9] = RET[13] = 32;
-    RET[2] = RET[6] = RET[10] = RET[14] = 38;
-    RET[3] = RET[7] = RET[11] = RET[15] = 39;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 32;
+    RET[i+1] = 32;
+    RET[i+2] = 38;
+    RET[i+3] = 39;
+  }
 }
diff --git a/tests/cfor-unif-struct-test-114.ispc b/tests/cfor-unif-struct-test-114.ispc
index 114e826d..59649fd0 100644
--- a/tests/cfor-unif-struct-test-114.ispc
+++ b/tests/cfor-unif-struct-test-114.ispc
@@ -8,9 +8,9 @@ struct Foo {
 };
 export void f_fi(uniform float RET[], uniform float a[], uniform int bFOO[]) {
     int b = bFOO[programIndex];
-    uniform struct Foo myFoo[17];
+    uniform struct Foo myFoo[programCount+1];
     uniform int i;
-    cfor (i = 0; i < 17; ++i) {
+    cfor (i = 0; i < programCount+1; ++i) {
         myFoo[i].x = i;
         myFoo[i].f = 2*i;
     }
diff --git a/tests/const-fold-1.ispc b/tests/const-fold-1.ispc
index fc4717ce..95b46cea 100644
--- a/tests/const-fold-1.ispc
+++ b/tests/const-fold-1.ispc
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex];
     uniform int x = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
-    static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
+    const static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
     RET[programIndex] = (x == y) ? 1. : 0.;
 }
 
diff --git a/tests/const-fold-2.ispc b/tests/const-fold-2.ispc
index 88743d2f..4e0ea5b6 100644
--- a/tests/const-fold-2.ispc
+++ b/tests/const-fold-2.ispc
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex];
     uniform int x = (170 >> 4) % 5;
-    static uniform int y = (170 >> 4) % 5;
+    const static uniform int y = (170 >> 4) % 5;
     RET[programIndex] = (x == y) ? 1. : 0.;
 }
 
diff --git a/tests/const-fold-3.ispc b/tests/const-fold-3.ispc
index cf5bc915..15c49e92 100644
--- a/tests/const-fold-3.ispc
+++ b/tests/const-fold-3.ispc
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex];
     uniform int x = (17 < 2) || (6 >= 5) && (20 >= 20);
-    static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20);
+    const static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20);
     RET[programIndex] = ((x!=0) == (y!=0)) ? 1. : 0.;
 }
 
diff --git a/tests/launch-8.ispc b/tests/launch-8.ispc
index eacba673..9855a963 100644
--- a/tests/launch-8.ispc
+++ b/tests/launch-8.ispc
@@ -2,22 +2,23 @@
 export uniform int width() { return programCount; }
 
 
-#define N0 10
+#define N0 12
 #define N1 20
 #define N2 50
 static uniform float array[N2][N1][N0];
 
-task void x(const float f) {
+task void x(const uniform float farray[]) {
+    const float f = farray[programIndex];
     uniform int j;
 
-    assert(taskCount  == (int32)N0*N1*N2);
-    assert(taskCount0 == (int32)N0);
-    assert(taskCount1 == (int32)N1);
-    assert(taskCount2 == (int32)N2);
-    assert(taskIndex  == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2));
-    assert(taskIndex0 < (int32)N0);
-    assert(taskIndex1 < (int32)N1);
-    assert(taskIndex2 < (int32)N2);
+    assert(taskCount  == (uniform int32)N0*N1*N2);
+    assert(taskCount0 == (uniform int32)N0);
+    assert(taskCount1 == (uniform int32)N1);
+    assert(taskCount2 == (uniform int32)N2);
+    assert(taskIndex  == (uniform int32)taskIndex0 + (uniform int32)N0*(taskIndex1 +(uniform int32) N1*taskIndex2));
+    assert(taskIndex0 < (uniform int32)N0);
+    assert(taskIndex1 < (uniform int32)N1);
+    assert(taskIndex2 < (uniform int32)N2);
 
     const uniform int i0 = taskIndex0;
     const uniform int i1 = taskIndex1;
@@ -30,7 +31,7 @@ task void x(const float f) {
         array[i2][i1][i0] = i;
 }
 export void f_f(uniform float RET[], uniform float fFOO[]) { 
-    float f = fFOO[programIndex];
+    uniform float * uniform f = fFOO;
     launch[N2][N1][N0] x(f);
     sync;
     RET[programIndex] = array[N2-1][N1-1][N0-1];
@@ -38,5 +39,5 @@ export void f_f(uniform float RET[], uniform float fFOO[]) {
 
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 9999.000000;
+    RET[programIndex] = 11999.000000;
 }
diff --git a/tests/launch-9.ispc b/tests/launch-9.ispc
index 1952e8e7..dbbb9f80 100644
--- a/tests/launch-9.ispc
+++ b/tests/launch-9.ispc
@@ -2,12 +2,13 @@
 export uniform int width() { return programCount; }
 
 
-#define N0 10
+#define N0 12
 #define N1 20
 #define N2 50
 static uniform float array[N2][N1][N0];
 
-task void x(const float f) {
+task void x(const uniform float farray[]) {
+    const float f = farray[programIndex];
     uniform int j;
 
     assert(taskCount  == (int32)N0*N1*N2);
@@ -30,13 +31,13 @@ task void x(const float f) {
         array[i2][i1][i0] = i;
 }
 export void f_f(uniform float RET[], uniform float fFOO[]) { 
-    float f = fFOO[programIndex];
-    launch[N0,N1,N2] x(f);
+    uniform float * uniform f = fFOO;
+    launch[N2][N1][N0] x(f);
     sync;
     RET[programIndex] = array[N2-1][N1-1][N0-1];
 }
 
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 9999.000000;
+    RET[programIndex] = 11999.000000;
 }
diff --git a/tests/operators2.ispc b/tests/operators2.ispc
index b732b24a..daef4ec6 100644
--- a/tests/operators2.ispc
+++ b/tests/operators2.ispc
@@ -1,4 +1,9 @@
+#ifdef __NVPTX__
+uniform int _off[programCount];
+#define off _off[programIndex]
+#else  /* global varying data types are not yet supported with "nvptx" target */
 int off;
+#endif
 
 export uniform int width() { return programCount; }
 
@@ -22,11 +27,11 @@ struct S operator/(struct S rr, struct S rv) {
     return c;
 }
 
-struct S a;
-struct S b;
-struct S d;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
+    struct S a;
+    struct S b;
+    struct S d;
     int T = programIndex;
     a.a = aFOO[programIndex];
     b.a = -aFOO[programIndex];
diff --git a/tests/soa-16.ispc b/tests/soa-16.ispc
index f23c39cb..3c6ff6c4 100644
--- a/tests/soa-16.ispc
+++ b/tests/soa-16.ispc
@@ -15,6 +15,16 @@ static void p(uniform float *uniform ptr) {
 }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+#ifdef __NVPTX__  /* soa is converted to shared memory story for now, use smaller amount to check the test */
+    soa<4> Point pts[10];
+    for (uniform int i = 0; i < 40; ++i) {
+        pts[i].x = b*i;
+        pts[i].y[0] = 2*b*i;
+        pts[i].y[1] = 2*b*i+1;
+        pts[i].y[2] = 2*b*i+2;
+        pts[i].z = 3*b*i;
+    }
+#else
     soa<4> Point pts[30];
     for (uniform int i = 0; i < 120; ++i) {
         pts[i].x = b*i;
@@ -23,6 +33,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         pts[i].y[2] = 2*b*i+2;
         pts[i].z = 3*b*i;
     }
+#endif
 
     float a = aFOO[programIndex]; 
     a *= -1;
diff --git a/tests/soa-17.ispc b/tests/soa-17.ispc
index f25b85bd..5dc9ea2f 100644
--- a/tests/soa-17.ispc
+++ b/tests/soa-17.ispc
@@ -16,6 +16,16 @@ static void p(uniform float *uniform ptr) {
 }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+#ifdef __NVPTX__  /* soa is converted to shared memory story for now, use smaller amount to check the test */
+    soa<4> Point pts[15];
+    for (uniform int i = 0; i < 60; ++i) {
+        pts[i].x = b*i;
+        pts[i].y[0] = 2*b*i;
+        pts[i].y[1] = 2*b*i+1;
+        pts[i].y[2] = 2*b*i+2;
+        pts[i].z = 3*b*i;
+    }
+#else
     soa<4> Point pts[40];
     for (uniform int i = 0; i < 160; ++i) {
         pts[i].x = b*i;
@@ -24,6 +34,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         pts[i].y[2] = 2*b*i+2;
         pts[i].z = 3*b*i;
     }
+#endif
 
     float a = aFOO[programIndex]; 
     a *= -1;
diff --git a/tests/soa-22.ispc b/tests/soa-22.ispc
index 60448694..ba3ffa0c 100644
--- a/tests/soa-22.ispc
+++ b/tests/soa-22.ispc
@@ -25,7 +25,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
             }
         }
     }
-    
+   
     assert(programIndex < 80);
     RET[programIndex] = pts[programIndex].pts[programIndex % 3][programIndex % 4].z;
 }
diff --git a/tests/soa-3.ispc b/tests/soa-3.ispc
index 2cec07a5..86c7c57c 100644
--- a/tests/soa-3.ispc
+++ b/tests/soa-3.ispc
@@ -6,6 +6,17 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
 
+#ifdef __NVPTX__  /* soa is converted to shared memory story for now, use smaller amount to check the test */
+    soa<8> Point pts[4];
+//CO    uniform Point pts[80];
+    foreach (i = 0 ... 40) {
+        pts[i].x = b*i;
+        pts[i].y[0] = 2*b*i;
+        pts[i].y[1] = 2*b*i+1;
+        pts[i].y[2] = 2*b*i+2;
+        pts[i].z = 3*b*i;
+    }
+#else
     soa<8> Point pts[10];
 //CO    uniform Point pts[80];
     foreach (i = 0 ... 80) {
@@ -15,6 +26,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         pts[i].y[2] = 2*b*i+2;
         pts[i].z = 3*b*i;
     }
+#endif
 
     assert(programCount < 80);
     RET[programIndex] = pts[programIndex].y[2];
diff --git a/tests/test-134.ispc b/tests/test-134.ispc
index baa8ec37..9d4d0e94 100644
--- a/tests/test-134.ispc
+++ b/tests/test-134.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/test-135.ispc b/tests/test-135.ispc
index c350a524..bb9881e6 100644
--- a/tests/test-135.ispc
+++ b/tests/test-135.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/test-136.ispc b/tests/test-136.ispc
index ab6c6b5b..098ac456 100644
--- a/tests/test-136.ispc
+++ b/tests/test-136.ispc
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 1;
+    RET[i+1] = 3;
+    RET[i+2] = 3;
+    RET[i+3] = 29;
+  }
 }
diff --git a/tests/test-140.ispc b/tests/test-140.ispc
index a983d528..997d558e 100644
--- a/tests/test-140.ispc
+++ b/tests/test-140.ispc
@@ -8,8 +8,11 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 0x0.0p+0;
-    RET[1] = RET[5] = RET[9] = RET[13] = 0x1.62e43p-1;
-    RET[2] = RET[6] = RET[10] = RET[14] =  0x1.193ea8p+0;
-    RET[3] = RET[7] = RET[11] = RET[15] = 0x1.62e43p+0;
+  for (int i = 0; i < programCount; i += 4)
+  {
+    RET[i+0] = 0x0.0p+0;
+    RET[i+1] = 0x1.62e43p-1;
+    RET[i+2] = 0x1.193ea8p+0;
+    RET[i+3] = 0x1.62e43p+0;
+  }
 }
diff --git a/tests/test-141.ispc b/tests/test-141.ispc
index b69be1fa..9045c081 100644
--- a/tests/test-141.ispc
+++ b/tests/test-141.ispc
@@ -5,7 +5,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex];
     // calculation error 1e-6 is the same as in icc
-    RET[programIndex] = (exp(-log(1/a)) - a) < 1e-6 ? 1 : 0;
+    RET[programIndex] = (exp(-log(1/a)) - a)/a < 1e-6 ? 1 : 0;
 }
 
 export void result(uniform float RET[4]) {
diff --git a/tests/test-142.ispc b/tests/test-142.ispc
index 18053402..9ab8ff9f 100644
--- a/tests/test-142.ispc
+++ b/tests/test-142.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = round(a+.499999); 
+    RET[programIndex] = round(a+.49999); 
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/test-144.ispc b/tests/test-144.ispc
index 568bdc10..64e1817a 100644
--- a/tests/test-144.ispc
+++ b/tests/test-144.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = floor(a+.999999); 
+    RET[programIndex] = floor(a+.99999); 
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/uniform-1.ispc b/tests/uniform-1.ispc
new file mode 100644
index 00000000..dcf4eab0
--- /dev/null
+++ b/tests/uniform-1.ispc
@@ -0,0 +1,34 @@
+
+export uniform int width() { return programCount; }
+
+
+task void f_f_task(uniform float RET[], uniform float aFOO[]) {
+    uniform float val[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        val[i] = 0;
+
+    foreach (i = 0 ... programCount)
+        val[i] += aFOO[programCount*taskIndex + i] - 1;
+
+    uniform float sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        sum += val[i];
+
+    if (programIndex < 32/4)
+      RET[programCount/4*taskIndex + programIndex] = sum;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) 
+{
+  launch[4] f_f_task(RET,  aFOO);
+}
+task void result_task(uniform float RET[])
+{
+  const uniform float ret = reduce_add(programIndex + programCount*taskIndex);
+  if (programIndex < 32/4)
+    RET[programCount/4*taskIndex + programIndex] = ret;
+}
+
+export void result(uniform float RET[]) {
+  launch[4] result_task(RET);
+}
diff --git a/type.cpp b/type.cpp
index 456a9520..822f7402 100644
--- a/type.cpp
+++ b/type.cpp
@@ -3058,7 +3058,11 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
         llvmArgTypes.push_back(LLVMTypes::MaskType);
 
     std::vector<llvm::Type *> callTypes;
-    if (isTask) {
+    if (isTask 
+#ifdef ISPC_NVPTX_ENABLED
+      && (g->target->getISA() != Target::NVPTX)
+#endif 
+      ){
         // Tasks take three arguments: a pointer to a struct that holds the
         // actual task arguments, the thread index, and the total number of
         // threads the tasks system has running.  (Task arguments are