Merge pull request #749 from egaburov/nvptx_clean

Experimental support for PTX with examples
2014-10-16 15:56:02 +04:00
parent ccf4d00385 92377426bd
commit 30270584aa
158 changed files with 21326 additions and 204 deletions
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -141,3 +141,46 @@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
 POSSIBILITY OF SUCH DAMAGE.
+
+---------------------------------------------------------------------------
+
+The ptxtools use parts of the PTX parser code from GPU Ocelot project
+(https://code.google.com/p/gpuocelot/), which is covered by the following
+license:
+
+Copyright 2011
+GEORGIA TECH RESEARCH CORPORATION
+ALL RIGHTS RESERVED
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+    * Redistributions of source code must retain the above copyright
+notice,   this list of conditions and the following disclaimers.
+    * Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the
+distribution.
+    * Neither the name of GEORGIA TECH RESEARCH CORPORATION nor the
+names of  its contributors may be used to endorse or promote
+products derived  from this software without specific prior
+written permission.
+
+THIS SOFTWARE IS PROVIDED BY GEORGIA TECH RESEARCH CORPORATION ''AS IS''
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GEORGIA TECH RESEARCH
+CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You agree that the Software will not be shipped, transferred, exported,
+or re-exported directly into any country prohibited by the United States
+Export Administration Act and the regulations thereunder nor will be
+used for any purpose prohibited by the Act.
+
+ 
--- a/21
+++ b/21
@@ -73,6 +73,10 @@ endif
 # To enable: make ARM_ENABLED=1
 ARM_ENABLED=0

+# Disable NVPTX by request
+# To disable: make NVPTX_ENABLED=0
+NVPTX_ENABLED=1
+
 # Add llvm bin to the path so any scripts run will go to the right llvm-config
 LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
 export PATH:=$(LLVM_BIN):$(PATH)
@@ -89,7 +93,7 @@ LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
 LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e 's/svn//' -e 's/\./_/' -e 's/\..*//')
 LLVM_VERSION_DEF=-D$(LLVM_VERSION)

-LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker
+LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker 
 # Component "option" was introduced in 3.3 and starting with 3.4 it is required for the link step.
 # We check if it's available before adding it (to not break 3.2 and earlier).
 ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1)
@@ -98,6 +102,9 @@ endif
 ifneq ($(ARM_ENABLED), 0)
    LLVM_COMPONENTS+=arm
 endif
+ifneq ($(NVPTX_ENABLED), 0)
+    LLVM_COMPONENTS+=nvptx
+endif	
 LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS))

 CLANG=clang
@@ -160,6 +167,9 @@ endif
 ifneq ($(ARM_ENABLED), 0)
    CXXFLAGS+=-DISPC_ARM_ENABLED
 endif
+ifneq ($(NVPTX_ENABLED), 0)
+    CXXFLAGS+=-DISPC_NVPTX_ENABLED
+endif

 LDFLAGS=
 ifeq ($(ARCH_OS),Linux)
@@ -184,6 +194,9 @@ TARGETS=avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-
 ifneq ($(ARM_ENABLED), 0)
    TARGETS+=neon-32 neon-16 neon-8
 endif
+ifneq ($(NVPTX_ENABLED), 0)
+    TARGETS+=nvptx
+endif
 # These files need to be compiled in two versions - 32 and 64 bits.
 BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
 # These are files to be compiled in single version.
@@ -289,15 +302,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@

-objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(32 bit version\)
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@

-objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(64 bit version\)
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@

--- a/builtins.cpp
+++ b/builtins.cpp
@@ -342,11 +342,17 @@ lSetInternalFunctions(llvm::Module *module) {
        "__all",
        "__any",
        "__aos_to_soa3_float",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__aos_to_soa3_float1",
+//#endif /* ISPC_NVPTX_ENABLED */
        "__aos_to_soa3_float16",
        "__aos_to_soa3_float4",
        "__aos_to_soa3_float8",
        "__aos_to_soa3_int32",
        "__aos_to_soa4_float",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__aos_to_soa4_float1",
+//#endif /* ISPC_NVPTX_ENABLED */
        "__aos_to_soa4_float16",
        "__aos_to_soa4_float4",
        "__aos_to_soa4_float8",
@@ -395,6 +401,38 @@ lSetInternalFunctions(llvm::Module *module) {
        "__atomic_xor_int64_global",
        "__atomic_xor_uniform_int32_global",
        "__atomic_xor_uniform_int64_global",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__atomic_add_varying_int32_global",
+        "__atomic_add_varying_int64_global",
+        "__atomic_and_varying_int32_global",
+        "__atomic_and_varying_int64_global",
+        "__atomic_compare_exchange_varying_double_global",
+        "__atomic_compare_exchange_varying_float_global",
+        "__atomic_compare_exchange_varying_int32_global",
+        "__atomic_compare_exchange_varying_int64_global",
+        "__atomic_max_varying_int32_global",
+        "__atomic_max_varying_int64_global",
+        "__atomic_min_varying_int32_global",
+        "__atomic_min_varying_int64_global",
+        "__atomic_or_varying_int32_global",
+        "__atomic_or_varying_int64_global",
+        "__atomic_sub_varying_int32_global",
+        "__atomic_sub_varying_int64_global",
+        "__atomic_swap_varying_double_global",
+        "__atomic_swap_varying_float_global",
+        "__atomic_swap_varying_int32_global",
+        "__atomic_swap_varying_int64_global",
+        "__atomic_umax_varying_uint32_global",
+        "__atomic_umax_varying_uint64_global",
+        "__atomic_umin_varying_uint32_global",
+        "__atomic_umin_varying_uint64_global",
+        "__atomic_xor_uniform_int32_global",
+        "__atomic_xor_uniform_int64_global",
+        "__atomic_xor_varying_int32_global",
+        "__atomic_xor_varying_int64_global",
+        "__atomic_xor_varying_int32_global",
+        "__atomic_xor_varying_int64_global",
+//#endif /* ISPC_NVPTX_ENABLED */
        "__broadcast_double",
        "__broadcast_float",
        "__broadcast_i16",
@@ -417,6 +455,9 @@ lSetInternalFunctions(llvm::Module *module) {
        "__do_assert_uniform",
        "__do_assert_varying",
        "__do_print",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__do_print_nvptx",
+//#endif /* ISPC_NVPTX_ENABLED */
        "__doublebits_uniform_int64",
        "__doublebits_varying_int64",
        "__exclusive_scan_add_double",
@@ -431,6 +472,10 @@ lSetInternalFunctions(llvm::Module *module) {
        "__extract_int32",
        "__extract_int64",
        "__extract_int8",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__extract_float",
+        "__extract_double",
+//#endif /* ISPC_NVPTX_ENABLED */
        "__fastmath",
        "__float_to_half_uniform",
        "__float_to_half_varying",
@@ -447,6 +492,10 @@ lSetInternalFunctions(llvm::Module *module) {
        "__insert_int32",
        "__insert_int64",
        "__insert_int8",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__insert_float",
+        "__insert_double",
+//#endif /* ISPC_NVPTX_ENABLED */
        "__intbits_uniform_double",
        "__intbits_uniform_float",
        "__intbits_varying_double",
@@ -483,6 +532,9 @@ lSetInternalFunctions(llvm::Module *module) {
        "__min_varying_uint32",
        "__min_varying_uint64",
        "__movmsk",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__movmsk_ptx",
+//#endif /* ISPC_NVPTX_ENABLED */
        "__new_uniform_32rt",
        "__new_uniform_64rt",
        "__new_varying32_32rt",
@@ -581,6 +633,10 @@ lSetInternalFunctions(llvm::Module *module) {
        "__soa_to_aos3_float8",
        "__soa_to_aos3_int32",
        "__soa_to_aos4_float",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__soa_to_aos3_float1",
+        "__soa_to_aos4_float1",
+//#endif /* ISPC_NVPTX_ENABLED */
        "__soa_to_aos4_float16",
        "__soa_to_aos4_float4",
        "__soa_to_aos4_float8",
@@ -681,6 +737,26 @@ lSetInternalFunctions(llvm::Module *module) {
        "__vec4_add_float",
        "__vec4_add_int32",
        "__vselect_float",
+//#ifdef ISPC_NVPTX_ENABLED
+        "__program_index",
+        "__program_count",
+        "__warp_index",
+        "__task_index0",
+        "__task_index1",
+        "__task_index2",
+        "__task_index",
+        "__task_count0",
+        "__task_count1",
+        "__task_count2",
+        "__task_count",
+        "__cvt_loc2gen",
+        "__cvt_loc2gen_var",
+        "__cvt_const2gen",
+        "__puts_nvptx",
+        "ISPCAlloc",
+        "ISPCLaunch",
+        "ISPCSync",
+//#endif /* ISPC_NVPTX_ENABLED */
        "__vselect_i32"
    };

@@ -759,6 +835,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
            g->target->getISA() != Target::NEON16 &&
            g->target->getISA() != Target::NEON8)
 #endif // !__arm__
+#ifdef ISPC_NVPTX_ENABLED
+        if (g->target->getISA() != Target::NVPTX)
+#endif /* ISPC_NVPTX_ENABLED */
        {
            Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
                   mTriple.getArch() == bcTriple.getArch());
@@ -954,6 +1033,19 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    // Next, add the target's custom implementations of the various needed
    // builtin functions (e.g. __masked_store_32(), etc).
    switch (g->target->getISA()) {
+#ifdef ISPC_NVPTX_ENABLED
+    case Target::NVPTX: 
+      {
+        if (runtime32) {
+            fprintf(stderr, "Unfortunatly 32bit targets are not supported at the moment .. \n");
+            assert(0);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_nvptx_64bit);
+        }
+        break;
+      };
+#endif /* ISPC_NVPTX_ENABLED */

 #ifdef ISPC_ARM_ENABLED
    case Target::NEON8: {
@@ -1224,7 +1316,18 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    }

    // define the 'programCount' builtin variable
-    lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
+#ifdef ISPC_NVPTX_ENABLED
+    if (g->target->getISA() == Target::NVPTX)
+    {
+      lDefineConstantInt("programCount", 32, module, symbolTable);
+    }
+    else
+    {
+#endif /* ISPC_NVPTX_ENABLED */
+      lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
+#ifdef ISPC_NVPTX_ENABLED
+    }
+#endif /* ISPC_NVPTX_ENABLED */

    // define the 'programIndex' builtin
    lDefineProgramIndex(module, symbolTable);
@@ -1256,6 +1359,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(),
                       module, symbolTable);

+#ifdef ISPC_NVPTX_ENABLED
+    lDefineConstantInt("__is_nvptx_target", (int)(g->target->getISA() == Target::NVPTX),
+                       module, symbolTable);
+#else
+    lDefineConstantInt("__is_nvptx_target", (int)0, module, symbolTable);
+#endif /* ISPC_NVPTX_ENABLED */
+
    if (g->forceAlignment != -1) {
        llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
        alignment->setInitializer(LLVMInt32(g->forceAlignment));
--- a/builtins/__do_print_nvptx.cu
+++ b/builtins/__do_print_nvptx.cu
@@ -0,0 +1,130 @@
+#include <cstdio>
+
+#define PRINT_BUF_SIZE 4096
+#define uint64_t unsigned long long
+
+static __device__ size_t d_strlen(const char *str)
+{
+  const char *s;
+
+  for (s = str; *s; ++s)
+    ;
+  return (s - str);
+}
+
+static __device__  char* d_strncat(char *dest, const char *src, size_t n)
+{
+  size_t dest_len = d_strlen(dest);
+  size_t i;
+
+  for (i = 0 ; i < n && src[i] != '\0' ; i++)
+    dest[dest_len + i] = src[i];
+  dest[dest_len + i] = '\0';
+
+  return dest;
+}
+ 
+#define APPEND(str)                                        \
+    do {                                                   \
+        int offset = bufp - &printString[0];               \
+        *bufp = '\0';                                      \
+        d_strncat(bufp, str, PRINT_BUF_SIZE-offset);         \
+        bufp += d_strlen(str);                               \
+        if (bufp >= &printString[PRINT_BUF_SIZE])          \
+            goto done;                                     \
+    } while (0) /* eat semicolon */
+
+
+#define PRINT_SCALAR(fmt, type)                  \
+    sprintf(tmpBuf, fmt, *((type *)ptr));        \
+    APPEND(tmpBuf);                              \
+    break
+
+#define PRINT_VECTOR(fmt, type)                                         \
+    *bufp++ = '[';                                                      \
+    if (bufp == &printString[PRINT_BUF_SIZE]) break;                    \
+    for (int i = 0; i < width; ++i) {                                   \
+        /* only print the value if the current lane is executing */     \
+        type val0 = *((type*)ptr);                                      \
+        type val = val0;                                                \
+        if (mask & (1ull<<i))                                           \
+            sprintf(tmpBuf, fmt, val);                                  \
+        else                                                            \
+            sprintf(tmpBuf, "(( * )) ");                                \
+        APPEND(tmpBuf);                                                 \
+        *bufp++ = (i != width-1 ? ',' : ']');                           \
+    }                                                                   \
+    break
+
+extern "C"
+__device__ void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
+                void **args) {
+    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
+    char *bufp = &printString[0];
+    char tmpBuf[256];
+    const char  trueBuf[] = "true";
+    const char falseBuf[] = "false";
+
+    int argCount = 0;
+    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
+        // Format strings are just single percent signs.
+        if (*format != '%') {
+            *bufp++ = *format;
+        }
+        else {
+            if (*types) {
+                void *ptr = args[argCount++];
+                // Based on the encoding in the types string, cast the
+                // value appropriately and print it with a reasonable
+                // printf() formatting string.
+                switch (*types) {
+                case 'b': {
+                    const char *tmpBuf1 =  *((bool *)ptr) ? trueBuf : falseBuf;
+                    APPEND(tmpBuf1);
+                    break;
+                }
+                case 'B': {
+                    *bufp++ = '[';
+                    if (bufp == &printString[PRINT_BUF_SIZE])
+                        break;
+                    for (int i = 0; i < width; ++i) {
+                        bool val0 = *((bool*)ptr);                                     
+                        bool val = val0;                                                \
+                        if (mask & (1ull << i)) {
+                            const char *tmpBuf1 =  val ? trueBuf : falseBuf;
+                            APPEND(tmpBuf1);
+                        }
+                        else
+                            APPEND("_________");
+                        *bufp++ = (i != width-1) ? ',' : ']';
+                    }
+                    break;
+                }
+                case 'i': PRINT_SCALAR("%d", int);
+                case 'I': PRINT_VECTOR("%d", int);
+                case 'u': PRINT_SCALAR("%u", unsigned int);
+                case 'U': PRINT_VECTOR("%u", unsigned int);
+                case 'f': PRINT_SCALAR("%f", float);
+                case 'F': PRINT_VECTOR("%f", float);
+                case 'l': PRINT_SCALAR("%lld", long long);
+                case 'L': PRINT_VECTOR("%lld", long long);
+                case 'v': PRINT_SCALAR("%llu", unsigned long long);
+                case 'V': PRINT_VECTOR("%llu", unsigned long long);
+                case 'd': PRINT_SCALAR("%f", double);
+                case 'D': PRINT_VECTOR("%f", double);
+                case 'p': PRINT_SCALAR("%p", void *);
+                case 'P': PRINT_VECTOR("%p", void *);
+                default:
+                    APPEND("UNKNOWN TYPE ");
+                    *bufp++ = *types;
+                }
+                ++types;
+            }
+        }
+        ++format;
+    }
+
+ done:
+    *bufp = '\n'; bufp++;
+    *bufp = '\0';
+}
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -185,6 +185,81 @@ void __do_print(const char *format, const char *types, int width, uint64_t mask,
    fflush(stdout);
 }

+/* this is print for PTX target only */
+int __puts_nvptx(const char *);
+void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
+                void **args) {
+#if 0
+    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
+    char *bufp = &printString[0];
+    char tmpBuf[256];
+
+    int argCount = 0;
+    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
+        // Format strings are just single percent signs.
+        if (*format != '%') {
+            *bufp++ = *format;
+        }
+        else {
+            if (*types) {
+                void *ptr = args[argCount++];
+                // Based on the encoding in the types string, cast the
+                // value appropriately and print it with a reasonable
+                // printf() formatting string.
+                switch (*types) {
+                case 'b': {
+                    sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
+                    APPEND(tmpBuf);
+                    break;
+                }
+                case 'B': {
+                    *bufp++ = '[';
+                    if (bufp == &printString[PRINT_BUF_SIZE])
+                        break;
+                    for (int i = 0; i < width; ++i) {
+                        if (mask & (1ull << i)) {
+                            sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
+                            APPEND(tmpBuf);
+                        }
+                        else
+                            APPEND("_________");
+                        *bufp++ = (i != width-1) ? ',' : ']';
+                    }
+                    break;
+                }
+                case 'i': PRINT_SCALAR("%d", int);
+                case 'I': PRINT_VECTOR("%d", int);
+                case 'u': PRINT_SCALAR("%u", unsigned int);
+                case 'U': PRINT_VECTOR("%u", unsigned int);
+                case 'f': PRINT_SCALAR("%f", float);
+                case 'F': PRINT_VECTOR("%f", float);
+                case 'l': PRINT_SCALAR("%lld", long long);
+                case 'L': PRINT_VECTOR("%lld", long long);
+                case 'v': PRINT_SCALAR("%llu", unsigned long long);
+                case 'V': PRINT_VECTOR("%llu", unsigned long long);
+                case 'd': PRINT_SCALAR("%f", double);
+                case 'D': PRINT_VECTOR("%f", double);
+                case 'p': PRINT_SCALAR("%p", void *);
+                case 'P': PRINT_VECTOR("%p", void *);
+                default:
+                    APPEND("UNKNOWN TYPE ");
+                    *bufp++ = *types;
+                }
+                ++types;
+            }
+        }
+        ++format;
+    }
+
+ done:
+    *bufp = '\n'; bufp++;
+    *bufp = '\0';
+    __puts_nvptx(printString);
+#else
+    __puts_nvptx("---nvptx printing is not support---\n");
+#endif
+}
+

 int __num_cores() {
 #if defined(_MSC_VER) || defined(__MINGW32__)
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -289,4 +289,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
 ;; int8/int16 builtins

 define_avgs()
+declare_nvptx()

--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -42,6 +42,7 @@ packed_load_and_store()
 scans()
 int64minmax()
 aossoa()
+declare_nvptx()
 saturation_arithmetic_novec()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -382,6 +382,7 @@ declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x
 ;; int8/int16 builtins

 define_avgs()
+declare_nvptx()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reciprocals in double precision, if supported
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -344,3 +344,4 @@ packed_load_and_store(4)
 ;; prefetch

 define_prefetches()
+declare_nvptx()
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -274,3 +274,4 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {

 define_avgs()

+declare_nvptx()
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -278,3 +278,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
  %call = call i64 @llvm.ctpop.i64(i64 %0)
  ret i64 %call
 }
+
+declare_nvptx()
--- a/builtins/util-nvptx.m4
+++ b/builtins/util-nvptx.m4
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -4964,6 +4964,62 @@ declare  double @__rcp_uniform_double(double)
 declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
 ')

+define(`declare_nvptx',
+`
+declare i32 @__program_index()  nounwind readnone alwaysinline
+declare i32 @__program_count()  nounwind readnone alwaysinline
+declare i32 @__warp_index()  nounwind readnone alwaysinline
+declare i32 @__task_index0()  nounwind readnone alwaysinline
+declare i32 @__task_index1()  nounwind readnone alwaysinline
+declare i32 @__task_index2()  nounwind readnone alwaysinline
+declare i32 @__task_index()  nounwind readnone alwaysinline
+declare i32 @__task_count0()  nounwind readnone alwaysinline
+declare i32 @__task_count1()  nounwind readnone alwaysinline
+declare i32 @__task_count2()  nounwind readnone alwaysinline
+declare i32 @__task_count()  nounwind readnone alwaysinline
+declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
+declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
+declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline
+declare i64 @__movmsk_ptx(<WIDTH x i1>) nounwind readnone alwaysinline;
+')
+
+define(`global_atomic_varying',`
+declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
+')
+
+define(`global_atomic_cas_varying',`
+declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %cmp, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
+')
+
+global_atomic_cas_varying(WIDTH, compare_exchange, i32, int32)
+global_atomic_cas_varying(WIDTH, compare_exchange, i64, int64)
+global_atomic_cas_varying(WIDTH, compare_exchange, float, float)
+global_atomic_cas_varying(WIDTH, compare_exchange, double, double)
+
+global_atomic_varying(WIDTH, swap, i32, int32)
+global_atomic_varying(WIDTH, swap, i64, int64)
+global_atomic_varying(WIDTH, swap, float, float)
+global_atomic_varying(WIDTH, swap, double, double)
+
+global_atomic_varying(WIDTH, add, i32, int32)
+global_atomic_varying(WIDTH, sub, i32, int32)
+global_atomic_varying(WIDTH, and, i32, int32)
+global_atomic_varying(WIDTH, or, i32, int32)
+global_atomic_varying(WIDTH, xor, i32, int32)
+global_atomic_varying(WIDTH, min, i32, int32)
+global_atomic_varying(WIDTH, max, i32, int32)
+global_atomic_varying(WIDTH, umin, i32, uint32)
+global_atomic_varying(WIDTH, umax, i32, uint32)
+
+global_atomic_varying(WIDTH, add, i64, int64)
+global_atomic_varying(WIDTH, sub, i64, int64)
+global_atomic_varying(WIDTH, and, i64, int64)
+global_atomic_varying(WIDTH, or, i64, int64)
+global_atomic_varying(WIDTH, xor, i64, int64)
+global_atomic_varying(WIDTH, min, i64, int64)
+global_atomic_varying(WIDTH, max, i64, int64)
+global_atomic_varying(WIDTH, umin, i64, uint64)
+global_atomic_varying(WIDTH, umax, i64, uint64)

 define(`transcendetals_decl',`
    declare float @__log_uniform_float(float) nounwind readnone
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -57,6 +57,10 @@
  #include <llvm/IR/Instructions.h>
  #include <llvm/IR/DerivedTypes.h>
 #endif
+#ifdef ISPC_NVPTX_ENABLED
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Support/FormattedStream.h>
+#endif /* ISPC_NVPTX_ENABLED */

 /** This is a small utility structure that records information related to one
    level of nested control flow.  It's mostly used in correctly restoring
@@ -1383,10 +1387,17 @@ FunctionEmitContext::None(llvm::Value *mask) {

 llvm::Value *
 FunctionEmitContext::LaneMask(llvm::Value *v) {
+#ifdef ISPC_NVPTX_ENABLED
+    /* this makes mandelbrot example slower with "nvptx" target. 
+     * Needs further investigation. */
+    const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk";
+#else
+    const char *__movmsk = "__movmsk";
+#endif
    // Call the target-dependent movmsk function to turn the vector mask
    // into an i64 value
    std::vector<Symbol *> mm;
-    m->symbolTable->LookupFunction("__movmsk", &mm);
+    m->symbolTable->LookupFunction(__movmsk, &mm);
    if (g->target->getMaskBitCount() == 1)
        AssertPos(currentPos, mm.size() == 1);
    else
@@ -1398,13 +1409,78 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
    return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
 }

+#ifdef ISPC_NVPTX_ENABLED
+bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName)
+{
+  llvm::Type *type = vector->getType();
+  if (type == LLVMTypes::Int8VectorType)
+    funcName += "_int8";
+  else if (type == LLVMTypes::Int16VectorType)
+    funcName += "_int16";
+  else if (type == LLVMTypes::Int32VectorType)
+    funcName += "_int32";
+  else if (type == LLVMTypes::Int64VectorType)
+    funcName += "_int64";
+  else if (type == LLVMTypes::FloatVectorType)
+    funcName += "_float";
+  else if (type == LLVMTypes::DoubleVectorType)
+    funcName += "_double";
+  else
+    return false;
+  return true;
+}
+
+llvm::Value*
+FunctionEmitContext::Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar)
+{
+  std::string funcName = "__insert";
+  assert(lAppendInsertExtractName(vector, funcName));
+  assert(lane->getType() == LLVMTypes::Int32Type);
+  
+  llvm::Function *func = m->module->getFunction(funcName.c_str());
+  assert(func != NULL);
+  std::vector<llvm::Value *> args;
+  args.push_back(vector);
+  args.push_back(lane);
+  args.push_back(scalar);
+  llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
+  return ret;
+}
+
+llvm::Value*
+FunctionEmitContext::Extract(llvm::Value *vector, llvm::Value *lane)
+{
+  std::string funcName = "__extract";
+  assert(lAppendInsertExtractName(vector, funcName));
+  assert(lane->getType() == LLVMTypes::Int32Type);
+  
+  llvm::Function *func = m->module->getFunction(funcName.c_str());
+  assert(func != NULL);
+  std::vector<llvm::Value *> args;
+  args.push_back(vector);
+  args.push_back(lane);
+  llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
+  return ret;
+}
+#endif /* ISPC_NVPTX_ENABLED */
+

 llvm::Value *
 FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
+#ifdef ISPC_NVPTX_ENABLED
+    if (g->target->getISA() == Target::NVPTX)
+    {
+      // Compare the two masks to get a vector of i1s
+      llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
+          v1, v2, "v1==v2");
+      return ExtractInst(cmp, 0);  /* this works without calling All(..) in PTX. Why ?!? */
+    }
+#endif /* ISPC_NVPTX_ENABLED */
+
 #if 0
    // Compare the two masks to get a vector of i1s
    llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
-                               v1, v2, "v1==v2");
+        v1, v2, "v1==v2");
    // Turn that into a bool vector type (often i32s)
    cmp = I1VecToBoolVec(cmp);
    // And see if it's all on
@@ -1413,7 +1489,7 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
    llvm::Value *mm1 = LaneMask(v1);
    llvm::Value *mm2 = LaneMask(v2);
    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
-                   LLVMGetName("equal", v1, v2));
+        LLVMGetName("equal", v1, v2));
 #endif
 }

@@ -1421,8 +1497,8 @@ llvm::Value *
 FunctionEmitContext::ProgramIndexVector(bool is32bits) {
    llvm::SmallVector<llvm::Constant*, 16> array;
    for (int i = 0; i < g->target->getVectorWidth() ; ++i) {
-        llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
-        array.push_back(C);
+      llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
+      array.push_back(C);
    }

    llvm::Constant* index = llvm::ConstantVector::get(array);
@@ -1430,6 +1506,20 @@ FunctionEmitContext::ProgramIndexVector(bool is32bits) {
    return index;
 }

+#ifdef ISPC_NVPTX_ENABLED
+llvm::Value *
+FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) {
+    llvm::Function *func_program_index  = m->module->getFunction("__program_index");
+    llvm::Value *__program_index    = CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__program_indexS");
+    llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), __program_index, 0, "foreach__program_indexV");
+#if 0
+    if (!is32bits)
+      index = ZExtInst(index, LLVMTypes::Int64VectandType);
+#endif
+    return index;
+}
+#endif /* ISPC_NVPTX_ENABLED */
+

 llvm::Value *
 FunctionEmitContext::GetStringPtr(const std::string &str) {
@@ -3555,31 +3645,117 @@ llvm::Value *
 FunctionEmitContext::LaunchInst(llvm::Value *callee,
                                std::vector<llvm::Value *> &argVals,
                                llvm::Value *launchCount[3]){
-    if (callee == NULL) {
+#ifdef ISPC_NVPTX_ENABLED
+    if (g->target->getISA() == Target::NVPTX)
+    {
+      if (callee == NULL) {
        AssertPos(currentPos, m->errorCount > 0);
        return NULL;
+      }
+      launchedTasks = true;
+
+      AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
+      std::vector<llvm::Type*> argTypes;
+
+      llvm::Function *F = llvm::dyn_cast<llvm::Function>(callee);
+      const unsigned int nArgs = F->arg_size();
+      llvm::Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+      for (; I != E; ++I) 
+        argTypes.push_back(I->getType());
+      llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes);
+      llvm::StructType *argStructType = static_cast<llvm::StructType *>(st);
+      llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
+      if (structSize->getType() != LLVMTypes::Int64Type)
+        structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
+            "struct_size_to_64");
+
+      const int align = 8;
+      llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
+      AssertPos(currentPos, falloc != NULL);
+      std::vector<llvm::Value *> allocArgs;
+      allocArgs.push_back(launchGroupHandlePtr);
+      allocArgs.push_back(structSize);
+      allocArgs.push_back(LLVMInt32(align));
+      llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
+      llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64");
+      llvm::BasicBlock* if_true  = CreateBasicBlock("if_true");
+      llvm::BasicBlock* if_false = CreateBasicBlock("if_false");
+
+      /* check if the pointer returned by ISPCAlloc is not NULL 
+       * --------------
+       * this is a workaround for not checking the value of programIndex 
+       * because ISPCAlloc will return NULL pointer for all programIndex > 0
+       * of course, if ISPAlloc fails to get parameter buffer, the pointer for programIndex = 0
+       * will also be NULL
+       * This check must be added, and also rewrite the code to make it less opaque 
+       */
+      llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1");
+      BranchInst(if_true, if_false, cmp1);
+
+      /**********************/
+      bblock = if_true;    
+
+      // label_if_then block:
+      llvm::Type *pt = llvm::PointerType::getUnqual(st);
+      llvm::Value *argmem = BitCastInst(voidmem, pt);
+      for (unsigned int i = 0; i < argVals.size(); ++i) 
+      {
+        llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
+        // don't need to do masked store here, I think
+        StoreInst(argVals[i], ptr);
+      }
+      if (nArgs == argVals.size() + 1) {
+        // copy in the mask
+        llvm::Value *mask = GetFullMask();
+        llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
+            "funarg_mask");
+        StoreInst(mask, ptr);
+      }
+      BranchInst(if_false);
+
+      /**********************/
+      bblock = if_false;
+
+      llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
+      llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
+      AssertPos(currentPos, flaunch != NULL);
+      std::vector<llvm::Value *> args;
+      args.push_back(launchGroupHandlePtr);
+      args.push_back(fptr);
+      args.push_back(voidmem);
+      args.push_back(launchCount[0]);
+      args.push_back(launchCount[1]);
+      args.push_back(launchCount[2]);
+      llvm::Value *ret =  CallInst(flaunch, NULL, args, "");
+      return ret;
+    }
+#endif /* ISPC_NVPTX_ENABLED */
+
+    if (callee == NULL) {
+      AssertPos(currentPos, m->errorCount > 0);
+      return NULL;
    }

    launchedTasks = true;

    AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
    llvm::Type *argType =
-        (llvm::dyn_cast<llvm::Function>(callee))->arg_begin()->getType();
+      (llvm::dyn_cast<llvm::Function>(callee))->arg_begin()->getType();
    AssertPos(currentPos, llvm::PointerType::classof(argType));
    llvm::PointerType *pt =
-        llvm::dyn_cast<llvm::PointerType>(argType);
+      llvm::dyn_cast<llvm::PointerType>(argType);
    AssertPos(currentPos, llvm::StructType::classof(pt->getElementType()));
    llvm::StructType *argStructType =
-        static_cast<llvm::StructType *>(pt->getElementType());
+      static_cast<llvm::StructType *>(pt->getElementType());

    llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
    AssertPos(currentPos, falloc != NULL);
    llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
    if (structSize->getType() != LLVMTypes::Int64Type)
-        // ISPCAlloc expects the size as an uint64_t, but on 32-bit
-        // targets, SizeOf returns a 32-bit value
-        structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
-                              "struct_size_to_64");
+      // ISPCAlloc expects the size as an uint64_t, but on 32-bit
+      // targets, SizeOf returns a 32-bit value
+      structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
+          "struct_size_to_64");
    int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth());

    std::vector<llvm::Value *> allocArgs;
@@ -3592,17 +3768,17 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
    // Copy the values of the parameters into the appropriate place in
    // the argument block
    for (unsigned int i = 0; i < argVals.size(); ++i) {
-        llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
-        // don't need to do masked store here, I think
-        StoreInst(argVals[i], ptr);
+      llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
+      // don't need to do masked store here, I think
+      StoreInst(argVals[i], ptr);
    }

    if (argStructType->getNumElements() == argVals.size() + 1) {
-        // copy in the mask
-        llvm::Value *mask = GetFullMask();
-        llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
-                                            "funarg_mask");
-        StoreInst(mask, ptr);
+      // copy in the mask
+      llvm::Value *mask = GetFullMask();
+      llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
+          "funarg_mask");
+      StoreInst(mask, ptr);
    }

    // And emit the call to the user-supplied task launch function, passing
@@ -3624,6 +3800,21 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,

 void
 FunctionEmitContext::SyncInst() {
+#ifdef ISPC_NVPTX_ENABLED 
+    if (g->target->getISA() == Target::NVPTX)
+    {
+      llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
+      llvm::Value *nullPtrValue =
+        llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
+      llvm::Function *fsync = m->module->getFunction("ISPCSync");
+      if (fsync == NULL)
+        FATAL("Couldn't find ISPCSync declaration?!");
+      CallInst(fsync, NULL, launchGroupHandle, "");
+      StoreInst(nullPtrValue, launchGroupHandlePtr);
+      return;
+    }
+#endif /* ISPC_NVPTX_ENABLED */
+
    llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
    llvm::Value *nullPtrValue =
        llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
--- a/ctx.h
+++ b/ctx.h
@@ -302,9 +302,17 @@ public:
        that indicates whether the two masks are equal. */
    llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);

-    /** Generate ConstantVector, which contains ProgramIndex, i.e.
+    /** generate constantvector, which contains programindex, i.e.
        < i32 0, i32 1, i32 2, i32 3> */
    llvm::Value *ProgramIndexVector(bool is32bits = true);
+#ifdef ISPC_NVPTX_ENABLED
+    llvm::Value *ProgramIndexVectorPTX(bool is32bits = true);
+
+    /** Issues a call to __insert_int8/int16/int32/int64/float/double */
+    llvm::Value* Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar);
+    /** Issues a call to __extract_int8/int16/int32/int64/float/double */
+    llvm::Value* Extract(llvm::Value *vector, llvm::Value *lane);
+#endif 

    /** Given a string, create an anonymous global variable to hold its
        value and return the pointer to the string. */
--- a/decl.cpp
+++ b/decl.cpp
@@ -168,6 +168,15 @@ DeclSpecs::GetBaseType(SourcePos pos) const {
    retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);

    if (soaWidth > 0) {
+#ifdef ISPC_NVPTX_ENABLED
+#if 0  /* see stmt.cpp in DeclStmt::EmitCode for work-around of SOAType Declaration */
+        if (g->target->getISA() == Target::NVPTX)
+        {
+            Error(pos, "\"soa\" data types are currently not supported with \"nvptx\" target.");
+            return NULL;
+        }
+#endif
+#endif /* ISPC_NVPTX_ENABLED */
        const StructType *st = CastType<StructType>(retType);

        if (st == NULL) {
@@ -402,6 +411,15 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
            return;
        }

+#ifdef ISPC_NVPTX_ENABLED
+#if 0 /* NVPTX */
+        if (baseType->IsUniformType())
+        {
+          fprintf(stderr, " detected uniform array of size= %d  array= %s\n" ,arraySize,
+              baseType->IsArrayType() ? " true " : " false ");
+        }
+#endif
+#endif /* ISPC_NVPTX_ENABLED */
        const Type *arrayType = new ArrayType(baseType, arraySize);
        if (child != NULL) {
            child->InitFromType(arrayType, ds);
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -178,6 +178,13 @@ Contents:
  + `Data Alignment and Aliasing`_
  + `Restructuring Existing Programs to Use ISPC`_

+* `Experimental support for PTX`_
+
+  + `Overview`_
+  + `Compiling For The NVIDIA Kepler GPU`_
+  + `Hints`_
+  + `Limitations & known issues`_
+
 * `Disclaimer and Legal Information`_

 * `Optimization Notice`_
@@ -4936,6 +4943,129 @@ program instances improves performance.
 .. _ispc Performance Tuning Guide: http://ispc.github.com/perfguide.html


+Experimental support for PTX
+============================
+``ispc`` provides experimental support for PTX code generation which currently
+targets NVIDIA GPUs with compute capability >3.5 [Kepler GPUs with support for
+dynamic parallelism]. Due to its nature, the PTX backend currently impose
+several restrictions on the ``ispc`` program, which will be described below.
+
+Overview
+--------
+SPMD programming in ``ispc`` is similar to a warp-synchronous CUDA programming.
+Namely, program instances in a gang are equivalent of CUDA threads in a single
+warp. Hence, to run efficiently on a GPU ``ispc`` program must use tasking
+functionality via ``launch`` keyword to ensure multiple number of warps are
+executed concurrently on the GPU.
+
+``export`` functions are equipped with a CUDA C wrapper which schedules a
+single warp--a thread-block with a total of 32 threads. In contract to CPU
+programming, this exported function, either directly or otherwise, should
+utilize ``launch`` keyword to schedule work on a GPU.
+
+At the PTX level, ``launch`` keyword is mapped to CUDA Dynamic Parallelism and
+it schedules a grid of thread-blocks each 4 warps-wide (128 threads).  As a
+result, ``ispc`` has a tasking-granularity of 4 tasks with PTX target; this
+restriction will be eliminated in future.
+
+When passing pointers to an ``export`` function, it is important that they
+remain legal when are accessed from GPU. Prior to CUDA 6.0, such a pointer were
+holding an address that is only accessible from the GPU.  With the release of
+CUDA 6.0, it is possible to pass a pointer to a unified memory allocated with
+``cudaMallocManaged``. Examples provides rudimentary wrapper functions that
+call CUDA API for managed memory allocations, allowing the programmers to avoid
+explicit memory copies.
+
+
+
+Compiling For The NVIDIA Kepler GPU
+-----------------------------------
+Compilation for NVIDIA Kepler GPU is a several step procedure.
+
+First, we need to generate a LLVM assembly from ``ispc`` source file (``ispc``
+generates LLVM assembly instead of bitcode when ``nvptx`` target is chosen):
+
+::
+
+  $ISPC_HOME/ispc foo.ispc --emit-llvm --target=nvptx -o foo.ll
+
+
+This LLVM assembly can immediately be compiled into PTX with the help of
+``ptxgen`` tool; this tool uses ``libNVVM`` which is a part of a CUDA Toolkit.
+
+::
+
+  $ISPC_HOME/ptxtools/ptxgen --use_fast_math foo.ll -o foo.ptx
+
+.. If ``ispc`` is compiled with  LLVM >3.2, the resulting bitcode must first be
+.. decompiled with the ``llvm-dis`` from LLVM 3.2 distribution; this "trick" is
+.. required to generate an IR compatible with libNVVM:
+
+.. ::
+.. 
+..   $LLVM32/bin/llvm-dis foo.bc -o foo.ll
+..   $ISPC_HOME/ptxtools/ptxgen --use_fast_math foo.ll -o foo.ptx
+
+This PTX is ready for execution on a GPU, for example via CUDA
+Driver API. Alternatively, we also provide a simple ``ptxcc`` tool, which
+compiles the resulting PTX code into an object file:
+
+::
+
+   $ISPC_HOME/ptxtools/ptxcc foo.ptx -o foo_cu.o -Xnvcc="--maxrregcount=64
+   -Xptxas=-v"
+
+This object file can be linked with the main program via ``nvcc``:
+
+::
+
+    nvcc foo_cu.o foo_main.o -o foo
+
+
+Hints
+-----
+- ``uniform`` arrays in a function scope are statically allocated in
+  ``__shared__`` memory, with all ensuing consequences. For example, if more 
+  than avaiable shared memory per SMX is allocated, a link- or runtime-error will occur
+- If ``uniform`` arrays of large size are desired, we recommend to use
+  ``uniform new uniform T[size]`` for their allocation, ideally outside the
+  tasking function (see ``deferred/kernels.ispc`` in the deferred shading example)
+
+Examples that produces executables for CPU, XeonPhi and Kepler GPU display
+several tuning approaches that can benefit GPU performance. 
+``ispc`` may also generate performance warning, that if followed, may improve
+GPU application performance.
+
+Limitations & known issues
+--------------------------
+Due to its experimental form, PTX code generation is known to impose several
+limitation on the ``ispc`` program which are documented in the following list:
+
+- Must use ``ispc`` tasking functionality to run efficiently on GPU
+- Must use ``new/delete`` and/or ``ispc_malloc``/``ispc_free``/``ispc_memset``/``ispc_memcpy`` to allocate/free/set/copy memory that is visible to GPU
+- ``export`` functions must have ``void`` return type.
+- ``task``/``export`` functions do not accept varying data-types
+- ``new``/``delete`` currently only works with ``uniform`` data-types
+- ``aossoa``/``soaaos`` is not yet supported
+- ``sizeof(varying)`` is not yet unsupported
+- Function pointers do not work yet (may or may not generate compilation fail)
+- ``memset``/``memcpy``/``memmove`` is not yet supported
+- ``uniform`` arrays in global scope are mapped to global memory
+- ``varying`` arrays in global scope are not yet supported
+- ``uniform`` arrays in local  scope are mapped to shared memory
+- ``varying`` arrays in local  scope are mapped to local  memory
+- ``const uniform/varying`` arrays are mapped to local memory
+- ``const static uniform`` arrays are mapped to constant memory
+- ``const static varying``  arrays are mapped to global   memory
+- ``static`` data types in local scope are not allowed; compilation will fail
+- Best performance is obtained with libNVVM (LLVM PTX backend can also be used but it requires libdevice.compute_35.10.bc that comes with libNVVM)
+
+
+Likely there are more... which, together with some of the above-mentioned
+issues, will be fixed in due time.
+
+
+
 Disclaimer and Legal Information
 ================================

--- a/examples/portable/aobench/.gitignore
+++ b/examples/portable/aobench/.gitignore
@@ -0,0 +1,2 @@
+ao
+*.ppm
--- a/examples/portable/aobench/Makefile_cpu
+++ b/examples/portable/aobench/Makefile_cpu
@@ -0,0 +1,8 @@
+
+EXAMPLE=ao
+CPP_SRC=ao.cpp 
+ISPC_SRC=ao.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+
+include ../common_cpu.mk
--- a/examples/portable/aobench/Makefile_knc
+++ b/examples/portable/aobench/Makefile_knc
@@ -0,0 +1,7 @@
+EXAMPLE=ao
+CXX_SRC=ao.cpp 
+ISPC_SRC=ao.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+
+include ../common_knc.mk
--- a/examples/portable/aobench/Makefile_ptx
+++ b/examples/portable/aobench/Makefile_ptx
@@ -0,0 +1,14 @@
+PROG=ao
+ISPC_SRC=ao.ispc
+CU_SRC=ao.cu
+CXX_SRC=ao.cpp 
+PTXCC_REGMAX=64
+#ISPC_FLAGS= --opt=disable-uniform-control-flow
+
+#LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
--- a/examples/portable/aobench/ao.cpp
+++ b/examples/portable/aobench/ao.cpp
@@ -0,0 +1,152 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#ifdef __linux__
+#include <malloc.h>
+#endif
+#include <math.h>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <sys/types.h>
+
+#include "ao_ispc.h"
+
+#include "timing.h"
+#include "ispc_malloc.h"
+
+#define NSUBSAMPLES        2
+
+static unsigned int test_iterations[] = {3, 7, 1};
+static unsigned int width, height;
+static unsigned char *img;
+static float *fimg;
+
+
+static unsigned char
+clamp(float f)
+{
+    int i = (int)(f * 255.5);
+
+    if (i < 0) i = 0;
+    if (i > 255) i = 255;
+
+    return (unsigned char)i;
+}
+
+
+static void
+savePPM(const char *fname, int w, int h)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)  {
+            img[3 * (y * w + x) + 0] = clamp(fimg[3 *(y * w + x) + 0]);
+            img[3 * (y * w + x) + 1] = clamp(fimg[3 *(y * w + x) + 1]);
+            img[3 * (y * w + x) + 2] = clamp(fimg[3 *(y * w + x) + 2]);
+        }
+    }
+
+    FILE *fp = fopen(fname, "wb");
+    if (!fp) {
+        perror(fname);
+        exit(1);
+    }
+
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", w, h);
+    fprintf(fp, "255\n");
+    fwrite(img, w * h * 3, 1, fp);
+    fclose(fp);
+    printf("Wrote image file %s\n", fname);
+}
+
+
+int main(int argc, char **argv)
+{
+    if (argc < 3) {
+        printf ("%s\n", argv[0]);
+        printf ("Usage: ao [width] [height] [ispc iterations] [tasks iterations] [serial iterations]\n");
+        getchar();
+        exit(-1);
+    }
+    else {
+        if (argc == 6) {
+            for (int i = 0; i < 3; i++) {
+                test_iterations[i] = atoi(argv[3 + i]);
+            }
+        }
+        width = atoi (argv[1]);
+        height = atoi (argv[2]);
+    }
+
+    // Allocate space for output images
+    img = new unsigned char[width * height * 3];
+    fimg = new float[width * height * 3];
+
+    //
+    // Run the ispc + tasks path, test_iterations times, and report the
+    // minimum time for any of them.
+    //
+    double minTimeISPCTasks = 1e30;
+    for (unsigned int i = 0; i < test_iterations[1]; i++) {
+        ispc_memset(fimg, 0, sizeof(float) * width * height * 3);
+        assert(NSUBSAMPLES == 2);
+
+        reset_and_start_timer();
+        ispc::ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_msec();
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", t);
+        minTimeISPCTasks = std::min(minTimeISPCTasks, t);
+    }
+
+    // Report results and save image
+    printf("[aobench ispc + tasks]:\t\t[%.3f] msec (%d x %d image)\n",
+           minTimeISPCTasks, width, height);
+    savePPM("ao-ispc-tasks.ppm", width, height);
+
+    delete img;
+    delete fimg;
+
+    return 0;
+}
--- a/examples/portable/aobench/ao.cu
+++ b/examples/portable/aobench/ao.cu
@@ -0,0 +1,447 @@
+// -*- mode: c++ -*-
+/*
+   Copyright (c) 2010-2014, Intel Corporation
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+   Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
+   */
+
+#include "cuda_helpers.cuh"
+
+#define NAO_SAMPLES        8
+//#define M_PI 3.1415926535f
+
+#define vec Float3
+struct Float3
+{
+  float x,y,z;
+
+  __device__ friend Float3 operator+(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x+b.x;
+    c.y = a.y+b.y;
+    c.z = a.z+b.z;
+    return c;
+  }
+  __device__ friend Float3 operator-(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x-b.x;
+    c.y = a.y-b.y;
+    c.z = a.z-b.z;
+    return c;
+  }
+  __device__ friend Float3 operator/(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x/b.x;
+    c.y = a.y/b.y;
+    c.z = a.z/b.z;
+    return c;
+  }
+  __device__ friend Float3 operator/(const float a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a/b.x;
+    c.y = a/b.y;
+    c.z = a/b.z;
+    return c;
+  }
+  __device__ friend Float3 operator*(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x*b.x;
+    c.y = a.y*b.y;
+    c.z = a.z*b.z;
+    return c;
+  }
+  __device__ friend Float3 operator*(const Float3 a, const float b)
+  {
+    Float3 c;
+    c.x = a.x*b;
+    c.y = a.y*b;
+    c.z = a.z*b;
+    return c;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////
+// RNG stuff
+
+struct RNGState {
+    unsigned int z1, z2, z3, z4;
+};
+
+__device__
+static inline unsigned int random(RNGState * state)
+{
+    unsigned int b;
+
+    b  = ((state->z1 << 6) ^ state->z1) >> 13;
+    state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
+    b  = ((state->z2 << 2) ^ state->z2) >> 27;
+    state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
+    b  = ((state->z3 << 13) ^ state->z3) >> 21;
+    state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
+    b  = ((state->z4 << 3) ^ state->z4) >> 12;
+    state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
+    return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
+}
+
+
+__device__
+static inline float frandom(RNGState * state)
+{
+    unsigned int irand = random(state);
+    irand &= (1ul<<23)-1;
+    return __int_as_float(0x3F800000 | irand)-1.0f;
+}
+
+__device__
+static inline void seed_rng(RNGState * state,
+                            unsigned int seed) {
+    state->z1 = seed;
+    state->z2 = seed ^ 0xbeeff00d;
+    state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
+    state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul)  << 8) |
+                 ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
+}
+
+
+
+struct Isect {
+  float      t;
+  vec        p;
+  vec        n;
+  int        hit;
+};
+
+struct Sphere {
+  vec        center;
+  float      radius;
+};
+
+struct Plane {
+  vec    p;
+  vec    n;
+};
+
+struct Ray {
+  vec org;
+  vec dir;
+};
+
+__device__
+static inline float dot(vec a, vec b) {
+  return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+__device__
+static inline vec vcross(vec v0, vec v1) {
+  vec ret;
+  ret.x = v0.y * v1.z - v0.z * v1.y;
+  ret.y = v0.z * v1.x - v0.x * v1.z;
+  ret.z = v0.x * v1.y - v0.y * v1.x;
+  return ret;
+}
+
+__device__
+static inline void vnormalize(vec &v) {
+  float len2 = dot(v, v);
+  float invlen = rsqrt(len2);
+  v = v*invlen;
+}
+
+
+__device__
+static inline void
+ray_plane_intersect(Isect &isect,const  Ray &ray, const  Plane &plane) {
+  float d = -dot(plane.p, plane.n);
+  float v = dot(ray.dir, plane.n);
+
+#if 0
+  if (abs(v) < 1.0f-17)
+    return;
+  else {
+    float t = -(dot(ray.org, plane.n) + d) / v;
+
+    if ((t > 0.0) && (t < isect.t)) {
+      isect.t = t;
+      isect.hit = 1;
+      isect.p = ray.org + ray.dir * t;
+      isect.n = plane.n;
+    }
+  }
+#else
+    if (abs(v) <= 1.0e-17)
+      return;
+    float t = -(dot(ray.org, plane.n) + d) / v;
+    if ((t > 0.0) && (t < isect.t)) {
+      isect.t = t;
+      isect.hit = 1;
+      isect.p = ray.org + ray.dir * t;
+      isect.n = plane.n;
+    }
+#endif
+}
+
+
+__device__
+static inline void
+ray_sphere_intersect(Isect &isect,const  Ray &ray, const Sphere &sphere) {
+  vec rs = ray.org - sphere.center;
+
+  float B = dot(rs, ray.dir);
+  float C = dot(rs, rs) - sphere.radius * sphere.radius;
+  float D = B * B - C;
+
+#if 0
+  if (D > 0.) {
+    float t = -B - sqrt(D);
+
+    if ((t > 0.0) && (t < isect.t)) {
+      isect.t = t;
+      isect.hit = 1;
+      isect.p = ray.org +  ray.dir * t;
+      isect.n = isect.p - sphere.center;
+      vnormalize(isect.n);
+    }
+  }
+#else
+    if (D <= 0.0f)
+      return;
+
+    float t = -B - sqrt(D);
+
+    if ((t > 0.0) && (t < isect.t)) {
+      isect.t = t;
+      isect.hit = 1;
+      isect.p = ray.org +  ray.dir * t;
+      isect.n = isect.p - sphere.center;
+      vnormalize(isect.n);
+    }
+#endif
+
+}
+
+
+__device__
+static inline void
+orthoBasis(vec basis[3], vec n) {
+  basis[2] = n;
+  basis[1].x = 0.0f; basis[1].y = 0.0f; basis[1].z = 0.0f;
+
+  if ((n.x < 0.6f) && (n.x > -0.6f)) {
+    basis[1].x = 1.0f;
+  } else if ((n.y < 0.6f) && (n.y > -0.6f)) {
+    basis[1].y = 1.0f;
+  } else if ((n.z < 0.6f) && (n.z > -0.6f)) {
+    basis[1].z = 1.0f;
+  } else {
+    basis[1].x = 1.0f;
+  }
+
+  basis[0] = vcross(basis[1], basis[2]);
+  vnormalize(basis[0]);
+
+  basis[1] = vcross(basis[2], basis[0]);
+  vnormalize(basis[1]);
+}
+
+
+__device__
+static inline float
+ambient_occlusion(Isect &isect,  const Plane &plane, const  Sphere spheres[3],
+    RNGState &rngstate) {
+  float eps = 0.0001f;
+  vec p; //, n;
+  vec basis[3];
+  float occlusion = 0.0f;
+
+  p = isect.p + isect.n * eps;
+
+  orthoBasis(basis, isect.n);
+
+  const  int ntheta = NAO_SAMPLES;
+  const  int nphi   = NAO_SAMPLES;
+  for ( int j = 0; j < ntheta; j++) {
+    for ( int i = 0; i < nphi; i++) {
+      Ray ray;
+      Isect occIsect;
+
+      float theta = sqrt(frandom(&rngstate));
+      float phi   = 2.0f * M_PI * frandom(&rngstate);
+      float x = cos(phi) * theta;
+      float y = sin(phi) * theta;
+      float z = sqrtf(1.0f - theta * theta);
+
+      // local . global
+      float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
+      float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
+      float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
+
+      ray.org = p;
+      ray.dir.x = rx;
+      ray.dir.y = ry;
+      ray.dir.z = rz;
+
+      occIsect.t   = 1.0f+17;
+      occIsect.hit = 0;
+
+      for ( int snum = 0; snum < 3; ++snum)
+        ray_sphere_intersect(occIsect, ray, spheres[snum]);
+      ray_plane_intersect (occIsect, ray, plane);
+
+      if (occIsect.hit) occlusion += 1.0f;
+    }
+  }
+
+  occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
+  return occlusion;
+}
+
+
+/* Compute the image for the scanlines from [y0,y1), for an overall image
+   of width w and height h.
+   */
+__device__
+static inline void ao_tiles(
+     int x0,  int x1,
+     int y0,  int y1,
+     int w,  int h,
+     int nsubsamples,
+     float image[])
+{
+  const  Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+  const  Sphere spheres[3] = {
+    { { -2.0f, 0.0f, -3.5f }, 0.5f },
+    { { -0.5f, 0.0f, -3.0f }, 0.5f },
+    { { 1.0f, 0.0f, -2.2f }, 0.5f } };
+  RNGState rngstate;
+
+  seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
+  float invSamples = 1.f / nsubsamples;
+  for ( int y = y0; y < y1; y++)
+    for ( int x = programIndex+x0; x < x1; x += programCount)
+    {
+      const int offset = 3 * (y * w + x);
+      float res = 0.0f;
+
+      for ( int u = 0; u < nsubsamples; u++)
+        for ( int v = 0; v < nsubsamples; v++)
+        {
+          float du = (float)u * invSamples, dv = (float)v * invSamples;
+
+          // Figure out x,y pixel in NDC
+          float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
+          float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
+          float ret = 0.f;
+          Ray ray;
+          Isect isect;
+
+          ray.org.x = 0.0f;
+          ray.org.y = 0.0f;
+          ray.org.z = 0.0f;
+
+          // Poor man's perspective projection
+          ray.dir.x = px;
+          ray.dir.y = py;
+          ray.dir.z = -1.0;
+          vnormalize(ray.dir);
+
+          isect.t   = 1.0e+17;
+          isect.hit = 0;
+
+          for ( int snum = 0; snum < 3; ++snum)
+            ray_sphere_intersect(isect, ray, spheres[snum]);
+          ray_plane_intersect(isect, ray, plane);
+
+          // Note use of 'coherent' if statement; the set of rays we
+          // trace will often all hit or all miss the scene
+          if (any(isect.hit)) {
+            ret = isect.hit*ambient_occlusion(isect, plane, spheres, rngstate);
+            ret *= invSamples * invSamples;
+            res += ret;
+          }
+        }
+
+      if (x < x1)
+      {
+        image[offset  ] = res;
+        image[offset+1] = res;
+        image[offset+2] = res;
+      }
+    }
+}
+
+
+
+#define TILEX 64
+#define TILEY 4
+
+extern "C"
+__global__
+void ao_task( int width,  int height,
+     int nsubsamples,  float image[])
+{
+  if (taskIndex0 >= taskCount0) return;
+  if (taskIndex1 >= taskCount1) return;
+
+  const  int x0 = taskIndex0 * TILEX;
+  const  int x1 = min(x0 + TILEX, width);
+
+  const  int y0 = taskIndex1 * TILEY;
+  const  int y1 = min(y0 + TILEY, height);
+  ao_tiles(x0,x1,y0,y1, width, height, nsubsamples, image);
+}
+
+extern "C"
+__global__
+void ao_ispc_tasks___export(
+    int w, int h, int nsubsamples,
+    float image[])
+{
+  const int ntilex = (w+TILEX-1)/TILEX;
+  const int ntiley = (h+TILEY-1)/TILEY;
+  launch(ntilex,ntiley,1,ao_task)(w,h,nsubsamples,image);
+  cudaDeviceSynchronize();
+}
+
+extern "C"
+__host__ void ao_ispc_tasks(
+    int w, int h, int nsubsamples,
+    float image[])
+{
+  ao_ispc_tasks___export<<<1,32>>>(w,h,nsubsamples,image);
+  cudaDeviceSynchronize();
+}
--- a/examples/portable/aobench/ao.ispc
+++ b/examples/portable/aobench/ao.ispc
@@ -0,0 +1,340 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/*
+  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
+*/
+
+#define NAO_SAMPLES        8
+#define M_PI 3.1415926535f
+
+typedef float<3> vec;
+
+#if 1
+#define __inline inline
+#else
+#define __inline
+#endif
+
+struct Isect {
+    float      t;
+    vec        p;
+    vec        n;
+    int        hit;
+};
+
+struct Sphere {
+    vec        center;
+    float      radius;
+};
+
+struct Plane {
+    vec    p;
+    vec    n;
+};
+
+struct Ray {
+    vec org;
+    vec dir;
+};
+
+static inline float dot(vec a, vec b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static inline vec vcross(vec v0, vec v1) {
+    vec ret;
+    ret.x = v0.y * v1.z - v0.z * v1.y;
+    ret.y = v0.z * v1.x - v0.x * v1.z;
+    ret.z = v0.x * v1.y - v0.y * v1.x;
+    return ret;
+}
+
+static inline void vnormalize(vec &v) {
+    float len2 = dot(v, v);
+    float invlen = rsqrt(len2);
+    v *= invlen;
+}
+
+
+__inline
+static void
+ray_plane_intersect(Isect &isect, Ray &ray, const Plane &plane) {
+    float d = -dot(plane.p, plane.n);
+    float v = dot(ray.dir, plane.n);
+
+#if 0
+    cif (abs(v) < 1.0e-17)
+        return;
+    else {
+        float t = -(dot(ray.org, plane.n) + d) / v;
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + ray.dir * t;
+            isect.n = plane.n;
+        }
+    }
+#else
+    cif (abs(v) <= 1.0e-17)
+      return;
+    float t = -(dot(ray.org, plane.n) + d) / v;
+    cif ((t > 0.0) && (t < isect.t)) {
+      isect.t = t;
+      isect.hit = 1;
+      isect.p = ray.org + ray.dir * t;
+      isect.n = plane.n;
+    }
+#endif
+}
+
+
+static inline void
+ray_sphere_intersect(Isect &isect, Ray &ray, const Sphere &sphere) {
+    vec rs = ray.org - sphere.center;
+
+    float B = dot(rs, ray.dir);
+    float C = dot(rs, rs) - sphere.radius * sphere.radius;
+    float D = B * B - C;
+
+#if 0
+    cif (D > 0.) {
+        float t = -B - sqrt(D);
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + t * ray.dir;
+            isect.n = isect.p - sphere.center;
+            vnormalize(isect.n);
+        }
+    }
+#else
+    cif (D <=0.0f)
+      return;
+
+    float t = -B - sqrt(D);
+    cif ((t > 0.0) && (t < isect.t)) {
+      isect.t = t;
+      isect.hit = 1;
+      isect.p = ray.org + t * ray.dir;
+      isect.n = isect.p - sphere.center;
+      vnormalize(isect.n);
+    }
+#endif
+}
+
+
+__inline
+static void
+orthoBasis(vec basis[3], vec n) {
+    basis[2] = n;
+    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
+
+    if ((n.x < 0.6) && (n.x > -0.6)) {
+        basis[1].x = 1.0;
+    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+        basis[1].y = 1.0;
+    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+        basis[1].z = 1.0;
+    } else {
+        basis[1].x = 1.0;
+    }
+
+    basis[0] = vcross(basis[1], basis[2]);
+    vnormalize(basis[0]);
+
+    basis[1] = vcross(basis[2], basis[0]);
+    vnormalize(basis[1]);
+}
+
+
+__inline
+static float
+ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
+                  RNGState &rngstate) {
+    float eps = 0.0001f;
+    vec p, n;
+    vec basis[3];
+    float occlusion = 0.0;
+
+    p = isect.p + eps * isect.n;
+
+    orthoBasis(basis, isect.n);
+
+    static const uniform int ntheta = NAO_SAMPLES;
+    static const uniform int nphi   = NAO_SAMPLES;
+    for (uniform int j = 0; j < ntheta; j++) {
+        for (uniform int i = 0; i < nphi; i++) {
+            Ray ray;
+            Isect occIsect;
+
+            float theta = sqrt(frandom(&rngstate));
+            float phi   = 2.0f * M_PI * frandom(&rngstate);
+            float x = cos(phi) * theta;
+            float y = sin(phi) * theta;
+            float z = sqrt(1.0 - theta * theta);
+
+            // local . global
+            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
+            float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
+            float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
+
+            ray.org = p;
+            ray.dir.x = rx;
+            ray.dir.y = ry;
+            ray.dir.z = rz;
+
+            occIsect.t   = 1.0e+17;
+            occIsect.hit = 0;
+
+            for (uniform int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(occIsect, ray, spheres[snum]);
+            ray_plane_intersect (occIsect, ray, plane);
+
+            if (occIsect.hit) occlusion += 1.0;
+        }
+    }
+
+    occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
+    return occlusion;
+}
+
+static  inline void ao_tiles(
+    uniform int x0, uniform int x1,
+    uniform int y0, uniform int y1,
+    uniform int w, uniform int h,
+    uniform int nsubsamples,
+    uniform float image[])
+{
+  const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+  const Sphere spheres[3] = {
+    { { -2.0f, 0.0f, -3.5f }, 0.5f },
+    { { -0.5f, 0.0f, -3.0f }, 0.5f },
+    { { 1.0f, 0.0f, -2.2f }, 0.5f } };
+  RNGState rngstate;
+
+  seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
+  float invSamples = 1.f / nsubsamples;
+  foreach_tiled (y = y0 ... y1, x = x0 ... x1)
+  {
+    const int offset = 3 * (y * w + x);
+    float res = 0.0f;
+
+    for (uniform int u = 0; u < nsubsamples; u++)
+      for (uniform int v = 0; v < nsubsamples; v++)
+      {
+        float du = (float)u * invSamples, dv = (float)v * invSamples;
+
+        // Figure out x,y pixel in NDC
+        float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
+        float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
+        float ret = 0.f;
+        Ray ray;
+        Isect isect;
+
+        ray.org = 0.f;
+
+        // Poor man's perspective projection
+        ray.dir.x = px;
+        ray.dir.y = py;
+        ray.dir.z = -1.0;
+        vnormalize(ray.dir);
+
+        isect.t   = 1.0e+17;
+        isect.hit = 0;
+
+        for (uniform int snum = 0; snum < 3; ++snum)
+          ray_sphere_intersect(isect, ray, spheres[snum]);
+        ray_plane_intersect(isect, ray, plane);
+
+        // Note use of 'coherent' if statement; the set of rays we
+        // trace will often all hit or all miss the scene
+#if 0
+        cif (isect.hit) {
+          ret = ambient_occlusion(isect, plane, spheres, rngstate);
+          ret *= invSamples * invSamples;
+          res += ret;
+        }
+#else
+         if(any(isect.hit))
+         {
+          ret = isect.hit*ambient_occlusion(isect, plane, spheres, rngstate);
+          ret *= invSamples * invSamples;
+          res += ret;
+         }
+#endif
+      }
+
+      image[offset  ] = res;
+      image[offset+1] = res;
+      image[offset+2] = res;
+  }
+}
+
+#define TILEX max(64,programCount*2)
+#define TILEY 4
+
+export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
+                    uniform float image[]) {
+  const uniform int x0 = 0;
+  const uniform int x1 = w;
+  const uniform int y0 = 0;
+  const uniform int y1 = h;
+  ao_tiles(x0,x1,y0,y1, w, h, nsubsamples, image);
+}
+
+void task ao_task(uniform int width, uniform int height,
+    uniform int nsubsamples, uniform float image[])
+{
+  if (taskIndex0 >= taskCount0) return;
+  if (taskIndex1 >= taskCount1) return;
+
+  const uniform int x0 = taskIndex0 * TILEX;
+  const uniform int x1 = min(x0 + TILEX, width);
+
+  const uniform int y0 = taskIndex1 * TILEY;
+  const uniform int y1 = min(y0 + TILEY, height);
+  ao_tiles(x0,x1,y0,y1, width, height, nsubsamples, image);
+}
+
+
+export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
+    uniform float image[])
+{
+  const uniform int ntilex = (w+TILEX-1)/TILEX;
+  const uniform int ntiley = (h+TILEY-1)/TILEY;
+  launch[ntilex,ntiley] ao_task(w, h, nsubsamples, image);
+  sync;
+}
--- a/examples/portable/common_cpu.mk
+++ b/examples/portable/common_cpu.mk
@@ -0,0 +1,122 @@
+
+TASK_CXX=../omp_tasksys.cpp ../../util/ispc_malloc.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=objs/omp_tasksys.o objs/ispc_malloc.o
+
+CXX=clang++
+CXX=icc -openmp
+CXXFLAGS+=-Iobjs/ -O2 -I../../ -I../../util 
+CXXFLAGS+=-DISPC_USE_OMP
+CC=clang
+CC=icc -openmp
+CCFLAGS+=-Iobjs/ -O2 -I../../ -I../../util
+CCFLAGS+=-DISPC_USE_OMP
+
+LIBS=-lm $(TASK_LIB) -lstdc++
+ISPC=ispc
+ISPC_FLAGS+=-O2
+ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
+
+ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
+
+ifeq ($(ARCH),x86)
+  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o)
+  COMMA=,
+  ifneq (,$(findstring $(COMMA),$(ISPC_IA_TARGETS)))
+    #$(info multi-target detected: $(ISPC_IA_TARGETS))
+    ifneq (,$(findstring sse2,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse2.o)
+    endif
+    ifneq (,$(findstring sse4,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse4.o)
+    endif
+    ifneq (,$(findstring avx1-,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx.o)
+    endif
+    ifneq (,$(findstring avx1.1,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx11.o)
+    endif
+    ifneq (,$(findstring avx2,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx2.o)
+    endif
+  endif
+  ISPC_TARGETS=$(ISPC_IA_TARGETS)
+  ARCH_BIT:=$(shell getconf LONG_BIT)
+  ifeq ($(ARCH_BIT),32)
+    ISPC_FLAGS += --arch=x86
+    CXXFLAGS += -m32
+    CCFLAGS += -m32
+  else
+    ISPC_FLAGS += --arch=x86-64
+    CXXFLAGS += -m64
+    CCFLAGS += -m64
+  endif
+else ifeq ($(ARCH),arm)
+  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o))
+  ISPC_TARGETS=$(ISPC_ARM_TARGETS)
+else
+  $(error Unknown architecture $(ARCH) from uname -m)
+endif
+
+CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o))
+CC_OBJS=$(addprefix objs/, $(CC_SRC:.c=.o))
+OBJS=$(CPP_OBJS) $(CC_OBJS) $(TASK_OBJ) $(ISPC_OBJS)
+
+default: $(EXAMPLE)
+
+all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+objs/%.cpp objs/%.o objs/%.h: dirs
+
+clean:
+	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 ref test
+
+$(EXAMPLE): $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/%.o: %.cpp dirs $(ISPC_HEADER)
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: %.c dirs $(ISPC_HEADER)
+	$(CC) $< $(CCFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp dirs
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+objs/%.o: ../../%.cpp dirs
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+objs/%.o: ../../util/%.cpp dirs
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h dirs
+
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o objs/%_ispc_avx11.o objs/%_ispc_avx2.o: %.ispc dirs
+	$(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+
+objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
+
+objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
+	$(CXX) -I../../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@
+
+$(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
+
+objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
+	$(CXX) -I../../intrinsics $< $(CXXFLAGS) -c -o $@
+
+$(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-1
+
+$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
--- a/examples/portable/common_knc.mk
+++ b/examples/portable/common_knc.mk
@@ -0,0 +1,52 @@
+TASK_CXX=../omp_tasksys.cpp ../../util/ispc_malloc.cpp
+TASK_OBJ=objs_knc/omp_tasksys.o objs_knc/ispc_malloc.o
+TASK_LIB=-openmp
+
+CXX=icc -openmp -mmic
+CXXFLAGS+=-Iobjs_knc/ -O2 -I../../ -I../../util  -I./
+CXXFLAGS+=  -DISPC_USE_OMP
+CC=icc -openmp -mmic
+CCFLAGS+= -Iobjs_knc/ -O2 -I../../ -I../../util -I./
+CCFLAGS+=-DISPC_USE_OMP
+
+LD=icc -mmic -openmp
+
+LIBS=-lm $(TASK_LIB) -lstdc++
+ISPC=ispc
+ISPC_FLAGS+=-O2
+ISPC_FLAGS+= --target=$(ISPC_TARGET) --c++-include-file=$(ISPC_INTRINSICS)
+
+ISPC_HEADERS=$(ISPC_SRC:%.ispc=objs_knc/%_ispc.h)
+ISPC_OBJ=$(ISPC_SRC:%.ispc=objs_knc/%_ispc.o)
+CXX_OBJ=$(CXX_SRC:%.cpp=objs_knc/%.o)
+CXX_OBJ+=$(TASK_OBJ)
+
+PROG=$(EXAMPLE)_knc
+
+all: dirs $(PROG)
+
+dirs:
+	/bin/mkdir -p objs_knc/
+
+objs_knc/%.cpp objs_knc/%.o objs_knc/%.h: dirs
+
+clean: 
+	/bin/rm -rf $(PROG) objs_knc
+
+$(PROG): $(ISPC_OBJ) $(CXX_OBJ) 
+	$(LD) -o $@ $^ $(LDFLAGS)
+
+objs_knc/%.o: %.cpp
+	$(CXX) $(CXXFLAGS)  -o $@ -c $<
+
+objs_knc/%.o: ../%.cpp
+	$(CXX) $(CXXFLAGS)  -o $@ -c $<
+objs_knc/%.o: ../../%.cpp
+	$(CXX) $(CXXFLAGS)  -o $@ -c $<
+objs_knc/%.o: ../../util/%.cpp
+	$(CXX) $(CXXFLAGS)  -o $@ -c $<
+
+objs_knc/%_ispc.o: %.ispc
+	$(ISPC) $(ISPC_FLAGS) --emit-c++ -o objs_knc/$*_ispc_zmm.cpp -h objs_knc/$*_ispc.h $< 
+	$(CXX) $(CXXFLAGS) -o $@ objs_knc/$*_ispc_zmm.cpp  -c
+
--- a/examples/portable/common_ptx.mk
+++ b/examples/portable/common_ptx.mk
@@ -0,0 +1,136 @@
+NVCC_SRC=../../util/nvcc_helpers.cu
+NVCC_OBJS=objs_ptx/nvcc_helpers_nvcc.o
+#
+CXX=g++ -ffast-math
+CXXFLAGS=-O3 -I$(CUDATK)/include -Iobjs_ptx/ -D_CUDA_ -I../../util -I../../
+#
+NVCC=nvcc
+NVCC_FLAGS+=-O3 -arch=sm_35 -D_CUDA_ -I../../util -Xptxas=-v -Iobjs_ptx/
+ifdef PTXCC_REGMAX
+  NVCC_FLAGS += --maxrregcount=$(PTXCC_REGMAX)
+endif
+NVCC_FLAGS+=--use_fast_math
+#
+LD=nvcc
+LDFLAGS=-lcudart -lcudadevrt -arch=sm_35
+#
+PTXCC=$(ISPC_HOME)/ptxtools/ptxcc
+PTXCC_FLAGS+= -Xptxas=-v
+ifdef PTXCC_REGMAX
+  PTXCC_FLAGS += -maxrregcount=$(PTXCC_REGMAX)
+endif
+
+#
+ISPC=$(ISPC_HOME)/ispc
+ISPC_FLAGS+=-O3 --math-lib=fast --target=nvptx --opt=fast-math
+#
+#
+#
+ISPC_LLVM_OBJS=$(ISPC_SRC:%.ispc=objs_ptx/%_llvm_ispc.o)
+ISPC_NVVM_OBJS=$(ISPC_SRC:%.ispc=objs_ptx/%_nvvm_ispc.o)
+#ISPC_BCS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.bc)
+ISPC_LLS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.ll)
+ISPC_LLVM_PTX=$(ISPC_SRC:%.ispc=objs_ptx/%_llvm_ispc.ptx)
+ISPC_NVVM_PTX=$(ISPC_SRC:%.ispc=objs_ptx/%_nvvm_ispc.ptx)
+ISPC_HEADERS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.h)
+CXX_OBJS=$(CXX_SRC:%.cpp=objs_ptx/%_gcc.o)
+CU_OBJS=$(CU_SRC:%.cu=objs_ptx/%_cu.o)
+#NVCC_OBJS=$(NVCC_SRC:%.cu=objs_ptx/%_nvcc.o)
+
+CXX_SRC+=ispc_malloc.cpp
+CXX_OBJS+=objs_ptx/ispc_malloc_gcc.o
+
+PTXGEN = $(ISPC_HOME)/ptxtools/ptxgen
+PTXGEN += --use_fast_math
+
+#LLVM32=$(HOME)/usr/local/llvm/bin-3.2
+#LLVM32DIS=$(LLVM32)/bin/llvm-dis
+
+LLC=$(LLVM_ROOT)/bin/llc
+LLC_FLAGS=-march=nvptx64 -mcpu=sm_35
+
+# .SUFFIXES: .bc .o .cu  .ll
+
+ifdef LLVM_GPU
+  OBJSptx_llvm=$(ISPC_LLVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS) 
+  PROGptx_llvm=$(PROG)_llvm_ptx
+else
+  ISPC_LLVM_PTX=
+endif
+
+
+ifdef NVVM_GPU
+  OBJSptx_nvvm=$(ISPC_NVVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS) $(ISPC_LVVM_PTX)
+  PROGptx_nvvm=$(PROG)_nvvm_ptx
+else
+  ISPC_NVVM_PTX=
+endif
+
+ifdef CU_SRC
+  OBJScu=$(CU_OBJS) $(CXX_OBJS) $(NVCC_OBJS)
+  PROGcu=$(PROG)_cu
+endif
+
+
+all: dirs  \
+	$(PROGptx_nvvm)  \
+	$(PROGptx_llvm)  \
+	$(PROGcu) $(ISPC_BCS) $(ISPC_LLS)  $(ISPC_HEADERS) $(ISPC_NVVM_PTX) $(ISPC_LLVM_PTX)
+
+dirs:
+	/bin/mkdir -p objs_ptx/
+
+objs_ptx/%.cpp objs_ptx/%.o objs_ptx/%.h: dirs
+
+clean: 
+	/bin/rm -rf $(PROGptx_nvvm) $(PROGptx_llvm) $(PROGcu) objs_ptx
+
+# generate binaries
+$(PROGptx_llvm): $(OBJSptx_llvm)
+	$(LD) -o $@ $^ $(LDFLAGS)
+$(PROGptx_nvvm): $(OBJSptx_nvvm)
+	$(LD) -o $@ $^ $(LDFLAGS)
+$(PROGcu): $(OBJScu)
+	$(LD) -o $@ $^ $(LDFLAGS)
+
+# compile C++ code
+objs_ptx/%_gcc.o: %.cpp $(ISPC_HEADERS)
+	$(CXX) $(CXXFLAGS)  -o $@ -c $<
+objs_ptx/%_gcc.o: ../../util/%.cpp 
+	$(CXX) $(CXXFLAGS)  -o $@ -c $<
+
+# CUDA helpers
+objs_ptx/%_cu.o: %.cu $(ISPC_HEADERS)
+	$(NVCC) $(NVCC_FLAGS)  -o $@ -dc $<
+
+# compile CUDA code 
+objs_ptx/%_nvcc.o: ../../util/%.cu
+	$(NVCC) $(NVCC_FLAGS) -o $@ -c $<
+objs_ptx/%_nvcc.o: %.cu 
+	$(NVCC) $(NVCC_FLAGS) -o $@ -c $<
+
+# compile ISPC to LLVM BC
+#objs_ptx/%_ispc.h objs_ptx/%_ispc.bc: %.ispc 
+#	$(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_ptx/$*_ispc.h -o objs_ptx/$*_ispc.bc $<
+objs_ptx/%_ispc.h objs_ptx/%_ispc.ll: %.ispc 
+	$(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_ptx/$*_ispc.h -o objs_ptx/$*_ispc.ll $<
+
+# generate PTX from LLVM BC
+#objs_ptx/%_llvm_ispc.ptx: objs_ptx/%_ispc.bc
+#	$(LLC) $(LLC_FLAGS) -o $@ $<
+objs_ptx/%_llvm_ispc.ptx: objs_ptx/%_ispc.ll
+	$(LLC) $(LLC_FLAGS) -o $@ $<
+#objs_ptx/%_nvvm_ispc.ptx: objs_ptx/%_ispc.bc
+#	$(LLVM32DIS) $< -o objs_ptx/$*_ispc-ll32.ll
+#	$(PTXGEN) objs_ptx/$*_ispc-ll32.ll -o $@
+objs_ptx/%_nvvm_ispc.ptx: objs_ptx/%_ispc.ll
+	$(PTXGEN) $< -o $@
+
+# generate an object file from PTX
+objs_ptx/%_ispc.o: objs_ptx/%_ispc.ptx
+	$(PTXCC) $< -Xnvcc="$(PTXCC_FLAGS)" -o $@
+
+
+	 
+
+
--- a/examples/portable/deferred/Makefile_cpu
+++ b/examples/portable/deferred/Makefile_cpu
@@ -0,0 +1,10 @@
+
+EXAMPLE=deferred_shading
+CPP_SRC=common.cpp main.cpp dynamic_c.cpp 
+# CPP_SRC+=dynamic_cilk.cpp
+ISPC_SRC=kernels.ispc
+ISPC_IA_TARGETS=avx1-i32x16
+ISPC_ARM_TARGETS=neon
+ISPC_FLAGS=--opt=fast-math
+
+include ../common_cpu.mk
--- a/examples/portable/deferred/Makefile_knc
+++ b/examples/portable/deferred/Makefile_knc
@@ -0,0 +1,8 @@
+EXAMPLE=deferred_shading
+CXX_SRC=common.cpp main.cpp dynamic_c.cpp 
+ISPC_SRC=kernels.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+ISPC_FLAGS=--opt=fast-math
+
+include ../common_knc.mk
--- a/examples/portable/deferred/Makefile_ptx
+++ b/examples/portable/deferred/Makefile_ptx
@@ -0,0 +1,13 @@
+PROG=deferred_shading
+ISPC_SRC=kernels.ispc
+CU_SRC=kernels.cu
+CXX_SRC=common.cpp  main.cpp
+PTXCC_REGMAX=64
+
+NVVM_GPU=1
+#LLVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
--- a/examples/portable/deferred/common.cpp
+++ b/examples/portable/deferred/common.cpp
@@ -0,0 +1,222 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#ifdef ISPC_IS_LINUX
+  #include <malloc.h>
+#endif
+#include "deferred.h"
+#include "timing.h"
+#include "ispc_malloc.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifndef _CUDA_
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+#else
+    void *ptr;
+    ispc_malloc(&ptr, size);
+    return ptr;
+#endif
+
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifndef _CUDA_
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+#else
+    ispc_free(ptr);
+#endif
+}
+
+
+Framebuffer::Framebuffer(int width, int height) {
+    nPixels = width*height;
+    r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+}
+
+
+Framebuffer::~Framebuffer() {
+    lAlignedFree(r);
+    lAlignedFree(g);
+    lAlignedFree(b);
+}
+
+
+void
+Framebuffer::clear() {
+    memset(r, 0, nPixels);
+    memset(g, 0, nPixels);
+    memset(b, 0, nPixels);
+}
+
+
+InputData *
+CreateInputDataFromFile(const char *path) {
+    FILE *in = fopen(path, "rb");
+    if (!in) return 0;
+
+    InputData *input = new InputData;
+
+    // Load header
+    if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+
+    // Load data chunk and update pointers
+    input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
+                                             ALIGNMENT_BYTES);
+    if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+
+    input->arrays.zBuffer =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
+    input->arrays.normalEncoded_x =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]];
+    input->arrays.normalEncoded_y =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]];
+    input->arrays.specularAmount =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]];
+    input->arrays.specularPower =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]];
+    input->arrays.albedo_x =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]];
+    input->arrays.albedo_y =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]];
+    input->arrays.albedo_z =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]];
+    input->arrays.lightPositionView_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]];
+    input->arrays.lightPositionView_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]];
+    input->arrays.lightPositionView_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]];
+    input->arrays.lightAttenuationBegin =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]];
+    input->arrays.lightColor_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]];
+    input->arrays.lightColor_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]];
+    input->arrays.lightColor_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]];
+    input->arrays.lightAttenuationEnd =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]];
+
+    fclose(in);
+    return input;
+}
+
+
+void DeleteInputData(InputData *input) {
+    lAlignedFree(input->chunk);
+}
+
+
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer) {
+    // Deswizzle and copy to RGBA output
+    // Doesn't need to be fast... only happens once
+    size_t imageBytes = 3 * input->header.framebufferWidth *
+        input->header.framebufferHeight;
+    uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
+    memset(framebufferAOS, 0, imageBytes);
+
+    for (int i = 0; i < input->header.framebufferWidth *
+                        input->header.framebufferHeight; ++i) {
+        framebufferAOS[3 * i + 0] = framebuffer.r[i];
+        framebufferAOS[3 * i + 1] = framebuffer.g[i];
+        framebufferAOS[3 * i + 2] = framebuffer.b[i];
+    }
+
+    // Write out simple PPM file
+    FILE *out = fopen(filename, "wb");
+    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
+            input->header.framebufferHeight);
+    fwrite(framebufferAOS, imageBytes, 1, out);
+    fclose(out);
+
+    lAlignedFree(framebufferAOS);
+}
--- a/examples/portable/deferred/data
+++ b/examples/portable/deferred/data
@@ -0,0 +1 @@
+../../deferred/data
--- a/examples/portable/deferred/deferred.h
+++ b/examples/portable/deferred/deferred.h
@@ -0,0 +1,108 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef DEFERRED_H
+#define DEFERRED_H
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 64
+#define MIN_TILE_HEIGHT 16
+#define MAX_LIGHTS 1024
+
+enum InputDataArraysEnum {
+    idaZBuffer = 0,
+    idaNormalEncoded_x,
+    idaNormalEncoded_y,
+    idaSpecularAmount,
+    idaSpecularPower,
+    idaAlbedo_x,
+    idaAlbedo_y,
+    idaAlbedo_z,
+    idaLightPositionView_x,
+    idaLightPositionView_y,
+    idaLightPositionView_z,
+    idaLightAttenuationBegin,
+    idaLightColor_x,
+    idaLightColor_y,
+    idaLightColor_z,
+    idaLightAttenuationEnd,
+
+    idaNum
+};
+
+#ifndef ISPC
+
+#include <stdint.h>
+#include "kernels_ispc.h"
+
+#define ALIGNMENT_BYTES 64
+
+#define MAX_LIGHTS 1024
+
+#define VISUALIZE_LIGHT_COUNT 0
+
+struct InputData
+{
+    ispc::InputHeader header;
+    ispc::InputDataArrays arrays;
+    uint8_t *chunk;
+};
+
+
+struct Framebuffer {
+    Framebuffer(int width, int height);
+    ~Framebuffer();
+
+    void clear();
+
+    uint8_t *r, *g, *b;
+
+private:
+    int nPixels;
+    Framebuffer(const Framebuffer &);
+    Framebuffer &operator=(const Framebuffer *);
+};
+
+
+InputData *CreateInputDataFromFile(const char *path);
+void DeleteInputData(InputData *input);
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer);
+void InitDynamicC(InputData *input);
+void InitDynamicCilk(InputData *input);
+void DispatchDynamicC(InputData *input, Framebuffer *framebuffer);
+void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer);
+
+#endif // !ISPC
+
+#endif // DEFERRED_H
--- a/examples/portable/deferred/dynamic_c.cpp
+++ b/examples/portable/deferred/dynamic_c.cpp
@@ -0,0 +1,874 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#ifndef MIN_TILE_WIDTH
+#define MIN_TILE_WIDTH 16
+#endif
+#ifndef MIN_TILE_HEIGHT
+#define MIN_TILE_HEIGHT 16
+#endif
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+static void
+ComputeZBounds(int tileStartX, int tileEndX,
+               int tileStartY, int tileEndY,
+               // G-buffer data
+               float zBuffer[],
+               int gBufferWidth,
+               // Camera data
+               float cameraProj_33, float cameraProj_43,
+               float cameraNear, float cameraFar,
+               // Output
+               float *minZ, float *maxZ)
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (int y = tileStartY; y < tileEndY; ++y) {
+        for (int x = tileStartX; x < tileEndX; ++x) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[(y * gBufferWidth + x)];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = std::min(laneMinZ, viewSpaceZ);
+                laneMaxZ = std::max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    *minZ = laneMinZ;
+    *maxZ = laneMaxZ;
+}
+
+
+static void
+ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
+                  int numTilesX, int numTilesY,
+                  // G-buffer data
+                  float zBuffer[],
+                  int gBufferWidth,
+                  // Camera data
+                  float cameraProj_33, float cameraProj_43,
+                  float cameraNear, float cameraFar,
+                  // Output
+                  float minZArray[],
+                  float maxZArray[])
+{
+    for (int tileX = 0; tileX < numTilesX; ++tileX) {
+        float minZ, maxZ;
+        ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
+                       tileY * tileHeight, tileY * tileHeight + tileHeight,
+                       zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
+                       cameraNear, cameraFar, &minZ, &maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+class MinMaxZTree
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTree(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                              zBuffer, gBufferPitchInElements,
+                              cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                              mMinZArrays[0] + (tileY * mNumTilesX),
+                              mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTree() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays);
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTree *gMinMaxZTree = 0;
+
+void InitDynamicC(InputData *input) {
+    gMinMaxZTree =
+        new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                        input->header.framebufferWidth,
+                        input->header.framebufferHeight);
+}
+
+
+/* We're going to split a tile into 4 sub-tiles.  This function
+   reclassifies the tile's lights with respect to the sub-tiles. */
+static void
+SplitTileMinMax(
+    int tileMidX, int tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    float subtileMinZ[],
+    float subtileMaxZ[],
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int lightIndices[],
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Outputs
+    int subtileIndices[],
+    int subtileIndicesPitch,
+    int subtileNumLights[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+
+    float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                   (cameraProj_22 * gBufferScale_y) };
+    float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                 tileMidY - gBufferScale_y };
+
+    for (int i = 0; i < 2; ++i) {
+        // Normalize
+        float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
+                                 frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+    // Initialize
+    int subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    for (int i = 0; i < numLights; ++i) {
+        int lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        // Test lights again against subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] +
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+
+        if (fabsf(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        if (fabsf(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        if (inFrustum[0])
+            subtileIndices[subtileLightOffset[0]++] = lightIndex;
+        if (inFrustum[1])
+            subtileIndices[subtileLightOffset[1]++] = lightIndex;
+        if (inFrustum[2])
+            subtileIndices[subtileLightOffset[2]++] = lightIndex;
+        if (inFrustum[3])
+            subtileIndices[subtileLightOffset[3]++] = lightIndex;
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
+
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = 1.f / sqrtf(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(uint8_t u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline uint8_t
+Float32ToUnorm8(float f) {
+    return (uint8_t)(f * 255.0f);
+}
+
+
+static inline float
+half_to_float_fast(uint16_t h) {
+    uint32_t hs = h & (int32_t)0x8000u;  // Pick off sign bit
+    uint32_t he = h & (int32_t)0x7C00u;  // Pick off exponent bits
+    uint32_t hm = h & (int32_t)0x03FFu;  // Pick off mantissa bits
+
+    // sign
+    uint32_t xs = ((uint32_t) hs) << 16;
+    // Exponent: unbias the halfp, then bias the single
+    int32_t xes = ((int32_t) (he >> 10)) - 15 + 127;
+    // Exponent
+    uint32_t xe = (uint32_t) (xes << 23);
+    // Mantissa
+    uint32_t xm = ((uint32_t) hm) << 13;
+
+    uint32_t bits = (xs | xe | xm);
+    float *fp = reinterpret_cast<float *>(&bits);
+    return *fp;
+}
+
+
+static void
+ShadeTileC(
+    int32_t tileStartX, int32_t tileEndX,
+    int32_t tileStartY, int32_t tileEndY,
+    int32_t gBufferWidth, int32_t gBufferHeight,
+    const ispc::InputDataArrays &inputData,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    float cameraProj_33, float cameraProj_43,
+    // Light list
+    int32_t tileLightIndices[],
+    int32_t tileNumLights,
+    // UI
+    bool visualizeLightCount,
+    // Output
+    uint8_t framebuffer_r[],
+    uint8_t framebuffer_g[],
+    uint8_t framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255));
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t framebufferIndex = (y * gBufferWidth + x);
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        float twoOverGBufferHeight = 2.0f / gBufferHeight;
+
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t gBufferOffset = y * gBufferWidth + x;
+
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x)) *
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z /
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z /
+                    cameraProj_22;
+
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y,
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrtf(4.0f * f - 1.0f);
+
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount =
+                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  =
+                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights;
+                     ++tileLightIndex) {
+                    int32_t lightIndex = tileLightIndices[tileLightIndex];
+
+                    // Gather light data relevant to initial culling
+                    float light_positionView_x =
+                        inputData.lightPositionView_x[lightIndex];
+                    float light_positionView_y =
+                        inputData.lightPositionView_y[lightIndex];
+                    float light_positionView_z =
+                        inputData.lightPositionView_z[lightIndex];
+                    float light_attenuationEnd =
+                        inputData.lightAttenuationEnd[lightIndex];
+
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    if (distanceToLight2 < light_attenutaionEnd2) {
+                        float distanceToLight = sqrtf(distanceToLight2);
+
+                        float distanceToLightRcp = 1.f / distanceToLight;
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y,
+                                           surface_normal_z, L_x, L_y, L_z);
+
+                        // Clip back facing
+                        if (NdotL > 0.0f) {
+                            float light_attenuationBegin =
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = std::min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+
+                            float NdotH = dot3(surface_normal_x, surface_normal_y,
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = std::max(NdotH, 0.0f);
+
+                            float specular = powf(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) *
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount *
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+
+                            float light_color_x = inputData.lightColor_x[lightIndex];
+                            float light_color_y = inputData.lightColor_y[lightIndex];
+                            float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                float gamma = 1.0 / 2.2f;
+                lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
+                lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
+                lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
+
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
+                        int *lightIndices, int numLights,
+                        Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ShadeTileC(startX, endX, startY, endY,
+                       input->header.framebufferWidth, input->header.framebufferHeight,
+                       input->arrays,
+                       input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+                       input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                       lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
+                       framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    }
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES))
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight,
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x,
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+
+        // Recurse into subtiles
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY,
+                                subtileLightIndices[0], subtileNumLights[0],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                subtileLightIndices[1], subtileNumLights[1],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                subtileLightIndices[2], subtileNumLights[2],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static int
+IntersectLightsWithTileMinMax(
+    int tileStartX, int tileEndX,
+    int tileStartY, int tileEndY,
+    // Tile data
+    float minZ,
+    float maxZ,
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Output
+    int tileLightIndices[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+
+    float frustumPlanes_xy[4];
+    float frustumPlanes_z[4];
+
+    // This one is totally constant over the whole screen... worth pulling it up at all?
+    float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_22 * gBufferScale_y),
+                                    -(cameraProj_22 * gBufferScale_y) };
+
+    float frustumPlanes_z_v[4] = {  tileEndX - gBufferScale_x,
+                                    -tileStartX + gBufferScale_x,
+                                    tileEndY - gBufferScale_y,
+                                    -tileStartY + gBufferScale_y };
+
+    for (int i = 0; i < 4; ++i) {
+        float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] +
+                                 frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
+        frustumPlanes_xy_v[i] *= norm;
+        frustumPlanes_z_v[i] *= norm;
+
+        frustumPlanes_xy[i] = frustumPlanes_xy_v[i];
+        frustumPlanes_z[i] = frustumPlanes_z_v[i];
+    }
+
+    int tileNumLights = 0;
+
+    for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) {
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        if (!inFrustum)
+            continue;
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+
+        d = light_positionView_z * frustumPlanes_z[0] +
+            light_positionView_x * frustumPlanes_xy[0];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_x * frustumPlanes_xy[1];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[2] +
+            light_positionView_y * frustumPlanes_xy[2];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[3] +
+            light_positionView_y * frustumPlanes_xy[3];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        // Pack and store intersecting lights
+        if (inFrustum)
+            tileLightIndices[tileNumLights++] = lightIndex;
+    }
+
+    return tileNumLights;
+}
+
+
+void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES))
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x,
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTree *minMaxZTree = gMinMaxZTree;
+
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+        input->header.cameraNear, input->header.cameraFar);
+
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
--- a/examples/portable/deferred/dynamic_cilk.cpp
+++ b/examples/portable/deferred/dynamic_cilk.cpp
@@ -0,0 +1,398 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef __cilk
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <assert.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+class MinMaxZTreeCilk
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTreeCilk(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        // Compute level 0 in parallel. Outer loops is here since we use Cilk
+        _Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ispc::ComputeZBoundsRow(tileY,
+                mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                zBuffer, gBufferPitchInElements,
+                cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                mMinZArrays[0] + (tileY * mNumTilesX),
+                mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        // NOTE: We currently don't use ispc here since it's sort of an
+        // awkward gather-based reduction Using SSE odd pack/unpack
+        // instructions might actually work here when we need to optimize
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            _Cilk_for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTreeCilk() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays);
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
+
+void InitDynamicCilk(InputData *input) {
+    gMinMaxZTreeCilk =
+        new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                            input->header.framebufferWidth,
+                            input->header.framebufferHeight);
+}
+
+
+static void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
+                        int *lightIndices, int numLights,
+                        Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ispc::ShadeTile(
+                startX, endX, startY, endY,
+                input->header.framebufferWidth, input->header.framebufferHeight,
+                &input->arrays,
+                input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+                input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
+                framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    }
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES))
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight,
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x,
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+
+        // Recurse into subtiles
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY,
+                                            subtileLightIndices[0], subtileNumLights[0],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                            subtileLightIndices[1], subtileNumLights[1],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                            subtileLightIndices[2], subtileNumLights[2],
+                                            framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES))
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = ispc::IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x,
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+        input->header.cameraNear, input->header.cameraFar);
+
+    // Launch the "root" tiles.  Ideally these should at least fill the
+    // machine... at the moment we have a static number of "levels" to the
+    // mip tree but it might make sense to compute it based on the width of
+    // the machine.
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    _Cilk_for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
+
+#endif // __cilk
--- a/examples/portable/deferred/kernels.cu
+++ b/examples/portable/deferred/kernels.cu
@@ -0,0 +1,778 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include "deferred.h"
+#include <stdio.h>
+#include <assert.h>
+
+#define programCount 32
+#define programIndex (threadIdx.x & 31)
+#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5))
+#define taskCount (gridDim.x*4)
+#define warpIdx (threadIdx.x >> 5)
+
+#define int32 int
+#define int16 short
+#define int8 char
+
+__device__ static inline float clamp(float v, float low, float high)
+{
+      return min(max(v, low), high);
+}
+
+struct InputDataArrays
+{
+    float *zBuffer;
+    unsigned int16 *normalEncoded_x; // half float
+    unsigned int16 *normalEncoded_y; // half float
+    unsigned int16 *specularAmount; // half float
+    unsigned int16 *specularPower; // half float
+    unsigned int8 *albedo_x; // unorm8
+    unsigned int8 *albedo_y; // unorm8
+    unsigned int8 *albedo_z; // unorm8
+    float *lightPositionView_x;
+    float *lightPositionView_y;
+    float *lightPositionView_z;
+    float *lightAttenuationBegin;
+    float *lightColor_x;
+    float *lightColor_y;
+    float *lightColor_z;
+    float *lightAttenuationEnd;
+};
+
+struct InputHeader
+{
+    float cameraProj[4][4];
+    float cameraNear;
+    float cameraFar;
+
+    int32 framebufferWidth;
+    int32 framebufferHeight;
+    int32 numLights;
+    int32 inputDataChunkSize;
+    int32 inputDataArrayOffsets[idaNum];
+};
+
+
+///////////////////////////////////////////////////////////////////////////
+// Common utility routines
+
+__device__
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+#if 0
+static __shared__ int shdata_full[128];
+template<typename T, int N>
+struct Uniform
+{
+  T data[(N+programCount-1)/programCount];
+  volatile T *shdata;
+
+  __device__ inline Uniform()
+  {
+    shdata = ((T*)shdata_full) + warpIdx*32;
+  }
+
+  __device__ inline int2 get_chunk(const int i) const
+  {
+    const int elem  = i & (programCount - 1);
+    const int chunk = i >> 5;
+    shdata[programIndex] = chunk;
+    shdata[        elem] = chunk;
+    return make_int2(shdata[programIndex], elem);
+  }
+
+  __device__ inline const T get(const int i) const
+  {
+    const int2 idx = get_chunk(i);
+    return __shfl(data[idx.x], idx.y);
+  }
+
+  __device__ inline void set(const bool active, const int i, T value)
+  {
+    const int2 idx = get_chunk(i);
+    const int chunkIdx = idx.x;
+    const int elemIdx = idx.y;
+    shdata[programIndex] = data[chunkIdx];
+    if (active) shdata[elemIdx] = value;
+    data[chunkIdx] = shdata[programIndex];
+  }
+};
+#elif 1
+template<typename T, int N>
+struct Uniform
+{
+  union
+  {
+    T *data;
+    int32_t ptr[2];
+  };
+
+  __device__ inline Uniform()
+  {
+    if (programIndex == 0)
+      data = (T*)malloc(N*sizeof(T));
+    ptr[0] = __shfl(ptr[0], 0);
+    ptr[1] = __shfl(ptr[1], 0);
+  }
+  __device__ inline ~Uniform()
+  {
+    if (programIndex == 0)
+      free(data);
+  }
+
+  __device__ inline const T get(const int i) const
+  {
+    return data[i];
+  }
+
+  __device__ inline T* get_ptr(const int i) {return &data[i]; }
+  __device__ inline void set(const bool active, const int i, T value)
+  {
+    if (active)
+      data[i] = value;
+  }
+};
+
+#else
+__shared__ int shdata_full[4*MAX_LIGHTS];
+template<typename T, int N>
+struct Uniform
+{
+  /* volatile */ T *shdata;
+
+  __device__ Uniform()
+  {
+    shdata = (T*)&shdata_full[warpIdx*MAX_LIGHTS];
+  }
+
+  __device__ inline const T get(const int i) const
+  {
+    return shdata[i];
+  }
+
+  __device__ inline void set(const bool active, const int i, T value)
+  {
+    if (active)
+      shdata[i] = value;
+  }
+};
+#endif
+
+
+__device__
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = rsqrt(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+__device__ inline
+static float reduce_min(float value)
+{
+#pragma unroll
+  for (int i = 4; i >=0; i--)
+    value = fminf(value, __shfl_xor(value, 1<<i, 32));
+  return value;
+}
+__device__ inline
+static float reduce_max(float value)
+{
+#pragma unroll
+  for (int i = 4; i >=0; i--)
+    value = fmaxf(value, __shfl_xor(value, 1<<i, 32));
+  return value;
+}
+
+#if 0
+__device__ inline
+static int reduce_sum(int value)
+{
+#pragma unroll
+  for (int i = 4; i >=0; i--)
+    value +=  __shfl_xor(value, 1<<i, 32);
+  return value;
+}
+static __device__ __forceinline__ uint shfl_scan_add_step(uint partial, uint up_offset)
+{
+  uint result;
+  asm(
+      "{.reg .u32 r0;"
+      ".reg .pred p;"
+      "shfl.up.b32 r0|p, %1, %2, 0;"
+      "@p add.u32 r0, r0, %3;"
+      "mov.u32 %0, r0;}"
+      : "=r"(result) : "r"(partial), "r"(up_offset), "r"(partial));
+  return result;
+}
+static __device__ __forceinline__ int inclusive_scan_warp(const int value)
+{
+  uint sum = value;
+#pragma unroll
+  for(int i = 0; i < 5; ++i)
+    sum = shfl_scan_add_step(sum, 1 << i);
+  return sum - value;
+}
+#endif
+
+
+static __device__ __forceinline__ int lanemask_lt()
+{
+  int mask;
+  asm("mov.u32 %0, %lanemask_lt;" : "=r" (mask));
+  return mask;
+}
+static __device__ __forceinline__ int2 warpBinExclusiveScan(const bool p)
+{
+  const int b = __ballot(p);
+  return make_int2(__popc(b), __popc(b & lanemask_lt()));
+}
+  __device__ static inline
+int packed_store_active(bool active, int* ptr, int value)
+{
+  const int2 res = warpBinExclusiveScan(active);
+  const int idx = res.y;
+  const int nactive = res.x;
+  if (active)
+    ptr[idx] = value;
+  return nactive;
+}
+
+
+
+
+
+__device__
+static inline float
+Unorm8ToFloat32(unsigned int8 u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+__device__
+static inline unsigned int8
+Float32ToUnorm8(float f) {
+    return (unsigned int8)(f * 255.0f);
+}
+
+
+__device__
+static inline void
+ComputeZBounds(
+     int32 tileStartX,  int32 tileEndX,
+     int32 tileStartY,  int32 tileEndY,
+    // G-buffer data
+     float zBuffer[],
+     int32 gBufferWidth,
+    // Camera data
+     float cameraProj_33,  float cameraProj_43,
+     float cameraNear,  float cameraFar,
+    // Output
+     float &minZ,
+     float &maxZ
+    )
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for ( int32 y = tileStartY; y < tileEndY; ++y) {
+        for ( int xb = tileStartX; xb < tileEndX; xb += programCount)
+        {
+          const int x = xb + programIndex;
+          if (x >= tileEndX) break;
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[y * gBufferWidth + x];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = min(laneMinZ, viewSpaceZ);
+                laneMaxZ = max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    minZ = reduce_min(laneMinZ);
+    maxZ = reduce_max(laneMaxZ);
+}
+
+
+__device__
+static inline  int32
+IntersectLightsWithTileMinMax(
+     int32 tileStartX,  int32 tileEndX,
+     int32 tileStartY,  int32 tileEndY,
+    // Tile data
+     float minZ,
+     float maxZ,
+    // G-buffer data
+     int32 gBufferWidth,  int32 gBufferHeight,
+    // Camera data
+     float cameraProj_11,  float cameraProj_22,
+    // Light Data
+     int32 numLights,
+     float light_positionView_x_array[],
+     float light_positionView_y_array[],
+     float light_positionView_z_array[],
+     float light_attenuationEnd_array[],
+    // Output
+     Uniform<int,MAX_LIGHTS> &tileLightIndices
+    )
+{
+     float gBufferScale_x = 0.5f * (float)gBufferWidth;
+     float gBufferScale_y = 0.5f * (float)gBufferHeight;
+
+     float frustumPlanes_xy[4] = {
+        -(cameraProj_11 * gBufferScale_x),
+         (cameraProj_11 * gBufferScale_x),
+         (cameraProj_22 * gBufferScale_y),
+        -(cameraProj_22 * gBufferScale_y) };
+     float frustumPlanes_z[4] = {
+         tileEndX - gBufferScale_x,
+        -tileStartX + gBufferScale_x,
+         tileEndY - gBufferScale_y,
+        -tileStartY + gBufferScale_y };
+
+    for ( int i = 0; i < 4; ++i) {
+         float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
+                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+     int32 tileNumLights = 0;
+
+    for ( int lightIndexB = 0; lightIndexB < numLights; lightIndexB += programCount)
+    {
+      const int lightIndex = lightIndexB + programIndex;
+      if (lightIndex >= numLights) break;
+
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        // This seems better than cif(!inFrustum) ccontinue; here since we
+        // don't actually need to mask the rest of this function - this is
+        // just a greedy early-out.  Could also structure all of this as
+        // nested if() statements, but this a bit easier to read
+        if (__ballot(inFrustum) > 0)
+        {
+            float light_positionView_x = light_positionView_x_array[lightIndex];
+            float light_positionView_y = light_positionView_y_array[lightIndex];
+
+            d = light_positionView_z * frustumPlanes_z[0] +
+                light_positionView_x * frustumPlanes_xy[0];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[1] +
+                light_positionView_x * frustumPlanes_xy[1];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[2] +
+                light_positionView_y * frustumPlanes_xy[2];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[3] +
+                light_positionView_y * frustumPlanes_xy[3];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            // Pack and store intersecting lights
+            const bool active = inFrustum && lightIndex < numLights;
+#if 0
+            if (__ballot(active) > 0)
+              tileNumLights += packed_store_active(active, tileLightIndices.get_ptr(tileNumLights), lightIndex);
+#else
+            if (__ballot(active) > 0)
+            {
+              const int2 res = warpBinExclusiveScan(active);
+              const int idx = tileNumLights + res.y;
+              const int nactive = res.x;
+              tileLightIndices.set(active, idx, lightIndex);
+              tileNumLights += nactive;
+            }
+#endif
+        }
+    }
+
+    return tileNumLights;
+}
+
+
+__device__
+static inline   int32
+IntersectLightsWithTile(
+     int32 tileStartX,  int32 tileEndX,
+     int32 tileStartY,  int32 tileEndY,
+     int32 gBufferWidth,  int32 gBufferHeight,
+    // G-buffer data
+     float zBuffer[],
+    // Camera data
+     float cameraProj_11,  float cameraProj_22,
+     float cameraProj_33,  float cameraProj_43,
+     float cameraNear,  float cameraFar,
+    // Light Data
+     int32 numLights,
+     float light_positionView_x_array[],
+     float light_positionView_y_array[],
+     float light_positionView_z_array[],
+     float light_attenuationEnd_array[],
+    // Output
+     Uniform<int,MAX_LIGHTS> &tileLightIndices
+    )
+{
+     float minZ, maxZ;
+    ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
+        zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+        minZ, maxZ);
+
+
+     int32 tileNumLights = IntersectLightsWithTileMinMax(
+        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
+        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
+        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
+        light_positionView_z_array, light_attenuationEnd_array,
+        tileLightIndices);
+
+    return tileNumLights;
+}
+
+
+__device__
+static inline void
+ShadeTile(
+     int32 tileStartX,  int32 tileEndX,
+     int32 tileStartY,  int32 tileEndY,
+     int32 gBufferWidth,  int32 gBufferHeight,
+    const  InputDataArrays &inputData,
+    // Camera data
+     float cameraProj_11,  float cameraProj_22,
+     float cameraProj_33,  float cameraProj_43,
+    // Light list
+     Uniform<int,MAX_LIGHTS> &tileLightIndices,
+     int32 tileNumLights,
+    // UI
+     bool visualizeLightCount,
+    // Output
+     unsigned int8 framebuffer_r[],
+     unsigned int8 framebuffer_g[],
+     unsigned int8 framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+         unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
+        for ( int32 y = tileStartY; y < tileEndY; ++y) {
+            for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
+            {
+              const int x = xb + programIndex;
+              if (x >= tileEndX) continue;
+                int32 framebufferIndex = (y * gBufferWidth + x);
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+         float twoOverGBufferWidth = 2.0f / gBufferWidth;
+         float twoOverGBufferHeight = 2.0f / gBufferHeight;
+
+        for ( int32 y = tileStartY; y < tileEndY; ++y) {
+             float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
+            {
+              const int x = xb + programIndex;
+//              if (x >= tileEndX) break;
+                int32 gBufferOffset = y * gBufferWidth + x;
+
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x)) *
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z /
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z /
+                    cameraProj_22;
+
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y,
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                asm("// half2float //");
+                float normal_x = __half2float(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = __half2float(inputData.normalEncoded_y[gBufferOffset]);
+                asm("// half2float //");
+
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrt(4.0f * f - 1.0f);
+
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount =
+                    __half2float(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  =
+                    __half2float(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
+                     ++tileLightIndex) {
+                     int32 lightIndex = tileLightIndices.get(tileLightIndex);
+
+                    // Gather light data relevant to initial culling
+                     float light_positionView_x =
+                        __ldg(&inputData.lightPositionView_x[lightIndex]);
+                     float light_positionView_y =
+                        __ldg(&inputData.lightPositionView_y[lightIndex]);
+                     float light_positionView_z =
+                        __ldg(&inputData.lightPositionView_z[lightIndex]);
+                     float light_attenuationEnd =
+                        __ldg(&inputData.lightAttenuationEnd[lightIndex]);
+
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    if (distanceToLight2 < light_attenutaionEnd2) {
+                        float distanceToLight = sqrt(distanceToLight2);
+
+                        // HLSL "rcp" is allowed to be fairly inaccurate
+                        float distanceToLightRcp = 1.0f/distanceToLight;
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y,
+                                           surface_normal_z, L_x, L_y, L_z);
+
+                        // Clip back facing
+                        if (NdotL > 0.0f) {
+                             float light_attenuationBegin =
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+
+                            float NdotH = dot3(surface_normal_x, surface_normal_y,
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = max(NdotH, 0.0f);
+
+                            float specular = pow(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) *
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount *
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+
+                             float light_color_x = inputData.lightColor_x[lightIndex];
+                             float light_color_y = inputData.lightColor_y[lightIndex];
+                             float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                // These pows are pretty slow right now, but we can do
+                // something faster if really necessary to squeeze every
+                // last bit of performance out of it
+                float gamma = 1.0 / 2.2f;
+                lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
+                lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
+                lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
+
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Static decomposition
+
+__global__ void
+RenderTile( int num_groups_x,  int num_groups_y,
+           const  InputHeader *inputHeaderPtr,
+           const  InputDataArrays *inputDataPtr,
+            int visualizeLightCount,
+           // Output
+            unsigned int8 framebuffer_r[],
+            unsigned int8 framebuffer_g[],
+            unsigned int8 framebuffer_b[]) {
+  if (taskIndex >= taskCount) return;
+
+  const  InputHeader inputHeader = *inputHeaderPtr;
+  const  InputDataArrays inputData = *inputDataPtr;
+     int32 group_y = taskIndex / num_groups_x;
+     int32 group_x = taskIndex % num_groups_x;
+
+     int32 tile_start_x = group_x * MIN_TILE_WIDTH;
+     int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
+     int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
+     int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
+
+     int framebufferWidth = inputHeader.framebufferWidth;
+     int framebufferHeight = inputHeader.framebufferHeight;
+     float cameraProj_00 = inputHeader.cameraProj[0][0];
+     float cameraProj_11 = inputHeader.cameraProj[1][1];
+     float cameraProj_22 = inputHeader.cameraProj[2][2];
+     float cameraProj_32 = inputHeader.cameraProj[3][2];
+
+    // Light intersection: figure out which lights illuminate this tile.
+     Uniform<int,MAX_LIGHTS> tileLightIndices;  // Light list for the tile
+#if 1
+     int numTileLights =
+        IntersectLightsWithTile(tile_start_x, tile_end_x,
+                                tile_start_y, tile_end_y,
+                                framebufferWidth, framebufferHeight,
+                                inputData.zBuffer,
+                                cameraProj_00, cameraProj_11,
+                                cameraProj_22, cameraProj_32,
+                                inputHeader.cameraNear, inputHeader.cameraFar,
+                                MAX_LIGHTS,
+                                inputData.lightPositionView_x,
+                                inputData.lightPositionView_y,
+                                inputData.lightPositionView_z,
+                                inputData.lightAttenuationEnd,
+                                tileLightIndices);
+
+    // And now shade the tile, using the lights in tileLightIndices
+    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
+              framebufferWidth, framebufferHeight, inputData,
+              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
+              tileLightIndices, numTileLights, visualizeLightCount,
+              framebuffer_r, framebuffer_g, framebuffer_b);
+#endif
+}
+
+
+extern "C" __global__ void
+RenderStatic___export( InputHeader inputHeaderPtr[],
+              InputDataArrays inputDataPtr[],
+              int visualizeLightCount,
+             // Output
+              unsigned int8 framebuffer_r[],
+              unsigned int8 framebuffer_g[],
+              unsigned int8 framebuffer_b[]) {
+
+  const  InputHeader inputHeader = *inputHeaderPtr;
+  const  InputDataArrays inputData = *inputDataPtr;
+
+
+     int num_groups_x = (inputHeader.framebufferWidth +
+                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
+     int num_groups_y = (inputHeader.framebufferHeight +
+                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
+     int num_groups = num_groups_x * num_groups_y;
+
+    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
+    // by MIN_TILE_HEIGHT pixels.
+     if (programIndex == 0)
+       RenderTile<<<(num_groups+4-1)/4,128>>>(num_groups_x, num_groups_y,
+           inputHeaderPtr, inputDataPtr, visualizeLightCount,
+           framebuffer_r, framebuffer_g, framebuffer_b);
+     cudaDeviceSynchronize();
+}
+extern "C" __host__ void
+RenderStatic( InputHeader inputHeaderPtr[],
+              InputDataArrays inputDataPtr[],
+              int visualizeLightCount,
+             // Output
+              unsigned int8 framebuffer_r[],
+              unsigned int8 framebuffer_g[],
+              unsigned int8 framebuffer_b[]) {
+  RenderStatic___export<<<1,32>>>( inputHeaderPtr,
+              inputDataPtr,
+              visualizeLightCount,
+             // Output
+              framebuffer_r,
+              framebuffer_g,
+              framebuffer_b);
+     cudaDeviceSynchronize();
+}
--- a/examples/portable/deferred/kernels.ispc
+++ b/examples/portable/deferred/kernels.ispc
@@ -0,0 +1,717 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "deferred.h"
+
+#ifdef __NVPTX__
+#define uniform_t varying
+#else
+#define uniform_t uniform
+#endif
+
+struct InputDataArrays
+{
+    float *zBuffer;
+    unsigned int16 *normalEncoded_x; // half float
+    unsigned int16 *normalEncoded_y; // half float
+    unsigned int16 *specularAmount; // half float
+    unsigned int16 *specularPower; // half float
+    unsigned int8 *albedo_x; // unorm8
+    unsigned int8 *albedo_y; // unorm8
+    unsigned int8 *albedo_z; // unorm8
+    float *lightPositionView_x;
+    float *lightPositionView_y;
+    float *lightPositionView_z;
+    float *lightAttenuationBegin;
+    float *lightColor_x;
+    float *lightColor_y;
+    float *lightColor_z;
+    float *lightAttenuationEnd;
+};
+
+struct InputHeader
+{
+    float cameraProj[4][4];
+    float cameraNear;
+    float cameraFar;
+
+    int32 framebufferWidth;
+    int32 framebufferHeight;
+    int32 numLights;
+    int32 inputDataChunkSize;
+    int32 inputDataArrayOffsets[idaNum];
+};
+
+
+///////////////////////////////////////////////////////////////////////////
+// Common utility routines
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = rsqrt(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(unsigned int8 u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline unsigned int8
+Float32ToUnorm8(float f) {
+    return (unsigned int8)(f * 255.0f);
+}
+
+
+#if 1
+inline
+#endif
+static void
+ComputeZBounds(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    uniform float &minZ,
+    uniform float &maxZ
+    )
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+        foreach (x = tileStartX ... tileEndX) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[y * gBufferWidth + x];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = min(laneMinZ, viewSpaceZ);
+                laneMaxZ = max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    minZ = reduce_min(laneMinZ);
+    maxZ = reduce_max(laneMaxZ);
+}
+
+#if 1
+inline
+#endif
+#ifndef __NVPTX__
+export
+#endif
+uniform int32
+IntersectLightsWithTileMinMax(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // Tile data
+    uniform float minZ,
+    uniform float maxZ,
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    uniform int32 tileLightIndices[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+
+    uniform_t float frustumPlanes_xy[4] = {
+        -(cameraProj_11 * gBufferScale_x),
+         (cameraProj_11 * gBufferScale_x),
+         (cameraProj_22 * gBufferScale_y),
+        -(cameraProj_22 * gBufferScale_y) };
+    uniform_t float frustumPlanes_z[4] = {
+         tileEndX - gBufferScale_x,
+        -tileStartX + gBufferScale_x,
+         tileEndY - gBufferScale_y,
+        -tileStartY + gBufferScale_y };
+
+    for (uniform int i = 0; i < 4; ++i) {
+        uniform_t float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
+                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+    uniform int32 tileNumLights = 0;
+
+    foreach (lightIndex = 0 ... numLights) {
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        // This seems better than cif(!inFrustum) ccontinue; here since we
+        // don't actually need to mask the rest of this function - this is
+        // just a greedy early-out.  Could also structure all of this as
+        // nested if() statements, but this a bit easier to read
+        if (any(inFrustum)) {
+            float light_positionView_x = light_positionView_x_array[lightIndex];
+            float light_positionView_y = light_positionView_y_array[lightIndex];
+
+            d = light_positionView_z * frustumPlanes_z[0] +
+                light_positionView_x * frustumPlanes_xy[0];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[1] +
+                light_positionView_x * frustumPlanes_xy[1];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[2] +
+                light_positionView_y * frustumPlanes_xy[2];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[3] +
+                light_positionView_y * frustumPlanes_xy[3];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+#if 0
+            // Pack and store intersecting lights
+            cif (inFrustum) {
+                tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
+                                                     lightIndex);
+            }
+#else
+        const bool active = inFrustum && lightIndex < numLights;
+        if(any(active))
+          tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex);
+#endif
+        }
+    }
+
+    return tileNumLights;
+}
+
+
+#if 1
+inline
+#endif
+static uniform int32
+IntersectLightsWithTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // G-buffer data
+    uniform float zBuffer[],
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    uniform int32 tileLightIndices[]
+    )
+{
+    uniform float minZ, maxZ;
+    ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
+        zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+        minZ, maxZ);
+
+    uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
+        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
+        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
+        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
+        light_positionView_z_array, light_attenuationEnd_array,
+        tileLightIndices);
+
+    return tileNumLights;
+}
+
+
+#if 1
+inline
+#endif
+#ifndef __NVPTX__
+export
+#endif
+void
+ShadeTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    uniform InputDataArrays &inputData,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    // Light list
+    uniform int32 tileLightIndices[],
+    uniform int32 tileNumLights,
+    // UI
+    uniform bool visualizeLightCount,
+    // Output
+    uniform unsigned int8 framebuffer_r[],
+    uniform unsigned int8 framebuffer_g[],
+    uniform unsigned int8 framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            foreach (x = tileStartX ... tileEndX) {
+                int32 framebufferIndex = (y * gBufferWidth + x);
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
+
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            foreach (x = tileStartX ... tileEndX) {
+                int32 gBufferOffset = y * gBufferWidth + x;
+
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x)) *
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z /
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z /
+                    cameraProj_22;
+
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y,
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
+
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrt(4.0f * f - 1.0f);
+
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount =
+                    half_to_float(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  =
+                    half_to_float(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
+                     ++tileLightIndex) {
+                    uniform int32 lightIndex = tileLightIndices[tileLightIndex];
+
+                    // Gather light data relevant to initial culling
+                    uniform float light_positionView_x =
+                        inputData.lightPositionView_x[lightIndex];
+                    uniform float light_positionView_y =
+                        inputData.lightPositionView_y[lightIndex];
+                    uniform float light_positionView_z =
+                        inputData.lightPositionView_z[lightIndex];
+                    uniform float light_attenuationEnd =
+                        inputData.lightAttenuationEnd[lightIndex];
+
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    cif (distanceToLight2 < light_attenutaionEnd2) {
+                        float distanceToLight = sqrt(distanceToLight2);
+
+                        // HLSL "rcp" is allowed to be fairly inaccurate
+                        float distanceToLightRcp = rcp(distanceToLight);
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y,
+                                           surface_normal_z, L_x, L_y, L_z);
+
+                        // Clip back facing
+                        cif (NdotL > 0.0f) {
+                            uniform float light_attenuationBegin =
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+
+                            float NdotH = dot3(surface_normal_x, surface_normal_y,
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = max(NdotH, 0.0f);
+
+                            float specular = pow(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) *
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount *
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+
+                            uniform float light_color_x = inputData.lightColor_x[lightIndex];
+                            uniform float light_color_y = inputData.lightColor_y[lightIndex];
+                            uniform float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                // These pows are pretty slow right now, but we can do
+                // something faster if really necessary to squeeze every
+                // last bit of performance out of it
+                float gamma = 1.0 / 2.2f;
+                lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
+                lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
+                lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
+
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Static decomposition
+
+task void
+RenderTile(uniform int num_groups_x, uniform int num_groups_y,
+           uniform InputHeader inputHeaderPtr[],
+           uniform InputDataArrays inputDataPtr[],
+           uniform int visualizeLightCount,
+           // Output
+           uniform unsigned int8 framebuffer_r[],
+           uniform unsigned int8 framebuffer_g[],
+           uniform unsigned int8 framebuffer_b[]) {
+
+  uniform InputHeader inputHeader = *inputHeaderPtr;
+  uniform InputDataArrays inputData = *inputDataPtr;
+
+    uniform int32 group_y = taskIndex / num_groups_x;
+    uniform int32 group_x = taskIndex % num_groups_x;
+    uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
+    uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
+    uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
+    uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
+
+    uniform int framebufferWidth = inputHeader.framebufferWidth;
+    uniform int framebufferHeight = inputHeader.framebufferHeight;
+    uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
+    uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
+    uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
+    uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
+
+    // Light intersection: figure out which lights illuminate this tile.
+#if 1
+    uniform int * uniform tileLightIndices = uniform new uniform int [MAX_LIGHTS];
+#define MALLOC
+#else /* shared memory doesn't full work... why? */
+    uniform int tileLightIndices[MAX_LIGHTS];  // Light list for the tile
+#endif
+    uniform int numTileLights =
+        IntersectLightsWithTile(tile_start_x, tile_end_x,
+                                tile_start_y, tile_end_y,
+                                framebufferWidth, framebufferHeight,
+                                inputData.zBuffer,
+                                cameraProj_00, cameraProj_11,
+                                cameraProj_22, cameraProj_32,
+                                inputHeader.cameraNear, inputHeader.cameraFar,
+                                MAX_LIGHTS,
+                                inputData.lightPositionView_x,
+                                inputData.lightPositionView_y,
+                                inputData.lightPositionView_z,
+                                inputData.lightAttenuationEnd,
+                                tileLightIndices);
+
+    // And now shade the tile, using the lights in tileLightIndices
+    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
+              framebufferWidth, framebufferHeight, inputData,
+              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
+              tileLightIndices, numTileLights, visualizeLightCount,
+              framebuffer_r, framebuffer_g, framebuffer_b);
+#ifdef MALLOC
+    delete tileLightIndices;
+#endif
+}
+
+
+export void
+RenderStatic(uniform InputHeader inputHeaderPtr[],
+             uniform InputDataArrays inputDataPtr[],
+             uniform int visualizeLightCount,
+             // Output
+             uniform unsigned int8 framebuffer_r[],
+             uniform unsigned int8 framebuffer_g[],
+             uniform unsigned int8 framebuffer_b[]) {
+
+    uniform InputHeader inputHeader = *inputHeaderPtr;
+    uniform InputDataArrays inputData = *inputDataPtr;
+
+    uniform int num_groups_x = (inputHeader.framebufferWidth +
+                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
+    uniform int num_groups_y = (inputHeader.framebufferHeight +
+                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
+    uniform int num_groups = num_groups_x * num_groups_y;
+
+    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
+    // by MIN_TILE_HEIGHT pixels.
+    launch[num_groups] RenderTile(num_groups_x, num_groups_y,
+                                  inputHeaderPtr, inputDataPtr, visualizeLightCount,
+                                  framebuffer_r, framebuffer_g, framebuffer_b);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Routines for dynamic decomposition path
+
+// This computes the z min/max range for a whole row worth of tiles.
+export void
+ComputeZBoundsRow(
+    uniform int32 tileY,
+    uniform int32 tileWidth, uniform int32 tileHeight,
+    uniform int32 numTilesX, uniform int32 numTilesY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    uniform float minZArray[],
+    uniform float maxZArray[]
+    )
+{
+    for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
+        uniform float minZ, maxZ;
+        ComputeZBounds(
+            tileX * tileWidth, tileX * tileWidth + tileWidth,
+            tileY * tileHeight, tileY * tileHeight + tileHeight,
+            zBuffer, gBufferWidth,
+            cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+            minZ, maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
+// numLights need not be a multiple of programCount here, but the input and output arrays
+// should be able to handle programCount-sized load/stores.
+export void
+SplitTileMinMax(
+    uniform int32 tileMidX, uniform int32 tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    uniform float subtileMinZ[],
+    uniform float subtileMaxZ[],
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    uniform int32 lightIndices[],
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Outputs
+    uniform int32 subtileIndices[],
+    uniform int32 subtileIndicesPitch,
+    uniform int32 subtileNumLights[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+
+    uniform_t float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                           (cameraProj_22 * gBufferScale_y) };
+    uniform_t float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                         tileMidY - gBufferScale_y };
+
+    // Normalize
+    uniform_t float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] +
+                                    frustumPlanes_z[0] * frustumPlanes_z[0]),
+                              rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] +
+                                    frustumPlanes_z[1] * frustumPlanes_z[1]) };
+    frustumPlanes_xy[0] *= norm[0];
+    frustumPlanes_xy[1] *= norm[1];
+    frustumPlanes_z[0] *= norm[0];
+    frustumPlanes_z[1] *= norm[1];
+
+    // Initialize
+    uniform int32 subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    foreach (i = 0 ... numLights) {
+        int32 lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        // Test lights again subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] +
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+
+        cif (abs(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        cif (abs(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        // Pack and store intersecting lights
+        // TODO: Experiment with a loop here instead
+        cif (inFrustum[0])
+            subtileLightOffset[0] +=
+            packed_store_active(&subtileIndices[subtileLightOffset[0]],
+                                lightIndex);
+        cif (inFrustum[1])
+            subtileLightOffset[1] +=
+            packed_store_active(&subtileIndices[subtileLightOffset[1]],
+                                lightIndex);
+        cif (inFrustum[2])
+            subtileLightOffset[2] +=
+            packed_store_active(&subtileIndices[subtileLightOffset[2]],
+                                lightIndex);
+        cif (inFrustum[3])
+            subtileLightOffset[3] +=
+            packed_store_active(&subtileIndices[subtileLightOffset[3]],
+                                lightIndex);
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
--- a/examples/portable/deferred/main.cpp
+++ b/examples/portable/deferred/main.cpp
@@ -0,0 +1,107 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#define NOMINMAX
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <cfloat>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <cassert>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include "timing.h"
+#include "ispc_malloc.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char** argv) {
+    if (argc < 2) {
+        printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)> [tasks iterations] [serial iterations]\n");
+        return 1;
+    }
+    static unsigned int test_iterations[] = {5, 3, 500}; //last value is for nframes, it is scale.
+    if (argc == 5) {
+        for (int i = 0; i < 3; i++) {
+            test_iterations[i] = atoi(argv[2 + i]);
+        }
+    }
+
+    InputData *input = CreateInputDataFromFile(argv[1]);
+    if (!input) {
+        printf("Failed to load input file \"%s\"!\n", argv[1]);
+        return 1;
+    }
+
+    Framebuffer framebuffer(input->header.framebufferWidth,
+                            input->header.framebufferHeight);
+
+    int nframes = test_iterations[2];
+    double ispcCycles = 1e30;
+    for (int i = 0; i < test_iterations[0]; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            ispc::RenderStatic(&input->header, &input->arrays,
+                               VISUALIZE_LIGHT_COUNT,
+                               framebuffer.r, framebuffer.g, framebuffer.b);
+        double msec = get_elapsed_msec() / nframes;
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec [%.3f fps]\n", msec, 1.0e3/msec);
+        ispcCycles = std::min(ispcCycles, msec);
+    }
+    printf("[ispc static + tasks]:\t\t[%.3f] msec to render "
+           "%d x %d image\n", ispcCycles,
+           input->header.framebufferWidth, input->header.framebufferHeight);
+    WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
+
+    DeleteInputData(input);
+
+    return 0;
+}
--- a/examples/portable/mergeSort/Makefile_cpu
+++ b/examples/portable/mergeSort/Makefile_cpu
@@ -0,0 +1,12 @@
+
+EXAMPLE=mergeSort
+CPP_SRC=mergeSort.cpp 
+ISPC_SRC=mergeSort.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+#ISPC_FLAGS=-DDEBUG -g
+CXXFLAGS=-g
+CCFLAGS=-g
+#NVCC_FLAGS=-Xptxas=-O0
+
+include ../common_cpu.mk
--- a/examples/portable/mergeSort/Makefile_knc
+++ b/examples/portable/mergeSort/Makefile_knc
@@ -0,0 +1,7 @@
+EXAMPLE=mergeSort
+CXX_SRC=mergeSort.cpp 
+ISPC_SRC=mergeSort.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+
+include ../common_knc.mk
--- a/examples/portable/mergeSort/Makefile_ptx
+++ b/examples/portable/mergeSort/Makefile_ptx
@@ -0,0 +1,15 @@
+PROG=mergeSort
+ISPC_SRC=mergeSort.ispc
+CU_SRC=mergeSort.cu
+CXX_SRC=mergeSort.cpp  mergeSort.cpp
+PTXCC_REGMAX=64
+#PTXCC_FLAGS= -Xptxas=-O3
+#NVCC_FLAGS=-Xptxas=-O0
+
+LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
--- a/examples/portable/mergeSort/keyType.h
+++ b/examples/portable/mergeSort/keyType.h
@@ -0,0 +1,3 @@
+#pragma once
+typedef float Key_t;
+typedef int   Val_t;
--- a/examples/portable/mergeSort/mergeSort.cpp
+++ b/examples/portable/mergeSort/mergeSort.cpp
@@ -0,0 +1,171 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <cassert>
+#include <iomanip>
+#include "timing.h"
+#include "ispc_malloc.h"
+#include "mergeSort_ispc.h"
+
+static void progressBar(const int x, const int n, const int width = 50)
+{
+  assert(n > 1);
+  assert(x >= 0 && x < n);
+  assert(width > 10);
+  const float f = static_cast<float>(x)/(n-1);
+  const int   w = static_cast<int>(f * width);
+
+  // print bar
+  std::string bstr("[");
+  for (int i = 0; i < width; i++)
+    bstr += i < w ? '=' : ' ';
+  bstr += "]";
+
+  // print percentage 
+  char pstr0[32];
+  sprintf(pstr0, " %2d %c ", static_cast<int>(f*100.0),'%');
+  const std::string pstr(pstr0);
+  std::copy(pstr.begin(), pstr.end(), bstr.begin() + (width/2-2));
+
+  std::cout << bstr;
+  std::cout << (x == n-1 ? "\n" : "\r") << std::flush;
+}
+
+#include "keyType.h"
+struct Key
+{
+  Key_t key;
+  Val_t val;
+};
+
+
+int main (int argc, char *argv[])
+{
+  int i, j, n = argc == 1 ? 1024*1024: atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
+  double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
+
+  Key *keys = new Key[n];
+  srand48(rtc()*65536);
+#pragma omp parallel for
+  for (int i = 0; i < n; i++)
+  {
+    keys[i].key = i; //((int)(drand48() * (1<<30)));
+    keys[i].val = i;
+  }
+  std::random_shuffle(keys, keys + n);
+
+  Key_t *keysSrc = new Key_t[n];
+  Val_t *valsSrc = new Val_t[n];
+  Key_t *keysBuf = new Key_t[n];
+  Val_t *valsBuf = new Val_t[n];
+  Key_t *keysDst = new Key_t[n];
+  Val_t *valsDst = new Val_t[n];
+  Key_t *keysGld = new Key_t[n];
+  Val_t *valsGld = new Val_t[n];
+#pragma omp parallel for
+  for (int i = 0; i < n; i++)
+  {
+    keysSrc[i] = keys[i].key;
+    valsSrc[i] = keys[i].val;
+
+    keysGld[i] = keysSrc[i];
+    valsGld[i] = valsSrc[i];
+  }
+  delete keys;
+
+  ispcSetMallocHeapLimit(1024*1024*1024);
+
+  ispc::openMergeSort();
+
+  tISPC2 = 1e30;
+  for (i = 0; i < m; i ++)
+  {
+    ispcMemcpy(keysSrc, keysGld, n*sizeof(Key_t));
+    ispcMemcpy(valsSrc, valsGld, n*sizeof(Val_t));
+
+    reset_and_start_timer();
+    ispc::mergeSort(keysDst, valsDst, keysBuf, valsBuf, keysSrc, valsSrc, n);
+    tISPC2 = std::min(tISPC2, get_elapsed_msec());
+
+    if (argc != 3)
+        progressBar (i, m);
+  }
+
+  ispc::closeMergeSort();
+
+  printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n/tISPC2);
+
+#if 0
+  printf("\n---\n");
+  for (int i = 0; i < 128; i++)
+  {
+    if ((i%32) == 0) printf("\n");
+    printf("%d ", (int)keysSrc[i]);
+  }
+  printf("\n---\n");
+  for (int i = 0; i < 128; i++)
+  {
+    if ((i%32) == 0) printf("\n");
+    printf("%d ", (int)keysBuf[i]);
+  }
+  printf("\n---\n");
+  for (int i = 0; i < 128; i++)
+  {
+    if ((i%32) == 0) printf("\n");
+    printf("%d ", (int)keysDst[i]);
+  }
+  printf("\n---\n");
+#endif
+
+
+
+  std::sort(keysGld, keysGld + n);
+  for (int i = 0; i < n; i++)
+    assert(keysDst[i] == keysGld[i]);
+
+  delete keysSrc;
+  delete valsSrc;
+  delete keysDst;
+  delete valsDst;
+  delete keysBuf;
+  delete valsBuf;
+  delete keysGld;
+  delete valsGld;
+
+  return 0;
+}
--- a/examples/portable/mergeSort/mergeSort.cu
+++ b/examples/portable/mergeSort/mergeSort.cu
@@ -0,0 +1,694 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/*
+   Based on mergeSort from CUDA SDK
+   */
+
+#include "keyType.h"
+#include "cuda_helpers.cuh"
+#include <cassert>
+
+#define uniform
+
+#define SAMPLE_STRIDE programCount
+
+#define iDivUp(a,b) (((a) + (b) - 1)/(b))
+#define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE)))
+
+#define W (/*sizeof(int)=*/4 * 8)
+
+__device__ static inline
+int nextPowerOfTwo(int x)
+{
+#if 0
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return ++x;
+#else
+  return 1U << (W - __clz(x - 1));
+#endif
+}
+
+
+__device__ static inline
+int binarySearchInclusiveRanks(
+    const int val,
+    uniform int *data,
+    const int L,
+    int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+__device__ static inline
+int binarySearchExclusiveRanks(
+    const int val,
+    uniform int *data,
+    const int L,
+    int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+__device__ static inline
+int binarySearchInclusive(
+    const Key_t val,
+    uniform Key_t *data,
+    const int L,
+    int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+__device__ static inline
+int binarySearchExclusive(
+    const Key_t val,
+    uniform Key_t *data,
+    const int L,
+    int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+__device__ static inline
+int binarySearchInclusive1(
+    const Key_t val,
+    Key_t data,
+    const uniform int L,
+    uniform int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (shuffle(data,newPos - 1) <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+__device__ static inline
+int binarySearchExclusive1(
+    const Key_t val,
+    Key_t data,
+    const uniform int L,
+    uniform int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (shuffle(data,newPos - 1) < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Bottom-level merge sort (binary search-based)
+////////////////////////////////////////////////////////////////////////////////
+__global__
+void mergeSortGangKernel(
+    uniform int batchSize,
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[])
+{
+  const uniform int blkIdx = taskIndex;
+  const uniform int blkDim = (batchSize + taskCount - 1)/taskCount;
+  const uniform int blkBeg =     blkIdx * blkDim;
+  const uniform int blkEnd = min(blkBeg + blkDim, batchSize);
+
+  __shared__ Key_t s_key_tmp[2*programCount*4];
+  __shared__ Val_t s_val_tmp[2*programCount*4];
+  Key_t *s_key = s_key_tmp + warpIdx*(2*programCount);
+  Val_t *s_val = s_val_tmp + warpIdx*(2*programCount);
+
+  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
+  {
+    const uniform int base = blk * (programCount*2);
+    s_key[programIndex +            0] = srcKey[base + programIndex +            0];
+    s_val[programIndex +            0] = srcVal[base + programIndex +            0];
+    s_key[programIndex + programCount] = srcKey[base + programIndex + programCount];
+    s_val[programIndex + programCount] = srcVal[base + programIndex + programCount];
+
+    for (uniform int stride = 1; stride < 2*programCount; stride <<= 1)
+    {
+      const int lPos = programIndex & (stride - 1);
+      uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
+      uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
+
+      Key_t keyA = baseKey[lPos +      0];
+      Val_t valA = baseVal[lPos +      0];
+      Key_t keyB = baseKey[lPos + stride];
+      Val_t valB = baseVal[lPos + stride];
+      int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos;
+      int posB = binarySearchInclusive(keyB, baseKey +      0, stride, stride) + lPos;
+
+      baseKey[posA] = keyA;
+      baseVal[posA] = valA;
+      baseKey[posB] = keyB;
+      baseVal[posB] = valB;
+    }
+
+    dstKey[base + programIndex +            0] = s_key[programIndex +            0];
+    dstVal[base + programIndex +            0] = s_val[programIndex +            0];
+    dstKey[base + programIndex + programCount] = s_key[programIndex + programCount];
+    dstVal[base + programIndex + programCount] = s_val[programIndex + programCount];
+  }
+}
+
+__device__ static inline
+void mergeSortGang(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int batchSize)
+{
+  uniform int nTasks = batchSize;
+  launch (nTasks,1,1,mergeSortGangKernel)(batchSize, dstKey, dstVal, srcKey, srcVal);
+  sync;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Merge step 1: generate sample ranks
+////////////////////////////////////////////////////////////////////////////////
+__global__
+void generateSampleRanksKernel(
+    uniform int nBlocks,
+    uniform int in_ranksA[],
+    uniform int in_ranksB[],
+    uniform Key_t in_srcKey[],
+    uniform int stride,
+    uniform int N,
+    uniform int totalProgramCount)
+{
+  const uniform int blkIdx = taskIndex;
+  const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
+  const uniform int blkBeg =     blkIdx * blkDim;
+  const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
+
+  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
+  {
+    const int pos = blk * programCount + programIndex;
+    cif (pos >= totalProgramCount)
+      return;
+
+    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+
+    uniform Key_t * srcKey = in_srcKey + segmentBase;
+    uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
+    uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
+
+    const int segmentElementsA = stride;
+    const int segmentElementsB = min(stride, N - segmentBase - stride);
+    const int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const int  segmentSamplesB = getSampleCount(segmentElementsB);
+
+    if (i < segmentSamplesA)
+    {
+      ranksA[i] = i * SAMPLE_STRIDE;
+      ranksB[i] = binarySearchExclusive(
+          srcKey[i * SAMPLE_STRIDE], srcKey + stride,
+          segmentElementsB, nextPowerOfTwo(segmentElementsB));
+    }
+
+    if (i < segmentSamplesB)
+    {
+      ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
+      ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
+          srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
+          segmentElementsA, nextPowerOfTwo(segmentElementsA));
+    }
+  }
+}
+
+__device__ static inline
+void generateSampleRanks(
+    uniform int ranksA[],
+    uniform int ranksB[],
+    uniform Key_t srcKey[],
+    uniform int stride,
+    uniform int N)
+{
+  uniform int lastSegmentElements = N % (2 * stride);
+  uniform int threadCount = (lastSegmentElements > stride) ?
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
+    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+
+  uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
+  uniform int nTasks = nBlocks;
+
+  launch (nTasks,1,1, generateSampleRanksKernel)(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
+  sync;
+}
+////////////////////////////////////////////////////////////////////////////////
+// Merge step 2: generate sample ranks and indices
+////////////////////////////////////////////////////////////////////////////////
+__global__
+void mergeRanksAndIndicesKernel(
+    uniform int nBlocks,
+    uniform int in_Limits[],
+    uniform int in_Ranks[],
+    uniform int stride,
+    uniform int N,
+    uniform int totalProgramCount)
+{
+  const uniform int blkIdx = taskIndex;
+  const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
+  const uniform int blkBeg =     blkIdx * blkDim;
+  const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
+
+  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
+  {
+    int pos = blk * programCount + programIndex;
+    cif (pos >= totalProgramCount)
+      return;
+
+    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+    uniform int *  ranks = in_Ranks  + (pos - i) * 2;
+    uniform int * limits = in_Limits + (pos - i) * 2;
+
+    const int segmentElementsA = stride;
+    const int segmentElementsB = min(stride, N - segmentBase - stride);
+    const int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const int  segmentSamplesB = getSampleCount(segmentElementsB);
+
+    if (i < segmentSamplesA)
+    {
+      int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
+      limits[dstPos] = ranks[i];
+    }
+
+    if (i < segmentSamplesB)
+    {
+      int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
+      limits[dstPos] = ranks[segmentSamplesA + i];
+    }
+  }
+}
+__device__ static inline
+void mergeRanksAndIndices(
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int ranksA[],
+    uniform int ranksB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int lastSegmentElements = N % (2 * stride);
+  const uniform int threadCount = (lastSegmentElements > stride) ?
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
+    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+
+  const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
+  uniform int nTasks = nBlocks;
+
+  launch (nTasks,1,1,mergeRanksAndIndicesKernel)(
+      nBlocks,
+      limitsA,
+      ranksA,
+      stride,
+      N,
+      threadCount);
+  launch (nTasks,1,1, mergeRanksAndIndicesKernel)(
+      nBlocks,
+      limitsB,
+      ranksB,
+      stride,
+      N,
+      threadCount);
+  sync;
+}
+
+
+__global__
+void mergeElementaryIntervalsKernel(
+    uniform int mergePairs,
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int blkIdx = taskIndex;
+  const uniform int blkDim = (mergePairs + taskCount - 1)/taskCount;
+  const uniform int blkBeg =     blkIdx * blkDim;
+  const uniform int blkEnd = min(blkBeg + blkDim, mergePairs);
+
+  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
+  {
+    const int uniform   intervalI =  blk & ((2 * stride) / SAMPLE_STRIDE - 1);
+    const int uniform segmentBase = (blk - intervalI) * SAMPLE_STRIDE;
+
+    //Set up threadblk-wide parameters
+
+    const uniform int segmentElementsA = stride;
+    const uniform int segmentElementsB = min(stride, N - segmentBase - stride);
+    const uniform int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const uniform int  segmentSamplesB = getSampleCount(segmentElementsB);
+    const uniform int   segmentSamples = segmentSamplesA + segmentSamplesB;
+
+    const uniform int startSrcA = limitsA[blk];
+    const uniform int startSrcB = limitsB[blk];
+    const uniform int endSrcA   = (intervalI + 1 < segmentSamples) ? limitsA[blk + 1] : segmentElementsA;
+    const uniform int endSrcB   = (intervalI + 1 < segmentSamples) ? limitsB[blk + 1] : segmentElementsB;
+    const uniform int lenSrcA   = endSrcA - startSrcA;
+    const uniform int lenSrcB   = endSrcB - startSrcB;
+    const uniform int startDstA = startSrcA + startSrcB;
+    const uniform int startDstB = startDstA + lenSrcA;
+
+    //Load main input data
+
+    Key_t keyA, keyB;
+    Val_t valA, valB;
+    if (programIndex < lenSrcA)
+    {
+      keyA = srcKey[segmentBase + startSrcA + programIndex];
+      valA = srcVal[segmentBase + startSrcA + programIndex];
+    }
+
+    if (programIndex < lenSrcB)
+    {
+      keyB = srcKey[segmentBase + stride + startSrcB + programIndex];
+      valB = srcVal[segmentBase + stride + startSrcB + programIndex];
+    }
+
+    // Compute destination addresses for merge data
+    int dstPosA, dstPosB, dstA = -1, dstB = -1;
+    if (any(programIndex < lenSrcA))
+      dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
+    if (any(programIndex < lenSrcB))
+      dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;
+
+    if (programIndex < lenSrcA && dstPosA < lenSrcA)
+      dstA = segmentBase + startDstA + dstPosA;
+    dstPosA -= lenSrcA;
+    if (programIndex < lenSrcA && dstPosA < lenSrcB)
+      dstA = segmentBase + startDstB + dstPosA;
+
+    if (programIndex < lenSrcB && dstPosB < lenSrcA)
+      dstB = segmentBase + startDstA + dstPosB;
+    dstPosB -= lenSrcA;
+    if (programIndex < lenSrcB && dstPosB < lenSrcB)
+      dstB = segmentBase + startDstB + dstPosB;
+
+    // store merge data
+    if (dstA >= 0)
+    {
+ //     int dstA = segmentBase + startSrcA + programIndex;
+      dstKey[dstA] = keyA;
+      dstVal[dstA] = valA;
+    }
+    if (dstB >= 0)
+    {
+//      int dstB = segmentBase + stride + startSrcB + programIndex;
+      dstKey[dstB] = keyB;
+      dstVal[dstB] = valB;
+    }
+  }
+
+}
+
+
+__device__ static inline
+void mergeElementaryIntervals(
+    uniform int nTasks,
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int lastSegmentElements = N % (2 * stride);
+  const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
+
+
+  nTasks = mergePairs/(programCount);
+
+  launch (nTasks,1,1, mergeElementaryIntervalsKernel)(
+      mergePairs,
+      dstKey,
+      dstVal,
+      srcKey,
+      srcVal,
+      limitsA,
+      limitsB,
+      stride,
+      N);
+  sync;
+}
+
+__device__ static uniform int * uniform memPool = NULL;
+__device__ static uniform int * uniform ranksA;
+__device__ static uniform int * uniform ranksB;
+__device__ static uniform int * uniform limitsA;
+__device__ static uniform int * uniform limitsB;
+__device__ static uniform int nTasks;
+__device__ static uniform int MAX_SAMPLE_COUNT = 0;
+
+__global__
+void openMergeSort___export()
+{
+  nTasks = 13*32*13;
+  MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
+  assert(memPool == NULL);
+  const uniform int nalloc = MAX_SAMPLE_COUNT * 4;
+  memPool = uniform new uniform int[nalloc];
+  ranksA  = memPool;
+  ranksB  =  ranksA + MAX_SAMPLE_COUNT;
+  limitsA =  ranksB + MAX_SAMPLE_COUNT;
+  limitsB = limitsA + MAX_SAMPLE_COUNT;
+}
+extern "C"
+void openMergeSort()
+{
+  openMergeSort___export<<<1,1>>>();
+  sync;
+}
+
+__global__
+void closeMergeSort___export()
+{
+  assert(memPool != NULL);
+  delete memPool;
+  memPool = NULL;
+}
+extern "C"
+void closeMergeSort()
+{
+  closeMergeSort___export<<<1,1>>>();
+  sync;
+}
+
+__global__
+void mergeSort___export(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t bufKey[],
+    uniform Val_t bufVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int N)
+{
+  uniform int stageCount = 0;
+  for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++);
+
+  uniform Key_t * uniform iKey, * uniform oKey;
+  uniform Val_t * uniform iVal, * uniform oVal;
+
+  if (stageCount & 1)
+  {
+    iKey = bufKey;
+    iVal = bufVal;
+    oKey = dstKey;
+    oVal = dstVal;
+  }
+  else
+  {
+    iKey = dstKey;
+    iVal = dstVal;
+    oKey = bufKey;
+    oVal = bufVal;
+  }
+
+
+
+  assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
+  assert(N % (programCount*2) == 0);
+
+  // k20m: 140 M/s
+  {
+    // k20m:  2367 M/s
+    mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
+
+#if 1
+    for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
+    {
+      const uniform int lastSegmentElements = N % (2 * stride);
+
+      // k20m: 271 M/s
+      {
+#if 1
+        // k20m: 944 M/s
+        {
+          // k20m:  1396 M/s
+          //Find sample ranks and prepare for limiters merge
+          generateSampleRanks(ranksA, ranksB, iKey, stride, N);
+
+          // k20m: 2379 M/s
+          //Merge ranks and indices
+          mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
+        }
+#endif
+
+        // k20m: 371 M/s
+        //Merge elementary intervals
+        mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
+      }
+
+      if (lastSegmentElements <= stride)
+        for (int i = programIndex; i < lastSegmentElements; i += programCount)
+          if (i < lastSegmentElements)
+          {
+            oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i];
+            oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i];
+          }
+
+
+      {
+        uniform Key_t * uniform tmpKey = iKey;
+        iKey = oKey;
+        oKey = tmpKey;
+      }
+      {
+        uniform Val_t * uniform tmpVal = iVal;
+        iVal = oVal;
+        oVal = tmpVal;
+      }
+    }
+#endif
+  }
+}
+extern "C"
+void mergeSort(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t bufKey[],
+    uniform Val_t bufVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int N)
+{
+  mergeSort___export<<<1,32>>>(
+      dstKey,
+      dstVal,
+      bufKey,
+      bufVal,
+      srcKey,
+      srcVal,
+      N);
+  sync;
+}
--- a/examples/portable/mergeSort/mergeSort.ispc
+++ b/examples/portable/mergeSort/mergeSort.ispc
@@ -0,0 +1,658 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/*
+   Based on mergeSort from CUDA SDK
+   */
+
+#include "keyType.h"
+
+#define SAMPLE_STRIDE programCount
+
+#define iDivUp(a,b) (((a) + (b) - 1)/(b))
+#define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE)))
+
+#define W (/*sizeof(int)=*/4 * 8)
+
+static inline
+int nextPowerOfTwo(int x)
+{
+#if 0
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return ++x;
+#else
+  return 1U << (W - count_leading_zeros(x - 1));
+#endif
+}
+
+static inline
+int binarySearchInclusiveRanks(
+    const int val,
+    uniform int *data,
+    const int L,
+    int stride)
+{
+  cif (L == 0)
+    return 0;
+
+  int pos = 0;
+  cfor (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    cif (data[newPos - 1] <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+static inline
+int binarySearchExclusiveRanks(
+    const int val,
+    uniform int *data,
+    const int L,
+    int stride)
+{
+  cif (L == 0)
+    return 0;
+
+  int pos = 0;
+  cfor (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+static inline
+int binarySearchInclusive(
+    const Key_t val,
+    uniform Key_t *data,
+    const int L,
+    int stride)
+{
+  cif (L == 0)
+    return 0;
+
+  int pos = 0;
+  cfor (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+static inline
+int binarySearchExclusive(
+    const Key_t val,
+    uniform Key_t *data,
+    const int L,
+    int stride)
+{
+  cif (L == 0)
+    return 0;
+
+  int pos = 0;
+  cfor (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+static inline
+int binarySearchInclusive1(
+    const Key_t val,
+    Key_t data,
+    const uniform int L,
+    uniform int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (shuffle(data,newPos - 1) <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+static inline
+int binarySearchExclusive1(
+    const Key_t val,
+    Key_t data,
+    const uniform int L,
+    uniform int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (shuffle(data,newPos - 1) < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Bottom-level merge sort (binary search-based)
+////////////////////////////////////////////////////////////////////////////////
+task
+void mergeSortGangKernel(
+    uniform int batchSize,
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int arrayLength)
+{
+  const uniform int blockIdx = taskIndex;
+  const uniform int blockDim = (batchSize + taskCount - 1)/taskCount;
+  const uniform int blockBeg =     blockIdx * blockDim;
+  const uniform int blockEnd = min(blockBeg + blockDim, batchSize);
+
+  uniform Key_t s_key[2*programCount];
+  uniform Val_t s_val[2*programCount];
+
+  for (uniform int block = blockBeg; block < blockEnd; block++)
+  {
+    const uniform int base = block * (programCount*2);
+    s_key[programIndex +            0] = srcKey[base + programIndex +            0];
+    s_val[programIndex +            0] = srcVal[base + programIndex +            0];
+    s_key[programIndex + programCount] = srcKey[base + programIndex + programCount];
+    s_val[programIndex + programCount] = srcVal[base + programIndex + programCount];
+
+    for (uniform int stride = 1; stride < arrayLength; stride <<= 1)
+    {
+      const int lPos = programIndex & (stride - 1);
+      const int offset = 2 * (programIndex - lPos);
+      uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
+      uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
+
+      Key_t keyA = baseKey[lPos +      0];
+      Val_t valA = baseVal[lPos +      0];
+      Key_t keyB = baseKey[lPos + stride];
+      Val_t valB = baseVal[lPos + stride];
+
+      int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos;
+      int posB = binarySearchInclusive(keyB, baseKey +      0, stride, stride) + lPos;
+
+      baseKey[posA] = keyA;
+      baseVal[posA] = valA;
+      baseKey[posB] = keyB;
+      baseVal[posB] = valB;
+    }
+
+    dstKey[base + programIndex +            0] = s_key[programIndex +            0];
+    dstVal[base + programIndex +            0] = s_val[programIndex +            0];
+    dstKey[base + programIndex + programCount] = s_key[programIndex + programCount];
+    dstVal[base + programIndex + programCount] = s_val[programIndex + programCount];
+  }
+}
+
+static inline
+void mergeSortGang(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int batchSize)
+{
+  uniform int nTasks = num_cores()*4;
+#ifdef __NVPTX__
+  nTasks = iDivUp(batchSize,1);
+#endif
+  launch [nTasks] mergeSortGangKernel(batchSize, dstKey, dstVal, srcKey, srcVal, 2*programCount);
+  sync;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Merge step 1: generate sample ranks
+////////////////////////////////////////////////////////////////////////////////
+task
+void generateSampleRanksKernel(
+    uniform int nBlocks,
+    uniform int in_ranksA[],
+    uniform int in_ranksB[],
+    uniform Key_t in_srcKey[],
+    uniform int stride,
+    uniform int N,
+    uniform int totalProgramCount)
+{
+  const uniform int blockIdx = taskIndex;
+  const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
+  const uniform int blockBeg =     blockIdx * blockDim;
+  const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
+
+  for (uniform int block = blockBeg; block < blockEnd; block++)
+  {
+    const int pos = block * programCount + programIndex;
+    cif (pos >= totalProgramCount)
+      return;
+
+    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+
+    uniform Key_t * srcKey = in_srcKey + segmentBase;
+    uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
+    uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
+
+    const int segmentElementsA = stride;
+    const int segmentElementsB = min(stride, N - segmentBase - stride);
+    const int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const int  segmentSamplesB = getSampleCount(segmentElementsB);
+
+    if (i < segmentSamplesA)
+    {
+      ranksA[i] = i * SAMPLE_STRIDE;
+      ranksB[i] = binarySearchExclusive(
+          srcKey[i * SAMPLE_STRIDE], srcKey + stride,
+          segmentElementsB, nextPowerOfTwo(segmentElementsB));
+    }
+
+    if (i < segmentSamplesB)
+    {
+      ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
+      ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
+          srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
+          segmentElementsA, nextPowerOfTwo(segmentElementsA));
+    }
+  }
+}
+
+static inline
+void generateSampleRanks(
+    uniform int ranksA[],
+    uniform int ranksB[],
+    uniform Key_t srcKey[],
+    uniform int stride,
+    uniform int N)
+{
+  uniform int lastSegmentElements = N % (2 * stride);
+  uniform int threadCount = (lastSegmentElements > stride) ?
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
+    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+
+  uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
+  uniform int nTasks = num_cores()*4;
+#ifdef __NVPTX__
+  nTasks = iDivUp(nBlocks,1);
+#endif
+
+  launch [nTasks] generateSampleRanksKernel(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
+  sync;
+}
+////////////////////////////////////////////////////////////////////////////////
+// Merge step 2: generate sample ranks and indices
+////////////////////////////////////////////////////////////////////////////////
+task
+void mergeRanksAndIndicesKernel(
+    uniform int nBlocks,
+    uniform int in_Limits[],
+    uniform int in_Ranks[],
+    uniform int stride,
+    uniform int N,
+    uniform int totalProgramCount)
+{
+  const uniform int blockIdx = taskIndex;
+  const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
+  const uniform int blockBeg =     blockIdx * blockDim;
+  const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
+
+  for (uniform int block = blockBeg; block < blockEnd; block++)
+  {
+    int pos = block * programCount + programIndex;
+    cif (pos >= totalProgramCount)
+      return;
+
+    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+    uniform int *  ranks = in_Ranks  + (pos - i) * 2;
+    uniform int * limits = in_Limits + (pos - i) * 2;
+
+    const int segmentElementsA = stride;
+    const int segmentElementsB = min(stride, N - segmentBase - stride);
+    const int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const int  segmentSamplesB = getSampleCount(segmentElementsB);
+
+    if (i < segmentSamplesA)
+    {
+      int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
+      limits[dstPos] = ranks[i];
+    }
+
+    if (i < segmentSamplesB)
+    {
+      int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
+      limits[dstPos] = ranks[segmentSamplesA + i];
+    }
+  }
+}
+static inline
+void mergeRanksAndIndices(
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int ranksA[],
+    uniform int ranksB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int lastSegmentElements = N % (2 * stride);
+  const uniform int threadCount = (lastSegmentElements > stride) ?
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
+    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+
+  const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
+  uniform int nTasks = num_cores()*4;
+
+#ifdef __NVPTX__
+  nTasks = iDivUp(nBlocks,1);
+#endif
+
+  launch [nTasks] mergeRanksAndIndicesKernel(
+      nBlocks,
+      limitsA,
+      ranksA,
+      stride,
+      N,
+      threadCount);
+  launch [nTasks] mergeRanksAndIndicesKernel(
+      nBlocks,
+      limitsB,
+      ranksB,
+      stride,
+      N,
+      threadCount);
+  sync;
+}
+
+
+task
+void mergeElementaryIntervalsKernel(
+    uniform int mergePairs,
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int blockIdx = taskIndex;
+  const uniform int blockDim = (mergePairs + taskCount - 1)/taskCount;
+  const uniform int blockBeg =     blockIdx * blockDim;
+  const uniform int blockEnd = min(blockBeg + blockDim, mergePairs);
+
+  for (uniform int block = blockBeg; block < blockEnd; block++)
+  {
+    const int uniform   intervalI =  block & ((2 * stride) / SAMPLE_STRIDE - 1);
+    const int uniform segmentBase = (block - intervalI) * SAMPLE_STRIDE;
+
+    //Set up threadblock-wide parameters
+
+    const uniform int segmentElementsA = stride;
+    const uniform int segmentElementsB = min(stride, N - segmentBase - stride);
+    const uniform int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const uniform int  segmentSamplesB = getSampleCount(segmentElementsB);
+    const uniform int   segmentSamples = segmentSamplesA + segmentSamplesB;
+
+    const uniform int startSrcA = limitsA[block];
+    const uniform int startSrcB = limitsB[block];
+    const uniform int endSrcA   = (intervalI + 1 < segmentSamples) ? limitsA[block + 1] : segmentElementsA;
+    const uniform int endSrcB   = (intervalI + 1 < segmentSamples) ? limitsB[block + 1] : segmentElementsB;
+    const uniform int lenSrcA   = endSrcA - startSrcA;
+    const uniform int lenSrcB   = endSrcB - startSrcB;
+    const uniform int startDstA = startSrcA + startSrcB;
+    const uniform int startDstB = startDstA + lenSrcA;
+
+    //Load main input data
+
+    Key_t keyA, keyB;
+    Val_t valA, valB;
+    if (programIndex < lenSrcA)
+    {
+      keyA = srcKey[segmentBase + startSrcA + programIndex];
+      valA = srcVal[segmentBase + startSrcA + programIndex];
+    }
+
+    if (programIndex < lenSrcB)
+    {
+      keyB = srcKey[segmentBase + stride + startSrcB + programIndex];
+      valB = srcVal[segmentBase + stride + startSrcB + programIndex];
+    }
+
+    // Compute destination addresses for merge data
+    int dstPosA, dstPosB, dstA = -1, dstB = -1;
+    if (programIndex < lenSrcA)
+      dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
+    if (programIndex < lenSrcB)
+      dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;
+
+    if (programIndex < lenSrcA && dstPosA < lenSrcA)
+      dstA = segmentBase + startDstA + dstPosA;
+    dstPosA -= lenSrcA;
+    if (programIndex < lenSrcA && dstPosA < lenSrcB)
+      dstA = segmentBase + startDstB + dstPosA;
+
+    if (programIndex < lenSrcB && dstPosB < lenSrcA)
+      dstB = segmentBase + startDstA + dstPosB;
+    dstPosB -= lenSrcA;
+    if (programIndex < lenSrcB && dstPosB < lenSrcB)
+      dstB = segmentBase + startDstB + dstPosB;
+
+    if (dstA >= 0)
+    {
+      dstKey[dstA] = keyA;
+      dstVal[dstA] = valA;
+    }
+    if (dstB >= 0)
+    {
+      dstKey[dstB] = keyB;
+      dstVal[dstB] = valB;
+    }
+  }
+}
+
+static inline
+void mergeElementaryIntervals(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int lastSegmentElements = N % (2 * stride);
+  const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
+
+
+  uniform int nTasks = num_cores()*4;
+#ifdef __NVPTX__
+  nTasks = iDivUp(mergePairs,1*programCount);
+#endif
+
+  launch [nTasks] mergeElementaryIntervalsKernel(
+      mergePairs,
+      dstKey,
+      dstVal,
+      srcKey,
+      srcVal,
+      limitsA,
+      limitsB,
+      stride,
+      N);
+  if (lastSegmentElements <= stride)
+    foreach (i = 0 ... lastSegmentElements)
+    {
+      dstKey[N-lastSegmentElements+i] = srcKey[N-lastSegmentElements+i];
+      dstVal[N-lastSegmentElements+i] = srcVal[N-lastSegmentElements+i];
+    }
+  sync;
+}
+
+static uniform int * uniform memPool = NULL;
+static uniform int * uniform ranksA;
+static uniform int * uniform ranksB;
+static uniform int * uniform limitsA;
+static uniform int * uniform limitsB;
+static uniform int MAX_SAMPLE_COUNT = 0;
+
+export
+void openMergeSort()
+{
+  MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
+  assert(memPool == NULL);
+  const uniform int nalloc = MAX_SAMPLE_COUNT * 4;
+  memPool = uniform new uniform int[nalloc];
+  ranksA  = memPool;
+  ranksB  =  ranksA + MAX_SAMPLE_COUNT;
+  limitsA =  ranksB + MAX_SAMPLE_COUNT;
+  limitsB = limitsA + MAX_SAMPLE_COUNT;
+}
+
+export
+void closeMergeSort()
+{
+  assert(memPool != NULL);
+  delete memPool;
+  memPool = NULL;
+}
+
+export
+void mergeSort(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t bufKey[],
+    uniform Val_t bufVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int N)
+{
+  uniform int stageCount = 0;
+  for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++);
+
+  uniform Key_t * uniform iKey, * uniform oKey;
+  uniform Val_t * uniform iVal, * uniform oVal;
+
+  if (stageCount & 1)
+  {
+    iKey = bufKey;
+    iVal = bufVal;
+    oKey = dstKey;
+    oVal = dstVal;
+  }
+  else
+  {
+    iKey = dstKey;
+    iVal = dstVal;
+    oKey = bufKey;
+    oVal = bufVal;
+  }
+
+
+
+  assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
+  assert(N % (programCount*2) == 0);
+
+  // cpu: 28  gpu: 74 M/s
+  {
+    // cpu: 356   gpu: 534 M/s
+    mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
+
+#if 1
+    for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
+    {
+      // cpu: 30  gpu: 112 M/s
+      {
+#if 1
+        // cpu: 121  gpu: 460 M/s
+        {
+          // cpu: 190  gpu: 600 M/s
+          //Find sample ranks and prepare for limiters merge
+          generateSampleRanks(ranksA, ranksB, iKey, stride, N);
+
+          // cpu: 120 gpu: 457 M/s
+          //Merge ranks and indices
+          mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
+        }
+#endif
+
+        // cpu: 287  gpu: 194 M/s
+        //Merge elementary intervals
+        mergeElementaryIntervals(oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
+      }
+
+      {
+        uniform Key_t * uniform tmpKey = iKey;
+        iKey = oKey;
+        oKey = tmpKey;
+      }
+      {
+        uniform Val_t * uniform tmpVal = iVal;
+        iVal = oVal;
+        oVal = tmpVal;
+      }
+    }
+#endif
+  }
+}
--- a/examples/portable/nbody_hermite4/Makefile_cpu
+++ b/examples/portable/nbody_hermite4/Makefile_cpu
@@ -0,0 +1,8 @@
+
+EXAMPLE=hermite4
+CPP_SRC=hermite4.cpp 
+ISPC_SRC=hermite4.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+
+include ../common_cpu.mk
--- a/examples/portable/nbody_hermite4/Makefile_knc
+++ b/examples/portable/nbody_hermite4/Makefile_knc
@@ -0,0 +1,7 @@
+EXAMPLE=hermite4
+CXX_SRC=hermite4.cpp 
+ISPC_SRC=hermite4.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+
+include ../common_knc.mk
--- a/examples/portable/nbody_hermite4/Makefile_ptx
+++ b/examples/portable/nbody_hermite4/Makefile_ptx
@@ -0,0 +1,14 @@
+PROG=hermite4
+ISPC_SRC=hermite4.ispc
+#CU_SRC=hermite4.cu
+CXX_SRC=hermite4.cpp 
+PTXCC_REGMAX=64
+#ISPC_FLAGS= --opt=disable-uniform-control-flow
+
+#LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
--- a/examples/portable/nbody_hermite4/hermite4.cpp
+++ b/examples/portable/nbody_hermite4/hermite4.cpp
@@ -0,0 +1,361 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Hermite4 N-body integrator */
+/* Makino and Aarseth, 1992 */
+/* http://adsabs.harvard.edu/abs/1992PASJ...44..141M and references there in*/
+
+#include <cstdlib>
+#include <cmath>
+#include <cstdio>
+#include <algorithm>
+#include <vector>
+#include <cassert>
+
+#include "timing.h"
+#include "ispc_malloc.h"
+
+#include "typeReal.h"
+#include "hermite4_ispc.h"
+
+struct Hermite4
+{
+  enum {PP_FLOP=44};
+  const int n;
+  const real eta;
+  real eps2;
+  real *g_mass, *g_gpot;
+  real *g_posx, *g_posy, *g_posz;
+  real *g_velx, *g_vely, *g_velz;
+  real *g_accx, *g_accy, *g_accz;
+  real *g_jrkx, *g_jrky, *g_jrkz;
+
+  std::vector<real> accx0, accy0, accz0;
+  std::vector<real> jrkx0, jrky0, jrkz0;
+
+  Hermite4(const int _n = 8192, const real _eta = 0.1) : n(_n), eta(_eta)
+  {
+    eps2  = 4.0/n;  /* eps = 4/n to give Ebin = 1 KT */
+    eps2 *= eps2;
+    g_mass = new real[n];
+    g_gpot = new real[n];
+    g_posx = new real[n];
+    g_posy = new real[n];
+    g_posz = new real[n];
+    g_velx = new real[n];
+    g_vely = new real[n];
+    g_velz = new real[n];
+    g_accx = new real[n];
+    g_accy = new real[n];
+    g_accz = new real[n];
+    g_jrkx = new real[n];
+    g_jrky = new real[n];
+    g_jrkz = new real[n];
+
+    accx0.resize(n);
+    accy0.resize(n);
+    accz0.resize(n);
+    jrkx0.resize(n);
+    jrky0.resize(n);
+    jrkz0.resize(n);
+
+    printf("---Intializing nbody--- \n");
+
+    const real R0 = 1;
+    const real mp = 1.0/n;
+#pragma omp parallel for schedule(runtime)
+    for (int i = 0; i < n; i++)
+    {
+      real xp, yp, zp, s2 = 2*R0;
+      real vx, vy, vz;
+      while (s2 > R0*R0) {
+        xp = (1.0 - 2.0*drand48())*R0;
+        yp = (1.0 - 2.0*drand48())*R0;
+        zp = (1.0 - 2.0*drand48())*R0;
+        s2 = xp*xp + yp*yp + zp*zp;
+        vx = drand48() * 0.1;
+        vy = drand48() * 0.1;
+        vz = drand48() * 0.1;
+      }
+      g_posx[i] = xp;
+      g_posy[i] = yp;
+      g_posz[i] = zp;
+      g_velx[i] = vx;
+      g_vely[i] = vy;
+      g_velz[i] = vz;
+      g_mass[i] = mp;
+    }
+  }
+
+  ~Hermite4()
+  {
+    delete g_mass;
+    delete g_gpot;
+    delete g_posx;
+    delete g_posy;
+    delete g_posz;
+    delete g_velx;
+    delete g_vely;
+    delete g_velz;
+    delete g_accx;
+    delete g_accy;
+    delete g_accz;
+    delete g_jrkx;
+    delete g_jrky;
+    delete g_jrkz;
+  }
+
+  void forces();
+
+  real step(const real dt)
+  {
+    const real dt2 = dt*real(1.0/2.0);
+    const real dt3 = dt*real(1.0/3.0);
+
+    real dt_min = HUGE;
+
+#pragma omp parallel for schedule(runtime)
+    for (int i = 0; i < n; i++)
+    {
+      accx0[i] = g_accx[i];
+      accy0[i] = g_accy[i];
+      accz0[i] = g_accz[i];
+      jrkx0[i] = g_jrkx[i];
+      jrky0[i] = g_jrky[i];
+      jrkz0[i] = g_jrkz[i];
+
+      g_posx[i] += dt*(g_velx[i] + dt2*(g_accx[i] + dt3*g_jrkx[i]));
+      g_posy[i] += dt*(g_vely[i] + dt2*(g_accy[i] + dt3*g_jrky[i]));
+      g_posz[i] += dt*(g_velz[i] + dt2*(g_accz[i] + dt3*g_jrkz[i]));
+
+      g_velx[i] += dt*(g_accx[i] + dt2*g_jrkx[i]);
+      g_vely[i] += dt*(g_accy[i] + dt2*g_jrky[i]);
+      g_velz[i] += dt*(g_accz[i] + dt2*g_jrkz[i]);
+    }
+
+    forces();
+
+    if (dt > 0.0)
+    {
+      const real h    = dt*real(0.5);
+      const real hinv = real(1.0)/h;
+      const real f1   = real(0.5)*hinv*hinv;
+      const real f2   = real(3.0)*hinv*f1;
+
+      const real dt2  = dt *dt * real(1.0/2.0);
+      const real dt3  = dt2*dt * real(1.0/3.0);
+      const real dt4  = dt3*dt * real(1.0/4.0);
+      const real dt5  = dt4*dt * real(1.0/5.0);
+
+#pragma omp parallel for schedule(runtime) reduction(min:dt_min)
+      for (int i = 0; i < n; i++)
+      {
+        /* compute snp & crk */
+
+        const real Amx = g_accx[i] - accx0[i];
+        const real Amy = g_accy[i] - accy0[i];
+        const real Amz = g_accz[i] - accz0[i];
+
+        const real Jmx = h*(g_jrkx[i] - jrkx0[i]);
+        const real Jmy = h*(g_jrky[i] - jrky0[i]);
+        const real Jmz = h*(g_jrkz[i] - jrkz0[i]);
+
+        const real Jpx = h*(g_jrkx[i] + jrkx0[i]);
+        const real Jpy = h*(g_jrky[i] + jrky0[i]);
+        const real Jpz = h*(g_jrkz[i] + jrkz0[i]);
+
+
+        real snpx = f1*Jmx;
+        real snpy = f1*Jmy;
+        real snpz = f1*Jmz;
+
+        real crkx = f2*(Jpx - Amx);
+        real crky = f2*(Jpy - Amy);
+        real crkz = f2*(Jpz - Amz);
+
+        snpx -= h*crkx;
+        snpy -= h*crky;
+        snpz -= h*crkz;
+
+        /* correct */
+
+        g_posx[i] += dt4*snpx + dt5*crkx;
+        g_posy[i] += dt4*snpy + dt5*crky;
+        g_posz[i] += dt4*snpz + dt5*crkz;
+
+        g_velx[i] += dt3*snpx + dt4*crkx;
+        g_vely[i] += dt3*snpy + dt4*crky;
+        g_velz[i] += dt3*snpz + dt4*crkz;
+
+        /* compute new timestep */
+
+        const real s0 = g_accx[i]*g_accx[i] + g_accy[i]*g_accy[i] + g_accz[i]*g_accz[i];
+        const real s1 = g_jrkx[i]*g_jrkx[i] + g_jrky[i]*g_jrky[i] + g_jrkz[i]*g_jrkz[i];
+        const real s2 = snpx*snpx + snpy*snpy + snpz*snpz;
+        const real s3 = crkx*crkx + crky*crky + crkz*crkz;
+
+        const double u = std::sqrt(s0*s2) + s1;
+        const double l = std::sqrt(s1*s3) + s2;
+        assert(l > 0.0f);
+        const real dt_loc = eta *std::sqrt(u/l);
+        dt_min = std::min(dt_min, dt_loc);
+      }
+    }
+
+    if (dt_min == HUGE)
+      return dt;
+    else
+      return dt_min;
+  }
+
+  void energy(real &Ekin, real &Epot)
+  {
+    real ekin = 0, epot = 0;
+
+#pragma omp parallel for reduction(+:ekin,epot)
+    for (int i = 0; i < n; i++)
+    {
+      ekin += g_mass[i] * (g_velx[i]*g_velx[i] + g_vely[i]*g_vely[i] + g_velz[i]*g_velz[i]) * real(0.5f);
+      epot += real(0.5f)*g_mass[i] * g_gpot[i];
+    }
+    Ekin = ekin;
+    Epot = epot;
+  }
+
+  void integrate(const int niter, const real t_end = HUGE)
+  {
+    const double tin = rtc();
+    forces();
+    const double fn = n;
+    printf(" mean flop rate in %g sec [%g GFLOP/s]\n", rtc() - tin,
+        fn*fn*PP_FLOP/(rtc() - tin)/1e9);
+
+    real Epot0, Ekin0;
+    energy(Ekin0, Epot0);
+    const real Etot0 = Epot0 + Ekin0;
+    printf(" E: %g %g %g \n", Epot0, Ekin0, Etot0);
+
+    /////////
+
+    real t_global = 0;
+    double t0 = 0;
+    int iter = 0;
+    int ntime = 10;
+    real dt = 1.0/131072;
+    real Epot, Ekin, Etot = Etot0;
+    while (t_global < t_end) {
+      if (iter % ntime == 0)
+        t0 = rtc();
+
+      if (iter >= niter) return;
+
+      dt = step(dt);
+      iter++;
+      t_global += dt;
+
+      const real Etot_pre = Etot;
+      energy(Ekin, Epot);
+      Etot = Ekin + Epot;
+
+      if (iter % 1 == 0) {
+        const real Etot = Ekin + Epot;
+        printf("iter= %d: t= %g  dt= %g Ekin= %g  Epot= %g  Etot= %g , dE = %g d(dE)= %g \n",
+            iter, t_global, dt, Ekin, Epot, Etot, (Etot - Etot0)/std::abs(Etot0),
+            (Etot - Etot_pre)/std::abs(Etot_pre)   );
+      }
+
+      if (iter % ntime == 0) {
+        printf(" mean flop rate in %g sec [%g GFLOP/s]\n", rtc() - t0,
+            fn*fn*PP_FLOP/(rtc() - t0)/1e9*ntime);
+      }
+
+      fflush(stdout);
+
+    }
+  }
+
+};
+
+
+
+void Hermite4::forces()
+{
+  ispc::compute_forces(
+      n,
+      g_mass,
+      g_posx,
+      g_posy,
+      g_posz,
+      g_velx,
+      g_vely,
+      g_velz,
+      g_accx,
+      g_accy,
+      g_accz,
+      g_jrkx,
+      g_jrky,
+      g_jrkz,
+      g_gpot,
+      eps2);
+}
+
+void run(const int nbodies, const real eta, const int nstep)
+{
+  Hermite4 h4(nbodies, eta);
+  h4.integrate(nstep);
+}
+
+int main(int argc, char *argv[])
+{
+  printf("  Usage: %s [nbodies=8192] [nsteps=40] [eta=0.1] \n", argv[0]);
+
+  int nbodies = 8192;
+  if (argc > 1) nbodies = atoi(argv[1]);
+
+  int nstep = 40;
+  if (argc > 2) nstep = atoi(argv[2]);
+
+  float eta = 0.1;
+  if (argc > 3) eta = atof(argv[3]);
+
+
+
+  printf("nbodies= %d\n", nbodies);
+  printf("nstep= %d\n", nstep);
+  printf(" eta= %g \n", eta);
+
+  run(nbodies, eta, nstep);
+
+  return 0;
+}
+
--- a/examples/portable/nbody_hermite4/hermite4.ispc
+++ b/examples/portable/nbody_hermite4/hermite4.ispc
@@ -0,0 +1,197 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "typeReal.h"
+
+typedef real<3> vec3;
+struct Force
+{
+  vec3 acc, jrk;
+  real pot, null;
+};
+
+struct Predictor
+{
+  vec3 pos, vel;
+};
+
+static inline
+void body_body_force(
+    Force &fi,
+    const Predictor &pi,
+    const Predictor &pj,
+    const real mj,
+    const real eps2)
+{
+  const real dx = pj.pos.x - pi.pos.x;
+  const real dy = pj.pos.y - pi.pos.y;
+  const real dz = pj.pos.z - pi.pos.z;
+
+  const real ds2 = dx*dx + dy*dy + dz*dz + eps2;
+
+#if 1
+  const real  inv_ds  = rsqrt((float)ds2);
+#else
+  const real  inv_ds  = rsqrt(ds2);
+#endif
+  const real  inv_ds2 = inv_ds*inv_ds;
+  const real minv_ds  = inv_ds  * mj;
+  const real minv_ds3 = inv_ds2 * minv_ds;
+
+
+  fi.acc.x += minv_ds3 * dx;
+  fi.acc.y += minv_ds3 * dy;
+  fi.acc.z += minv_ds3 * dz;
+  fi.pot   -= minv_ds;
+
+  const real dvx = pj.vel.x - pi.vel.x;
+  const real dvy = pj.vel.y - pi.vel.y;
+  const real dvz = pj.vel.z - pi.vel.z;
+  const real rv  = dx*dvx + dy*dvy + dz*dvz;
+
+  const real Jij = (real)(-3.0) * (rv * inv_ds2 * minv_ds3);
+
+  fi.jrk.x += minv_ds3*dvx + Jij*dx;
+  fi.jrk.y += minv_ds3*dvy + Jij*dy;
+  fi.jrk.z += minv_ds3*dvz + Jij*dz;
+}
+
+task void compute_forces_task(
+    uniform const int     n,
+    uniform const int nPerTask,
+    uniform const real mass[],
+    uniform const real posx[],
+    uniform const real posy[],
+    uniform const real posz[],
+    uniform const real velx[],
+    uniform const real vely[],
+    uniform const real velz[],
+    uniform       real accx[],
+    uniform       real accy[],
+    uniform       real accz[],
+    uniform       real jrkx[],
+    uniform       real jrky[],
+    uniform       real jrkz[],
+    uniform       real gpot[],
+    const uniform real eps2)
+{
+  const uniform int nibeg = taskIndex * nPerTask;
+  const uniform int niend = min(n, nibeg + nPerTask);
+
+  if (nibeg >= n)
+    return;
+
+  uniform real shdata[7][programCount];
+
+  assert((n%programCount) == 0);
+
+  foreach (i = nibeg ... niend)
+  {
+    Force fi;
+    fi.acc = (real)0.0;
+    fi.jrk = (real)0.0;
+    fi.pot = (real)0.0;
+
+    Predictor pi;
+    pi.pos.x = posx[i];
+    pi.pos.y = posy[i];
+    pi.pos.z = posz[i];
+    pi.vel.x = velx[i];
+    pi.vel.y = vely[i];
+    pi.vel.z = velz[i];
+
+    for (uniform int jb = 0; jb < n; jb += programCount)
+    {
+      const int jp = jb + programIndex;
+      shdata[0][programIndex] = posx[jp];
+      shdata[1][programIndex] = posy[jp];
+      shdata[2][programIndex] = posz[jp];
+      shdata[3][programIndex] = mass[jp];
+      shdata[4][programIndex] = velx[jp];
+      shdata[5][programIndex] = vely[jp];
+      shdata[6][programIndex] = velz[jp];
+
+      for (uniform int j = 0; j < programCount; j++)
+      {
+        Predictor pj;
+        pj.pos.x = shdata[0][j];
+        pj.pos.y = shdata[1][j];
+        pj.pos.z = shdata[2][j];
+        pj.vel.x = shdata[4][j];
+        pj.vel.y = shdata[5][j];
+        pj.vel.z = shdata[6][j];
+        const real jmass  = shdata[3][j];
+        body_body_force(fi,pi,pj,jmass,eps2);
+      }
+    }
+
+    accx[i] = fi.acc.x;
+    accy[i] = fi.acc.y;
+    accz[i] = fi.acc.z;
+    jrkx[i] = fi.jrk.x;
+    jrky[i] = fi.jrk.y;
+    jrkz[i] = fi.jrk.z;
+    gpot[i] = fi.pot;
+  }
+}
+
+export void compute_forces(
+    uniform const int     n,
+    uniform const real mass[],
+    uniform const real posx[],
+    uniform const real posy[],
+    uniform const real posz[],
+    uniform const real velx[],
+    uniform const real vely[],
+    uniform const real velz[],
+    uniform       real accx[],
+    uniform       real accy[],
+    uniform       real accz[],
+    uniform       real jrkx[],
+    uniform       real jrky[],
+    uniform       real jrkz[],
+    uniform       real gpot[],
+    const uniform real eps2)
+{
+  const uniform int nPerTask = min(128,programCount*8);
+  const uniform int nTask = (n+nPerTask-1)/nPerTask;
+
+  launch [nTask]  compute_forces_task(
+      n, nPerTask,
+      mass,
+      posx,posy,posz,
+      velx,vely,velz,
+      accx,accy,accz,
+      jrkx,jrky,jrkz,
+      gpot,eps2);
+}
--- a/examples/portable/nbody_hermite4/typeReal.h
+++ b/examples/portable/nbody_hermite4/typeReal.h
@@ -0,0 +1,2 @@
+#pragma once
+typedef double real;
--- a/examples/portable/omp_tasksys.cpp
+++ b/examples/portable/omp_tasksys.cpp
@@ -0,0 +1,409 @@
+/*
+  Copyright (c) 2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#define DBG(x)
+#include <omp.h>
+#include <malloc.h>
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <algorithm>
+
+// Signature of ispc-generated 'task' functions
+typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,
+                             int taskIndex, int taskCount,
+                             int taskIndex0, int taskIndex1, int taskIndex2,
+                             int taskCount0, int taskCount1, int taskCount2);
+
+// Small structure used to hold the data for each task
+#ifdef _MSC_VER
+__declspec(align(16))
+#endif
+struct TaskInfo {
+    TaskFuncType func;
+    void *data;
+    int taskIndex;
+    int taskCount3d[3];
+#if defined(ISPC_IS_WINDOWS)
+    event taskEvent;
+#endif
+    int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; }
+    int taskIndex0() const
+    {
+      return taskIndex % taskCount3d[0];
+    }
+    int taskIndex1() const
+    {
+      return ( taskIndex / taskCount3d[0] ) % taskCount3d[1];
+    }
+    int taskIndex2() const
+    {
+      return taskIndex / ( taskCount3d[0]*taskCount3d[1] );
+    }
+    int taskCount0() const { return taskCount3d[0]; }
+    int taskCount1() const { return taskCount3d[1]; }
+    int taskCount2() const { return taskCount3d[2]; }
+    TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); }
+}
+#ifndef _MSC_VER
+__attribute__((aligned(32)));
+#endif
+;
+
+// ispc expects these functions to have C linkage / not be mangled
+extern "C" {
+    void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz);
+    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
+    void ISPCSync(void *handle);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// TaskGroupBase
+
+#define LOG_TASK_QUEUE_CHUNK_SIZE 14
+#define MAX_TASK_QUEUE_CHUNKS 8
+#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
+
+#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
+
+#define NUM_MEM_BUFFERS 16
+
+class TaskGroup;
+
+/** The TaskGroupBase structure provides common functionality for "task
+    groups"; a task group is the set of tasks launched from within a single
+    ispc function.  When the function is ready to return, it waits for all
+    of the tasks in its task group to finish before it actually returns.
+ */
+class TaskGroupBase {
+public:
+    void Reset();
+
+    int AllocTaskInfo(int count);
+    TaskInfo *GetTaskInfo(int index);
+
+    void *AllocMemory(int64_t size, int32_t alignment);
+
+protected:
+    TaskGroupBase();
+    ~TaskGroupBase();
+
+    int nextTaskInfoIndex;
+
+private:
+    /* We allocate blocks of TASK_QUEUE_CHUNK_SIZE TaskInfo structures as
+       needed by the calling function.  We hold up to MAX_TASK_QUEUE_CHUNKS
+       of these (and then exit at runtime if more than this many tasks are
+       launched.)
+     */
+    TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
+
+    /* We also allocate chunks of memory to service ISPCAlloc() calls.  The
+       memBuffers[] array holds pointers to this memory.  The first element
+       of this array is initialized to point to mem and then any subsequent
+       elements required are initialized with dynamic allocation.
+     */
+    int curMemBuffer, curMemBufferOffset;
+    int memBufferSize[NUM_MEM_BUFFERS];
+    char *memBuffers[NUM_MEM_BUFFERS];
+    char mem[256];
+};
+
+
+inline TaskGroupBase::TaskGroupBase() {
+    nextTaskInfoIndex = 0;
+
+    curMemBuffer = 0;
+    curMemBufferOffset = 0;
+    memBuffers[0] = mem;
+    memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
+    for (int i = 1; i < NUM_MEM_BUFFERS; ++i) {
+        memBuffers[i] = NULL;
+        memBufferSize[i] = 0;
+    }
+
+    for (int i = 0; i < MAX_TASK_QUEUE_CHUNKS; ++i)
+        taskInfo[i] = NULL;
+}
+
+
+inline TaskGroupBase::~TaskGroupBase() {
+    // Note: don't delete memBuffers[0], since it points to the start of
+    // the "mem" member!
+    for (int i = 1; i < NUM_MEM_BUFFERS; ++i)
+        delete[](memBuffers[i]);
+}
+
+
+inline void
+TaskGroupBase::Reset() {
+    nextTaskInfoIndex = 0;
+    curMemBuffer = 0;
+    curMemBufferOffset = 0;
+}
+
+
+inline int
+TaskGroupBase::AllocTaskInfo(int count) {
+    int ret = nextTaskInfoIndex;
+    nextTaskInfoIndex += count;
+    return ret;
+}
+
+
+inline TaskInfo *
+TaskGroupBase::GetTaskInfo(int index) {
+    int chunk = (index >> LOG_TASK_QUEUE_CHUNK_SIZE);
+    int offset = index & (TASK_QUEUE_CHUNK_SIZE-1);
+
+    if (chunk == MAX_TASK_QUEUE_CHUNKS) {
+        fprintf(stderr, "A total of %d tasks have been launched from the "
+                "current function--the simple built-in task system can handle "
+                "no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE "
+                "and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation.  "
+                "Sorry!  Exiting.\n", index);
+        exit(1);
+    }
+
+    if (taskInfo[chunk] == NULL)
+        taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
+    return &taskInfo[chunk][offset];
+}
+
+
+inline void *
+TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
+    char *basePtr = memBuffers[curMemBuffer];
+    intptr_t iptr = (intptr_t)(basePtr + curMemBufferOffset);
+    iptr = (iptr + (alignment-1)) & ~(alignment-1);
+
+    int newOffset = int(iptr - (intptr_t)basePtr + size);
+    if (newOffset < memBufferSize[curMemBuffer]) {
+        curMemBufferOffset = newOffset;
+        return (char *)iptr;
+    }
+
+    ++curMemBuffer;
+    curMemBufferOffset = 0;
+    assert(curMemBuffer < NUM_MEM_BUFFERS);
+
+    int allocSize = 1 << (12 + curMemBuffer);
+    allocSize = std::max(int(size+alignment), allocSize);
+    char *newBuf = new char[allocSize];
+    memBufferSize[curMemBuffer] = allocSize;
+    memBuffers[curMemBuffer] = newBuf;
+    return AllocMemory(size, alignment);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Atomics and the like
+
+static inline void
+lMemFence() {
+    // Windows atomic functions already contain the fence
+    // KNC doesn't need the memory barrier
+#if !defined ISPC_IS_KNC && !defined ISPC_IS_WINDOWS
+    __sync_synchronize();
+#endif
+}
+
+static void *
+lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
+#ifdef ISPC_IS_WINDOWS
+    return InterlockedCompareExchangePointer(v, newValue, oldValue);
+#else
+    void *result = __sync_val_compare_and_swap(v, oldValue, newValue);
+    lMemFence();
+    return result;
+#endif // ISPC_IS_WINDOWS
+}
+
+static int32_t
+lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
+#ifdef ISPC_IS_WINDOWS
+    return InterlockedCompareExchange((volatile LONG *)v, newValue, oldValue);
+#else
+    int32_t result = __sync_val_compare_and_swap(v, oldValue, newValue);
+    lMemFence();
+    return result;
+#endif // ISPC_IS_WINDOWS
+}
+
+static inline int32_t
+lAtomicAdd(volatile int32_t *v, int32_t delta) {
+#ifdef ISPC_IS_WINDOWS
+    return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta;
+#else
+    return __sync_fetch_and_add(v, delta);
+#endif
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+class TaskGroup : public TaskGroupBase {
+public:
+    void Launch(int baseIndex, int count);
+    void Sync();
+
+};
+
+
+///////////////////////////////////////////////////////////////////////////
+// OpenMP
+
+static void
+InitTaskSystem() {
+        // No initialization needed
+}
+
+inline void
+TaskGroup::Launch(int baseIndex, int count) {
+#pragma omp parallel
+  {
+    const int threadIndex = omp_get_thread_num();
+    const int threadCount = omp_get_num_threads();
+
+    TaskInfo ti = *GetTaskInfo(baseIndex);
+#pragma omp for schedule(runtime)
+    for(int i = 0; i < count; i++)
+    {
+        ti.taskIndex = i;
+
+        // Actually run the task.
+        ti.func(ti.data, threadIndex, threadCount, ti.taskIndex, ti.taskCount(),
+            ti.taskIndex0(), ti.taskIndex1(), ti.taskIndex2(),
+            ti.taskCount0(), ti.taskCount1(), ti.taskCount2());
+    }
+  }
+}
+
+inline void
+TaskGroup::Sync() {
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+#define MAX_FREE_TASK_GROUPS 64
+static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
+
+  static inline TaskGroup *
+AllocTaskGroup()
+{
+  for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
+    TaskGroup *tg = freeTaskGroups[i];
+    if (tg != NULL) {
+      void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);
+      if (ptr != NULL) {
+        return (TaskGroup *)ptr;
+      }
+    }
+  }
+
+  return new TaskGroup;
+}
+
+
+  static inline void
+FreeTaskGroup(TaskGroup *tg)
+{
+  tg->Reset();
+
+  for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
+    if (freeTaskGroups[i] == NULL) {
+      void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL);
+      if (ptr == NULL)
+        return;
+    }
+  }
+
+  delete tg;
+}
+
+  void
+ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2)
+{
+  const int count = count0*count1*count2;
+  TaskGroup *taskGroup;
+  if (*taskGroupPtr == NULL) {
+    InitTaskSystem();
+    taskGroup = AllocTaskGroup();
+    *taskGroupPtr = taskGroup;
+  }
+  else
+    taskGroup = (TaskGroup *)(*taskGroupPtr);
+
+  int baseIndex = taskGroup->AllocTaskInfo(count);
+  for (int i = 0; i < 1; ++i) {
+    TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i);
+    ti->func = (TaskFuncType)func;
+    ti->data = data;
+    ti->taskIndex = i;
+    ti->taskCount3d[0] = count0;
+    ti->taskCount3d[1] = count1;
+    ti->taskCount3d[2] = count2;
+  }
+  taskGroup->Launch(baseIndex, count);
+}
+
+
+  void
+ISPCSync(void *h)
+{
+  TaskGroup *taskGroup = (TaskGroup *)h;
+  if (taskGroup != NULL) {
+    taskGroup->Sync();
+    FreeTaskGroup(taskGroup);
+  }
+}
+
+
+  void *
+ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment)
+{
+  TaskGroup *taskGroup;
+  if (*taskGroupPtr == NULL) {
+    InitTaskSystem();
+    taskGroup = AllocTaskGroup();
+    *taskGroupPtr = taskGroup;
+  }
+  else
+    taskGroup = (TaskGroup *)(*taskGroupPtr);
+
+  return taskGroup->AllocMemory(size, alignment);
+}
+
--- a/examples/portable/options/.gitignore
+++ b/examples/portable/options/.gitignore
@@ -0,0 +1 @@
+options
--- a/examples/portable/options/Makefile_cpu
+++ b/examples/portable/options/Makefile_cpu
@@ -0,0 +1,8 @@
+
+EXAMPLE=options
+CPP_SRC=options.cpp 
+ISPC_SRC=options.ispc
+ISPC_IA_TARGETS=avx1-i32x16
+ISPC_ARM_TARGETS=neon
+
+include ../common_cpu.mk
--- a/examples/portable/options/Makefile_knc
+++ b/examples/portable/options/Makefile_knc
@@ -0,0 +1,7 @@
+EXAMPLE=options
+CXX_SRC=options.cpp 
+ISPC_SRC=options.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+
+include ../common_knc.mk
--- a/examples/portable/options/Makefile_ptx
+++ b/examples/portable/options/Makefile_ptx
@@ -0,0 +1,14 @@
+PROG=options
+ISPC_SRC=options.ispc
+CU_SRC=options.cu
+CXX_SRC=options.cpp 
+PTXCC_REGMAX=128
+
+
+#LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
--- a/examples/portable/options/options.cpp
+++ b/examples/portable/options/options.cpp
@@ -0,0 +1,120 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define NOMINMAX
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#include <cmath>
+#include <algorithm>
+using std::max;
+
+#include "options_defs.h"
+#include "timing.h"
+#include "ispc_malloc.h"
+
+#include "options_ispc.h"
+using namespace ispc;
+
+static void usage() {
+    printf("usage: options [--count=<num options>]\n");
+}
+
+
+int main(int argc, char *argv[]) {
+    int nOptions = 128*1024;
+
+    for (int i = 1; i < argc; ++i) {
+        if (strncmp(argv[i], "--count=", 8) == 0) {
+            nOptions = atoi(argv[i] + 8);
+            if (nOptions <= 0) {
+                usage();
+                exit(1);
+            }
+        }
+    }
+
+    float *S = new float[nOptions];
+    float *X = new float[nOptions];
+    float *T = new float[nOptions];
+    float *r = new float[nOptions];
+    float *v = new float[nOptions];
+    float *result = new float[nOptions];
+
+    for (int i = 0; i < nOptions; ++i) {
+        S[i] = 100;  // stock price
+        X[i] = 98;   // option strike price
+        T[i] = 2;    // time (years)
+        r[i] = .02;  // risk-free interest rate
+        v[i] = 5;    // volatility
+    }
+
+    double sum;
+
+    //
+    // Binomial options pricing model, ispc implementation, tasks
+    //
+    double binomial_tasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        binomial_put_ispc_tasks(S, X, T, r, v, result, nOptions);
+        double dt = get_elapsed_msec();
+        binomial_tasks = std::min(binomial_tasks, dt);
+    }
+    sum = 0.;
+    for (int i = 0; i < nOptions; ++i)
+      sum += result[i];
+    printf("[binomial ispc, tasks]:\t\t[%.3f] msec (avg %f)\n",
+           binomial_tasks, sum / nOptions);
+
+    //
+    // Black-Scholes options pricing model, ispc implementation, tasks
+    //
+    double bs_ispc_tasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        black_scholes_ispc_tasks(S, X, T, r, v, result, nOptions);
+        double dt = get_elapsed_msec();
+        sum = 0.;
+        for (int i = 0; i < nOptions; ++i)
+            sum += result[i];
+        bs_ispc_tasks = std::min(bs_ispc_tasks, dt);
+    }
+    printf("[black-scholes ispc, tasks]:\t[%.3f] msec (avg %f)\n",
+           bs_ispc_tasks, sum / nOptions);
+
+
+    return 0;
+}
--- a/examples/portable/options/options.cu
+++ b/examples/portable/options/options.cu
@@ -0,0 +1,334 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "options_defs.h"
+#include "cuda_helpers.cuh"
+
+__device__ static inline void __range_reduce_log(float input, float * reduced,
+                                      int * exponent) {
+    int int_version = __float_as_int(input); //intbits(input);
+    // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
+    // exponent mask    = 0111 1111 1000 0000 0000 0000 0000 0000
+    //                    0x7  0xF  0x8  0x0  0x0  0x0  0x0  0x0
+    // non-exponent     = 1000 0000 0111 1111 1111 1111 1111 1111
+    //                  = 0x8  0x0  0x7  0xF  0xF  0xF  0xF  0xF
+
+    //const int exponent_mask(0x7F800000)
+    const int nonexponent_mask = 0x807FFFFF;
+
+    // We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126
+    const int exponent_neg1 = (126l << 23);
+    // NOTE(boulos): We don't need to mask anything out since we know
+    // the sign bit has to be 0. If it's 1, we need to return infinity/nan
+    // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
+    int biased_exponent = int_version >> 23; // This number is [0, 255] but it means [-127, 128]
+
+    int offset_exponent = biased_exponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
+    *exponent = offset_exponent - 127; // get the real value
+
+    // Blend the offset_exponent with the original input (do this in
+    // int for now, until I decide if float can have & and &not)
+    int blended = (int_version & nonexponent_mask) | (exponent_neg1);
+    *reduced = __int_as_float(blended); //floatbits(blended);
+}
+
+
+__device__ static inline float __Logf(const float x_full)
+{
+#if 1
+  return __logf(x_full);
+#else
+  float reduced;
+  int exponent;
+
+  const int NaN_bits = 0x7fc00000;
+  const int Neg_Inf_bits = 0xFF800000;
+  const float NaN = __int_as_float(NaN_bits); //floatbits(NaN_bits);
+  const float neg_inf = __int_as_float(Neg_Inf_bits); //floatbits(Neg_Inf_bits);
+  bool use_nan = x_full < 0.f;
+  bool use_inf = x_full == 0.f;
+  bool exceptional = use_nan || use_inf;
+  const float one = 1.0f;
+
+  float patched = exceptional ? one : x_full;
+  __range_reduce_log(patched, &reduced, &exponent);
+
+  const float ln2 = 0.693147182464599609375f;
+
+  float x1 = one - reduced;
+  const float c1 = 0.50000095367431640625f;
+  const float c2 = 0.33326041698455810546875f;
+  const float c3 = 0.2519190013408660888671875f;
+  const float c4 = 0.17541764676570892333984375f;
+  const float c5 = 0.3424419462680816650390625f;
+  const float c6 = -0.599632322788238525390625f;
+  const float c7 = +1.98442304134368896484375f;
+  const float c8 = -2.4899270534515380859375f;
+  const float c9 = +1.7491014003753662109375f;
+
+  float result = x1 * c9 + c8;
+  result = x1 * result + c7;
+  result = x1 * result + c6;
+  result = x1 * result + c5;
+  result = x1 * result + c4;
+  result = x1 * result + c3;
+  result = x1 * result + c2;
+  result = x1 * result + c1;
+  result = x1 * result + one;
+
+  // Equation was for -(ln(red)/(1-red))
+  result *= -x1;
+  result += (float)(exponent) * ln2;
+
+  return exceptional ? (use_nan ? NaN : neg_inf) : result;
+#endif
+}
+
+__device__ static inline float __Expf(const float x_full)
+{
+#if 1
+  return __expf(x_full);
+#else
+  const float ln2_part1 = 0.6931457519f;
+  const float ln2_part2 = 1.4286067653e-6f;
+  const float one_over_ln2 = 1.44269502162933349609375f;
+
+  float scaled = x_full * one_over_ln2;
+  float k_real = floor(scaled);
+  int k = (int)k_real;
+
+  // Reduced range version of x
+  float x = x_full - k_real * ln2_part1;
+  x -= k_real * ln2_part2;
+
+  // These coefficients are for e^x in [0, ln(2)]
+  const float one = 1.f;
+  const float c2 = 0.4999999105930328369140625f;
+  const float c3 = 0.166668415069580078125f;
+  const float c4 = 4.16539050638675689697265625e-2f;
+  const float c5 = 8.378830738365650177001953125e-3f;
+  const float c6 = 1.304379315115511417388916015625e-3f;
+  const float c7 = 2.7555381529964506626129150390625e-4f;
+
+  float result = x * c7 + c6;
+  result = x * result + c5;
+  result = x * result + c4;
+  result = x * result + c3;
+  result = x * result + c2;
+  result = x * result + one;
+  result = x * result + one;
+
+  // Compute 2^k (should differ for float and double, but I'll avoid
+  // it for now and just do floats)
+  const int fpbias = 127;
+  int biased_n = k + fpbias;
+  bool overflow = k > fpbias;
+  // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
+  // we've got underflow. -127 * ln(2) -> -88.02. So the most
+  // negative float input that doesn't result in zero is like -88.
+  bool underflow = (biased_n <= 0);
+  const int InfBits = 0x7f800000;
+  biased_n <<= 23;
+  // Reinterpret this thing as float
+  float two_to_the_n = __int_as_float(biased_n); //floatbits(biased_n);
+  // Handle both doubles and floats (hopefully eliding the copy for float)
+  float elemtype_2n = two_to_the_n;
+  result *= elemtype_2n;
+//  result = overflow ? floatbits(InfBits) : result;
+  result = overflow ? __int_as_float(InfBits) : result;
+  result = underflow ? 0.0f : result;
+  return result;
+#endif
+}
+
+// Cumulative normal distribution function
+//
+__device__
+static inline float
+CND(float X) {
+    float L = fabsf(X);
+
+    float k = 1.0f / (1.0f + 0.2316419f * L);
+    float k2 = k*k;
+    float k3 = k2*k;
+    float k4 = k2*k2;
+    float k5 = k3*k2;
+
+    const float invSqrt2Pi = 0.39894228040f;
+    float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 +
+               -1.821255978f * k4 + 1.330274429f * k5);
+    w *= invSqrt2Pi * __Expf(-L * L * .5f);
+
+    if (X > 0.f)
+        w = 1.0f - w;
+    return w;
+}
+
+__global__
+void bs_task( float Sa[],  float Xa[],  float Ta[],
+    float ra[],  float va[],
+    float result[],  int count) {
+  if (taskIndex >= taskCount) return;
+     int first = taskIndex * (count/taskCount);
+     int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
+
+    for (int i = programIndex + first; i < last; i += programCount)
+      if (i < last)
+    {
+        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
+
+        float d1 = (__Logf(S/X) + (r + v * v * .5f) * T) / (v * sqrtf(T));
+        float d2 = d1 - v * sqrtf(T);
+
+        result[i] = S * CND(d1) - X * __Expf(-r * T) * CND(d2);
+    }
+}
+
+extern "C"
+__global__ void
+black_scholes_ispc_tasks___export( float Sa[],  float Xa[],  float Ta[],
+                          float ra[],  float va[],
+                          float result[],  int count) {
+  int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
+  launch(nTasks,1,1,bs_task)
+    (Sa, Xa, Ta, ra, va, result, count);
+  cudaDeviceSynchronize();
+}
+extern "C"
+__host__ void
+black_scholes_ispc_tasks( float Sa[],  float Xa[],  float Ta[],
+                          float ra[],  float va[],
+                          float result[],  int count) {
+  black_scholes_ispc_tasks___export<<<1,32>>>(Sa,Xa,Ta,ra,va,result,count);
+  cudaDeviceSynchronize();
+}
+
+/********/
+
+
+template<int NBEG, int NEND, int STEP>
+struct loop
+{
+  __device__ static void op1(float V[], const float u, const float X, const float S)
+  {
+    const int j = NBEG;
+    float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
+    V[j] = max(0.0f, X - S * upow);
+    loop<j+STEP,NEND,STEP>::op1(V,u,X,S);
+  }
+  __device__ static void op2(float V[], const float Pu, const float disc)
+  {
+    const int j = NBEG;
+#pragma unroll
+    for ( int k = 0; k < j; ++k)
+      V[k] = ((1.0f - Pu) * V[k] + Pu * V[k+ 1]) / disc;
+    loop<j+STEP,NEND,STEP>::op2(V, Pu,disc);
+  }
+};
+
+template<int NEND, int STEP>
+struct loop<NEND,NEND,STEP>
+{
+  __device__ static void op1(float V[], const float u, const float X, const float S) {}
+  __device__ static void op2(float V[], const float Pu, const float disc) {}
+};
+
+__device__
+static inline float
+binomial_put(float S, float X, float T, float r, float v)
+{
+
+  float V[BINOMIAL_NUM];
+
+  float dt = T / BINOMIAL_NUM;
+  float u = exp(v * sqrt(dt));
+  float d = 1.f / u;
+  float disc = exp(r * dt);
+  float Pu = (disc - d) / (u - d);
+
+#if 0  /* slow */
+  for ( int j = 0; j < BINOMIAL_NUM; ++j) {
+    float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
+    V[j] = max(0.0f, X - S * upow);
+  }
+  for ( int j = BINOMIAL_NUM-1; j >= 0; --j)
+    for ( int k = 0; k < j; ++k)
+      V[k] = ((1.0f - Pu) * V[k] + Pu * V[k+ 1]) / disc;
+#else  /* with loop unrolling, stores resutls in registers */
+  loop<0,BINOMIAL_NUM,1>::op1(V,u,X,S);
+  loop<BINOMIAL_NUM-1, -1, -1>::op2(V, Pu, disc);
+#endif
+  return V[0];
+}
+
+
+
+__global__ void
+binomial_task( float Sa[],  float Xa[],
+               float Ta[],  float ra[],
+               float va[],  float result[],
+               int count)
+{
+  int first = taskIndex * (count/taskCount);
+  int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
+
+  for (int i = programIndex + first; i < last; i += programCount)
+    if (i < last)
+    {
+      float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
+      result[i] = binomial_put(S, X, T, r, v);
+    }
+}
+
+
+extern "C" __global__ void
+binomial_put_ispc_tasks___export( float Sa[],  float Xa[],
+                         float Ta[],  float ra[],
+                         float va[],  float result[],
+                         int count) {
+  int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
+  launch(nTasks,1,1,binomial_task)
+    (Sa, Xa, Ta, ra, va, result, count);
+  cudaDeviceSynchronize();
+}
+extern "C"
+__host__ void
+binomial_put_ispc_tasks( float Sa[],  float Xa[],  float Ta[],
+                          float ra[],  float va[],
+                          float result[],  int count) {
+
+  cudaDeviceSetCacheConfig (cudaFuncCachePreferL1);
+  binomial_put_ispc_tasks___export<<<1,32>>>(Sa,Xa,Ta,ra,va,result,count);
+  cudaDeviceSynchronize();
+}
--- a/examples/portable/options/options.ispc
+++ b/examples/portable/options/options.ispc
@@ -0,0 +1,211 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "options_defs.h"
+
+// Cumulative normal distribution function
+static inline float
+CND(float X) {
+    float L = abs(X);
+
+    float k = 1.0 / (1.0 + 0.2316419 * L);
+    float k2 = k*k;
+    float k3 = k2*k;
+    float k4 = k2*k2;
+    float k5 = k3*k2;
+
+    const float invSqrt2Pi = 0.39894228040f;
+    float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 +
+               -1.821255978f * k4 + 1.330274429f * k5);
+    w *= invSqrt2Pi * exp(-L * L * .5f);
+
+    if (X > 0.f)
+        w = 1.0 - w;
+    return w;
+}
+
+task void
+bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
+        uniform float ra[], uniform float va[],
+        uniform float result[], uniform int count) {
+    uniform int first = taskIndex * (count/taskCount);
+    uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
+
+    foreach (i = first ... last) {
+        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
+
+        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
+        float d2 = d1 - v * sqrt(T);
+
+        result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
+    }
+}
+
+export void
+black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[],
+                         uniform float ra[], uniform float va[],
+                         uniform float result[], uniform int count) {
+    uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
+    launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
+}
+
+/********/
+
+
+export void
+black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
+                   uniform float ra[], uniform float va[],
+                   uniform float result[], uniform int count) {
+    foreach (i = 0 ... count) {
+        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
+
+        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
+        float d2 = d1 - v * sqrt(T);
+
+        result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
+    }
+}
+
+
+static inline float
+binomial_put(float S, float X, float T, float r, float v) {
+    float V[BINOMIAL_NUM];
+
+    float dt = T / BINOMIAL_NUM;
+    float u = exp(v * sqrt(dt));
+    float d = 1. / u;
+    float disc = exp(r * dt);
+    float Pu = (disc - d) / (u - d);
+
+#ifndef __NVPTX__
+
+    for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
+        float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
+        V[j] = max(0., X - S * upow);
+    }
+    for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
+        for (uniform int k = 0; k < j; ++k)
+            V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
+
+#else
+
+    /* loop unrolling helps NVVM to place V -> registers therefore boosting performance */
+    /* takes looong time to compile... */
+#if BINOMIAL_NUM != 64
+#error "Cannot unroll. Please use generic version above"
+#endif
+
+    // with PTX target unroll loops which will store data in registers..
+
+    /* first loop */
+
+#define OP(j) { \
+        float upow = pow(u, (float)(2*(j)-BINOMIAL_NUM)); \
+        V[j] = max(0., X - S * upow); }
+#define OP10(k) \
+    OP(k+0); OP(k+1); OP(k+2); OP(k+3); OP(k+4) \
+    OP(k+5); OP(k+6); OP(k+7); OP(k+8); OP(k+9);
+    OP10(0)
+    OP10(10)
+    OP10(20)
+    OP10(30)
+    OP10(40)
+    OP10(50)
+    OP(60)
+    OP(61)
+    OP(62)
+    OP(63)
+#undef OP10
+#undef OP
+
+    /* second loop */
+
+#define OP(j) {\
+  for (uniform int k = 0; k < (j); ++k) \
+      V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc; }
+#define OP10(k) \
+  OP(k+9); OP(k+8); OP(k+7); OP(k+6); OP(k+5); \
+  OP(k+4); OP(k+3); OP(k+2); OP(k+1); OP(k+0);
+  OP(63)
+  OP(62)
+  OP(61)
+  OP(60)
+  OP10(50)
+  OP10(40)
+  OP10(30)
+  OP10(20)
+  OP10(10)
+  OP10(0)
+#undef OP10
+#undef OP
+
+#endif
+    return V[0];
+}
+
+
+export void
+binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
+                  uniform float ra[], uniform float va[],
+                  uniform float result[], uniform int count) {
+    foreach (i = 0 ... count) {
+        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
+        result[i] = binomial_put(S, X, T, r, v);
+    }
+}
+
+
+task void
+binomial_task(uniform float Sa[], uniform float Xa[],
+              uniform float Ta[], uniform float ra[],
+              uniform float va[], uniform float result[],
+              uniform int count) {
+    uniform int first = taskIndex * (count/taskCount);
+    uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
+
+    foreach (i = first ... last) {
+        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
+        result[i] = binomial_put(S, X, T, r, v);
+    }
+}
+
+
+export void
+binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
+                        uniform float Ta[], uniform float ra[],
+                        uniform float va[], uniform float result[],
+                        uniform int count) {
+    uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
+    launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);
+}
--- a/examples/portable/options/options_defs.h
+++ b/examples/portable/options/options_defs.h
@@ -0,0 +1,40 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef OPTIONS_DEFS_H
+#define OPTIONS_DEFS_H 1
+
+#define BINOMIAL_NUM 64
+
+
+#endif // OPTIONS_DEFS_H
--- a/examples/portable/radixSort/Makefile_cpu
+++ b/examples/portable/radixSort/Makefile_cpu
@@ -0,0 +1,9 @@
+
+EXAMPLE=radixSort
+CPP_SRC=radixSort.cpp 
+ISPC_SRC=radixSort.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+#ISPC_FLAGS=-DDEBUG -g
+
+include ../common_cpu.mk
--- a/examples/portable/radixSort/Makefile_knc
+++ b/examples/portable/radixSort/Makefile_knc
@@ -0,0 +1,7 @@
+EXAMPLE=radixSort
+CXX_SRC=radixSort.cpp 
+ISPC_SRC=radixSort.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+
+include ../common_knc.mk
--- a/examples/portable/radixSort/Makefile_ptx
+++ b/examples/portable/radixSort/Makefile_ptx
@@ -0,0 +1,15 @@
+PROG=radixSort
+ISPC_SRC=radixSort.ispc
+
+CU_SRC=radixSort.cu
+# NVCC_FLAGS=-Xptxas=-O1
+CXX_SRC=radixSort.cpp  radixSort.cpp
+PTXCC_REGMAX=64
+
+LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
--- a/examples/portable/radixSort/radixSort.cpp
+++ b/examples/portable/radixSort/radixSort.cpp
@@ -0,0 +1,154 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <cassert>
+#include <iomanip>
+#include "timing.h"
+#include "ispc_malloc.h"
+#include "radixSort_ispc.h"
+
+static void progressBar(const int x, const int n, const int width = 50)
+{
+  assert(n > 1);
+  assert(x >= 0 && x < n);
+  assert(width > 10);
+  const float f = static_cast<float>(x)/(n-1);
+  const int   w = static_cast<int>(f * width);
+
+  // print bar
+  std::string bstr("[");
+  for (int i = 0; i < width; i++)
+    bstr += i < w ? '=' : ' ';
+  bstr += "]";
+
+  // print percentage 
+  char pstr0[32];
+  sprintf(pstr0, " %2d %c ", static_cast<int>(f*100.0),'%');
+  const std::string pstr(pstr0);
+  std::copy(pstr.begin(), pstr.end(), bstr.begin() + (width/2-2));
+
+  std::cout << bstr;
+  std::cout << (x == n-1 ? "\n" : "\r") << std::flush;
+}
+
+struct Key
+{
+  int32_t key,val;
+};
+
+int main (int argc, char *argv[])
+{
+  int i, j, n = argc == 1 ? 1000000 : atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
+  double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
+  Key *keys = new Key [n];
+  Key *keys_orig = new Key [n];
+  unsigned int *keys_gold = new unsigned int [n];
+
+  srand48(rtc()*65536);
+
+  int sortBits = 32;
+  assert(sortBits <= 32);
+
+#pragma omp parallel for
+  for (int i = 0; i < n; i++)
+  {
+    keys[i].key = ((int)(drand48() * (1<<30))) & ((1ULL << sortBits) - 1);
+    keys[i].val = i;
+  }
+
+  std::random_shuffle(keys, keys + n);
+
+#pragma omp parallel for
+  for (int i = 0; i < n; i++)
+  {
+    keys_gold[i] = keys[i].key;
+    keys_orig[i] = keys[i];
+  }
+
+  ispcSetMallocHeapLimit(1024*1024*1024);
+
+  ispc::radixSort_alloc(n);
+
+  tISPC2 = 1e30;
+  for (i = 0; i < m; i ++)
+  {
+    ispcMemcpy(keys, keys_orig, n*sizeof(Key));
+    reset_and_start_timer();
+    ispc::radixSort(n, (int64_t*)keys, sortBits);
+    tISPC2 = std::min(tISPC2, get_elapsed_msec());
+    if (argc != 3)
+        progressBar (i, m);
+  }
+
+  ispc::radixSort_free();
+
+  printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n/tISPC2);
+
+  std::sort(keys_gold, keys_gold + n);
+  for (int i = 0; i < n; i++)
+    assert(keys[i].key == keys_gold[i]);
+
+
+#if 0
+  for (i = 0; i < m; i ++)
+  {
+    ispcMemcpy(code, code_orig, n*sizeof(unsigned int));
+
+    reset_and_start_timer();
+
+    sort_serial (n, code, order);
+
+    tSerial += get_elapsed_msec();
+
+    if (argc != 3)
+        progressBar (i, m);
+  }
+
+  printf("[sort serial]:\t\t[%.3f] msec [%.3f Mpair/s]\n", tSerial, 1.0e-3*n*m/tSerial);
+
+#ifndef _CUDA_
+  printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2);
+#else
+  printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", tSerial/tISPC2);
+#endif
+#endif
+
+  delete keys;
+  delete keys_orig;
+  delete keys_gold;
+  return 0;
+}
--- a/examples/portable/radixSort/radixSort.cu
+++ b/examples/portable/radixSort/radixSort.cu
@@ -0,0 +1,401 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/*
+   Based on radixSort from  http://www.moderngpu.com
+   */
+
+#include "cuda_helpers.cuh"
+#include <cassert>
+
+#define NUMBITS 8
+#define NUMDIGITS (1<<NUMBITS)
+
+typedef long long Key;
+
+__forceinline__ __device__ int atomic_add_global(int* ptr, int value)
+{
+  return atomicAdd(ptr, value);
+}
+
+static __device__ __forceinline__ int shfl_scan_add_step(int partial, int up_offset)
+{
+  int result;
+  asm(
+      "{.reg .u32 r0;"
+      ".reg .pred p;"
+      "shfl.up.b32 r0|p, %1, %2, 0;"
+      "@p add.u32 r0, r0, %3;"
+      "mov.u32 %0, r0;}"
+      : "=r"(result) : "r"(partial), "r"(up_offset), "r"(partial));
+  return result;
+}
+
+__forceinline__ __device__ int exclusive_scan_add(int value)
+{
+  int mysum = value;
+#pragma unroll
+  for(int i = 0; i < 5; ++i)
+    mysum = shfl_scan_add_step(mysum, 1 << i);
+  return mysum - value;
+}
+
+__global__
+void countPass(
+    const  Key keysAll[],
+    Key sortedAll[],
+    const  int bit,
+    const  int numElements,
+    int countsAll[],
+    int countsGlobal[])
+{
+  const  int  blkIdx = taskIndex;
+  const  int numBlocks = taskCount;
+  const  int  blkDim = (numElements + numBlocks - 1) / numBlocks;
+
+  const  int mask = (1 << NUMBITS) - 1;
+
+  const  Key *  keys   =   keysAll + blkIdx*blkDim;
+  Key *  sorted = sortedAll + blkIdx*blkDim;
+  int *      counts = countsAll + blkIdx*NUMDIGITS;
+  const  int           nloc = min(numElements - blkIdx*blkDim, blkDim);
+
+#pragma unroll 8
+  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+    counts[digit] = 0;
+
+  for (int i = programIndex; i < nloc; i += programCount)
+    if (i < nloc)
+    {
+      sorted[i] = keys[i];
+      const int key = mask & ((unsigned int)keys[i] >> bit);
+      atomic_add_global(&counts[key], 1);
+    }
+
+#pragma unroll 8
+  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+    atomic_add_global(&countsGlobal[digit], counts[digit]);
+}
+
+__global__
+void sortPass(
+    Key keysAll[],
+    Key sorted[],
+    int bit,
+    int numElements,
+    int digitOffsetsAll[])
+{
+  const  int  blkIdx = taskIndex;
+  const  int numBlocks = taskCount;
+
+  const  int  blkDim = (numElements + numBlocks - 1) / numBlocks;
+
+
+  const  int keyIndex = blkIdx * blkDim;
+  Key *  keys = keysAll + keyIndex;
+
+
+  const  int nloc = min(numElements - keyIndex, blkDim);
+
+  const  int mask = (1 << NUMBITS) - 1;
+
+  /* copy digit offset from Gmem to Lmem */
+#if 1
+  __shared__ int digitOffsets_sh[NUMDIGITS*4];
+  volatile int *digitOffsets = digitOffsets_sh + warpIdx*NUMDIGITS;
+  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+    digitOffsets[digit] = digitOffsetsAll[blkIdx*NUMDIGITS + digit];
+#else
+  int *digitOffsets = &digitOffsetsAll[blkIdx*NUMDIGITS];
+#endif
+
+
+  for (int i = programIndex; i < nloc; i += programCount)
+    if (i < nloc)
+    {
+      const int key = mask & ((unsigned int)keys[i] >> bit);
+      int scatter;
+      /* not a vector friendly loop */
+#pragma unroll 1  /* needed, otherwise compiler unroll and optimizes the result :S */
+      for (int iv = 0; iv < programCount; iv++)
+        if (programIndex == iv)
+          scatter = digitOffsets[key]++;
+      sorted [scatter] = keys[i];
+    }
+}
+
+__global__
+void partialScanLocal(
+    int numBlocks,
+    int excScanAll[],
+    int  countsAll[],
+    int partialSumAll[])
+{
+  const  int  blkIdx = taskIndex;
+
+  const  int  blkDim = (numBlocks+taskCount-1)/taskCount;
+  const  int      bbeg = blkIdx * blkDim;
+  const  int      bend = min(bbeg + blkDim, numBlocks);
+
+  int (*   countsBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])countsAll;
+  int (*  excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll;
+  int (*    partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll;
+
+#pragma unroll 8
+  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+  {
+    int prev = bbeg == 0 ? excScanBlock[0][digit] : 0;
+    for ( int block = bbeg; block < bend; block++)
+    {
+      const int y = countsBlock[block][digit];
+      excScanBlock[block][digit] = prev;
+      prev += y;
+    }
+    partialSum[blkIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit];
+  }
+}
+
+__global__
+void partialScanGlobal(
+    const  int numBlocks,
+    int partialSumAll[],
+    int prefixSumAll[])
+{
+  int (*  partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll;
+  int (*   prefixSum)[NUMDIGITS] = ( int (*)[NUMDIGITS]) prefixSumAll;
+  const  int digit = taskIndex;
+  int carry = 0;
+  for (int block = programIndex;  block < numBlocks; block += programCount)
+  {
+    const int value = partialSum[block][digit];
+    const int scan  = exclusive_scan_add(value);
+    if (block < numBlocks)
+      prefixSum[block][digit] = scan + carry;
+    carry += __shfl(scan+value, programCount-1);
+  }
+}
+
+__global__
+void completeScanGlobal(
+    int numBlocks,
+    int excScanAll[],
+    int carryValueAll[])
+{
+  const  int  blkIdx = taskIndex;
+  const  int  blkDim = (numBlocks+taskCount-1)/taskCount;
+  const  int      bbeg = blkIdx * blkDim;
+  const  int      bend = min(bbeg  + blkDim, numBlocks);
+
+  int (*  excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll;
+  int (*    carryValue)[NUMDIGITS] = ( int (*)[NUMDIGITS])carryValueAll;
+
+#pragma unroll 8
+  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+  {
+    const int carry = carryValue[blkIdx][digit];
+    for ( int block = bbeg; block < bend; block++)
+      excScanBlock[block][digit] += carry;
+  }
+}
+
+__device__ static
+inline void radixExclusiveScan(
+    const  int numBlocks,
+    int excScanPtr[],
+    int  countsPtr[],
+    int partialSum[],
+    int  prefixSum[])
+{
+  const  int scale = 8;
+  launch (numBlocks/scale, 1,1, partialScanLocal)(numBlocks, excScanPtr, countsPtr, partialSum);
+  sync;
+
+  launch (NUMDIGITS,1,1,partialScanGlobal) (numBlocks/scale, partialSum, prefixSum);
+  sync;
+
+  launch (numBlocks/scale,1,1, completeScanGlobal) (numBlocks, excScanPtr, prefixSum);
+  sync;
+}
+
+__device__ static  int *  memoryPool = NULL;
+__device__ static  int numBlocks;
+__device__ static  int nSharedCounts;
+__device__ static  int nCountsGlobal;
+__device__ static  int nExcScan;
+__device__ static  int nCountsBlock;
+__device__ static  int nPartialSum;
+__device__ static  int nPrefixSum;
+
+__device__ static  int *  sharedCounts;
+__device__ static  int *  countsGlobal;
+__device__ static  int *  excScan;
+__device__ static  int *  counts;
+__device__ static  int *  partialSum;
+__device__ static  int *  prefixSum;
+
+__device__ static  int numElementsBuf = 0;
+__device__ static  Key *  bufKeys;
+
+__global__
+void radixSort_alloc___export(const  int n)
+{
+  assert(memoryPool == NULL);
+  numBlocks     = 13*32*4;
+  nSharedCounts = NUMDIGITS*numBlocks;
+  nCountsGlobal = NUMDIGITS;
+  nExcScan      = NUMDIGITS*numBlocks;
+  nCountsBlock  = NUMDIGITS*numBlocks;
+  nPartialSum   = NUMDIGITS*numBlocks;
+  nPrefixSum    = NUMDIGITS*numBlocks;
+
+
+  const  int nalloc =
+    nSharedCounts +
+    nCountsGlobal +
+    nExcScan +
+    nCountsBlock +
+    nPartialSum +
+    nPrefixSum;
+
+  if (programIndex == 0)
+    memoryPool =  new  int[nalloc];
+
+  sharedCounts = memoryPool;
+  countsGlobal = sharedCounts + nSharedCounts;
+  excScan      = countsGlobal + nCountsGlobal;
+  counts       = excScan      + nExcScan;
+  partialSum   = counts       + nCountsBlock;
+  prefixSum    = partialSum   + nPartialSum;
+}
+
+extern "C"
+void radixSort_alloc(const  int n)
+{
+  radixSort_alloc___export<<<1,32>>>(n);
+  sync;
+}
+
+
+__device__  static
+void radixSort_freeBufKeys()
+{
+  if (numElementsBuf > 0)
+  {
+    if (programIndex == 0)
+      delete bufKeys;
+    numElementsBuf = 0;
+  }
+}
+
+__global__ void radixSort_free___export()
+{
+  assert(memoryPool != NULL);
+  if (programIndex == 0)
+    delete memoryPool;
+  memoryPool = NULL;
+
+  radixSort_freeBufKeys();
+}
+extern "C"
+void radixSort_free()
+{
+  radixSort_free___export<<<1,32>>>();
+  sync;
+}
+
+__global__ void radixSort___export(
+    const  int numElements,
+    Key keys[],
+    const  int nBits)
+{
+#ifdef __NVPTX__
+  assert((numBlocks & 3) == 0);  /* task granularity on Kepler is 4 */
+#endif
+
+  if (numElementsBuf < numElements)
+    radixSort_freeBufKeys();
+  if (numElementsBuf == 0)
+  {
+    numElementsBuf = numElements;
+    if (programIndex == 0)
+      bufKeys =  new  Key[numElementsBuf];
+  }
+
+  const  int blkDim  = (numElements + numBlocks - 1) / numBlocks;
+
+  for ( int bit = 0; bit < nBits; bit += NUMBITS)
+  {
+    /* initialize histogram for each digit */
+    for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+      countsGlobal[digit] = 0;
+
+    /* compute histogram for each digit */
+    launch (numBlocks,1,1, countPass)(keys, bufKeys, bit, numElements, counts, countsGlobal);
+    sync;
+
+    /* exclusive scan on global histogram */
+    int carry = 0;
+    excScan[0] = 0;
+#pragma unroll 8
+    for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+    {
+      const int value = countsGlobal[digit];
+      const int scan  = exclusive_scan_add(value);
+      excScan[digit] = scan + carry;
+      carry += __shfl(scan+value, programCount-1);
+    }
+
+    /* computing offsets for each digit */
+    radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
+
+    /* sorting */
+    launch (numBlocks,1,1,
+      sortPass)(
+          bufKeys,
+          keys,
+          bit,
+          numElements,
+          excScan);
+    sync;
+  }
+}
+
+extern "C"
+void radixSort(
+    const  int numElements,
+    Key keys[],
+    const  int nBits)
+{
+  cudaDeviceSetCacheConfig ( cudaFuncCachePreferEqual );
+  radixSort___export<<<1,32>>>(numElements, keys, nBits);
+  sync;
+}
--- a/examples/portable/radixSort/radixSort.ispc
+++ b/examples/portable/radixSort/radixSort.ispc
@@ -0,0 +1,337 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/*
+   Based on radixSort from  http://www.moderngpu.com
+   */
+
+#define NUMBITS 8
+#define NUMDIGITS (1<<NUMBITS)
+
+typedef int64 Key;
+
+task
+void countPass(
+    const uniform Key keysAll[],
+    uniform Key sortedAll[],
+    const uniform int bit,
+    const uniform int numElements,
+    uniform int countsAll[],
+    uniform int countsGlobal[])
+{
+  const uniform int  blockIdx = taskIndex;
+  const uniform int numBlocks = taskCount;
+  const uniform int  blockDim = (numElements + numBlocks - 1) / numBlocks;
+
+  const uniform int mask = (1 << NUMBITS) - 1;
+
+  const uniform Key * uniform keys   =   keysAll + blockIdx*blockDim;
+  uniform Key * uniform sorted = sortedAll + blockIdx*blockDim;
+  uniform int * uniform     counts = countsAll + blockIdx*NUMDIGITS;
+  const uniform int           nloc = min(numElements - blockIdx*blockDim, blockDim);
+
+  foreach (digit = 0 ... NUMDIGITS)
+    counts[digit] = 0;
+
+  foreach (i = 0 ... nloc)
+  {
+    sorted[i] = keys[i];
+    const int key = mask & ((unsigned int)keys[i] >> bit);
+#ifdef __NVPTX__
+    atomic_add_global(&counts[key], 1);
+#else
+    atomic_add_local(&counts[key], 1);
+#endif
+  }
+
+  foreach (digit = 0 ... NUMDIGITS)
+    atomic_add_global(&countsGlobal[digit], counts[digit]);
+}
+
+task
+void sortPass(
+    uniform Key keysAll[],
+    uniform Key sorted[],
+    uniform int bit,
+    uniform int numElements,
+    uniform int digitOffsetsAll[])
+{
+  const uniform int  blockIdx = taskIndex;
+  const uniform int numBlocks = taskCount;
+
+  const uniform int  blockDim = (numElements + numBlocks - 1) / numBlocks;
+
+
+  const uniform int keyIndex = blockIdx * blockDim;
+  uniform Key * uniform keys = keysAll + keyIndex;
+
+
+  const uniform int nloc = min(numElements - keyIndex, blockDim);
+
+  const uniform int mask = (1 << NUMBITS) - 1;
+
+  /* copy digit offset from Gmem to Lmem */
+#if 1
+  uniform int digitOffsets[NUMDIGITS];
+  foreach (digit = 0 ... NUMDIGITS)
+    digitOffsets[digit] = digitOffsetsAll[blockIdx*NUMDIGITS + digit];
+#else
+  uniform int * uniform digitOffsets = &digitOffsetsAll[blockIdx*NUMDIGITS];
+#endif
+
+  foreach (i = 0 ... nloc)
+  {
+    const int key = mask & ((unsigned int)keys[i] >> bit);
+    int scatter;
+    /* not a vector friendly loop */
+    foreach_active(iv)
+      scatter = digitOffsets[key]++;
+    sorted[scatter] = keys[i];
+  }
+}
+
+task
+void partialScanLocal(
+    uniform int numBlocks,
+    uniform int excScanAll[],
+    uniform int  countsAll[],
+    uniform int partialSumAll[])
+{
+  const uniform int  blockIdx = taskIndex;
+
+  const uniform int  blockDim = (numBlocks+taskCount-1)/taskCount;
+  const uniform int      bbeg = blockIdx * blockDim;
+  const uniform int      bend = min(bbeg + blockDim, numBlocks);
+
+  uniform int (* uniform  countsBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])countsAll;
+  uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
+  uniform int (* uniform   partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
+
+  foreach (digit = 0 ... NUMDIGITS)
+  {
+    int prev = bbeg == 0 ? excScanBlock[0][digit] : 0;
+    for (uniform int block = bbeg; block < bend; block++)
+    {
+      const int y = countsBlock[block][digit];
+      excScanBlock[block][digit] = prev;
+      prev += y;
+    }
+    partialSum[blockIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit];
+  }
+}
+
+task
+void partialScanGlobal(
+    const uniform int numBlocks,
+    uniform int partialSumAll[],
+    uniform int prefixSumAll[])
+{
+  uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
+  uniform int (* uniform  prefixSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS]) prefixSumAll;
+  const uniform int digit = taskIndex;
+  int carry = 0;
+  foreach (block = 0 ... numBlocks)
+  {
+    const int value = partialSum[block][digit];
+    const int scan  = exclusive_scan_add(value);
+    prefixSum[block][digit] = scan + carry;
+    carry += broadcast(scan+value, programCount-1);
+  }
+}
+
+task
+void completeScanGlobal(
+    uniform int numBlocks,
+    uniform int excScanAll[],
+    uniform int carryValueAll[])
+{
+  const uniform int  blockIdx = taskIndex;
+  const uniform int  blockDim = (numBlocks+taskCount-1)/taskCount;
+  const uniform int      bbeg = blockIdx * blockDim;
+  const uniform int      bend = min(bbeg  + blockDim, numBlocks);
+
+  uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
+  uniform int (* uniform   carryValue)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])carryValueAll;
+
+  foreach (digit = 0 ... NUMDIGITS)
+  {
+    const int carry = carryValue[blockIdx][digit];
+    for (uniform int block = bbeg; block < bend; block++)
+      excScanBlock[block][digit] += carry;
+  }
+}
+
+static
+inline void radixExclusiveScan(
+    const uniform int numBlocks,
+    uniform int excScanPtr[],
+    uniform int  countsPtr[],
+    uniform int partialSum[],
+    uniform int  prefixSum[])
+{
+  const uniform int scale = 8;
+  launch [numBlocks/scale] partialScanLocal(numBlocks, excScanPtr, countsPtr, partialSum);
+  sync;
+
+  launch [NUMDIGITS] partialScanGlobal(numBlocks/scale, partialSum, prefixSum);
+  sync;
+
+  launch [numBlocks/scale] completeScanGlobal(numBlocks, excScanPtr, prefixSum);
+  sync;
+}
+
+static uniform int * uniform memoryPool = NULL;
+static uniform int numBlocks;
+static uniform int nSharedCounts;
+static uniform int nCountsGlobal;
+static uniform int nExcScan;
+static uniform int nCountsBlock;
+static uniform int nPartialSum;
+static uniform int nPrefixSum;
+
+static uniform int * uniform sharedCounts;
+static uniform int * uniform countsGlobal;
+static uniform int * uniform excScan;
+static uniform int * uniform counts;
+static uniform int * uniform partialSum;
+static uniform int * uniform prefixSum;
+
+static uniform int numElementsBuf = 0;
+static uniform Key * uniform bufKeys;
+
+export void radixSort_alloc(const uniform int n)
+{
+  assert(memoryPool == NULL);
+  numBlocks     = num_cores()*4;
+#ifdef __NVPTX__
+  numBlocks     = 13*32*4; //num_cores()*4;
+#endif
+  nSharedCounts = NUMDIGITS*numBlocks;
+  nCountsGlobal = NUMDIGITS;
+  nExcScan      = NUMDIGITS*numBlocks;
+  nCountsBlock  = NUMDIGITS*numBlocks;
+  nPartialSum   = NUMDIGITS*numBlocks;
+  nPrefixSum    = NUMDIGITS*numBlocks;
+
+
+  const uniform int nalloc =
+    nSharedCounts +
+    nCountsGlobal +
+    nExcScan +
+    nCountsBlock +
+    nPartialSum +
+    nPrefixSum;
+
+  memoryPool = uniform new uniform int[nalloc];
+
+  sharedCounts = memoryPool;
+  countsGlobal = sharedCounts + nSharedCounts;
+  excScan      = countsGlobal + nCountsGlobal;
+  counts       = excScan      + nExcScan;
+  partialSum   = counts       + nCountsBlock;
+  prefixSum    = partialSum   + nPartialSum;
+}
+
+static
+void radixSort_freeBufKeys()
+{
+  if (numElementsBuf > 0)
+  {
+    delete bufKeys;
+    numElementsBuf = 0;
+  }
+}
+
+export void radixSort_free()
+{
+  assert(memoryPool != NULL);
+  delete memoryPool;
+  memoryPool = NULL;
+
+  radixSort_freeBufKeys();
+}
+
+export void radixSort(
+    const uniform int numElements,
+    uniform Key keys[],
+    const uniform int nBits)
+{
+#ifdef __NVPTX__
+  assert((numBlocks & 3) == 0);  /* task granularity on Kepler is 4 */
+#endif
+
+  if (numElementsBuf < numElements)
+    radixSort_freeBufKeys();
+  if (numElementsBuf == 0)
+  {
+    numElementsBuf = numElements;
+    bufKeys = uniform new uniform Key[numElementsBuf];
+  }
+
+  const uniform int blockDim  = (numElements + numBlocks - 1) / numBlocks;
+
+  for (uniform int bit = 0; bit < nBits; bit += NUMBITS)
+  {
+    /* initialize histogram for each digit */
+    foreach (digit = 0 ... NUMDIGITS)
+      countsGlobal[digit] = 0;
+
+    /* compute histogram for each digit */
+    launch [numBlocks] countPass(keys, bufKeys, bit, numElements, counts, countsGlobal);
+    sync;
+
+    /* exclusive scan on global histogram */
+    int carry = 0;
+    excScan[0] = 0;
+    foreach (digit = 0 ... NUMDIGITS)
+    {
+      const int value = countsGlobal[digit];
+      const int scan  = exclusive_scan_add(value);
+      excScan[digit] = scan + carry;
+      carry += broadcast(scan+value, programCount-1);
+    }
+
+    /* computing offsets for each digit */
+    radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
+
+    /* sorting */
+    launch [numBlocks]
+      sortPass(
+          bufKeys,
+          keys,
+          bit,
+          numElements,
+          excScan);
+    sync;
+  }
+
+}
--- a/examples/portable/rt/.gitignore
+++ b/examples/portable/rt/.gitignore
@@ -0,0 +1,2 @@
+rt
+*.ppm
--- a/examples/portable/rt/Makefile_cpu
+++ b/examples/portable/rt/Makefile_cpu
@@ -0,0 +1,8 @@
+
+EXAMPLE=rt
+CPP_SRC=rt.cpp
+ISPC_SRC=rt.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+
+include ../common_cpu.mk
--- a/examples/portable/rt/Makefile_knc
+++ b/examples/portable/rt/Makefile_knc
@@ -0,0 +1,7 @@
+EXAMPLE=rt
+CXX_SRC=rt.cpp 
+ISPC_SRC=rt.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+
+include ../common_knc.mk
--- a/examples/portable/rt/Makefile_ptx
+++ b/examples/portable/rt/Makefile_ptx
@@ -0,0 +1,13 @@
+PROG=rt
+ISPC_SRC=rt.ispc
+CU_SRC=rt.cu
+CXX_SRC=rt.cpp 
+PTXCC_REGMAX=32
+
+#LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
--- a/examples/portable/rt/cornell.bvh
+++ b/examples/portable/rt/cornell.bvh
@@ -0,0 +1 @@
+../../rt/cornell.bvh
--- a/examples/portable/rt/cornell.camera
+++ b/examples/portable/rt/cornell.camera
@@ -0,0 +1 @@
+../../rt/cornell.camera
--- a/examples/portable/rt/rt.cpp
+++ b/examples/portable/rt/rt.cpp
@@ -0,0 +1,229 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <cstdio>
+#include <cmath>
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <sys/types.h>
+#include "timing.h"
+#include "rt_ispc.h"
+#include "ispc_malloc.h"
+
+using namespace ispc;
+
+typedef unsigned int uint;
+
+static void writeImage(int *idImage, float *depthImage, int width, int height,
+                       const char *filename) {
+    FILE *f = fopen(filename, "wb");
+    if (!f) {
+        perror(filename);
+        exit(1);
+    }
+
+    fprintf(f, "P6\n%d %d\n255\n", width, height);
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+            // use the bits from the object id of the hit object to make a
+            // random color
+            int id = idImage[y * width + x];
+            unsigned char r = 0, g = 0, b = 0;
+
+            for (int i = 0; i < 8; ++i) {
+                // extract bit 3*i for red, 3*i+1 for green, 3*i+2 for blue
+                int rbit = (id & (1 << (3*i)))   >> (3*i);
+                int gbit = (id & (1 << (3*i+1))) >> (3*i+1);
+                int bbit = (id & (1 << (3*i+2))) >> (3*i+2);
+                // and then set the bits of the colors starting from the
+                // high bits...
+                r |= rbit << (7-i);
+                g |= gbit << (7-i);
+                b |= bbit << (7-i);
+            }
+            fputc(r, f);
+            fputc(g, f);
+            fputc(b, f);
+        }
+    }
+    fclose(f);
+    printf("Wrote image file %s\n", filename);
+}
+
+
+static void usage() {
+    fprintf(stderr, "rt <scene name base> [--scale=<factor>] [ispc iterations] [tasks iterations] [serial iterations]\n");
+    exit(1);
+}
+
+
+int main(int argc, char *argv[]) {
+    static unsigned int test_iterations[] = {3, 7, 1};
+    float scale = 1.f;
+    const char *filename = NULL;
+    if (argc < 2) usage();
+    filename = argv[1];
+    if (argc > 2) {
+        if (strncmp(argv[2], "--scale=", 8) == 0) {
+            scale = atof(argv[2] + 8);
+        }
+    }
+    if ((argc == 6) || (argc == 5)) {
+        for (int i = 0; i < 3; i++) {
+            test_iterations[i] = atoi(argv[argc - 3 + i]);
+        }
+    }
+
+#define READ(var, n)                                            \
+    if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) {  \
+        fprintf(stderr, "Unexpected EOF reading scene file\n"); \
+        return 1;                                               \
+    } else /* eat ; */
+
+    //
+    // Read the camera specification information from the camera file
+    //
+    char fnbuf[1024];
+    sprintf(fnbuf, "%s.camera", filename);
+    FILE *f = fopen(fnbuf, "rb");
+    if (!f) {
+        perror(fnbuf);
+        return 1;
+    }
+
+    //
+    // Nothing fancy, and trouble if we run on a big-endian system, just
+    // fread in the bits
+    //
+    int baseWidth, baseHeight;
+//    float camera2world[4][4], raster2camera[4][4];
+    float *camera2world_ispc = new float[4*4];
+    float *raster2camera_ispc = new float[4*4];
+    float (*camera2world )[4] = (float (*)[4])camera2world_ispc;
+    float (*raster2camera)[4] = (float (*)[4])raster2camera_ispc;
+    READ(baseWidth, 1);
+    READ(baseHeight, 1);
+    READ(camera2world[0][0], 16);
+    READ(raster2camera[0][0], 16);
+
+    //
+    // Read in the serialized BVH
+    //
+    sprintf(fnbuf, "%s.bvh", filename);
+    f = fopen(fnbuf, "rb");
+    if (!f) {
+        perror(fnbuf);
+        return 1;
+    }
+
+    // The BVH file starts with an int that gives the total number of BVH
+    // nodes
+    uint nNodes;
+    READ(nNodes, 1);
+
+    LinearBVHNode *nodes = new LinearBVHNode[nNodes];
+    for (unsigned int i = 0; i < nNodes; ++i) {
+        // Each node is 6x floats for a boox, then an integer for an offset
+        // to the second child node, then an integer that encodes the type
+        // of node, the total number of int it if a leaf node, etc.
+        float b[6];
+        READ(b[0], 6);
+        nodes[i].bounds[0][0] = b[0];
+        nodes[i].bounds[0][1] = b[1];
+        nodes[i].bounds[0][2] = b[2];
+        nodes[i].bounds[1][0] = b[3];
+        nodes[i].bounds[1][1] = b[4];
+        nodes[i].bounds[1][2] = b[5];
+        READ(nodes[i].offset, 1);
+        READ(nodes[i].nPrimitives, 1);
+        READ(nodes[i].splitAxis, 1);
+        READ(nodes[i].pad, 1);
+    }
+
+    // And then read the triangles
+    uint nTris;
+    READ(nTris, 1);
+    Triangle *triangles = new Triangle[nTris];
+    for (uint i = 0; i < nTris; ++i) {
+        // 9x floats for the 3 vertices
+        float v[9];
+        READ(v[0], 9);
+        float *vp = v;
+        for (int j = 0; j < 3; ++j) {
+            triangles[i].p[j][0] = *vp++;
+            triangles[i].p[j][1] = *vp++;
+            triangles[i].p[j][2] = *vp++;
+        }
+        // And create an object id
+        triangles[i].id = i+1;
+    }
+    fclose(f);
+
+    int height = int(baseHeight * scale);
+    int width = int(baseWidth * scale);
+
+    // allocate images; one to hold hit object ids, one to hold depth to
+    // the first interseciton
+    int *id = new int[width*height];
+    float *image = new float[width*height];
+
+    ispc_memset(id, 0, width*height*sizeof(int));
+    ispc_memset(image, 0, width*height*sizeof(float));
+
+    //
+    // Run 3 iterations with ispc + 1 core, record the minimum time
+    //
+    double minTimeISPCtasks = 1e30;
+    for (int i = 0; i < test_iterations[1]; ++i) {
+        reset_and_start_timer();
+        raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera,
+                            camera2world, image, id, nodes, triangles);
+        double dt = get_elapsed_msec();
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
+        minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
+    }
+    printf("[rt ispc + tasks]:\t\t[%.3f] msec for %d x %d image\n",
+           minTimeISPCtasks, width, height);
+
+    writeImage(id, image, width, height, "rt-ispc-tasks.ppm");
+
+    return 0;
+}
--- a/examples/portable/rt/rt.cu
+++ b/examples/portable/rt/rt.cu
@@ -0,0 +1,373 @@
+#include "cuda_helpers.cuh"
+
+#define float3 Float3
+struct Float3
+{
+  float x,y,z;
+  __device__ friend Float3 operator+(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x+b.x;
+    c.y = a.y+b.y;
+    c.z = a.z+b.z;
+    return c;
+  }
+  __device__ friend Float3 operator-(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x-b.x;
+    c.y = a.y-b.y;
+    c.z = a.z-b.z;
+    return c;
+  }
+  __device__ friend Float3 operator/(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x/b.x;
+    c.y = a.y/b.y;
+    c.z = a.z/b.z;
+    return c;
+  }
+  __device__ friend Float3 operator/(const float a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a/b.x;
+    c.y = a/b.y;
+    c.z = a/b.z;
+    return c;
+  }
+  __device__ friend Float3 operator*(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x*b.x;
+    c.y = a.y*b.y;
+    c.z = a.z*b.z;
+    return c;
+  }
+  __device__ friend Float3 operator*(const Float3 a, const float b)
+  {
+    Float3 c;
+    c.x = a.x*b;
+    c.y = a.y*b;
+    c.z = a.z*b;
+    return c;
+  }
+};
+
+#define int8 char
+#define int16 short
+
+struct Ray {
+    float3 origin, dir, invDir;
+    unsigned int dirIsNeg0, dirIsNeg1, dirIsNeg2;
+    float mint, maxt;
+    int hitId;
+};
+
+struct Triangle {
+    float p[3][4];
+    int id;
+    int pad[3];
+};
+
+struct LinearBVHNode {
+    float bounds[2][3];
+    unsigned int offset;     // num primitives for leaf, second child for interior
+    unsigned int8 nPrimitives;
+    unsigned int8 splitAxis;
+    unsigned int16 pad;
+};
+
+__device__
+static inline float3 Cross(const float3 v1, const float3 v2) {
+    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
+    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
+    float3 ret;
+    ret.x = (v1y * v2z) - (v1z * v2y);
+    ret.y = (v1z * v2x) - (v1x * v2z);
+    ret.z = (v1x * v2y) - (v1y * v2x);
+    return ret;
+}
+
+__device__
+static inline float Dot(const float3 a, const float3 b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+__device__
+inline
+static void generateRay( const float raster2camera[4][4],
+                         const float camera2world[4][4],
+                        float x, float y, Ray &ray) {
+    ray.mint = 0.f;
+    ray.maxt = 1e30f;
+
+    ray.hitId = 0;
+
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
+        camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
+        camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
+        camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+
+    ray.invDir = 1.f / ray.dir;
+
+#if 0
+    ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
+    ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
+    ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
+#else
+    ray.dirIsNeg0 = any(ray.invDir.x < 0) ? 1 : 0;
+    ray.dirIsNeg1 = any(ray.invDir.y < 0) ? 1 : 0;
+    ray.dirIsNeg2 = any(ray.invDir.z < 0) ? 1 : 0;
+#endif
+}
+
+__device__
+inline
+static bool BBoxIntersect(const  float bounds[2][3],
+                          const Ray &ray) {
+     float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
+     float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
+    float t0 = ray.mint, t1 = ray.maxt;
+
+    // Check all three axis-aligned slabs.  Don't try to early out; it's
+    // not worth the trouble
+    float3 tNear = (bounds0 - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds1 - ray.origin) * ray.invDir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = max(tNear.x, t0);
+    t1 = min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = max(tNear.y, t0);
+    t1 = min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = max(tNear.z, t0);
+    t1 = min(tFar.z, t1);
+
+    return (t0 <= t1);
+}
+
+
+__device__
+inline
+static bool TriIntersect(const  Triangle &tri, Ray &ray) {
+     float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
+     float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
+     float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
+     float3 e1 = p1 - p0;
+     float3 e2 = p2 - p0;
+
+    float3 s1 = Cross(ray.dir, e2);
+    float divisor = Dot(s1, e1);
+    bool hit = true;
+
+    if (divisor == 0.)
+        hit = false;
+    float invDivisor = 1.f / divisor;
+
+    // Compute first barycentric coordinate
+    float3 d = ray.origin - p0;
+    float b1 = Dot(d, s1) * invDivisor;
+    if (b1 < 0. || b1 > 1.)
+        hit = false;
+
+    // Compute second barycentric coordinate
+    float3 s2 = Cross(d, e1);
+    float b2 = Dot(ray.dir, s2) * invDivisor;
+    if (b2 < 0. || b1 + b2 > 1.)
+        hit = false;
+
+    // Compute _t_ to intersection point
+    float t = Dot(e2, s2) * invDivisor;
+    if (t < ray.mint || t > ray.maxt)
+        hit = false;
+
+    if (hit) {
+        ray.maxt = t;
+        ray.hitId = tri.id;
+    }
+    return hit;
+}
+
+__device__
+inline
+bool BVHIntersect(const  LinearBVHNode nodes[],
+                  const  Triangle tris[], Ray &r,
+                   int todo[]) {
+    Ray ray = r;
+    bool hit = false;
+    // Follow ray through BVH nodes to find primitive intersections
+     int todoOffset = 0, nodeNum = 0;
+
+    while (true) {
+        // Check ray against BVH node
+         LinearBVHNode node = nodes[nodeNum];
+        if (any(BBoxIntersect(node.bounds, ray))) {
+             unsigned int nPrimitives = node.nPrimitives;
+            if (nPrimitives > 0) {
+                // Intersect ray with primitives in leaf BVH node
+                 unsigned int primitivesOffset = node.offset;
+                for ( unsigned int i = 0; i < nPrimitives; ++i) {
+                    if (TriIntersect(tris[primitivesOffset+i], ray))
+                        hit = true;
+                }
+                if (todoOffset == 0)
+                    break;
+                nodeNum = todo[--todoOffset];
+            }
+            else {
+                // Put far BVH node on _todo_ stack, advance to near node
+                int dirIsNeg;
+                if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg0;
+                if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg1;
+                if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg2;
+                if (dirIsNeg) {
+                   todo[todoOffset++] = nodeNum + 1;
+                   nodeNum = node.offset;
+                }
+                else {
+                   todo[todoOffset++] = node.offset;
+                   nodeNum = nodeNum + 1;
+                }
+            }
+        }
+        else {
+            if (todoOffset == 0)
+                break;
+            nodeNum = todo[--todoOffset];
+        }
+    }
+    r.maxt = ray.maxt;
+    r.hitId = ray.hitId;
+
+    return hit;
+}
+
+__device__
+inline
+static void raytrace_tile( int x0,  int x1,
+                           int y0,  int y1,
+                           int width,  int height,
+                           int baseWidth,  int baseHeight,
+                          const  float raster2camera[4][4],
+                          const  float camera2world[4][4],
+                           float image[],  int id[],
+                          const  LinearBVHNode nodes[],
+                          const  Triangle triangles[]) {
+     float widthScale = (float)(baseWidth) / (float)(width);
+     float heightScale = (float)(baseHeight) / (float)(height);
+
+#if 0
+   int *  todo =  new  int[64];
+#define ALLOC
+#else
+   int todo[64];
+#endif
+
+    for (int y = y0 ;y < y1; y++)
+      for (int x = x0 + programIndex; x < x1; x += programCount)
+        if (x < x1)
+        {
+          Ray ray;
+          generateRay(raster2camera, camera2world, x*widthScale,
+              y*heightScale, ray);
+          BVHIntersect(nodes, triangles, ray, todo);
+
+          int offset = y * width + x;
+          image[offset] = ray.maxt;
+          id[offset] = ray.hitId;
+        }
+
+#ifdef ALLOC
+  delete todo;
+#endif
+}
+
+
+
+__global__
+void raytrace_tile_task( int width,  int height,
+                              int baseWidth,  int baseHeight,
+                             const  float raster2camera[4][4],
+                             const  float camera2world[4][4],
+                              float image[],  int id[],
+                             const  LinearBVHNode nodes[],
+                             const  Triangle triangles[]) {
+     int dx = 64, dy = 8; // must match dx, dy below
+     int xBuckets = (width + (dx-1)) / dx;
+     int x0 = (taskIndex % xBuckets) * dx;
+     int x1 = min(x0 + dx, width);
+     int y0 = (taskIndex / xBuckets) * dy;
+     int y1 = min(y0 + dy, height);
+
+    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
+                  raster2camera, camera2world, image,
+                  id, nodes, triangles);
+}
+
+
+extern "C" __global__ void raytrace_ispc_tasks___export( int width,  int height,
+                                 int baseWidth,  int baseHeight,
+                                const  float raster2camera[4][4],
+                                const  float camera2world[4][4],
+                                 float image[],  int id[],
+                                const  LinearBVHNode nodes[],
+                                const  Triangle triangles[]) {
+     int dx = 64, dy = 8;
+     int xBuckets = (width + (dx-1)) / dx;
+     int yBuckets = (height + (dy-1)) / dy;
+     int nTasks = xBuckets * yBuckets;
+     launch(nTasks,1,1,raytrace_tile_task)
+       (width, height, baseWidth, baseHeight,
+        raster2camera, camera2world,
+        image, id, nodes, triangles);
+     cudaDeviceSynchronize();
+}
+
+
+
+extern "C" __host__ void raytrace_ispc_tasks( int width,  int height,
+    int baseWidth,  int baseHeight,
+    const  float raster2camera[4][4],
+    const  float camera2world[4][4],
+    float image[],  int id[],
+    const  LinearBVHNode nodes[],
+    const  Triangle triangles[]) {
+  raytrace_ispc_tasks___export<<<1,32>>>( width,  height,
+      baseWidth,  baseHeight,
+      raster2camera,
+      camera2world,
+      image,  id,
+      nodes,
+      triangles);
+  cudaDeviceSynchronize();
+}
--- a/examples/portable/rt/rt.ispc
+++ b/examples/portable/rt/rt.ispc
@@ -0,0 +1,351 @@
+/*
+  Copyright (c) 2010-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if 1
+typedef int bool_t;
+#else
+typedef bool bool_t;
+#endif
+typedef float<3> float3;
+
+#ifdef __NVPTX__
+#define uniform_t varying
+#else
+#define uniform_t uniform
+#endif
+
+
+
+struct int3
+{
+  int x,y,z;
+};
+
+struct Ray {
+    float3 origin, dir, invDir;
+    uniform unsigned int dirIsNeg[3];
+    float mint, maxt;
+    int hitId;
+};
+
+struct Triangle {
+    float p[3][4];
+    int id;
+    int pad[3];
+};
+
+struct LinearBVHNode {
+    float bounds[2][3];
+    unsigned int offset;     // num primitives for leaf, second child for interior
+    unsigned int8 nPrimitives;
+    unsigned int8 splitAxis;
+    unsigned int16 pad;
+};
+
+static inline float3 Cross(const float3 v1, const float3 v2) {
+    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
+    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
+    float3 ret;
+    ret.x = (v1y * v2z) - (v1z * v2y);
+    ret.y = (v1z * v2x) - (v1x * v2z);
+    ret.z = (v1x * v2y) - (v1y * v2x);
+    return ret;
+}
+
+static inline float Dot(const float3 a, const float3 b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+
+#if 1
+inline
+#endif
+static void generateRay(uniform const float raster2camera[4][4],
+                        uniform const float camera2world[4][4],
+                        float x, float y, Ray &ray) {
+    ray.mint = 0.f;
+    ray.maxt = 1e30f;
+
+    ray.hitId = 0;
+
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
+        camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
+        camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
+        camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+
+    ray.invDir = 1.f / ray.dir;
+
+    ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
+    ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
+    ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
+}
+
+
+#if 1
+inline
+#endif
+static bool_t BBoxIntersect(const uniform float bounds[2][3],
+                          const Ray &ray) {
+    const uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
+    const uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
+    float t0 = ray.mint, t1 = ray.maxt;
+
+    // Check all three axis-aligned slabs.  Don't try to early out; it's
+    // not worth the trouble
+    float3 tNear = (bounds0 - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds1 - ray.origin) * ray.invDir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = max(tNear.x, t0);
+    t1 = min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = max(tNear.y, t0);
+    t1 = min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = max(tNear.z, t0);
+    t1 = min(tFar.z, t1);
+
+    return (t0 <= t1);
+}
+
+
+
+#if 1
+inline
+#endif
+static bool_t TriIntersect(const uniform_t Triangle tri, Ray &ray) {
+    const uniform_t float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
+    const uniform_t float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
+    const uniform_t float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
+    const uniform_t float3 e1 = p1 - p0;
+    const uniform_t float3 e2 = p2 - p0;
+
+    float3 s1 = Cross(ray.dir, e2);
+    float divisor = Dot(s1, e1);
+    bool_t hit = true;
+
+    if (divisor == 0.)
+        hit = false;
+    float invDivisor = 1.f / divisor;
+
+    // Compute first barycentric coordinate
+    float3 d = ray.origin - p0;
+    float b1 = Dot(d, s1) * invDivisor;
+    if (b1 < 0. || b1 > 1.)
+        hit = false;
+
+    // Compute second barycentric coordinate
+    float3 s2 = Cross(d, e1);
+    float b2 = Dot(ray.dir, s2) * invDivisor;
+    if (b2 < 0. || b1 + b2 > 1.)
+        hit = false;
+
+    // Compute _t_ to intersection point
+    float t = Dot(e2, s2) * invDivisor;
+    if (t < ray.mint || t > ray.maxt)
+        hit = false;
+
+    if (hit) {
+        ray.maxt = t;
+        ray.hitId = tri.id;
+    }
+    return hit;
+}
+
+
+#if 1
+inline
+#endif
+bool_t
+BVHIntersect(const uniform LinearBVHNode nodes[],
+                  const uniform Triangle tris[], Ray &r) {
+    Ray ray = r;
+    bool_t hit = false;
+    // Follow ray through BVH nodes to find primitive intersections
+    uniform int todoOffset = 0, nodeNum = 0;
+    uniform int todo[64];
+
+    while (true) {
+        // Check ray against BVH node
+        const uniform LinearBVHNode node = nodes[nodeNum];
+        if (any(BBoxIntersect(node.bounds, ray))) {
+            const uniform unsigned int nPrimitives = node.nPrimitives;
+            if (nPrimitives > 0) {
+                // Intersect ray with primitives in leaf BVH node
+                const uniform unsigned int primitivesOffset = node.offset;
+                for (uniform_t unsigned int i = 0; i < nPrimitives; ++i) {
+                    if (TriIntersect(tris[primitivesOffset+i], ray))
+                        hit = true;
+                }
+                if (todoOffset == 0)
+                    break;
+                nodeNum = todo[--todoOffset];
+            }
+            else {
+                // Put far BVH node on _todo_ stack, advance to near node
+#if 0 /* fails */
+                int dirIsNeg = r.dirIsNeg[node.splitAxis];
+#else
+                int dirIsNeg;
+                if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg[0];
+                if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg[1];
+                if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg[2];
+#endif
+                if (dirIsNeg) {
+                   todo[todoOffset++] = nodeNum + 1;
+                   nodeNum = node.offset;
+                }
+                else {
+                   todo[todoOffset++] = node.offset;
+                   nodeNum = nodeNum + 1;
+                }
+            }
+        }
+        else {
+            if (todoOffset == 0)
+                break;
+            nodeNum = todo[--todoOffset];
+        }
+    }
+    r.maxt = ray.maxt;
+    r.hitId = ray.hitId;
+
+    return hit;
+}
+
+
+#if 1
+inline
+#endif
+static void raytrace_tile(uniform int x0, uniform int x1,
+                          uniform int y0, uniform int y1,
+                          uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
+                          const uniform float raster2camera[4][4],
+                          const uniform float camera2world[4][4],
+                          uniform float image[], uniform int id[],
+                          const uniform LinearBVHNode nodes[],
+                          const uniform Triangle triangles[]) {
+    const uniform float widthScale = (float)(baseWidth) / (float)(width);
+    const uniform float heightScale = (float)(baseHeight) / (float)(height);
+
+    foreach_tiled (y = y0 ... y1, x = x0 ... x1) {
+        Ray ray;
+        generateRay(raster2camera, camera2world, x*widthScale,
+                    y*heightScale, ray);
+        BVHIntersect(nodes, triangles, ray);
+
+        int offset = y * width + x;
+        image[offset] = ray.maxt;
+        id[offset] = ray.hitId;
+    }
+}
+
+
+export void raytrace_ispc(uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
+                          const uniform float raster2camera[4][4],
+                          const uniform float camera2world[4][4],
+                          uniform float image[], uniform int id[],
+                          const uniform LinearBVHNode nodes[],
+                          const uniform Triangle triangles[]) {
+    raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
+                  raster2camera, camera2world, image,
+                  id, nodes, triangles);
+}
+
+
+task void raytrace_tile_task(uniform int width, uniform int height,
+                             uniform int baseWidth, uniform int baseHeight,
+                             const uniform float raster2camera[4][4],
+                             const uniform float camera2world[4][4],
+                             uniform float image[], uniform int id[],
+                             const uniform LinearBVHNode nodes[],
+                             const uniform Triangle triangles[]) {
+    const uniform int dx = 64, dy = 8; // must match dx, dy below
+    const uniform int xBuckets = (width + (dx-1)) / dx;
+    const uniform int x0 = (taskIndex % xBuckets) * dx;
+    const uniform int x1 = min(x0 + dx, width);
+    const uniform int y0 = (taskIndex / xBuckets) * dy;
+    const uniform int y1 = min(y0 + dy, height);
+
+    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
+                  raster2camera, camera2world, image,
+                  id, nodes, triangles);
+}
+
+
+export void raytrace_ispc_tasks(uniform int width, uniform int height,
+                                uniform int baseWidth, uniform int baseHeight,
+                                const uniform float raster2camera[4][4],
+                                const uniform float camera2world[4][4],
+                                uniform float image[], uniform int id[],
+                                const uniform LinearBVHNode nodes[],
+                                const uniform Triangle triangles[]) {
+    const uniform int dx = 64, dy = 8;
+    const uniform int xBuckets = (width + (dx-1)) / dx;
+    const uniform int yBuckets = (height + (dy-1)) / dy;
+    const uniform int nTasks = xBuckets * yBuckets;
+    launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight,
+                                      raster2camera, camera2world,
+                                      image, id, nodes, triangles);
+}
+
--- a/examples/portable/rt/sponza.bvh
+++ b/examples/portable/rt/sponza.bvh
@@ -0,0 +1 @@
+../../rt/sponza.bvh
--- a/examples/portable/rt/sponza.camera
+++ b/examples/portable/rt/sponza.camera
@@ -0,0 +1 @@
+../../rt/sponza.camera
--- a/examples/portable/rt/teapot.bvh
+++ b/examples/portable/rt/teapot.bvh
@@ -0,0 +1 @@
+../../rt/teapot.bvh
--- a/examples/portable/rt/teapot.camera
+++ b/examples/portable/rt/teapot.camera
@@ -0,0 +1 @@
+../../rt/teapot.camera
--- a/examples/portable/volume_rendering/.gitignore
+++ b/examples/portable/volume_rendering/.gitignore
@@ -0,0 +1,2 @@
+mandelbrot
+*.ppm
--- a/examples/portable/volume_rendering/Makefile_cpu
+++ b/examples/portable/volume_rendering/Makefile_cpu
@@ -0,0 +1,8 @@
+
+EXAMPLE=volume
+CPP_SRC=volume.cpp
+ISPC_SRC=volume.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+
+include ../common_cpu.mk
--- a/examples/portable/volume_rendering/Makefile_knc
+++ b/examples/portable/volume_rendering/Makefile_knc
@@ -0,0 +1,7 @@
+EXAMPLE=volume
+CXX_SRC=volume.cpp 
+ISPC_SRC=volume.ispc
+ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
+ISPC_TARGET=generic-16
+
+include ../common_knc.mk
--- a/examples/portable/volume_rendering/Makefile_ptx
+++ b/examples/portable/volume_rendering/Makefile_ptx
@@ -0,0 +1,13 @@
+PROG=volume
+ISPC_SRC=volume.ispc
+CU_SRC=volume.cu
+CXX_SRC=volume.cpp  
+PTXCC_REGMAX=64
+
+#LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
--- a/examples/portable/volume_rendering/camera.dat
+++ b/examples/portable/volume_rendering/camera.dat
@@ -0,0 +1,11 @@
+896 1184
+
+0.000155 0.000000 0.000000 -0.069927
+0.000000 -0.000155 0.000000 0.093236
+0.000000 0.000000 0.000000 1.000000
+0.000000 0.000000 -99.999001 100.000000
+
+1.000000 0.000000 0.000000 1.000000
+0.000000 0.980129 -0.198360 2.900000
+0.000000 0.198360 0.980129 -10.500000
+0.000000 0.000000 0.000000 1.000000
--- a/examples/portable/volume_rendering/density_highres.vol
+++ b/examples/portable/volume_rendering/density_highres.vol
@@ -0,0 +1 @@
+../../volume_rendering/density_highres.vol
--- a/examples/portable/volume_rendering/density_lowres.vol
+++ b/examples/portable/volume_rendering/density_lowres.vol
@@ -0,0 +1 @@
+../../volume_rendering/density_lowres.vol
--- a/examples/portable/volume_rendering/volume.cpp
+++ b/examples/portable/volume_rendering/volume.cpp
@@ -0,0 +1,183 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <cstdio>
+#include <algorithm>
+#include "timing.h"
+#include "ispc_malloc.h"
+#include "volume_ispc.h"
+using namespace ispc;
+
+/* Write a PPM image file with the image */
+static void
+writePPM(float *buf, int width, int height, const char *fn) {
+    FILE *fp = fopen(fn, "wb");
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", width, height);
+    fprintf(fp, "255\n");
+    for (int i = 0; i < width*height; ++i) {
+        float v = buf[i] * 255.f;
+        if (v < 0.f) v = 0.f;
+        else if (v > 255.f) v = 255.f;
+        unsigned char c = (unsigned char)v;
+        for (int j = 0; j < 3; ++j)
+            fputc(c, fp);
+    }
+    fclose(fp);
+    printf("Wrote image file %s\n", fn);
+}
+
+
+/* Load image and viewing parameters from a camera data file.
+   FIXME: we should add support to be able to specify viewing parameters
+   in the program here directly. */
+static void
+loadCamera(const char *fn, int *width, int *height, float raster2camera[4][4],
+           float camera2world[4][4]) {
+    FILE *f = fopen(fn, "r");
+    if (!f) {
+        perror(fn);
+        exit(1);
+    }
+    if (fscanf(f, "%d %d", width, height) != 2) {
+        fprintf(stderr, "Unexpected end of file in camera file\n");
+        exit(1);
+    }
+
+    for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            if (fscanf(f, "%f", &raster2camera[i][j]) != 1) {
+                fprintf(stderr, "Unexpected end of file in camera file\n");
+                exit(1);
+            }
+        }
+    }
+    for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            if (fscanf(f, "%f", &camera2world[i][j]) != 1) {
+                fprintf(stderr, "Unexpected end of file in camera file\n");
+                exit(1);
+            }
+        }
+    }
+    fclose(f);
+}
+
+
+/* Load a volume density file.  Expects the number of x, y, and z samples
+   as the first three values (as integer strings), then x*y*z
+   floating-point values (also as strings) to give the densities.  */
+static float *
+loadVolume(const char *fn, int n[3]) {
+    FILE *f = fopen(fn, "r");
+    if (!f) {
+        perror(fn);
+        exit(1);
+    }
+
+    if (fscanf(f, "%d %d %d", &n[0], &n[1], &n[2]) != 3) {
+        fprintf(stderr, "Couldn't find resolution at start of density file\n");
+        exit(1);
+    }
+
+    int count = n[0] * n[1] * n[2];
+    float *v = new float[count];
+    for (int i = 0; i < count; ++i) {
+        if (fscanf(f, "%f", &v[i]) != 1) {
+            fprintf(stderr, "Unexpected end of file at %d'th density value\n", i);
+            exit(1);
+        }
+    }
+
+    return v;
+}
+
+
+int main(int argc, char *argv[]) {
+    static unsigned int test_iterations[] = {3, 7, 1};
+    if (argc < 3) {
+        fprintf(stderr, "usage: volume <camera.dat> <volume_density.vol> [ispc iterations] [tasks iterations] [serial iterations]\n");
+        return 1;
+    }
+    if (argc == 6) {
+        for (int i = 0; i < 3; i++) {
+            test_iterations[i] = atoi(argv[3 + i]);
+        }
+    }
+
+    //
+    // Load viewing data and the volume density data
+    //
+    int width, height;
+
+    float *camera2world_ispc = new float[4*4];
+    float *raster2camera_ispc = new float[4*4];
+    float (*camera2world )[4] = (float (*)[4])camera2world_ispc;
+    float (*raster2camera)[4] = (float (*)[4])raster2camera_ispc;
+
+    loadCamera(argv[1], &width, &height, raster2camera, camera2world);
+    float *image = new float[width*height];
+
+    int *n = new int[3];
+    float *density = loadVolume(argv[2], n);
+
+    // Clear out the buffer
+    for (int i = 0; i < width * height; ++i)
+        image[i] = 0.;
+
+    //
+    // Compute the image using the ispc implementation that also uses
+    // tasks; report the minimum time of three runs.
+    //
+    double minISPCtasks = 1e30;
+    for (int i = 0; i < test_iterations[1]; ++i) {
+        reset_and_start_timer();
+        volume_ispc_tasks(density, n, raster2camera, camera2world,
+                          width, height, image);
+        double dt = get_elapsed_msec();
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
+        minISPCtasks = std::min(minISPCtasks, dt);
+    }
+
+    printf("[volume ispc + tasks]:\t\t[%.3f] msec\n", minISPCtasks);
+    writePPM(image, width, height, "volume-ispc-tasks.ppm");
+
+    return 0;
+}
--- a/examples/portable/volume_rendering/volume.cu
+++ b/examples/portable/volume_rendering/volume.cu
@@ -0,0 +1,454 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cuda_helpers.cuh"
+__device__ static inline float clamp(float v, float low, float high)
+{
+      return min(max(v, low), high);
+}
+
+
+#define float3 Float3
+struct Float3
+{
+  float x,y,z;
+  __device__ friend Float3 operator+(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x+b.x;
+    c.y = a.y+b.y;
+    c.z = a.z+b.z;
+    return c;
+  }
+  __device__ friend Float3 operator-(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x-b.x;
+    c.y = a.y-b.y;
+    c.z = a.z-b.z;
+    return c;
+  }
+  __device__ friend Float3 operator/(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x/b.x;
+    c.y = a.y/b.y;
+    c.z = a.z/b.z;
+    return c;
+  }
+  __device__ friend Float3 operator*(const Float3 a, const Float3 b)
+  {
+    Float3 c;
+    c.x = a.x*b.x;
+    c.y = a.y*b.y;
+    c.z = a.z*b.z;
+    return c;
+  }
+  __device__ friend Float3 operator*(const Float3 a, const float b)
+  {
+    Float3 c;
+    c.x = a.x*b;
+    c.y = a.y*b;
+    c.z = a.z*b;
+    return c;
+  }
+};
+
+struct Ray {
+    float3 origin, dir;
+};
+
+
+__device__ static void
+generateRay(const float raster2camera[4][4],
+            const float camera2world[4][4],
+            float x, float y, Ray &ray) {
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+}
+
+
+__device__ static inline bool
+Inside(float3 p, float3 pMin, float3 pMax) {
+    return (p.x >= pMin.x && p.x <= pMax.x &&
+            p.y >= pMin.y && p.y <= pMax.y &&
+            p.z >= pMin.z && p.z <= pMax.z);
+}
+
+
+__device__ static bool
+IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
+    float t0 = -1e30f, t1 = 1e30f;
+
+    float3 tNear = (pMin - ray.origin) / ray.dir;
+    float3 tFar  = (pMax - ray.origin) / ray.dir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = max(tNear.x, t0);
+    t1 = min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = max(tNear.y, t0);
+    t1 = min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = max(tNear.z, t0);
+    t1 = min(tFar.z, t1);
+
+    if (t0 <= t1) {
+        hit0 = t0;
+        hit1 = t1;
+        return true;
+    }
+    else
+        return false;
+}
+
+
+__device__ static inline float Lerp(float t, float a, float b) {
+    return (1.f - t) * a + t * b;
+}
+
+
+__device__ static inline float D(int x, int y, int z, int nVoxels[3],
+                      float density[]) {
+    x = clamp(x, 0, nVoxels[0]-1);
+    y = clamp(y, 0, nVoxels[1]-1);
+    z = clamp(z, 0, nVoxels[2]-1);
+
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+__device__ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
+    return (p - pMin) / (pMax - pMin);
+}
+
+
+__device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
+                     float density[], int nVoxels[3]) {
+    if (!Inside(Pobj, pMin, pMax))
+        return 0;
+    // Compute voxel coordinates and offsets for _Pobj_
+    float3 vox = Offset(Pobj, pMin, pMax);
+    vox.x = vox.x * nVoxels[0] - .5f;
+    vox.y = vox.y * nVoxels[1] - .5f;
+    vox.z = vox.z * nVoxels[2] - .5f;
+    int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
+    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
+
+    // Trilinearly interpolate density values to compute local density
+    float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
+                     D(vx+1, vy, vz, nVoxels, density));
+    float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
+                     D(vx+1, vy+1, vz, nVoxels, density));
+    float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
+                     D(vx+1, vy, vz+1, nVoxels, density));
+    float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
+                     D(vx+1, vy+1, vz+1, nVoxels, density));
+    float d0 = Lerp(dy, d00, d10);
+    float d1 = Lerp(dy, d01, d11);
+    return Lerp(dz, d0, d1);
+}
+
+
+/* Returns the transmittance between two points p0 and p1, in a volume
+   with extent (pMin,pMax) with transmittance coefficient sigma_t,
+   defined by nVoxels[3] voxels in each dimension in the given density
+   array. */
+__device__ static inline float
+transmittance(float3 p0, float3 p1, float3 pMin,
+              float3 pMax, float sigma_t,
+              float density[], int nVoxels[3]) {
+    float rayT0, rayT1;
+    Ray ray;
+    ray.origin = p1;
+    ray.dir = p0 - p1;
+
+    // Find the parametric t range along the ray that is inside the volume.
+    if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 1.f;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Accumulate beam transmittance in tau
+    float tau = 0.0f;
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    float stepDist = 0.2f;
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1) {
+        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    return exp(-tau);
+}
+
+
+__device__ static inline float
+distanceSquared(float3 a, float3 b) {
+    float3 d = a-b;
+    return d.x*d.x + d.y*d.y + d.z*d.z;
+}
+
+
+__device__ static inline float
+raymarch(float density[], int nVoxels[3], Ray ray) {
+    float rayT0, rayT1;
+    float3 pMin = {.3f, -.2f, .3f}, pMax = {1.8f, 2.3f, 1.8f};
+    float3 lightPos = { -1.f, 4., 1.5f };
+
+    if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 0.f;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Parameters that define the volume scattering characteristics and
+    // sampling rate for raymarching
+    float Le = .25f;            // Emission coefficient
+    float sigma_a = 10.f;        // Absorption coefficient
+    float sigma_s = 10.f;        // Scattering coefficient
+    float stepDist = 0.025f;    // Ray step amount
+    float lightIntensity = 40.0f; // Light source intensity
+
+    float tau = 0.f;  // accumulated beam transmittance
+    float L = 0.f;      // radiance along the ray
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1)
+    {
+        float d = Density(pos, pMin, pMax, density, nVoxels);
+
+        // terminate once attenuation is high
+        float atten = exp(-tau);
+        if (atten < .005f)
+            break;
+
+        // direct lighting
+        float Li = lightIntensity / distanceSquared(lightPos, pos) *
+            transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
+                          density, nVoxels);
+        L += stepDist * atten * d * sigma_s * (Li + Le);
+
+        // update beam transmittance
+        tau += stepDist * (sigma_a + sigma_s) * d;
+
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    // Gamma correction
+    return pow(L, 1.f / 2.2f);
+}
+
+
+/* Utility routine used by both the task-based and the single-core entrypoints.
+   Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the
+   result into the image[] array.
+ */
+__device__ static void
+volume_tile(int x0, int y0, int x1,
+            int y1, float density[], int nVoxels[3],
+            const float raster2camera[4][4],
+            const float camera2world[4][4],
+            int width, int height, float image[]) {
+    // Work on 4x4=16 pixel big tiles of the image.  This function thus
+    // implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
+    // by 4.
+    for (int y = y0; y < y1; y += 8) {
+        for (int x = x0; x < x1; x += 8) {
+              for (int ob = 0; ob < 64; ob += programCount)
+              {
+                const int o = ob + programIndex;
+
+
+                // These two arrays encode the mapping from [0,15] to
+                // offsets within the 4x4 pixel block so that we render
+                // each pixel inside the block
+                const int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
+                  0, 1, 0, 1, 2, 3, 2, 3 };
+                const int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
+                  2, 2, 3, 3, 2, 2, 3, 3 };
+
+                const int xblock[4] = {0, 4, 0, 4};
+                const int yblock[4] = {0, 0, 4, 4};
+
+                // Figure out the pixel to render for this program instance
+                const int xo = x + xblock[o/16] + xoffsets[o&15];
+                const int yo = y + yblock[o/16] + yoffsets[o&15];
+
+                // Use viewing parameters to compute the corresponding ray
+                // for the pixel
+                Ray ray;
+                generateRay(raster2camera, camera2world, xo, yo, ray);
+
+                // And raymarch through the volume to compute the pixel's
+                // value
+                int offset = yo * width + xo;
+                if (xo < x1 && yo < y1)
+                  image[offset] = raymarch(density, nVoxels, ray);
+            }
+        }
+    }
+}
+
+
+__global__ void
+volume_task(float density[], int _nVoxels[3],
+            const float _raster2camera[4][4],
+            const float _camera2world[4][4],
+            int width, int height, float image[]) {
+  if (taskIndex0 >= taskCount0) return;
+
+#if 0
+  int nVoxels[3];
+  nVoxels[0] = _nVoxels[0];
+  nVoxels[1] = _nVoxels[1];
+  nVoxels[2] = _nVoxels[2];
+
+  float raster2camera[4][4];
+  raster2camera[0][0] = _raster2camera[0][0];
+  raster2camera[0][1] = _raster2camera[0][1];
+  raster2camera[0][2] = _raster2camera[0][2];
+  raster2camera[0][3] = _raster2camera[0][3];
+  raster2camera[1][0] = _raster2camera[1][0];
+  raster2camera[1][1] = _raster2camera[1][1];
+  raster2camera[1][2] = _raster2camera[1][2];
+  raster2camera[1][3] = _raster2camera[1][3];
+  raster2camera[2][0] = _raster2camera[2][0];
+  raster2camera[2][1] = _raster2camera[2][1];
+  raster2camera[2][2] = _raster2camera[2][2];
+  raster2camera[2][3] = _raster2camera[2][3];
+  raster2camera[3][0] = _raster2camera[3][0];
+  raster2camera[3][1] = _raster2camera[3][1];
+  raster2camera[3][2] = _raster2camera[3][2];
+  raster2camera[3][3] = _raster2camera[3][3];
+
+  float camera2world[4][4];
+  camera2world[0][0] = _camera2world[0][0];
+  camera2world[0][1] = _camera2world[0][1];
+  camera2world[0][2] = _camera2world[0][2];
+  camera2world[0][3] = _camera2world[0][3];
+  camera2world[1][0] = _camera2world[1][0];
+  camera2world[1][1] = _camera2world[1][1];
+  camera2world[1][2] = _camera2world[1][2];
+  camera2world[1][3] = _camera2world[1][3];
+  camera2world[2][0] = _camera2world[2][0];
+  camera2world[2][1] = _camera2world[2][1];
+  camera2world[2][2] = _camera2world[2][2];
+  camera2world[2][3] = _camera2world[2][3];
+  camera2world[3][0] = _camera2world[3][0];
+  camera2world[3][1] = _camera2world[3][1];
+  camera2world[3][2] = _camera2world[3][2];
+  camera2world[3][3] = _camera2world[3][3];
+#else
+#define  nVoxels _nVoxels
+#define  raster2camera _raster2camera
+#define  camera2world _camera2world
+#endif
+
+  int dx = 8, dy = 8; // must match value in volume_ispc_tasks
+  int xbuckets = (width + (dx-1)) / dx;
+  int ybuckets = (height + (dy-1)) / dy;
+
+  int x0 = (taskIndex % xbuckets) * dx;
+  int y0 = (taskIndex / xbuckets) * dy;
+  int x1 = x0 + dx, y1 = y0 + dy;
+  x1 = min(x1, width);
+  y1 = min(y1, height);
+
+  volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
+      camera2world, width, height, image);
+}
+
+
+extern "C"
+__global__ void
+volume_ispc_tasks___export( float density[],  int nVoxels[3],
+    const  float raster2camera[4][4],
+    const  float camera2world[4][4],
+    int width,  int height,  float image[]) {
+  // Launch tasks to work on (dx,dy)-sized tiles of the image
+  int dx = 8, dy = 8;
+  int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
+  launch(nTasks,1,1,volume_task)
+    (density, nVoxels, raster2camera, camera2world,
+     width, height, image);
+  cudaDeviceSynchronize();
+}
+
+extern "C"
+__host__ void
+volume_ispc_tasks( float density[],  int nVoxels[3],
+    const  float raster2camera[4][4],
+    const  float camera2world[4][4],
+    int width,  int height,  float image[]) {
+  volume_ispc_tasks___export<<<1,32>>>(density, nVoxels, raster2camera, camera2world, width, height,image);
+  cudaDeviceSynchronize();
+}
--- a/examples/portable/volume_rendering/volume.ispc
+++ b/examples/portable/volume_rendering/volume.ispc
@@ -0,0 +1,413 @@
+/*
+  Copyright (c) 2011-2014, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+
+typedef float<3> float3;
+
+struct Ray {
+    float3 origin, dir;
+};
+
+
+static inline void
+generateRay(const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4],
+            float x, float y, Ray &ray) {
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+}
+
+
+static inline bool
+Inside(float3 p, float3 pMin, float3 pMax) {
+    return (p.x >= pMin.x && p.x <= pMax.x &&
+            p.y >= pMin.y && p.y <= pMax.y &&
+            p.z >= pMin.z && p.z <= pMax.z);
+}
+
+
+static inline bool
+IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
+    float t0 = -1e30, t1 = 1e30;
+
+    float3 tNear = (pMin - ray.origin) / ray.dir;
+    float3 tFar  = (pMax - ray.origin) / ray.dir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = max(tNear.x, t0);
+    t1 = min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = max(tNear.y, t0);
+    t1 = min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = max(tNear.z, t0);
+    t1 = min(tFar.z, t1);
+
+    if (t0 <= t1) {
+        hit0 = t0;
+        hit1 = t1;
+        return true;
+    }
+    else
+        return false;
+}
+
+
+static inline float Lerp(float t, float a, float b) {
+    return (1.f - t) * a + t * b;
+}
+
+
+static inline float D(int x, int y, int z, uniform int nVoxels[3],
+                      uniform float density[]) {
+    x = clamp(x, 0, nVoxels[0]-1);
+    y = clamp(y, 0, nVoxels[1]-1);
+    z = clamp(z, 0, nVoxels[2]-1);
+
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
+    return (p - pMin) / (pMax - pMin);
+}
+
+
+static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
+                     uniform float density[], uniform int nVoxels[3]) {
+    if (!Inside(Pobj, pMin, pMax))
+        return 0;
+    // Compute voxel coordinates and offsets for _Pobj_
+    float3 vox = Offset(Pobj, pMin, pMax);
+    vox.x = vox.x * nVoxels[0] - .5f;
+    vox.y = vox.y * nVoxels[1] - .5f;
+    vox.z = vox.z * nVoxels[2] - .5f;
+    int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
+    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
+
+    // Trilinearly interpolate density values to compute local density
+    float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
+                     D(vx+1, vy, vz, nVoxels, density));
+    float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
+                     D(vx+1, vy+1, vz, nVoxels, density));
+    float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
+                     D(vx+1, vy, vz+1, nVoxels, density));
+    float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
+                     D(vx+1, vy+1, vz+1, nVoxels, density));
+    float d0 = Lerp(dy, d00, d10);
+    float d1 = Lerp(dy, d01, d11);
+    return Lerp(dz, d0, d1);
+}
+
+
+/* Returns the transmittance between two points p0 and p1, in a volume
+   with extent (pMin,pMax) with transmittance coefficient sigma_t,
+   defined by nVoxels[3] voxels in each dimension in the given density
+   array. */
+static inline float
+transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
+              uniform float3 pMax, uniform float sigma_t,
+              uniform float density[], uniform int nVoxels[3]) {
+    float rayT0, rayT1;
+    Ray ray;
+    ray.origin = p1;
+    ray.dir = p0 - p1;
+
+    // Find the parametric t range along the ray that is inside the volume.
+    if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 1.;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Accumulate beam transmittance in tau
+    float tau = 0;
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    const uniform float stepDist = 0.2;
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1) {
+        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    return exp(-tau);
+}
+
+
+static inline float
+distanceSquared(float3 a, float3 b) {
+    float3 d = a-b;
+    return d.x*d.x + d.y*d.y + d.z*d.z;
+}
+
+
+static inline float
+raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
+    float rayT0, rayT1;
+    const uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8};
+    const uniform float3 lightPos = { -1, 4, 1.5 };
+
+    if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 0.;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Parameters that define the volume scattering characteristics and
+    // sampling rate for raymarching
+    const uniform float Le = .25;            // Emission coefficient
+    const uniform float sigma_a = 10;        // Absorption coefficient
+    const uniform float sigma_s = 10;        // Scattering coefficient
+    const uniform float stepDist = 0.025;    // Ray step amount
+    const uniform float lightIntensity = 40; // Light source intensity
+
+    float tau = 0.f;  // accumulated beam transmittance
+    float L = 0;      // radiance along the ray
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1)
+    {
+        float d = Density(pos, pMin, pMax, density, nVoxels);
+
+        // terminate once attenuation is high
+        float atten = exp(-tau);
+        if (atten < .005)
+            break;
+
+        // direct lighting
+        float Li = lightIntensity / distanceSquared(lightPos, pos) *
+            transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
+                          density, nVoxels);
+        L += stepDist * atten * d * sigma_s * (Li + Le);
+
+        // update beam transmittance
+        tau += stepDist * (sigma_a + sigma_s) * d;
+
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    // Gamma correction
+    return pow(L, 1.f / 2.2f);
+}
+
+
+/* Utility routine used by both the task-based and the single-core entrypoints.
+   Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the
+   result into the image[] array.
+ */
+static inline void
+volume_tile(uniform int x0, uniform int y0, uniform int x1,
+            uniform int y1, uniform float density[], uniform int nVoxels[3],
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4],
+            uniform int width, uniform int height, uniform float image[]) {
+  // Work on 4x4=16 pixel big tiles of the image.  This function thus
+  // implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
+  // by 4.
+#if 0
+  for (uniform int y = y0; y < y1; y += 8)
+    for (uniform int x = x0; x < x1; x += 8)
+      foreach (o = 0 ... 64)
+      {
+        // These two arrays encode the mapping from [0,15] to
+        // offsets within the 4x4 pixel block so that we render
+        // each pixel inside the block
+        const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
+          0, 1, 0, 1, 2, 3, 2, 3 };
+        const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
+          2, 2, 3, 3, 2, 2, 3, 3 };
+
+        const uniform int xblock[4] = {0, 4, 0, 4};
+        const uniform int yblock[4] = {0, 0, 4, 4};
+
+        // Figure out the pixel to render for this program instance
+        const int xo = x + xblock[o/16] + xoffsets[o&15];
+        const int yo = y + yblock[o/16] + yoffsets[o&15];
+
+        // Use viewing parameters to compute the corresponding ray
+        // for the pixel
+        Ray ray;
+        generateRay(raster2camera, camera2world, xo, yo, ray);
+
+        // And raymarch through the volume to compute the pixel's
+        // value
+        int offset = yo * width + xo;
+        if (xo < x1 && yo < y1)
+          image[offset] = raymarch(density, nVoxels, ray);
+      }
+#else
+  foreach_tiled (y = y0 ... y1, x = x0 ... x1)
+  {
+    // Use viewing parameters to compute the corresponding ray
+    // for the pixel
+    Ray ray;
+    generateRay(raster2camera, camera2world, x, y, ray);
+
+    // And raymarch through the volume to compute the pixel's
+    // value
+    int offset = y * width + x;
+    image[offset] = raymarch(density, nVoxels, ray);
+  }
+#endif
+}
+
+
+task void
+volume_task(uniform float density[], uniform int _nVoxels[3],
+            const uniform float _raster2camera[4][4],
+            const uniform float _camera2world[4][4],
+            uniform int width, uniform int height, uniform float image[])
+{
+  if (taskIndex >= taskCount) return;
+
+#if 1 /* cannot pass shared memory pointers to functions, need to find a way to solve this one :S */
+  uniform int nVoxels[3];
+  nVoxels[0] = _nVoxels[0];
+  nVoxels[1] = _nVoxels[1];
+  nVoxels[2] = _nVoxels[2];
+
+  uniform float raster2camera[4][4];
+  raster2camera[0][0] = _raster2camera[0][0];
+  raster2camera[0][1] = _raster2camera[0][1];
+  raster2camera[0][2] = _raster2camera[0][2];
+  raster2camera[0][3] = _raster2camera[0][3];
+  raster2camera[1][0] = _raster2camera[1][0];
+  raster2camera[1][1] = _raster2camera[1][1];
+  raster2camera[1][2] = _raster2camera[1][2];
+  raster2camera[1][3] = _raster2camera[1][3];
+  raster2camera[2][0] = _raster2camera[2][0];
+  raster2camera[2][1] = _raster2camera[2][1];
+  raster2camera[2][2] = _raster2camera[2][2];
+  raster2camera[2][3] = _raster2camera[2][3];
+  raster2camera[3][0] = _raster2camera[3][0];
+  raster2camera[3][1] = _raster2camera[3][1];
+  raster2camera[3][2] = _raster2camera[3][2];
+  raster2camera[3][3] = _raster2camera[3][3];
+
+  uniform float camera2world[4][4];
+  camera2world[0][0] = _camera2world[0][0];
+  camera2world[0][1] = _camera2world[0][1];
+  camera2world[0][2] = _camera2world[0][2];
+  camera2world[0][3] = _camera2world[0][3];
+  camera2world[1][0] = _camera2world[1][0];
+  camera2world[1][1] = _camera2world[1][1];
+  camera2world[1][2] = _camera2world[1][2];
+  camera2world[1][3] = _camera2world[1][3];
+  camera2world[2][0] = _camera2world[2][0];
+  camera2world[2][1] = _camera2world[2][1];
+  camera2world[2][2] = _camera2world[2][2];
+  camera2world[2][3] = _camera2world[2][3];
+  camera2world[3][0] = _camera2world[3][0];
+  camera2world[3][1] = _camera2world[3][1];
+  camera2world[3][2] = _camera2world[3][2];
+  camera2world[3][3] = _camera2world[3][3];
+#else
+#define  nVoxels _nVoxels
+#define  raster2camera _raster2camera
+#define  camera2world _camera2world
+#endif
+
+  const uniform int dx = 8, dy = 8; // must match value in volume_ispc_tasks
+  const uniform int xbuckets = (width + (dx-1)) / dx;
+  const uniform int ybuckets = (height + (dy-1)) / dy;
+
+  const uniform int x0 = (taskIndex % xbuckets) * dx;
+  const uniform int y0 = (taskIndex / xbuckets) * dy;
+  const uniform int x1 = min(x0 + dx, width);
+  const uniform int y1 = min(y0 + dy, height);
+
+  volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
+      camera2world, width, height, image);
+}
+
+
+export void
+volume_ispc(uniform float density[], uniform int nVoxels[3],
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4],
+            uniform int width, uniform int height, uniform float image[]) {
+    volume_tile(0, 0, width, height, density, nVoxels, raster2camera,
+                camera2world, width, height,  image);
+}
+
+
+export void
+volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
+                  const uniform float raster2camera[4][4],
+                  const uniform float camera2world[4][4],
+                  uniform int width, uniform int height, uniform float image[]) {
+    // Launch tasks to work on (dx,dy)-sized tiles of the image
+    const uniform int dx = 8, dy = 8;
+    const uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
+    launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world,
+                               width, height, image);
+    sync;
+}
--- a/examples/sort/sort.cpp
+++ b/examples/sort/sort.cpp
@@ -37,6 +37,7 @@
 #include <stdlib.h>
 #include <algorithm>
 #include <iostream>
+#include <cassert>
 #include <iomanip>
 #include "../timing.h"
 #include "sort_ispc.h"
@@ -45,26 +46,28 @@ using namespace ispc;

 extern void sort_serial (int n, unsigned int code[], int order[]);

-/* progress bar by Ross Hemsley;
- * http://www.rosshemsley.co.uk/2011/02/creating-a-progress-bar-in-c-or-any-other-console-app/ */
-static inline void progressbar (unsigned int x, unsigned int n, unsigned int w = 50)
+static void progressBar(const int x, const int n, const int width = 50)
 {
-  if (n < 100)
-  {
-    x *= 100/n;
-    n = 100;
-  }
+  assert(n > 1);
+  assert(x >= 0 && x < n);
+  assert(width > 10);
+  const float f = static_cast<float>(x)/(n-1);
+  const int   w = static_cast<int>(f * width);

-  if ((x != n) && (x % (n/100) != 0)) return;
+  // print bar
+  std::string bstr("[");
+  for (int i = 0; i < width; i++)
+    bstr += i < w ? '=' : ' ';
+  bstr += "]";

-  using namespace std;
-  float ratio  =  x/(float)n;
-  int c =  ratio * w;
+  // print percentage 
+  char pstr0[32];
+  sprintf(pstr0, " %2d %c ", static_cast<int>(f*100.0),'%');
+  const std::string pstr(pstr0);
+  std::copy(pstr.begin(), pstr.end(), bstr.begin() + (width/2-2));

-  cout << setw(3) << (int)(ratio*100) << "% [";
-  for (int x=0; x<c; x++) cout << "=";
-  for (int x=c; x<w; x++) cout << " ";
-  cout << "]\r" << flush;
+  std::cout << bstr;
+  std::cout << (x == n-1 ? "\n" : "\r") << std::flush;
 }

 int main (int argc, char *argv[])
@@ -87,7 +90,7 @@ int main (int argc, char *argv[])
    tISPC1 += get_elapsed_mcycles();

    if (argc != 3)
-        progressbar (i, m);
+        progressBar (i, m);
  }

  printf("[sort ispc]:\t[%.3f] million cycles\n", tISPC1);
@@ -105,7 +108,7 @@ int main (int argc, char *argv[])
    tISPC2 += get_elapsed_mcycles();

    if (argc != 3)
-        progressbar (i, m);
+        progressBar (i, m);
  }

  printf("[sort ispc + tasks]:\t[%.3f] million cycles\n", tISPC2);
@@ -123,7 +126,7 @@ int main (int argc, char *argv[])
    tSerial += get_elapsed_mcycles();

    if (argc != 3)
-        progressbar (i, m);
+        progressBar (i, m);
  }

  printf("[sort serial]:\t\t[%.3f] million cycles\n", tSerial);
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -960,17 +960,22 @@ InitTaskSystem() {

 inline void
 TaskGroup::Launch(int baseIndex, int count) {
-#pragma omp parallel for
-    for(int i = 0; i < count; i++) {
+#pragma omp parallel
+  {
+    const int threadIndex = omp_get_thread_num();
+    const int threadCount = omp_get_num_threads();
+
+#pragma omp for schedule(runtime)
+    for(int i = 0; i < count; i++) 
+    {
        TaskInfo *ti = GetTaskInfo(baseIndex + i);

        // Actually run the task. 
-        int threadIndex = omp_get_thread_num();
-        int threadCount = omp_get_num_threads();
        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
    }
+  }
 }

 inline void
--- a/examples/timing.h
+++ b/examples/timing.h
@@ -58,6 +58,7 @@ __inline__ uint64_t rdtsc() {

 #ifdef WIN32
 #include <windows.h>
+double rtc();
 #define rdtsc __rdtsc
 #else // WIN32
 __inline__ uint64_t rdtsc() {
@@ -72,14 +73,30 @@ __inline__ uint64_t rdtsc() {
  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
  return (uint64_t)high << 32 | low;
 }
+
+#include <sys/time.h>
+static inline double rtc(void)
+{
+  struct timeval Tvalue;
+  double etime;
+  struct timezone dummy;
+
+  gettimeofday(&Tvalue,&dummy);
+  etime =  (double) Tvalue.tv_sec +
+    1.e-6*((double) Tvalue.tv_usec);
+  return etime;
+}
+
 #endif // !WIN32
 #endif // !__arm__            
            
-static uint64_t start, end;
+static uint64_t start,  end;
+static double  tstart, tend;

 static inline void reset_and_start_timer()
 {
    start = rdtsc();
+    tstart = rtc();
 }

 /* Returns the number of millions of elapsed processor cycles since the
@@ -89,3 +106,9 @@ static inline double get_elapsed_mcycles()
    end = rdtsc();
    return (end-start) / (1024. * 1024.);
 }
+
+static inline double get_elapsed_msec()
+{
+    tend = rtc();
+    return (tend - tstart)*1e3;
+}
--- a/examples/util/cuda_helpers.cuh
+++ b/examples/util/cuda_helpers.cuh
@@ -0,0 +1,58 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#define programCount 32
+#define programIndex (threadIdx.x & 31)
+#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5))
+#define taskCount0 (gridDim.x*4)
+#define taskIndex1 (blockIdx.y)
+#define taskCount1 (gridDim.y)
+#define taskIndex2 (blockIdx.z)
+#define taskCount2 (gridDim.z)
+#define taskIndex (taskIndex0 + taskCount0*(taskIndex1 + taskCount1*taskIndex2))
+#define taskCount (taskCount0*taskCount1*taskCount2)
+#define warpIdx (threadIdx.x >> 5)
+#define launch(ntx,nty,ntz,func) if (programIndex==0) func<<<dim3(((ntx)+4-1)/4,nty,ntz),128>>>
+#define sync cudaDeviceSynchronize()
+#define cif if
+__device__ __forceinline__ static double __shfl(double x, int lane)
+{
+  return __hiloint2double(
+      __shfl_xor(__double2hiint(x), lane),
+      __shfl_xor(__double2loint(x), lane));
+
+}
+#define shuffle(x,y) __shfl(x,y)
+#define broadcast(x,y) __shfl(x,y)
--- a/examples/util/ispc_malloc.cpp
+++ b/examples/util/ispc_malloc.cpp
@@ -0,0 +1,87 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <cstring>
+#include "ispc_malloc.h"
+
+#ifdef _CUDA_
+
+void * operator new(size_t size) throw(std::bad_alloc)
+{
+  void *ptr;
+  ispc_malloc(&ptr, size);
+  return ptr;
+}
+void operator delete(void *ptr) throw()
+{
+  ispc_free(ptr);
+}
+
+#else
+
+void ispc_malloc(void **ptr, const size_t size)
+{
+  *ptr = malloc(size);
+}
+void ispc_free(void *ptr)
+{
+  free(ptr);
+}
+void ispc_memset(void *ptr, int value, size_t size)
+{
+  memset(ptr, value, size);
+}
+void ispcSetMallocHeapLimit(size_t value)
+{
+}
+void ispcSetStackLimit(size_t value)
+{
+}
+unsigned long long ispcGetMallocHeapLimit()
+{
+  return -1;
+}
+unsigned long long ispcGetStackLimit()
+{
+  return -1;
+}
+void * ispcMemcpy(void *dest,  void *src,  size_t num)
+{
+  memcpy(dest, src, num);
+  return dest;
+}
+
+#endif
--- a/examples/util/ispc_malloc.h
+++ b/examples/util/ispc_malloc.h
@@ -0,0 +1,43 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+extern void ispc_malloc(void **ptr, const size_t size);
+extern void ispc_free(void *ptr);
+extern void ispc_memset(void *ptr, int value, size_t size);
+extern void ispcSetMallocHeapLimit(size_t value);
+extern void ispcSetStackLimit(size_t value);
+extern unsigned long long ispcGetMallocHeapLimit();
+extern unsigned long long ispcGetStackLimit();
+extern void * ispcMemcpy(void *dest,  void *src,  size_t num);
--- a/examples/util/nvcc_helpers.cu
+++ b/examples/util/nvcc_helpers.cu
@@ -0,0 +1,76 @@
+/*
+  Copyright (c) 2014, Evghenii Gaburov
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _CUDA_
+#error "Something went wrong..."
+#endif
+
+void ispc_malloc(void **ptr, const size_t size)
+{
+  cudaMallocManaged(ptr, size);
+}
+void ispc_free(void *ptr)
+{
+  cudaFree(ptr);
+}
+void ispc_memset(void *ptr, int value, size_t size)
+{
+  cudaMemset(ptr, value, size);
+}
+void ispcSetMallocHeapLimit(size_t value)
+{
+  cudaDeviceSetLimit(cudaLimitMallocHeapSize,value);
+}
+void ispcSetStackLimit(size_t value)
+{
+  cudaDeviceSetLimit(cudaLimitStackSize,value);
+}
+unsigned long long ispcGetMallocHeapLimit()
+{
+  size_t value;
+  cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize);
+  return value;
+}
+unsigned long long ispcGetStackLimit()
+{
+  size_t value;
+  cudaDeviceGetLimit(&value, cudaLimitStackSize);
+  return value;
+}
+void * ispcMemcpy(void *dest,  void *src,  size_t num)
+{
+  cudaMemcpy(dest, src, num, cudaMemcpyDefault);
+  return dest;
+}
+
+
--- a/expr.cpp
+++ b/expr.cpp
@@ -7872,6 +7872,14 @@ SizeOfExpr::TypeCheck() {
              "struct type \"%s\".", type->GetString().c_str());
        return NULL;
    }
+#ifdef ISPC_NVPTX_ENABLED
+    if (type != NULL)
+      if (g->target->getISA() == Target::NVPTX && type->IsVaryingType())
+      {
+        Error(pos, "\"sizeof\" with varying data types is not yet supported with \"nvptx\" target.");
+        return NULL;
+      }
+#endif /* ISPC_NVPTX_ENABLED */

    return this;
 }
@@ -8704,6 +8712,13 @@ NewExpr::TypeCheck() {
        AssertPos(pos, m->errorCount > 0);
        return NULL;
    }
+#ifdef ISPC_NVPTX_ENABLED
+    if (g->target->getISA() == Target::NVPTX && allocType->IsVaryingType())
+    {
+      Error(pos, "\"new\" with varying data types is not yet supported with \"nvptx\" target.");
+      return NULL;
+    }
+#endif /* ISPC_NVPTX_ENABLED */
    if (CastType<UndefinedStructType>(allocType) != NULL) {
        Error(pos, "Can't dynamically allocate storage for declared "
              "but not defined type \"%s\".", allocType->GetString().c_str());
--- a/func.cpp
+++ b/func.cpp
@@ -47,6 +47,9 @@
 #include <stdio.h>

 #if defined(LLVM_3_2)
+#ifdef ISPC_NVPTX_ENABLED
+  #include <llvm/Metadata.h>
+#endif /* ISPC_NVPTX_ENABLED */
  #include <llvm/LLVMContext.h>
  #include <llvm/Module.h>
  #include <llvm/Type.h>
@@ -54,6 +57,9 @@
  #include <llvm/Intrinsics.h>
  #include <llvm/DerivedTypes.h>
 #else
+#ifdef ISPC_NVPTX_ENABLED
+  #include <llvm/IR/Metadata.h>
+#endif /* ISPC_NVPTX_ENABLED */
  #include <llvm/IR/LLVMContext.h>
  #include <llvm/IR/Module.h>
  #include <llvm/IR/Type.h>
@@ -129,7 +135,11 @@ Function::Function(Symbol *s, Stmt *c) {
            sym->parentFunction = this;
    }

-    if (type->isTask) {
+    if (type->isTask
+#ifdef ISPC_NVPTX_ENABLED
+        && (g->target->getISA() != Target::NVPTX) 
+#endif
+       ){
        threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
        Assert(threadIndexSym);
        threadCountSym = m->symbolTable->LookupVariable("threadCount");
@@ -240,7 +250,11 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
 #endif
    const FunctionType *type = CastType<FunctionType>(sym->type);
    Assert(type != NULL);
-    if (type->isTask == true) {
+    if (type->isTask == true
+#ifdef ISPC_NVPTX_ENABLED
+        && (g->target->getISA() != Target::NVPTX) 
+#endif 
+       ){
        // For tasks, there should always be three parameters: the
        // pointer to the structure that holds all of the arguments, the
        // thread index, and the thread count variables.
@@ -338,6 +352,18 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
            ctx->SetFunctionMask(argIter);
            Assert(++argIter == function->arg_end());
        }
+#ifdef ISPC_NVPTX_ENABLED
+        if (type->isTask == true && g->target->getISA() == Target::NVPTX)
+        {
+          llvm::NamedMDNode* annotations =
+            m->module->getOrInsertNamedMetadata("nvvm.annotations");
+          llvm::SmallVector<llvm::Value*, 3> av;
+          av.push_back(function);
+          av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
+          av.push_back(LLVMInt32(1));
+          annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
+        }
+#endif /* ISPC_NVPTX_ENABLED */
    }

    // Finally, we can generate code for the function
@@ -499,6 +525,21 @@ Function::GenerateIR() {
                std::string functionName = sym->name;
                if (g->mangleFunctionsWithTarget)
                    functionName += std::string("_") + g->target->GetISAString();
+#ifdef ISPC_NVPTX_ENABLED
+                if (g->target->getISA() == Target::NVPTX)
+                {
+                  functionName += std::string("___export");  /* add ___export to the end, for ptxcc to recognize it is exported */
+#if 0
+                  llvm::NamedMDNode* annotations =
+                    m->module->getOrInsertNamedMetadata("nvvm.annotations");
+                  llvm::SmallVector<llvm::Value*, 3> av;
+                  av.push_back(function);
+                  av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
+                  av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
+                  annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); 
+#endif
+                }
+#endif /* ISPC_NVPTX_ENABLED */
                llvm::Function *appFunction =
                    llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
                appFunction->setDoesNotThrow();
@@ -536,6 +577,18 @@ Function::GenerateIR() {
                            FATAL("Function verificication failed");
                        }
                    }
+#ifdef ISPC_NVPTX_ENABLED
+                    if (g->target->getISA() == Target::NVPTX)
+                    {
+                      llvm::NamedMDNode* annotations =
+                        m->module->getOrInsertNamedMetadata("nvvm.annotations");
+                      llvm::SmallVector<llvm::Value*, 3> av;
+                      av.push_back(appFunction);
+                      av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
+                      av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
+                      annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); 
+                    }
+#endif /* ISPC_NVPTX_ENABLED */
                }
            }
        }
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -243,6 +243,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
            arch = "arm";
        else
 #endif
+#ifdef ISPC_NVPTX_ENABLED
+         if(!strncmp(isa, "nvptx", 5))
+           arch = "nvptx64";
+         else
+#endif /* ISPC_NVPTX_ENABLED */
            arch = "x86-64";
    }

@@ -582,6 +587,23 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        this->m_maskBitCount = 32;
    }
 #endif
+#ifdef ISPC_NVPTX_ENABLED
+    else if (!strcasecmp(isa, "nvptx")) 
+    {
+        this->m_isa = Target::NVPTX;
+        this->m_cpu = "sm_35";
+        this->m_nativeVectorWidth = 32;
+        this->m_nativeVectorAlignment = 32;
+        this->m_vectorWidth = 1;
+        this->m_hasHalf = true;
+        this->m_maskingIsFree = true;
+        this->m_maskBitCount = 1;
+        this->m_hasTranscendentals = true;
+        this->m_hasTrigonometry = true;
+        this->m_hasGather = this->m_hasScatter = false;
+        cpuFromIsa = "sm_35";
+    }
+#endif /* ISPC_NVPTX_ENABLED */
    else {
        Error(SourcePos(), "Target \"%s\" is unknown.  Choices are: %s.",
                isa, SupportedTargets());
@@ -679,6 +701,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
                "i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-"
                "f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32-v4:128:128";
        }
+        else if (m_isa == Target::NVPTX)
+        {
+          dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
+        }

        // 3. Finally set member data
        m_dataLayout = new llvm::DataLayout(dl_string);
@@ -695,6 +721,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        // Initialize target-specific "target-feature" attribute.
        if (!m_attributes.empty()) {
            llvm::AttrBuilder attrBuilder;
+#ifdef ISPC_NVPTX_ENABLED
+            if (m_isa != Target::NVPTX)
+#endif
            attrBuilder.addAttribute("target-cpu", this->m_cpu);
            attrBuilder.addAttribute("target-features", this->m_attributes);
            this->m_tf_attributes = new llvm::AttributeSet(
@@ -742,6 +771,9 @@ Target::SupportedTargets() {
    return
 #ifdef ISPC_ARM_ENABLED
        "neon-i8x16, neon-i16x8, neon-i32x4, "
+#endif
+#ifdef ISPC_NVPTX_ENABLED
+        "nvptx, "
 #endif
        "sse2-i32x4, sse2-i32x8, "
        "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
@@ -777,6 +809,10 @@ Target::GetTripleString() const {
            triple.setArchName("i386");
        else if (m_arch == "x86-64")
            triple.setArchName("x86_64");
+#ifdef ISPC_NVPTX_ENABLED
+        else if (m_arch == "nvptx64")
+          triple = llvm::Triple("nvptx64", "nvidia", "cuda");
+#endif /* ISPC_NVPTX_ENABLED */
        else
            triple.setArchName(m_arch);
    }
@@ -809,6 +845,10 @@ Target::ISAToString(ISA isa) {
        return "avx2";
    case Target::GENERIC:
        return "generic";
+#ifdef ISPC_NVPTX_ENABLED
+    case Target::NVPTX:
+        return "nvptx";
+#endif /* ISPC_NVPTX_ENABLED */
    default:
        FATAL("Unhandled target in ISAToString()");
    }
@@ -847,6 +887,10 @@ Target::ISAToTargetString(ISA isa) {
        return "avx2-i32x8";
    case Target::GENERIC:
        return "generic-4";
+#ifdef ISPC_NVPTX_ENABLED
+    case Target::NVPTX:
+        return "nvptx";
+#endif /* ISPC_NVPTX_ENABLED */
    default:
        FATAL("Unhandled target in ISAToTargetString()");
    }
--- a/Show More
+++ b/Show More