added examples

2014-01-06 14:00:36 +01:00
parent c7ed130cce
commit 785b2f5d24
26 changed files with 3 additions and 2564 deletions
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=ao
 CPP_SRC=ao.cpp ao_serial.cpp
 ISPC_SRC=ao.ispc
-ISPC_IA_TARGETS=avx1-i32x8
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8
 ISPC_ARM_TARGETS=neon

 include ../common.mk
--- a/examples/deferred/deferred_shading
+++ b/examples/deferred/deferred_shading
--- a/examples/deferred/main_cu.cpp
+++ b/examples/deferred/main_cu.cpp
@@ -1,139 +0,0 @@
-/*
-  Copyright (c) 2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-#ifdef _MSC_VER
-#define ISPC_IS_WINDOWS
-#define NOMINMAX
-#elif defined(__linux__)
-#define ISPC_IS_LINUX
-#elif defined(__APPLE__)
-#define ISPC_IS_APPLE
-#endif
-
-#include <fcntl.h>
-#include <float.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <stdint.h>
-#include <algorithm>
-#include <assert.h>
-#include <vector>
-#ifdef ISPC_IS_WINDOWS
-  #define WIN32_LEAN_AND_MEAN
-  #include <windows.h>
-#endif
-#include "deferred.h"
-#include "kernels_ispc.h"
-#include "../timing.h"
-
-///////////////////////////////////////////////////////////////////////////
-
-int main(int argc, char** argv) {
-    if (argc != 2) {
-        printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
-        return 1;
-    }
-
-    InputData *input = CreateInputDataFromFile(argv[1]);
-    if (!input) {
-        printf("Failed to load input file \"%s\"!\n", argv[1]);
-        return 1;
-    }
-
-    Framebuffer framebuffer(input->header.framebufferWidth,
-                            input->header.framebufferHeight);
-
-    InitDynamicC(input);
-#ifdef __cilk
-    InitDynamicCilk(input);
-#endif // __cilk
-
-    int nframes = 5;
-    double ispcCycles = 1e30;
-    for (int i = 0; i < 5; ++i) {
-        framebuffer.clear();
-        reset_and_start_timer();
-        for (int j = 0; j < nframes; ++j)
-            ispc::RenderStatic(input->header, input->arrays,
-                               VISUALIZE_LIGHT_COUNT,
-                               framebuffer.r, framebuffer.g, framebuffer.b);
-        double mcycles = get_elapsed_mcycles() / nframes;
-        ispcCycles = std::min(ispcCycles, mcycles);
-    }
-    printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
-           "%d x %d image\n", ispcCycles,
-           input->header.framebufferWidth, input->header.framebufferHeight);
-    WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
-
-#ifdef __cilk
-    double dynamicCilkCycles = 1e30;
-    for (int i = 0; i < 5; ++i) {
-        framebuffer.clear();
-        reset_and_start_timer();
-        for (int j = 0; j < nframes; ++j)
-            DispatchDynamicCilk(input, &framebuffer);
-        double mcycles = get_elapsed_mcycles() / nframes;
-        dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
-    }
-    printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", 
-           dynamicCilkCycles);
-    WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
-#endif // __cilk
-
-    double serialCycles = 1e30;
-    for (int i = 0; i < 5; ++i) {
-        framebuffer.clear();
-        reset_and_start_timer();
-        for (int j = 0; j < nframes; ++j)
-            DispatchDynamicC(input, &framebuffer);
-        double mcycles = get_elapsed_mcycles() / nframes;
-        serialCycles = std::min(serialCycles, mcycles);
-    }
-    printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", 
-           serialCycles);
-    WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
-
-#ifdef __cilk
-    printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", 
-           serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
-#else
-    printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles);
-#endif // __cilk
-
-    DeleteInputData(input);
-
-    return 0;
-}
--- a/examples/mandelbrot/avx.out
+++ b/examples/mandelbrot/avx.out
--- a/examples/mandelbrot/avx1.out
+++ b/examples/mandelbrot/avx1.out
--- a/examples/mandelbrot/out.o
+++ b/examples/mandelbrot/out.o
--- a/examples/mandelbrot/out.ptx
+++ b/examples/mandelbrot/out.ptx
@@ -1,843 +0,0 @@
-//
-// Generated by LLVM NVPTX Back-End
-//
-
-.version 3.1
-.target sm_35, texmode_independent
-.address_size 64
-
-	// .globl	__vselect_i8
-                                        // @__vselect_i8
-.func  (.param .align 1 .b8 func_retval0[1]) __vselect_i8(
-	.param .align 1 .b8 __vselect_i8_param_0[1],
-	.param .align 1 .b8 __vselect_i8_param_1[1],
-	.param .align 4 .b8 __vselect_i8_param_2[4]
-)
-{
-	.reg .pred %p<396>;
-	.reg .s16 %rc<396>;
-	.reg .s16 %rs<396>;
-	.reg .s32 %r<396>;
-	.reg .s64 %rl<396>;
-	.reg .f32 %f<396>;
-	.reg .f64 %fl<396>;
-
-// BB#0:
-	ld.param.u32 	%r0, [__vselect_i8_param_2];
-	setp.eq.s32 	%p0, %r0, 0;
-	ld.param.u8 	%rc0, [__vselect_i8_param_0];
-	ld.param.u8 	%rc1, [__vselect_i8_param_1];
-	selp.b16 	%rc0, %rc0, %rc1, %p0;
-	st.param.b8	[func_retval0+0], %rc0;
-	ret;
-}
-
-	// .globl	__vselect_i16
-.func  (.param .align 2 .b8 func_retval0[2]) __vselect_i16(
-	.param .align 2 .b8 __vselect_i16_param_0[2],
-	.param .align 2 .b8 __vselect_i16_param_1[2],
-	.param .align 4 .b8 __vselect_i16_param_2[4]
-)                                       // @__vselect_i16
-{
-	.reg .pred %p<396>;
-	.reg .s16 %rc<396>;
-	.reg .s16 %rs<396>;
-	.reg .s32 %r<396>;
-	.reg .s64 %rl<396>;
-	.reg .f32 %f<396>;
-	.reg .f64 %fl<396>;
-
-// BB#0:
-	ld.param.u32 	%r0, [__vselect_i16_param_2];
-	setp.eq.s32 	%p0, %r0, 0;
-	ld.param.u16 	%rs0, [__vselect_i16_param_0];
-	ld.param.u16 	%rs1, [__vselect_i16_param_1];
-	selp.b16 	%rs0, %rs0, %rs1, %p0;
-	st.param.b16	[func_retval0+0], %rs0;
-	ret;
-}
-
-	// .globl	__vselect_i64
-.func  (.param .align 8 .b8 func_retval0[8]) __vselect_i64(
-	.param .align 8 .b8 __vselect_i64_param_0[8],
-	.param .align 8 .b8 __vselect_i64_param_1[8],
-	.param .align 4 .b8 __vselect_i64_param_2[4]
-)                                       // @__vselect_i64
-{
-	.reg .pred %p<396>;
-	.reg .s16 %rc<396>;
-	.reg .s16 %rs<396>;
-	.reg .s32 %r<396>;
-	.reg .s64 %rl<396>;
-	.reg .f32 %f<396>;
-	.reg .f64 %fl<396>;
-
-// BB#0:
-	ld.param.u32 	%r0, [__vselect_i64_param_2];
-	setp.eq.s32 	%p0, %r0, 0;
-	ld.param.u64 	%rl0, [__vselect_i64_param_0];
-	ld.param.u64 	%rl1, [__vselect_i64_param_1];
-	selp.b64 	%rl0, %rl0, %rl1, %p0;
-	st.param.b64	[func_retval0+0], %rl0;
-	ret;
-}
-
-	// .globl	__aos_to_soa4_float1
-.func __aos_to_soa4_float1(
-	.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
-	.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
-	.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
-	.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
-	.param .b64 __aos_to_soa4_float1_param_4,
-	.param .b64 __aos_to_soa4_float1_param_5,
-	.param .b64 __aos_to_soa4_float1_param_6,
-	.param .b64 __aos_to_soa4_float1_param_7
-)                                       // @__aos_to_soa4_float1
-{
-	.reg .pred %p<396>;
-	.reg .s16 %rc<396>;
-	.reg .s16 %rs<396>;
-	.reg .s32 %r<396>;
-	.reg .s64 %rl<396>;
-	.reg .f32 %f<396>;
-	.reg .f64 %fl<396>;
-
-// BB#0:
-	ld.param.u64 	%rl0, [__aos_to_soa4_float1_param_4];
-	ld.param.u64 	%rl1, [__aos_to_soa4_float1_param_5];
-	ld.param.u64 	%rl2, [__aos_to_soa4_float1_param_6];
-	ld.param.u64 	%rl3, [__aos_to_soa4_float1_param_7];
-	ld.param.f32 	%f0, [__aos_to_soa4_float1_param_0];
-	ld.param.f32 	%f1, [__aos_to_soa4_float1_param_1];
-	ld.param.f32 	%f2, [__aos_to_soa4_float1_param_2];
-	ld.param.f32 	%f3, [__aos_to_soa4_float1_param_3];
-	st.f32 	[%rl0], %f0;
-	st.f32 	[%rl1], %f1;
-	st.f32 	[%rl2], %f2;
-	st.f32 	[%rl3], %f3;
-	ret;
-}
-
-	// .globl	__soa_to_aos4_float1
-.func __soa_to_aos4_float1(
-	.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
-	.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
-	.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
-	.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
-	.param .b64 __soa_to_aos4_float1_param_4,
-	.param .b64 __soa_to_aos4_float1_param_5,
-	.param .b64 __soa_to_aos4_float1_param_6,
-	.param .b64 __soa_to_aos4_float1_param_7
-)                                       // @__soa_to_aos4_float1
-{
-	.reg .pred %p<396>;
-	.reg .s16 %rc<396>;
-	.reg .s16 %rs<396>;
-	.reg .s32 %r<396>;
-	.reg .s64 %rl<396>;
-	.reg .f32 %f<396>;
-	.reg .f64 %fl<396>;
-
-// BB#0:
-	ld.param.u64 	%rl0, [__soa_to_aos4_float1_param_4];
-	ld.param.u64 	%rl1, [__soa_to_aos4_float1_param_5];
-	ld.param.u64 	%rl2, [__soa_to_aos4_float1_param_6];
-	ld.param.u64 	%rl3, [__soa_to_aos4_float1_param_7];
-	ld.param.f32 	%f0, [__soa_to_aos4_float1_param_0];
-	ld.param.f32 	%f1, [__soa_to_aos4_float1_param_1];
-	ld.param.f32 	%f2, [__soa_to_aos4_float1_param_2];
-	ld.param.f32 	%f3, [__soa_to_aos4_float1_param_3];
-	st.f32 	[%rl0], %f0;
-	st.f32 	[%rl1], %f1;
-	st.f32 	[%rl2], %f2;
-	st.f32 	[%rl3], %f3;
-	ret;
-}
-
-	// .globl	__aos_to_soa3_float1
-.func __aos_to_soa3_float1(
-	.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
-	.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
-	.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
-	.param .b64 __aos_to_soa3_float1_param_3,
-	.param .b64 __aos_to_soa3_float1_param_4,
-	.param .b64 __aos_to_soa3_float1_param_5
-)                                       // @__aos_to_soa3_float1
-{
-	.reg .pred %p<396>;
-	.reg .s16 %rc<396>;
-	.reg .s16 %rs<396>;
-	.reg .s32 %r<396>;
-	.reg .s64 %rl<396>;
-	.reg .f32 %f<396>;
-	.reg .f64 %fl<396>;
-
-// BB#0:
-	ld.param.u64 	%rl0, [__aos_to_soa3_float1_param_3];
-	ld.param.u64 	%rl1, [__aos_to_soa3_float1_param_4];
-	ld.param.u64 	%rl2, [__aos_to_soa3_float1_param_5];
-	ld.param.f32 	%f0, [__aos_to_soa3_float1_param_0];
-	ld.param.f32 	%f1, [__aos_to_soa3_float1_param_1];
-	ld.param.f32 	%f2, [__aos_to_soa3_float1_param_2];
-	st.f32 	[%rl0], %f0;
-	st.f32 	[%rl1], %f1;
-	st.f32 	[%rl2], %f2;
-	ret;
-}
-
-	// .globl	__soa_to_aos3_float1
-.func __soa_to_aos3_float1(
-	.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
-	.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
-	.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
-	.param .b64 __soa_to_aos3_float1_param_3,
-	.param .b64 __soa_to_aos3_float1_param_4,
-	.param .b64 __soa_to_aos3_float1_param_5
-)                                       // @__soa_to_aos3_float1
-{
-	.reg .pred %p<396>;
-	.reg .s16 %rc<396>;
-	.reg .s16 %rs<396>;
-	.reg .s32 %r<396>;
-	.reg .s64 %rl<396>;
-	.reg .f32 %f<396>;
-	.reg .f64 %fl<396>;
-
-// BB#0:
-	ld.param.u64 	%rl0, [__soa_to_aos3_float1_param_3];
-	ld.param.u64 	%rl1, [__soa_to_aos3_float1_param_4];
-	ld.param.u64 	%rl2, [__soa_to_aos3_float1_param_5];
-	ld.param.f32 	%f0, [__soa_to_aos3_float1_param_0];
-	ld.param.f32 	%f1, [__soa_to_aos3_float1_param_1];
-	ld.param.f32 	%f2, [__soa_to_aos3_float1_param_2];
-	st.f32 	[%rl0], %f0;
-	st.f32 	[%rl1], %f1;
-	st.f32 	[%rl2], %f2;
-	ret;
-}
-
-	// .globl	__rsqrt_varying_double
-.func  (.param .align 8 .b8 func_retval0[8]) __rsqrt_varying_double(
-	.param .align 8 .b8 __rsqrt_varying_double_param_0[8]
-)                                       // @__rsqrt_varying_double
-{
-	.reg .pred %p<396>;
-	.reg .s16 %rc<396>;
-	.reg .s16 %rs<396>;
-	.reg .s32 %r<396>;
-	.reg .s64 %rl<396>;
-	.reg .f32 %f<396>;
-	.reg .f64 %fl<396>;
-
-// BB#0:
-	ld.param.f64 	%fl0, [__rsqrt_varying_double_param_0];
-	rsqrt.approx.f64 	%fl0, %fl0;
-	st.param.f64	[func_retval0+0], %fl0;
-	ret;
-}
-
-	// .globl	mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
-.func mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_(
-	.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0,
-	.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1,
-	.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2,
-	.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3,
-	.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4,
-	.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5,
-	.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6,
-	.param .b64 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7,
-	.param .align 4 .b8 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_8[4]
-)                                       // @mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
-{
-	.reg .pred %p<396>;
-	.reg .s16 %rc<396>;
-	.reg .s16 %rs<396>;
-	.reg .s32 %r<396>;
-	.reg .s64 %rl<396>;
-	.reg .f32 %f<396>;
-	.reg .f64 %fl<396>;
-
-// BB#0:                                // %allocas
-	ld.param.f32 	%f0, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0];
-	ld.param.f32 	%f1, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1];
-	ld.param.f32 	%f3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2];
-	ld.param.f32 	%f2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3];
-	ld.param.u32 	%r0, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4];
-	ld.param.u32 	%r1, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5];
-	ld.param.u32 	%r2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6];
-	ld.param.u64 	%rl0, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7];
-	ld.param.u32 	%r3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_8];
-	setp.lt.s32 	%p0, %r3, 0;
-	sub.f32 	%f3, %f3, %f0;
-	cvt.rn.f32.s32 	%f4, %r0;
-	sub.f32 	%f2, %f2, %f1;
-	cvt.rn.f32.s32 	%f5, %r1;
-	div.rn.f32 	%f2, %f2, %f5;
-	div.rn.f32 	%f3, %f3, %f4;
-	@%p0 bra 	BB8_9;
-// BB#1:                                // %for_test110.preheader
-	setp.lt.s32 	%p0, %r1, 1;
-	@%p0 bra 	BB8_45;
-// BB#2:                                // %outer_not_in_extras140.preheader.lr.ph
-	setp.gt.s32 	%p0, %r2, 0;
-	mov.u32 	%r3, 0;
-	selp.b32 	%r4, -1, 0, %p0;
-	shl.b32 	%r5, %r0, 2;
-	mov.u32 	%r6, %r3;
-BB8_3:                                  // %outer_not_in_extras140.preheader
-                                        // =>This Loop Header: Depth=1
-                                        //     Child Loop BB8_41 Depth 2
-                                        //     Child Loop BB8_43 Depth 2
-                                        //     Child Loop BB8_38 Depth 2
-                                        //       Child Loop BB8_33 Depth 3
-	setp.lt.s32 	%p0, %r0, 1;
-	@%p0 bra 	BB8_4;
-// BB#31:                               // %foreach_full_body120.lr.ph
-                                        //   in Loop: Header=BB8_3 Depth=1
-	setp.lt.s32 	%p0, %r4, 0;
-	mov.u32 	%r7, %r0;
-	mov.u32 	%r8, %r3;
-	@%p0 bra 	BB8_32;
-	bra.uni 	BB8_43;
-BB8_32:                                 //   in Loop: Header=BB8_3 Depth=1
-	mov.u64 	%rl1, 0;
-	cvt.rn.f32.s32 	%f4, %r6;
-	fma.rn.f32 	%f4, %f2, %f4, %f1;
-	mul.lo.s32 	%r7, %r6, %r0;
-BB8_38:                                 // %for_loop.i380.lr.ph.us
-                                        //   Parent Loop BB8_3 Depth=1
-                                        // =>  This Loop Header: Depth=2
-                                        //       Child Loop BB8_33 Depth 3
-	cvt.u32.u64 	%r8, %rl1;
-	cvt.rn.f32.s32 	%f5, %r8;
-	fma.rn.f32 	%f5, %f3, %f5, %f0;
-	mov.u32 	%r10, 0;
-	mov.u32 	%r12, %r4;
-	mov.u32 	%r11, %r10;
-	mov.u32 	%r9, %r10;
-	mov.f32 	%f7, %f5;
-	mov.f32 	%f6, %f4;
-BB8_33:                                 // %for_loop.i380.us
-                                        //   Parent Loop BB8_3 Depth=1
-                                        //     Parent Loop BB8_38 Depth=2
-                                        // =>    This Inner Loop Header: Depth=3
-	mul.f32 	%f8, %f7, %f7;
-	fma.rn.f32 	%f9, %f6, %f6, %f8;
-	setp.gtu.f32 	%p0, %f9, 0f40800000;
-	selp.b32 	%r13, %r12, 0, %p0;
-	or.b32  	%r11, %r13, %r11;
-	shr.u32 	%r13, %r11, 31;
-	shr.u32 	%r14, %r12, 31;
-	setp.eq.s32 	%p0, %r13, %r14;
-	@%p0 bra 	BB8_34;
-	bra.uni 	BB8_35;
-BB8_34:                                 //   in Loop: Header=BB8_33 Depth=3
-	mov.u32 	%r12, %r10;
-	bra.uni 	BB8_36;
-BB8_35:                                 // %not_all_continued_or_breaked.i394.us
-                                        //   in Loop: Header=BB8_33 Depth=3
-	mul.f32 	%f9, %f6, %f6;
-	not.b32 	%r13, %r11;
-	and.b32  	%r12, %r12, %r13;
-	sub.f32 	%f8, %f8, %f9;
-	add.f32 	%f8, %f5, %f8;
-	add.f32 	%f7, %f7, %f7;
-	fma.rn.f32 	%f6, %f6, %f7, %f4;
-	mov.f32 	%f7, %f8;
-BB8_36:                                 // %for_step.i363.us
-                                        //   in Loop: Header=BB8_33 Depth=3
-	setp.ne.s32 	%p0, %r12, 0;
-	selp.u32 	%r13, 1, 0, %p0;
-	add.s32 	%r9, %r9, %r13;
-	setp.lt.s32 	%p0, %r9, %r2;
-	selp.b32 	%r12, %r12, 0, %p0;
-	setp.lt.s32 	%p0, %r12, 0;
-	@%p0 bra 	BB8_33;
-// BB#37:                               // %mandel___vyfvyfvyi.exit395.us
-                                        //   in Loop: Header=BB8_38 Depth=2
-	add.s32 	%r8, %r8, %r7;
-	shl.b32 	%r8, %r8, 2;
-	cvt.s64.s32 	%rl2, %r8;
-	add.s64 	%rl2, %rl2, %rl0;
-	st.u32 	[%rl2], %r9;
-	add.s64 	%rl1, %rl1, 1;
-	cvt.u32.u64 	%r8, %rl1;
-	setp.eq.s32 	%p0, %r8, %r0;
-	@%p0 bra 	BB8_44;
-	bra.uni 	BB8_38;
-BB8_43:                                 // %mandel___vyfvyfvyi.exit395
-                                        //   Parent Loop BB8_3 Depth=1
-                                        // =>  This Inner Loop Header: Depth=2
-	cvt.s64.s32 	%rl1, %r8;
-	add.s64 	%rl1, %rl1, %rl0;
-	mov.u32 	%r9, 0;
-	st.u32 	[%rl1], %r9;
-	add.s32 	%r8, %r8, 4;
-	add.s32 	%r7, %r7, -1;
-	setp.eq.s32 	%p0, %r7, 0;
-	@%p0 bra 	BB8_44;
-	bra.uni 	BB8_43;
-BB8_4:                                  // %partial_inner_all_outer156
-                                        //   in Loop: Header=BB8_3 Depth=1
-	@%p0 bra 	BB8_44;
-// BB#5:                                // %partial_inner_only197
-                                        //   in Loop: Header=BB8_3 Depth=1
-	setp.gt.s32 	%p0, %r0, 0;
-	mov.u32 	%r8, 0;
-	fma.rn.f32 	%f4, %f3, 0f00000000, %f0;
-	cvt.rn.f32.s32 	%f5, %r6;
-	fma.rn.f32 	%f5, %f2, %f5, %f1;
-	selp.b32 	%r7, %r4, 0, %p0;
-	setp.lt.s32 	%p1, %r7, 0;
-	mov.u32 	%r10, %r4;
-	mov.u32 	%r9, %r8;
-	mov.u32 	%r7, %r8;
-	mov.f32 	%f7, %f4;
-	mov.f32 	%f6, %f5;
-	@%p1 bra 	BB8_41;
-	bra.uni 	BB8_6;
-BB8_41:                                 // %for_loop.i
-                                        //   Parent Loop BB8_3 Depth=1
-                                        // =>  This Inner Loop Header: Depth=2
-	selp.b32 	%r11, %r10, 0, %p0;
-	mul.f32 	%f8, %f7, %f7;
-	fma.rn.f32 	%f9, %f6, %f6, %f8;
-	setp.gtu.f32 	%p1, %f9, 0f40800000;
-	selp.b32 	%r12, %r10, 0, %p1;
-	or.b32  	%r9, %r12, %r9;
-	selp.b32 	%r12, %r9, 0, %p0;
-	shr.u32 	%r12, %r12, 31;
-	shr.u32 	%r11, %r11, 31;
-	setp.eq.s32 	%p1, %r12, %r11;
-	@%p1 bra 	BB8_42;
-	bra.uni 	BB8_39;
-BB8_42:                                 //   in Loop: Header=BB8_41 Depth=2
-	mov.u32 	%r10, %r8;
-	bra.uni 	BB8_40;
-BB8_39:                                 // %not_all_continued_or_breaked.i
-                                        //   in Loop: Header=BB8_41 Depth=2
-	mul.f32 	%f9, %f6, %f6;
-	not.b32 	%r11, %r9;
-	and.b32  	%r10, %r10, %r11;
-	sub.f32 	%f8, %f8, %f9;
-	add.f32 	%f8, %f4, %f8;
-	add.f32 	%f7, %f7, %f7;
-	fma.rn.f32 	%f6, %f6, %f7, %f5;
-	mov.f32 	%f7, %f8;
-BB8_40:                                 // %for_step.i
-                                        //   in Loop: Header=BB8_41 Depth=2
-	setp.ne.s32 	%p1, %r10, 0;
-	selp.u32 	%r11, 1, 0, %p1;
-	add.s32 	%r7, %r7, %r11;
-	setp.lt.s32 	%p1, %r7, %r2;
-	selp.b32 	%r10, %r10, 0, %p1;
-	selp.b32 	%r11, %r10, 0, %p0;
-	setp.gt.s32 	%p1, %r11, -1;
-	@%p1 bra 	BB8_7;
-	bra.uni 	BB8_41;
-BB8_6:                                  //   in Loop: Header=BB8_3 Depth=1
-	mov.u32 	%r7, %r8;
-BB8_7:                                  // %mandel___vyfvyfvyi.exit
-                                        //   in Loop: Header=BB8_3 Depth=1
-	setp.lt.s32 	%p0, %r0, 1;
-	@%p0 bra 	BB8_44;
-// BB#8:                                // %pl_dolane.i
-                                        //   in Loop: Header=BB8_3 Depth=1
-	mul.lo.s32 	%r8, %r6, %r0;
-	shl.b32 	%r8, %r8, 2;
-	cvt.s64.s32 	%rl1, %r8;
-	add.s64 	%rl1, %rl1, %rl0;
-	st.u32 	[%rl1], %r7;
-BB8_44:                                 // %foreach_reset128
-                                        //   in Loop: Header=BB8_3 Depth=1
-	add.s32 	%r6, %r6, 1;
-	add.s32 	%r3, %r3, %r5;
-	setp.eq.s32 	%p0, %r6, %r1;
-	@%p0 bra 	BB8_45;
-	bra.uni 	BB8_3;
-BB8_9:                                  // %for_test.preheader
-	setp.lt.s32 	%p0, %r1, 1;
-	@%p0 bra 	BB8_45;
-// BB#10:                               // %outer_not_in_extras.preheader.lr.ph
-	setp.gt.s32 	%p0, %r2, 0;
-	mov.u32 	%r3, 0;
-	selp.b32 	%r4, -1, 0, %p0;
-	shl.b32 	%r5, %r0, 2;
-	mov.u32 	%r6, %r3;
-BB8_11:                                 // %outer_not_in_extras.preheader
-                                        // =>This Loop Header: Depth=1
-                                        //     Child Loop BB8_23 Depth 2
-                                        //     Child Loop BB8_20 Depth 2
-                                        //     Child Loop BB8_19 Depth 2
-                                        //       Child Loop BB8_14 Depth 3
-	setp.lt.s32 	%p0, %r0, 1;
-	@%p0 bra 	BB8_28;
-// BB#12:                               // %foreach_full_body.lr.ph
-                                        //   in Loop: Header=BB8_11 Depth=1
-	setp.lt.s32 	%p0, %r4, 0;
-	mov.u32 	%r7, %r0;
-	mov.u32 	%r8, %r3;
-	@%p0 bra 	BB8_13;
-	bra.uni 	BB8_20;
-BB8_13:                                 //   in Loop: Header=BB8_11 Depth=1
-	mov.u64 	%rl1, 0;
-	cvt.rn.f32.s32 	%f4, %r6;
-	fma.rn.f32 	%f4, %f2, %f4, %f1;
-	mul.lo.s32 	%r7, %r6, %r0;
-BB8_19:                                 // %for_loop.i281.lr.ph.us
-                                        //   Parent Loop BB8_11 Depth=1
-                                        // =>  This Loop Header: Depth=2
-                                        //       Child Loop BB8_14 Depth 3
-	cvt.u32.u64 	%r8, %rl1;
-	cvt.rn.f32.s32 	%f5, %r8;
-	fma.rn.f32 	%f5, %f3, %f5, %f0;
-	mov.u32 	%r10, 0;
-	mov.u32 	%r12, %r4;
-	mov.u32 	%r11, %r10;
-	mov.u32 	%r9, %r10;
-	mov.f32 	%f7, %f5;
-	mov.f32 	%f6, %f4;
-BB8_14:                                 // %for_loop.i281.us
-                                        //   Parent Loop BB8_11 Depth=1
-                                        //     Parent Loop BB8_19 Depth=2
-                                        // =>    This Inner Loop Header: Depth=3
-	mul.f32 	%f8, %f7, %f7;
-	fma.rn.f32 	%f9, %f6, %f6, %f8;
-	setp.gtu.f32 	%p0, %f9, 0f40800000;
-	selp.b32 	%r13, %r12, 0, %p0;
-	or.b32  	%r11, %r13, %r11;
-	shr.u32 	%r13, %r11, 31;
-	shr.u32 	%r14, %r12, 31;
-	setp.eq.s32 	%p0, %r13, %r14;
-	@%p0 bra 	BB8_15;
-	bra.uni 	BB8_16;
-BB8_15:                                 //   in Loop: Header=BB8_14 Depth=3
-	mov.u32 	%r12, %r10;
-	bra.uni 	BB8_17;
-BB8_16:                                 // %not_all_continued_or_breaked.i295.us
-                                        //   in Loop: Header=BB8_14 Depth=3
-	mul.f32 	%f9, %f6, %f6;
-	not.b32 	%r13, %r11;
-	and.b32  	%r12, %r12, %r13;
-	sub.f32 	%f8, %f8, %f9;
-	add.f32 	%f8, %f5, %f8;
-	add.f32 	%f7, %f7, %f7;
-	fma.rn.f32 	%f6, %f6, %f7, %f4;
-	mov.f32 	%f7, %f8;
-BB8_17:                                 // %for_step.i264.us
-                                        //   in Loop: Header=BB8_14 Depth=3
-	setp.ne.s32 	%p0, %r12, 0;
-	selp.u32 	%r13, 1, 0, %p0;
-	add.s32 	%r9, %r9, %r13;
-	setp.lt.s32 	%p0, %r9, %r2;
-	selp.b32 	%r12, %r12, 0, %p0;
-	setp.lt.s32 	%p0, %r12, 0;
-	@%p0 bra 	BB8_14;
-// BB#18:                               // %mandel___vyfvyfvyi.exit296.us
-                                        //   in Loop: Header=BB8_19 Depth=2
-	add.s32 	%r8, %r8, %r7;
-	shl.b32 	%r8, %r8, 2;
-	cvt.s64.s32 	%rl2, %r8;
-	add.s64 	%rl2, %rl2, %rl0;
-	st.u32 	[%rl2], %r9;
-	add.s64 	%rl1, %rl1, 1;
-	cvt.u32.u64 	%r8, %rl1;
-	setp.eq.s32 	%p0, %r8, %r0;
-	@%p0 bra 	BB8_27;
-	bra.uni 	BB8_19;
-BB8_20:                                 // %mandel___vyfvyfvyi.exit296
-                                        //   Parent Loop BB8_11 Depth=1
-                                        // =>  This Inner Loop Header: Depth=2
-	cvt.s64.s32 	%rl1, %r8;
-	add.s64 	%rl1, %rl1, %rl0;
-	mov.u32 	%r9, 0;
-	st.u32 	[%rl1], %r9;
-	add.s32 	%r8, %r8, 4;
-	add.s32 	%r7, %r7, -1;
-	setp.eq.s32 	%p0, %r7, 0;
-	@%p0 bra 	BB8_27;
-	bra.uni 	BB8_20;
-BB8_28:                                 // %partial_inner_all_outer
-                                        //   in Loop: Header=BB8_11 Depth=1
-	@%p0 bra 	BB8_27;
-// BB#29:                               // %partial_inner_only
-                                        //   in Loop: Header=BB8_11 Depth=1
-	setp.gt.s32 	%p0, %r0, 0;
-	mov.u32 	%r8, 0;
-	fma.rn.f32 	%f4, %f3, 0f00000000, %f0;
-	cvt.rn.f32.s32 	%f5, %r6;
-	fma.rn.f32 	%f5, %f2, %f5, %f1;
-	selp.b32 	%r7, %r4, 0, %p0;
-	setp.lt.s32 	%p1, %r7, 0;
-	mov.u32 	%r10, %r4;
-	mov.u32 	%r9, %r8;
-	mov.u32 	%r7, %r8;
-	mov.f32 	%f7, %f4;
-	mov.f32 	%f6, %f5;
-	@%p1 bra 	BB8_23;
-	bra.uni 	BB8_30;
-BB8_23:                                 // %for_loop.i332
-                                        //   Parent Loop BB8_11 Depth=1
-                                        // =>  This Inner Loop Header: Depth=2
-	selp.b32 	%r11, %r10, 0, %p0;
-	mul.f32 	%f8, %f7, %f7;
-	fma.rn.f32 	%f9, %f6, %f6, %f8;
-	setp.gtu.f32 	%p1, %f9, 0f40800000;
-	selp.b32 	%r12, %r10, 0, %p1;
-	or.b32  	%r9, %r12, %r9;
-	selp.b32 	%r12, %r9, 0, %p0;
-	shr.u32 	%r12, %r12, 31;
-	shr.u32 	%r11, %r11, 31;
-	setp.eq.s32 	%p1, %r12, %r11;
-	@%p1 bra 	BB8_24;
-	bra.uni 	BB8_21;
-BB8_24:                                 //   in Loop: Header=BB8_23 Depth=2
-	mov.u32 	%r10, %r8;
-	bra.uni 	BB8_22;
-BB8_21:                                 // %not_all_continued_or_breaked.i346
-                                        //   in Loop: Header=BB8_23 Depth=2
-	mul.f32 	%f9, %f6, %f6;
-	not.b32 	%r11, %r9;
-	and.b32  	%r10, %r10, %r11;
-	sub.f32 	%f8, %f8, %f9;
-	add.f32 	%f8, %f4, %f8;
-	add.f32 	%f7, %f7, %f7;
-	fma.rn.f32 	%f6, %f6, %f7, %f5;
-	mov.f32 	%f7, %f8;
-BB8_22:                                 // %for_step.i313
-                                        //   in Loop: Header=BB8_23 Depth=2
-	setp.ne.s32 	%p1, %r10, 0;
-	selp.u32 	%r11, 1, 0, %p1;
-	add.s32 	%r7, %r7, %r11;
-	setp.lt.s32 	%p1, %r7, %r2;
-	selp.b32 	%r10, %r10, 0, %p1;
-	selp.b32 	%r11, %r10, 0, %p0;
-	setp.gt.s32 	%p1, %r11, -1;
-	@%p1 bra 	BB8_25;
-	bra.uni 	BB8_23;
-BB8_30:                                 //   in Loop: Header=BB8_11 Depth=1
-	mov.u32 	%r7, %r8;
-BB8_25:                                 // %mandel___vyfvyfvyi.exit347
-                                        //   in Loop: Header=BB8_11 Depth=1
-	setp.lt.s32 	%p0, %r0, 1;
-	@%p0 bra 	BB8_27;
-// BB#26:                               // %pl_dolane.i452
-                                        //   in Loop: Header=BB8_11 Depth=1
-	mul.lo.s32 	%r8, %r6, %r0;
-	shl.b32 	%r8, %r8, 2;
-	cvt.s64.s32 	%rl1, %r8;
-	add.s64 	%rl1, %rl1, %rl0;
-	st.u32 	[%rl1], %r7;
-BB8_27:                                 // %foreach_reset
-                                        //   in Loop: Header=BB8_11 Depth=1
-	add.s32 	%r6, %r6, 1;
-	add.s32 	%r3, %r3, %r5;
-	setp.eq.s32 	%p0, %r6, %r1;
-	@%p0 bra 	BB8_45;
-	bra.uni 	BB8_11;
-BB8_45:                                 // %for_exit
-	ret;
-}
-
-	// .globl	mandelbrot_ispc
-.func mandelbrot_ispc(
-	.param .b32 mandelbrot_ispc_param_0,
-	.param .b32 mandelbrot_ispc_param_1,
-	.param .b32 mandelbrot_ispc_param_2,
-	.param .b32 mandelbrot_ispc_param_3,
-	.param .b32 mandelbrot_ispc_param_4,
-	.param .b32 mandelbrot_ispc_param_5,
-	.param .b32 mandelbrot_ispc_param_6,
-	.param .b64 mandelbrot_ispc_param_7
-)                                       // @mandelbrot_ispc
-{
-	.reg .pred %p<396>;
-	.reg .s16 %rc<396>;
-	.reg .s16 %rs<396>;
-	.reg .s32 %r<396>;
-	.reg .s64 %rl<396>;
-	.reg .f32 %f<396>;
-	.reg .f64 %fl<396>;
-
-// BB#0:                                // %allocas
-	ld.param.u32 	%r0, [mandelbrot_ispc_param_5];
-	setp.lt.s32 	%p0, %r0, 1;
-	@%p0 bra 	BB9_18;
-// BB#1:                                // %outer_not_in_extras.preheader.lr.ph
-	ld.param.f32 	%f0, [mandelbrot_ispc_param_0];
-	ld.param.f32 	%f1, [mandelbrot_ispc_param_1];
-	ld.param.f32 	%f3, [mandelbrot_ispc_param_2];
-	ld.param.f32 	%f2, [mandelbrot_ispc_param_3];
-	ld.param.u32 	%r1, [mandelbrot_ispc_param_4];
-	ld.param.u32 	%r2, [mandelbrot_ispc_param_6];
-	ld.param.u64 	%rl0, [mandelbrot_ispc_param_7];
-	sub.f32 	%f3, %f3, %f0;
-	cvt.rn.f32.s32 	%f4, %r1;
-	sub.f32 	%f2, %f2, %f1;
-	cvt.rn.f32.s32 	%f5, %r0;
-	div.rn.f32 	%f2, %f2, %f5;
-	div.rn.f32 	%f3, %f3, %f4;
-	setp.gt.s32 	%p0, %r2, 0;
-	mov.u32 	%r3, 0;
-	selp.b32 	%r4, -1, 0, %p0;
-BB9_2:                                  // %outer_not_in_extras.preheader
-                                        // =>This Loop Header: Depth=1
-                                        //     Child Loop BB9_13 Depth 2
-                                        //     Child Loop BB9_4 Depth 2
-                                        //       Child Loop BB9_9 Depth 3
-	setp.lt.s32 	%p0, %r1, 1;
-	@%p0 bra 	BB9_19;
-// BB#3:                                // %foreach_full_body.lr.ph
-                                        //   in Loop: Header=BB9_2 Depth=1
-	mov.u64 	%rl1, 0;
-	cvt.rn.f32.s32 	%f4, %r3;
-	fma.rn.f32 	%f4, %f2, %f4, %f1;
-	mul.lo.s32 	%r5, %r3, %r1;
-BB9_4:                                  // %foreach_full_body
-                                        //   Parent Loop BB9_2 Depth=1
-                                        // =>  This Loop Header: Depth=2
-                                        //       Child Loop BB9_9 Depth 3
-	setp.lt.s32 	%p0, %r4, 0;
-	cvt.u32.u64 	%r6, %rl1;
-	cvt.rn.f32.s32 	%f5, %r6;
-	fma.rn.f32 	%f5, %f3, %f5, %f0;
-	mov.u32 	%r8, 0;
-	mov.u32 	%r10, %r4;
-	mov.u32 	%r9, %r8;
-	mov.u32 	%r7, %r8;
-	mov.f32 	%f7, %f5;
-	mov.f32 	%f6, %f4;
-	@%p0 bra 	BB9_9;
-	bra.uni 	BB9_5;
-BB9_9:                                  // %for_loop.i281
-                                        //   Parent Loop BB9_2 Depth=1
-                                        //     Parent Loop BB9_4 Depth=2
-                                        // =>    This Inner Loop Header: Depth=3
-	mul.f32 	%f8, %f7, %f7;
-	fma.rn.f32 	%f9, %f6, %f6, %f8;
-	setp.gtu.f32 	%p0, %f9, 0f40800000;
-	selp.b32 	%r11, %r10, 0, %p0;
-	or.b32  	%r9, %r11, %r9;
-	shr.u32 	%r11, %r9, 31;
-	shr.u32 	%r12, %r10, 31;
-	setp.eq.s32 	%p0, %r11, %r12;
-	@%p0 bra 	BB9_10;
-	bra.uni 	BB9_7;
-BB9_10:                                 //   in Loop: Header=BB9_9 Depth=3
-	mov.u32 	%r10, %r8;
-	bra.uni 	BB9_8;
-BB9_7:                                  // %not_all_continued_or_breaked.i295
-                                        //   in Loop: Header=BB9_9 Depth=3
-	mul.f32 	%f9, %f6, %f6;
-	not.b32 	%r11, %r9;
-	and.b32  	%r10, %r10, %r11;
-	sub.f32 	%f8, %f8, %f9;
-	add.f32 	%f8, %f5, %f8;
-	add.f32 	%f7, %f7, %f7;
-	fma.rn.f32 	%f6, %f6, %f7, %f4;
-	mov.f32 	%f7, %f8;
-BB9_8:                                  // %for_step.i264
-                                        //   in Loop: Header=BB9_9 Depth=3
-	setp.ne.s32 	%p0, %r10, 0;
-	selp.u32 	%r11, 1, 0, %p0;
-	add.s32 	%r7, %r7, %r11;
-	setp.lt.s32 	%p0, %r7, %r2;
-	selp.b32 	%r10, %r10, 0, %p0;
-	setp.gt.s32 	%p0, %r10, -1;
-	@%p0 bra 	BB9_6;
-	bra.uni 	BB9_9;
-BB9_5:                                  //   in Loop: Header=BB9_4 Depth=2
-	mov.u32 	%r7, %r8;
-BB9_6:                                  // %mandel___vyfvyfvyi.exit296
-                                        //   in Loop: Header=BB9_4 Depth=2
-	add.s32 	%r6, %r6, %r5;
-	shl.b32 	%r6, %r6, 2;
-	cvt.s64.s32 	%rl2, %r6;
-	add.s64 	%rl2, %rl2, %rl0;
-	st.u32 	[%rl2], %r7;
-	add.s64 	%rl1, %rl1, 1;
-	cvt.u32.u64 	%r6, %rl1;
-	setp.eq.s32 	%p0, %r6, %r1;
-	@%p0 bra 	BB9_17;
-	bra.uni 	BB9_4;
-BB9_19:                                 // %partial_inner_all_outer
-                                        //   in Loop: Header=BB9_2 Depth=1
-	@%p0 bra 	BB9_17;
-// BB#20:                               // %partial_inner_only
-                                        //   in Loop: Header=BB9_2 Depth=1
-	setp.gt.s32 	%p0, %r1, 0;
-	mov.u32 	%r6, 0;
-	fma.rn.f32 	%f4, %f3, 0f00000000, %f0;
-	cvt.rn.f32.s32 	%f5, %r3;
-	fma.rn.f32 	%f5, %f2, %f5, %f1;
-	selp.b32 	%r5, %r4, 0, %p0;
-	setp.lt.s32 	%p1, %r5, 0;
-	mov.u32 	%r8, %r4;
-	mov.u32 	%r7, %r6;
-	mov.u32 	%r5, %r6;
-	mov.f32 	%f7, %f4;
-	mov.f32 	%f6, %f5;
-	@%p1 bra 	BB9_13;
-	bra.uni 	BB9_21;
-BB9_13:                                 // %for_loop.i332
-                                        //   Parent Loop BB9_2 Depth=1
-                                        // =>  This Inner Loop Header: Depth=2
-	selp.b32 	%r9, %r8, 0, %p0;
-	mul.f32 	%f8, %f7, %f7;
-	fma.rn.f32 	%f9, %f6, %f6, %f8;
-	setp.gtu.f32 	%p1, %f9, 0f40800000;
-	selp.b32 	%r10, %r8, 0, %p1;
-	or.b32  	%r7, %r10, %r7;
-	selp.b32 	%r10, %r7, 0, %p0;
-	shr.u32 	%r10, %r10, 31;
-	shr.u32 	%r9, %r9, 31;
-	setp.eq.s32 	%p1, %r10, %r9;
-	@%p1 bra 	BB9_14;
-	bra.uni 	BB9_11;
-BB9_14:                                 //   in Loop: Header=BB9_13 Depth=2
-	mov.u32 	%r8, %r6;
-	bra.uni 	BB9_12;
-BB9_11:                                 // %not_all_continued_or_breaked.i346
-                                        //   in Loop: Header=BB9_13 Depth=2
-	mul.f32 	%f9, %f6, %f6;
-	not.b32 	%r9, %r7;
-	and.b32  	%r8, %r8, %r9;
-	sub.f32 	%f8, %f8, %f9;
-	add.f32 	%f8, %f4, %f8;
-	add.f32 	%f7, %f7, %f7;
-	fma.rn.f32 	%f6, %f6, %f7, %f5;
-	mov.f32 	%f7, %f8;
-BB9_12:                                 // %for_step.i313
-                                        //   in Loop: Header=BB9_13 Depth=2
-	setp.ne.s32 	%p1, %r8, 0;
-	selp.u32 	%r9, 1, 0, %p1;
-	add.s32 	%r5, %r5, %r9;
-	setp.lt.s32 	%p1, %r5, %r2;
-	selp.b32 	%r8, %r8, 0, %p1;
-	selp.b32 	%r9, %r8, 0, %p0;
-	setp.gt.s32 	%p1, %r9, -1;
-	@%p1 bra 	BB9_15;
-	bra.uni 	BB9_13;
-BB9_21:                                 //   in Loop: Header=BB9_2 Depth=1
-	mov.u32 	%r5, %r6;
-BB9_15:                                 // %mandel___vyfvyfvyi.exit347
-                                        //   in Loop: Header=BB9_2 Depth=1
-	setp.lt.s32 	%p0, %r1, 1;
-	@%p0 bra 	BB9_17;
-// BB#16:                               // %pl_dolane.i
-                                        //   in Loop: Header=BB9_2 Depth=1
-	mul.lo.s32 	%r6, %r3, %r1;
-	shl.b32 	%r6, %r6, 2;
-	cvt.s64.s32 	%rl1, %r6;
-	add.s64 	%rl1, %rl1, %rl0;
-	st.u32 	[%rl1], %r5;
-BB9_17:                                 // %foreach_reset
-                                        //   in Loop: Header=BB9_2 Depth=1
-	add.s32 	%r3, %r3, 1;
-	setp.eq.s32 	%p0, %r3, %r0;
-	@%p0 bra 	BB9_18;
-	bra.uni 	BB9_2;
-BB9_18:                                 // %for_exit
-	ret;
-}
-
--- a/examples/mandelbrot/out.s
+++ b/examples/mandelbrot/out.s
--- a/examples/mandelbrot/out1.o
+++ b/examples/mandelbrot/out1.o
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=mandelbrot_tasks
 CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
 ISPC_SRC=mandelbrot_tasks.ispc
-ISPC_IA_TARGETS=avx1-i32x16
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
 ISPC_ARM_TARGETS=neon

 include ../common.mk
--- a/examples/rt/Makefile
+++ b/examples/rt/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=rt
 CPP_SRC=rt.cpp rt_serial.cpp
 ISPC_SRC=rt.ispc
-ISPC_IA_TARGETS=avx1-i32x8
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8
 ISPC_ARM_TARGETS=neon

 include ../common.mk
--- a/examples/sort/sort
+++ b/examples/sort/sort
--- a/examples/stencil/a.out
+++ b/examples/stencil/a.out
--- a/examples/stencil/drvapi_error_string.h
+++ b/examples/stencil/drvapi_error_string.h
@@ -1,370 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
- 
-#ifndef _DRVAPI_ERROR_STRING_H_
-#define _DRVAPI_ERROR_STRING_H_
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-
-// Error Code string definitions here
-typedef struct
-{
-    char const *error_string;
-    int  error_id;
-} s_CudaErrorStr;
-
-/**
- * Error codes
- */
-static s_CudaErrorStr sCudaDrvErrorString[] =
-{
-    /**
-     * The API call returned with no errors. In the case of query calls, this
-     * can also mean that the operation being queried is complete (see
-     * ::cuEventQuery() and ::cuStreamQuery()).
-     */
-    { "CUDA_SUCCESS", 0 },
-
-    /**
-     * This indicates that one or more of the parameters passed to the API call
-     * is not within an acceptable range of values.
-     */
-    { "CUDA_ERROR_INVALID_VALUE", 1 },
-
-    /**
-     * The API call failed because it was unable to allocate enough memory to
-     * perform the requested operation.
-     */
-    { "CUDA_ERROR_OUT_OF_MEMORY", 2 },
-
-    /**
-     * This indicates that the CUDA driver has not been initialized with
-     * ::cuInit() or that initialization has failed.
-     */
-    { "CUDA_ERROR_NOT_INITIALIZED", 3 },
-
-    /**
-     * This indicates that the CUDA driver is in the process of shutting down.
-     */
-    { "CUDA_ERROR_DEINITIALIZED", 4 },
-
-    /**
-     * This indicates profiling APIs are called while application is running
-     * in visual profiler mode. 
-    */
-    { "CUDA_ERROR_PROFILER_DISABLED", 5 },
-    /**
-     * This indicates profiling has not been initialized for this context. 
-     * Call cuProfilerInitialize() to resolve this. 
-    */
-    { "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6 },
-    /**
-     * This indicates profiler has already been started and probably
-     * cuProfilerStart() is incorrectly called.
-    */
-    { "CUDA_ERROR_PROFILER_ALREADY_STARTED", 7 },
-    /**
-     * This indicates profiler has already been stopped and probably
-     * cuProfilerStop() is incorrectly called.
-    */
-    { "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8 },  
-    /**
-     * This indicates that no CUDA-capable devices were detected by the installed
-     * CUDA driver.
-     */
-    { "CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100 },
-
-    /**
-     * This indicates that the device ordinal supplied by the user does not
-     * correspond to a valid CUDA device.
-     */
-    { "CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", 101 },
-
-
-    /**
-     * This indicates that the device kernel image is invalid. This can also
-     * indicate an invalid CUDA module.
-     */
-    { "CUDA_ERROR_INVALID_IMAGE", 200 },
-
-    /**
-     * This most frequently indicates that there is no context bound to the
-     * current thread. This can also be returned if the context passed to an
-     * API call is not a valid handle (such as a context that has had
-     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
-     * mixes different API versions (i.e. 3010 context with 3020 API calls).
-     * See ::cuCtxGetApiVersion() for more details.
-     */
-    { "CUDA_ERROR_INVALID_CONTEXT", 201 },
-
-    /**
-     * This indicated that the context being supplied as a parameter to the
-     * API call was already the active context.
-     * \deprecated
-     * This error return is deprecated as of CUDA 3.2. It is no longer an
-     * error to attempt to push the active context via ::cuCtxPushCurrent().
-     */
-    { "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202 },
-
-    /**
-     * This indicates that a map or register operation has failed.
-     */
-    { "CUDA_ERROR_MAP_FAILED", 205 },
-
-    /**
-     * This indicates that an unmap or unregister operation has failed.
-     */
-    { "CUDA_ERROR_UNMAP_FAILED", 206 },
-
-    /**
-     * This indicates that the specified array is currently mapped and thus
-     * cannot be destroyed.
-     */
-    { "CUDA_ERROR_ARRAY_IS_MAPPED", 207 },
-
-    /**
-     * This indicates that the resource is already mapped.
-     */
-    { "CUDA_ERROR_ALREADY_MAPPED", 208 },
-
-    /**
-     * This indicates that there is no kernel image available that is suitable
-     * for the device. This can occur when a user specifies code generation
-     * options for a particular CUDA source file that do not include the
-     * corresponding device configuration.
-     */
-    { "CUDA_ERROR_NO_BINARY_FOR_GPU", 209 },
-
-    /**
-     * This indicates that a resource has already been acquired.
-     */
-    { "CUDA_ERROR_ALREADY_ACQUIRED", 210 },
-
-    /**
-     * This indicates that a resource is not mapped.
-     */
-    { "CUDA_ERROR_NOT_MAPPED", 211 },
-
-    /**
-     * This indicates that a mapped resource is not available for access as an
-     * array.
-     */
-    { "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212 },
-
-    /**
-     * This indicates that a mapped resource is not available for access as a
-     * pointer.
-     */
-    { "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213 },
-
-    /**
-     * This indicates that an uncorrectable ECC error was detected during
-     * execution.
-     */
-    { "CUDA_ERROR_ECC_UNCORRECTABLE", 214 },
-
-    /**
-     * This indicates that the ::CUlimit passed to the API call is not
-     * supported by the active device.
-     */
-    { "CUDA_ERROR_UNSUPPORTED_LIMIT", 215 },
-
-    /**
-     * This indicates that the ::CUcontext passed to the API call can
-     * only be bound to a single CPU thread at a time but is already 
-     * bound to a CPU thread.
-     */
-    { "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216 },
-
-    /**
-     * This indicates that peer access is not supported across the given
-     * devices.
-     */
-    { "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217},
-
-    /**
-     * This indicates that the device kernel source is invalid.
-     */
-    { "CUDA_ERROR_INVALID_SOURCE", 300 },
-
-    /**
-     * This indicates that the file specified was not found.
-     */
-    { "CUDA_ERROR_FILE_NOT_FOUND", 301 },
-
-    /**
-     * This indicates that a link to a shared object failed to resolve.
-     */
-    { "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302 },
-
-    /**
-     * This indicates that initialization of a shared object failed.
-     */
-    { "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303 },
-
-    /**
-     * This indicates that an OS call failed.
-     */
-    { "CUDA_ERROR_OPERATING_SYSTEM", 304 },
-
-
-    /**
-     * This indicates that a resource handle passed to the API call was not
-     * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
-     */
-    { "CUDA_ERROR_INVALID_HANDLE", 400 },
-
-
-    /**
-     * This indicates that a named symbol was not found. Examples of symbols
-     * are global/constant variable names, texture names }, and surface names.
-     */
-    { "CUDA_ERROR_NOT_FOUND", 500 },
-
-
-    /**
-     * This indicates that asynchronous operations issued previously have not
-     * completed yet. This result is not actually an error, but must be indicated
-     * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
-     * may return this value include ::cuEventQuery() and ::cuStreamQuery().
-     */
-    { "CUDA_ERROR_NOT_READY", 600 },
-
-
-    /**
-     * An exception occurred on the device while executing a kernel. Common
-     * causes include dereferencing an invalid device pointer and accessing
-     * out of bounds shared memory. The context cannot be used }, so it must
-     * be destroyed (and a new one should be created). All existing device
-     * memory allocations from this context are invalid and must be
-     * reconstructed if the program is to continue using CUDA.
-     */
-    { "CUDA_ERROR_LAUNCH_FAILED", 700 },
-
-    /**
-     * This indicates that a launch did not occur because it did not have
-     * appropriate resources. This error usually indicates that the user has
-     * attempted to pass too many arguments to the device kernel, or the
-     * kernel launch specifies too many threads for the kernel's register
-     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
-     * when a 32-bit int is expected) is equivalent to passing too many
-     * arguments and can also result in this error.
-     */
-    { "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701 },
-
-    /**
-     * This indicates that the device kernel took too long to execute. This can
-     * only occur if timeouts are enabled - see the device attribute
-     * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
-     * context cannot be used (and must be destroyed similar to
-     * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
-     * this context are invalid and must be reconstructed if the program is to
-     * continue using CUDA.
-     */
-    { "CUDA_ERROR_LAUNCH_TIMEOUT", 702 },
-
-    /**
-     * This error indicates a kernel launch that uses an incompatible texturing
-     * mode.
-     */
-    { "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703 },
-    
-    /**
-     * This error indicates that a call to ::cuCtxEnablePeerAccess() is
-     * trying to re-enable peer access to a context which has already
-     * had peer access to it enabled.
-     */
-    { "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704 },
-
-    /**
-     * This error indicates that ::cuCtxDisablePeerAccess() is 
-     * trying to disable peer access which has not been enabled yet 
-     * via ::cuCtxEnablePeerAccess(). 
-     */
-    { "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705 },
-
-    /**
-     * This error indicates that the primary context for the specified device
-     * has already been initialized.
-     */
-    { "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708 },
-
-    /**
-     * This error indicates that the context current to the calling thread
-     * has been destroyed using ::cuCtxDestroy }, or is a primary context which
-     * has not yet been initialized.
-     */
-    { "CUDA_ERROR_CONTEXT_IS_DESTROYED", 709 },
-
-    /**
-     * A device-side assert triggered during kernel execution. The context
-     * cannot be used anymore, and must be destroyed. All existing device 
-     * memory allocations from this context are invalid and must be 
-     * reconstructed if the program is to continue using CUDA.
-     */
-    { "CUDA_ERROR_ASSERT", 710 },
-
-        /**
-     * This error indicates that the hardware resources required to enable
-     * peer access have been exhausted for one or more of the devices 
-     * passed to ::cuCtxEnablePeerAccess().
-     */
-    { "CUDA_ERROR_TOO_MANY_PEERS", 711 },
-
-    /**
-     * This error indicates that the memory range passed to ::cuMemHostRegister()
-     * has already been registered.
-     */
-    { "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712 },
-
-    /**
-     * This error indicates that the pointer passed to ::cuMemHostUnregister()
-     * does not correspond to any currently registered memory region.
-     */
-    { "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713 },
-
-    /**
-     * This error indicates that the attempted operation is not permitted.
-     */
-    { "CUDA_ERROR_NOT_PERMITTED", 800 },
-
-    /**
-     * This error indicates that the attempted operation is not supported
-     * on the current system or device.
-     */
-    { "CUDA_ERROR_NOT_SUPPORTED", 801 },
-
-    /**
-     * This indicates that an unknown internal error has occurred.
-     */
-    { "CUDA_ERROR_UNKNOWN", 999 },
-    { NULL, -1 }
-};
-
-// This is just a linear search through the array, since the error_id's are not
-// always ocurring consecutively
-const char * getCudaDrvErrorString(CUresult error_id)
-{
-    int index = 0;
-    while (sCudaDrvErrorString[index].error_id != error_id && 
-           sCudaDrvErrorString[index].error_id != -1)
-    {
-        index++;
-    }
-    if (sCudaDrvErrorString[index].error_id == error_id)
-        return (const char *)sCudaDrvErrorString[index].error_string;
-    else
-        return (const char *)"CUDA_ERROR not found!";
-}
-
-#endif
--- a/examples/stencil/err
+++ b/examples/stencil/err
--- a/examples/stencil/info
+++ b/examples/stencil/info
@@ -1,5 +0,0 @@
-I have been working with sort example, attempting to use ISPC_USE_OMP for tasking and adding example for sort_paralle.cpp which uses  __gnu_parallel::sort to compare apples with apples, but clang has no support for OpenMP.
-
-The reason to use ISPC_USE_OMP is to control thread-affinity on multi-socket systems. For bandwidth bound throughput, the tasking system based on pthread make it messy to control thread-affinity and w/o this for bandwidth bound work-loads performance may suffer..
-
-I used sort example to begin with
--- a/examples/stencil/stencil.cu
+++ b/examples/stencil/stencil.cu
@@ -1,62 +0,0 @@
-#define programCount 32
-#define programIndex threadIdx.x
-#define taskIndex blockIdx.x
-
-__device__ static void
-stencil_step( int x0,  int x1,
-              int y0,  int y1,
-              int z0,  int z1,
-              int Nx,  int Ny,  int Nz,
-              const double coef[4],  const double vsq[],
-              const double Ain[],  double Aout[]) {
-    const  int Nxy = Nx * Ny;
-
-
-#if 0
-    foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) {
-#else
-      const  double coef0 = coef[0];
-      const  double coef1 = coef[1];
-      const  double coef2 = coef[2];
-      const  double coef3 = coef[3];
-      for ( int z = z0; z < z1; z++)
-        for ( int y = y0 ; y < y1; y++)
-          for ( int xb = x0; xb < x1; xb += programCount)
-          {
-            const int x = xb + programIndex;
-
-#endif
-        int index = (z * Nxy) + (y * Nx) + x;
-#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
-#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
-        double div = 
-            coef0 *  A_cur(0, 0, 0) +
-            coef1 * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
-                      A_cur(0, +1, 0) + A_cur(0, -1, 0) +
-                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
-            coef2 * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
-                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
-                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
-            coef3 * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
-                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
-                       A_cur(0, 0, +3) + A_cur(0, 0, -3));
-
-        if (x < x1)
-          A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
-            vsq[index] * div;
-    }
-}
-
-
-extern "C"
-__global__  void
-stencil_step_task( int x0,  int x1,
-                   int y0,  int y1,
-                   int z0,
-                   int Nx,  int Ny,  int Nz,
-                   const double coef[4],  const double vsq[],
-                   const double Ain[],  double Aout[]) {
-    stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,
-                 Nx, Ny, Nz, coef, vsq, Ain, Aout);
-}
-
--- a/examples/stencil/stencil.ptx
+++ b/examples/stencil/stencil.ptx
@@ -1,267 +0,0 @@
-//
-// Generated by NVIDIA NVVM Compiler
-// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857)
-// Cuda compilation tools, release 5.5, V5.5.0
-//
-
-.version 3.2
-.target sm_35
-.address_size 64
-
-	.file	1 "/home/evghenii/soft/ispc-code/ispc/examples/stencil/stencil.cu", 1383254912, 2112
-	.file	2 "/usr/local/cuda-5.5/bin/..//include/cuda_device_runtime_api.h", 1375338991, 7655
-
-.weak .func  (.param .b32 func_retval0) cudaMalloc(
-	.param .b64 cudaMalloc_param_0,
-	.param .b64 cudaMalloc_param_1
-)
-{
-	.reg .s32 	%r<2>;
-
-
-	mov.u32 	%r1, 30;
-	st.param.b32	[func_retval0+0], %r1;
-	.loc 2 66 3
-	ret;
-}
-
-.weak .func  (.param .b32 func_retval0) cudaFuncGetAttributes(
-	.param .b64 cudaFuncGetAttributes_param_0,
-	.param .b64 cudaFuncGetAttributes_param_1
-)
-{
-	.reg .s32 	%r<2>;
-
-
-	mov.u32 	%r1, 30;
-	st.param.b32	[func_retval0+0], %r1;
-	.loc 2 71 3
-	ret;
-}
-
-.visible .entry stencil_step_task(
-	.param .u32 stencil_step_task_param_0,
-	.param .u32 stencil_step_task_param_1,
-	.param .u32 stencil_step_task_param_2,
-	.param .u32 stencil_step_task_param_3,
-	.param .u32 stencil_step_task_param_4,
-	.param .u32 stencil_step_task_param_5,
-	.param .u32 stencil_step_task_param_6,
-	.param .u32 stencil_step_task_param_7,
-	.param .u64 stencil_step_task_param_8,
-	.param .u64 stencil_step_task_param_9,
-	.param .u64 stencil_step_task_param_10,
-	.param .u64 stencil_step_task_param_11
-)
-{
-	.reg .pred 	%p<8>;
-	.reg .s32 	%r<54>;
-	.reg .s64 	%rd<36>;
-	.reg .f64 	%fd<48>;
-
-
-	ld.param.u32 	%r19, [stencil_step_task_param_0];
-	ld.param.u32 	%r20, [stencil_step_task_param_1];
-	ld.param.u32 	%r21, [stencil_step_task_param_2];
-	ld.param.u32 	%r22, [stencil_step_task_param_3];
-	ld.param.u32 	%r23, [stencil_step_task_param_4];
-	ld.param.u32 	%r24, [stencil_step_task_param_5];
-	ld.param.u32 	%r25, [stencil_step_task_param_6];
-	ld.param.u64 	%rd4, [stencil_step_task_param_8];
-	ld.param.u64 	%rd1, [stencil_step_task_param_9];
-	ld.param.u64 	%rd2, [stencil_step_task_param_10];
-	ld.param.u64 	%rd3, [stencil_step_task_param_11];
-	cvta.to.global.u64 	%rd5, %rd4;
-	.loc 1 59 1
-	mov.u32 	%r26, %ctaid.x;
-	add.s32 	%r51, %r26, %r23;
-	add.s32 	%r27, %r51, 1;
-	.loc 1 18 1
-	ld.global.f64 	%fd1, [%rd5];
-	.loc 1 19 1
-	ld.global.f64 	%fd2, [%rd5+8];
-	.loc 1 20 1
-	ld.global.f64 	%fd3, [%rd5+16];
-	.loc 1 21 1
-	ld.global.f64 	%fd4, [%rd5+24];
-	.loc 1 22 1
-	setp.ge.s32	%p1, %r51, %r27;
-	@%p1 bra 	BB2_11;
-
-	mul.lo.s32 	%r28, %r25, %r24;
-	shl.b32 	%r29, %r28, 1;
-	neg.s32 	%r30, %r29;
-	shl.b32 	%r2, %r30, 3;
-	cvta.to.global.u64 	%rd6, %rd2;
-	cvta.to.global.u64 	%rd31, %rd3;
-	cvta.to.global.u64 	%rd32, %rd1;
-
-BB2_2:
-	.loc 1 23 1
-	setp.ge.s32	%p2, %r21, %r22;
-	@%p2 bra 	BB2_10;
-
-	mov.u32 	%r52, %r21;
-
-BB2_4:
-	.loc 1 24 1
-	mov.u32 	%r4, %r52;
-	setp.ge.s32	%p3, %r19, %r20;
-	@%p3 bra 	BB2_9;
-
-	.loc 1 29 1
-	mul.lo.s32 	%r32, %r51, %r28;
-	mad.lo.s32 	%r5, %r4, %r24, %r32;
-	.loc 1 32 1
-	add.s32 	%r6, %r24, %r5;
-	add.s32 	%r7, %r5, %r28;
-	shl.b32 	%r33, %r24, 1;
-	add.s32 	%r8, %r5, %r33;
-	mad.lo.s32 	%r9, %r24, -2, %r5;
-	add.s32 	%r10, %r5, %r29;
-	mad.lo.s32 	%r11, %r28, -2, %r5;
-	add.s32 	%r12, %r24, %r8;
-	mad.lo.s32 	%r13, %r28, 3, %r5;
-	mov.u32 	%r53, %r19;
-
-BB2_6:
-	.loc 1 26 1
-	mov.u32 	%r14, %r53;
-	mov.u32 	%r35, %tid.x;
-	add.s32 	%r36, %r35, %r14;
-	.loc 1 29 1
-	add.s32 	%r15, %r36, %r5;
-	mul.wide.s32 	%rd7, %r15, 8;
-	add.s64 	%rd8, %rd6, %rd7;
-	.loc 1 32 1
-	ld.global.f64 	%fd5, [%rd8];
-	ld.global.f64 	%fd7, [%rd8+-8];
-	ld.global.f64 	%fd8, [%rd8+8];
-	add.f64 	%fd9, %fd8, %fd7;
-	add.s32 	%r37, %r6, %r36;
-	mul.wide.s32 	%rd9, %r37, 8;
-	add.s64 	%rd10, %rd6, %rd9;
-	.loc 1 32 1
-	ld.global.f64 	%fd10, [%rd10];
-	add.f64 	%fd11, %fd9, %fd10;
-	.loc 1 22 1
-	neg.s32 	%r39, %r33;
-	shl.b32 	%r40, %r39, 3;
-	cvt.s64.s32	%rd11, %r40;
-	add.s64 	%rd12, %rd10, %rd11;
-	.loc 1 32 1
-	ld.global.f64 	%fd12, [%rd12];
-	add.f64 	%fd13, %fd11, %fd12;
-	add.s32 	%r41, %r7, %r36;
-	mul.wide.s32 	%rd13, %r41, 8;
-	add.s64 	%rd14, %rd6, %rd13;
-	.loc 1 32 1
-	ld.global.f64 	%fd14, [%rd14];
-	add.f64 	%fd15, %fd13, %fd14;
-	cvt.s64.s32	%rd15, %r2;
-	add.s64 	%rd16, %rd14, %rd15;
-	.loc 1 32 1
-	ld.global.f64 	%fd16, [%rd16];
-	add.f64 	%fd17, %fd15, %fd16;
-	mul.f64 	%fd18, %fd2, %fd17;
-	fma.rn.f64 	%fd19, %fd1, %fd5, %fd18;
-	ld.global.f64 	%fd20, [%rd8+-16];
-	ld.global.f64 	%fd21, [%rd8+16];
-	add.f64 	%fd22, %fd21, %fd20;
-	add.s32 	%r42, %r8, %r36;
-	mul.wide.s32 	%rd17, %r42, 8;
-	add.s64 	%rd18, %rd6, %rd17;
-	.loc 1 32 1
-	ld.global.f64 	%fd23, [%rd18];
-	add.f64 	%fd24, %fd22, %fd23;
-	add.s32 	%r43, %r9, %r36;
-	mul.wide.s32 	%rd19, %r43, 8;
-	add.s64 	%rd20, %rd6, %rd19;
-	.loc 1 32 1
-	ld.global.f64 	%fd25, [%rd20];
-	add.f64 	%fd26, %fd24, %fd25;
-	add.s32 	%r44, %r10, %r36;
-	mul.wide.s32 	%rd21, %r44, 8;
-	add.s64 	%rd22, %rd6, %rd21;
-	.loc 1 32 1
-	ld.global.f64 	%fd27, [%rd22];
-	add.f64 	%fd28, %fd26, %fd27;
-	add.s32 	%r45, %r11, %r36;
-	mul.wide.s32 	%rd23, %r45, 8;
-	add.s64 	%rd24, %rd6, %rd23;
-	.loc 1 32 1
-	ld.global.f64 	%fd29, [%rd24];
-	add.f64 	%fd30, %fd28, %fd29;
-	fma.rn.f64 	%fd31, %fd3, %fd30, %fd19;
-	ld.global.f64 	%fd32, [%rd8+-24];
-	ld.global.f64 	%fd33, [%rd8+24];
-	add.f64 	%fd34, %fd33, %fd32;
-	add.s32 	%r46, %r12, %r36;
-	mul.wide.s32 	%rd25, %r46, 8;
-	add.s64 	%rd26, %rd6, %rd25;
-	.loc 1 32 1
-	ld.global.f64 	%fd35, [%rd26];
-	add.f64 	%fd36, %fd34, %fd35;
-	add.s64 	%rd27, %rd12, %rd11;
-	.loc 1 32 1
-	ld.global.f64 	%fd37, [%rd27];
-	add.f64 	%fd38, %fd36, %fd37;
-	add.s32 	%r47, %r13, %r36;
-	mul.wide.s32 	%rd28, %r47, 8;
-	add.s64 	%rd29, %rd6, %rd28;
-	.loc 1 32 1
-	ld.global.f64 	%fd39, [%rd29];
-	add.f64 	%fd40, %fd38, %fd39;
-	add.s64 	%rd30, %rd16, %rd15;
-	.loc 1 32 1
-	ld.global.f64 	%fd41, [%rd30];
-	add.f64 	%fd42, %fd40, %fd41;
-	fma.rn.f64 	%fd6, %fd4, %fd42, %fd31;
-	.loc 1 44 1
-	setp.ge.s32	%p4, %r36, %r20;
-	@%p4 bra 	BB2_8;
-
-	mul.wide.s32 	%rd33, %r15, 8;
-	add.s64 	%rd34, %rd31, %rd33;
-	.loc 1 45 1
-	ld.global.f64 	%fd43, [%rd34];
-	add.f64 	%fd44, %fd5, %fd5;
-	sub.f64 	%fd45, %fd44, %fd43;
-	add.s64 	%rd35, %rd32, %rd33;
-	.loc 1 45 1
-	ld.global.f64 	%fd46, [%rd35];
-	fma.rn.f64 	%fd47, %fd46, %fd6, %fd45;
-	st.global.f64 	[%rd34], %fd47;
-
-BB2_8:
-	.loc 1 24 19
-	add.s32 	%r16, %r14, 32;
-	.loc 1 24 1
-	setp.lt.s32	%p5, %r16, %r20;
-	mov.u32 	%r53, %r16;
-	@%p5 bra 	BB2_6;
-
-BB2_9:
-	.loc 1 23 18
-	add.s32 	%r17, %r4, 1;
-	.loc 1 23 1
-	setp.lt.s32	%p6, %r17, %r22;
-	mov.u32 	%r52, %r17;
-	@%p6 bra 	BB2_4;
-
-BB2_10:
-	.loc 1 22 18
-	add.s32 	%r51, %r51, 1;
-	.loc 1 59 1
-	add.s32 	%r49, %r23, %r26;
-	add.s32 	%r50, %r49, 1;
-	.loc 1 22 1
-	setp.lt.s32	%p7, %r51, %r50;
-	@%p7 bra 	BB2_2;
-
-BB2_11:
-	.loc 1 61 2
-	ret;
-}
-
-
--- a/examples/stencil/stencil0.ptx
+++ b/examples/stencil/stencil0.ptx
@@ -1,224 +0,0 @@
-//
-// Generated by NVIDIA NVVM Compiler
-// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857)
-// Cuda compilation tools, release 5.5, V5.5.0
-//
-
-.version 3.2
-.target sm_35
-.address_size 64
-
-	.file	1 "/home/evghenii/soft/ispc-code/ispc/examples/stencil/stencil.cu", 1383254912, 2112
-
-)
-{
-	.reg .s32 	%r<2>;
-
-
-	mov.u32 	%r1, 30;
-	st.param.b32	[func_retval0+0], %r1;
-	ret;
-}
-
-.weak .func  (.param .b32 func_retval0) cudaFuncGetAttributes(
-	.param .b64 cudaFuncGetAttributes_param_0,
-	.param .b64 cudaFuncGetAttributes_param_1
-)
-{
-	.reg .s32 	%r<2>;
-
-
-	mov.u32 	%r1, 30;
-	st.param.b32	[func_retval0+0], %r1;
-	ret;
-}
-
-.visible .entry stencil_step_task(
-	.param .u32 stencil_step_task_param_0,
-	.param .u32 stencil_step_task_param_1,
-	.param .u32 stencil_step_task_param_2,
-	.param .u32 stencil_step_task_param_3,
-	.param .u32 stencil_step_task_param_4,
-	.param .u32 stencil_step_task_param_5,
-	.param .u32 stencil_step_task_param_6,
-	.param .u32 stencil_step_task_param_7,
-	.param .u64 stencil_step_task_param_8,
-	.param .u64 stencil_step_task_param_9,
-	.param .u64 stencil_step_task_param_10,
-	.param .u64 stencil_step_task_param_11
-)
-{
-	.reg .pred 	%p<8>;
-	.reg .s32 	%r<54>;
-	.reg .s64 	%rd<36>;
-	.reg .f64 	%fd<48>;
-
-
-	ld.param.u32 	%r19, [stencil_step_task_param_0];
-	ld.param.u32 	%r20, [stencil_step_task_param_1];
-	ld.param.u32 	%r21, [stencil_step_task_param_2];
-	ld.param.u32 	%r22, [stencil_step_task_param_3];
-	ld.param.u32 	%r23, [stencil_step_task_param_4];
-	ld.param.u32 	%r24, [stencil_step_task_param_5];
-	ld.param.u32 	%r25, [stencil_step_task_param_6];
-	ld.param.u64 	%rd4, [stencil_step_task_param_8];
-	ld.param.u64 	%rd1, [stencil_step_task_param_9];
-	ld.param.u64 	%rd2, [stencil_step_task_param_10];
-	ld.param.u64 	%rd3, [stencil_step_task_param_11];
-	cvta.to.global.u64 	%rd5, %rd4;
-	mov.u32 	%r26, %ctaid.x;
-	add.s32 	%r51, %r26, %r23;
-	add.s32 	%r27, %r51, 1;
-	ld.global.f64 	%fd1, [%rd5];
-	ld.global.f64 	%fd2, [%rd5+8];
-	ld.global.f64 	%fd3, [%rd5+16];
-	ld.global.f64 	%fd4, [%rd5+24];
-	setp.ge.s32	%p1, %r51, %r27;
-	@%p1 bra 	BB2_11;
-
-	mul.lo.s32 	%r28, %r25, %r24;
-	shl.b32 	%r29, %r28, 1;
-	neg.s32 	%r30, %r29;
-	shl.b32 	%r2, %r30, 3;
-	cvta.to.global.u64 	%rd6, %rd2;
-	cvta.to.global.u64 	%rd31, %rd3;
-	cvta.to.global.u64 	%rd32, %rd1;
-
-BB2_2:
-	setp.ge.s32	%p2, %r21, %r22;
-	@%p2 bra 	BB2_10;
-
-	mov.u32 	%r52, %r21;
-
-BB2_4:
-	mov.u32 	%r4, %r52;
-	setp.ge.s32	%p3, %r19, %r20;
-	@%p3 bra 	BB2_9;
-
-	mul.lo.s32 	%r32, %r51, %r28;
-	mad.lo.s32 	%r5, %r4, %r24, %r32;
-	add.s32 	%r6, %r24, %r5;
-	add.s32 	%r7, %r5, %r28;
-	shl.b32 	%r33, %r24, 1;
-	add.s32 	%r8, %r5, %r33;
-	mad.lo.s32 	%r9, %r24, -2, %r5;
-	add.s32 	%r10, %r5, %r29;
-	mad.lo.s32 	%r11, %r28, -2, %r5;
-	add.s32 	%r12, %r24, %r8;
-	mad.lo.s32 	%r13, %r28, 3, %r5;
-	mov.u32 	%r53, %r19;
-
-BB2_6:
-	mov.u32 	%r14, %r53;
-	mov.u32 	%r35, %tid.x;
-	add.s32 	%r36, %r35, %r14;
-	add.s32 	%r15, %r36, %r5;
-	mul.wide.s32 	%rd7, %r15, 8;
-	add.s64 	%rd8, %rd6, %rd7;
-	ld.global.f64 	%fd5, [%rd8];
-	ld.global.f64 	%fd7, [%rd8+-8];
-	ld.global.f64 	%fd8, [%rd8+8];
-	add.f64 	%fd9, %fd8, %fd7;
-	add.s32 	%r37, %r6, %r36;
-	mul.wide.s32 	%rd9, %r37, 8;
-	add.s64 	%rd10, %rd6, %rd9;
-	ld.global.f64 	%fd10, [%rd10];
-	add.f64 	%fd11, %fd9, %fd10;
-	neg.s32 	%r39, %r33;
-	shl.b32 	%r40, %r39, 3;
-	cvt.s64.s32	%rd11, %r40;
-	add.s64 	%rd12, %rd10, %rd11;
-	ld.global.f64 	%fd12, [%rd12];
-	add.f64 	%fd13, %fd11, %fd12;
-	add.s32 	%r41, %r7, %r36;
-	mul.wide.s32 	%rd13, %r41, 8;
-	add.s64 	%rd14, %rd6, %rd13;
-	ld.global.f64 	%fd14, [%rd14];
-	add.f64 	%fd15, %fd13, %fd14;
-	cvt.s64.s32	%rd15, %r2;
-	add.s64 	%rd16, %rd14, %rd15;
-	ld.global.f64 	%fd16, [%rd16];
-	add.f64 	%fd17, %fd15, %fd16;
-	mul.f64 	%fd18, %fd2, %fd17;
-	fma.rn.f64 	%fd19, %fd1, %fd5, %fd18;
-	ld.global.f64 	%fd20, [%rd8+-16];
-	ld.global.f64 	%fd21, [%rd8+16];
-	add.f64 	%fd22, %fd21, %fd20;
-	add.s32 	%r42, %r8, %r36;
-	mul.wide.s32 	%rd17, %r42, 8;
-	add.s64 	%rd18, %rd6, %rd17;
-	ld.global.f64 	%fd23, [%rd18];
-	add.f64 	%fd24, %fd22, %fd23;
-	add.s32 	%r43, %r9, %r36;
-	mul.wide.s32 	%rd19, %r43, 8;
-	add.s64 	%rd20, %rd6, %rd19;
-	ld.global.f64 	%fd25, [%rd20];
-	add.f64 	%fd26, %fd24, %fd25;
-	add.s32 	%r44, %r10, %r36;
-	mul.wide.s32 	%rd21, %r44, 8;
-	add.s64 	%rd22, %rd6, %rd21;
-	ld.global.f64 	%fd27, [%rd22];
-	add.f64 	%fd28, %fd26, %fd27;
-	add.s32 	%r45, %r11, %r36;
-	mul.wide.s32 	%rd23, %r45, 8;
-	add.s64 	%rd24, %rd6, %rd23;
-	ld.global.f64 	%fd29, [%rd24];
-	add.f64 	%fd30, %fd28, %fd29;
-	fma.rn.f64 	%fd31, %fd3, %fd30, %fd19;
-	ld.global.f64 	%fd32, [%rd8+-24];
-	ld.global.f64 	%fd33, [%rd8+24];
-	add.f64 	%fd34, %fd33, %fd32;
-	add.s32 	%r46, %r12, %r36;
-	mul.wide.s32 	%rd25, %r46, 8;
-	add.s64 	%rd26, %rd6, %rd25;
-	ld.global.f64 	%fd35, [%rd26];
-	add.f64 	%fd36, %fd34, %fd35;
-	add.s64 	%rd27, %rd12, %rd11;
-	ld.global.f64 	%fd37, [%rd27];
-	add.f64 	%fd38, %fd36, %fd37;
-	add.s32 	%r47, %r13, %r36;
-	mul.wide.s32 	%rd28, %r47, 8;
-	add.s64 	%rd29, %rd6, %rd28;
-	ld.global.f64 	%fd39, [%rd29];
-	add.f64 	%fd40, %fd38, %fd39;
-	add.s64 	%rd30, %rd16, %rd15;
-	ld.global.f64 	%fd41, [%rd30];
-	add.f64 	%fd42, %fd40, %fd41;
-	fma.rn.f64 	%fd6, %fd4, %fd42, %fd31;
-	setp.ge.s32	%p4, %r36, %r20;
-	@%p4 bra 	BB2_8;
-
-	mul.wide.s32 	%rd33, %r15, 8;
-	add.s64 	%rd34, %rd31, %rd33;
-	ld.global.f64 	%fd43, [%rd34];
-	add.f64 	%fd44, %fd5, %fd5;
-	sub.f64 	%fd45, %fd44, %fd43;
-	add.s64 	%rd35, %rd32, %rd33;
-	ld.global.f64 	%fd46, [%rd35];
-	fma.rn.f64 	%fd47, %fd46, %fd6, %fd45;
-	st.global.f64 	[%rd34], %fd47;
-
-BB2_8:
-	add.s32 	%r16, %r14, 32;
-	setp.lt.s32	%p5, %r16, %r20;
-	mov.u32 	%r53, %r16;
-	@%p5 bra 	BB2_6;
-
-BB2_9:
-	add.s32 	%r17, %r4, 1;
-	setp.lt.s32	%p6, %r17, %r22;
-	mov.u32 	%r52, %r17;
-	@%p6 bra 	BB2_4;
-
-BB2_10:
-	add.s32 	%r51, %r51, 1;
-	add.s32 	%r49, %r23, %r26;
-	add.s32 	%r50, %r49, 1;
-	setp.lt.s32	%p7, %r51, %r50;
-	@%p7 bra 	BB2_2;
-
-BB2_11:
-	ret;
-}
-
-
--- a/examples/stencil/stencil2.ptx
+++ b/examples/stencil/stencil2.ptx
@@ -1,247 +0,0 @@
-//
-// Generated by LLVM NVPTX Back-End
-//
-
-.version 3.1
-.target sm_20, texmode_independent
-.address_size 64
-
-	// .globl	stencil_step_task
-                                        // @stencil_step_task
-.entry stencil_step_task(
-	.param .u32 stencil_step_task_param_0,
-	.param .u32 stencil_step_task_param_1,
-	.param .u32 stencil_step_task_param_2,
-	.param .u32 stencil_step_task_param_3,
-	.param .u32 stencil_step_task_param_4,
-	.param .u32 stencil_step_task_param_5,
-	.param .u32 stencil_step_task_param_6,
-	.param .u32 stencil_step_task_param_7,
-	.param .u64 .ptr .align 8 stencil_step_task_param_8,
-	.param .u64 .ptr .align 8 stencil_step_task_param_9,
-	.param .u64 .ptr .align 8 stencil_step_task_param_10,
-	.param .u64 .ptr .align 8 stencil_step_task_param_11
-)
-{
-	.reg .pred %p<396>;
-	.reg .s16 %rc<396>;
-	.reg .s16 %rs<396>;
-	.reg .s32 %r<396>;
-	.reg .s64 %rl<396>;
-	.reg .f32 %f<396>;
-	.reg .f64 %fl<396>;
-
-// BB#0:                                // %allocas
-	mov.u32 	%r12, %ctaid.x;
-	ld.param.u32 	%r13, [stencil_step_task_param_4];
-	add.s32 	%r16, %r12, %r13;
-	add.s32 	%r0, %r16, 1;
-	setp.ge.s32 	%p0, %r16, %r0;
-	@%p0 bra 	BB0_11;
-// BB#1:                                // %for_test28.i.preheader.lr.ph
-	ld.param.u32 	%r0, [stencil_step_task_param_0];
-	ld.param.u32 	%r1, [stencil_step_task_param_1];
-	ld.param.u32 	%r2, [stencil_step_task_param_2];
-	ld.param.u32 	%r3, [stencil_step_task_param_3];
-	ld.param.u32 	%r4, [stencil_step_task_param_5];
-	ld.param.u32 	%r5, [stencil_step_task_param_6];
-	mul.lo.s32 	%r5, %r5, %r4;
-	ld.param.u64 	%rl3, [stencil_step_task_param_8];
-	ld.f64 	%fl0, [%rl3];
-	ld.f64 	%fl1, [%rl3+8];
-	ld.param.u64 	%rl0, [stencil_step_task_param_9];
-	ld.f64 	%fl2, [%rl3+16];
-	ld.param.u64 	%rl1, [stencil_step_task_param_10];
-	ld.param.u64 	%rl2, [stencil_step_task_param_11];
-	ld.f64 	%fl3, [%rl3+24];
-	shl.b32 	%r6, %r4, 1;
-	mul.lo.s32 	%r7, %r4, 3;
-	mul.lo.s32 	%r8, %r4, -3;
-	shl.b32 	%r9, %r5, 1;
-	mul.lo.s32 	%r10, %r5, 3;
-	mul.lo.s32 	%r11, %r5, -3;
-	add.s32 	%r12, %r12, %r13;
-	neg.s32 	%r13, %r9;
-	neg.s32 	%r14, %r6;
-	mov.u32 	%r32, WARP_SZ;
-BB0_2:                                  // %for_test28.i.preheader
-                                        // =>This Loop Header: Depth=1
-                                        //     Child Loop BB0_9 Depth 2
-                                        //       Child Loop BB0_5 Depth 3
-	mov.u32 	%r15, %r16;
-	setp.ge.s32 	%p0, %r2, %r3;
-	@%p0 bra 	BB0_10;
-// BB#3:                                // %for_test35.i.preheader.lr.ph
-                                        //   in Loop: Header=BB0_2 Depth=1
-	setp.lt.s32 	%p0, %r0, %r1;
-	@%p0 bra 	BB0_4;
-	bra.uni 	BB0_10;
-BB0_4:                                  //   in Loop: Header=BB0_2 Depth=1
-	mul.lo.s32 	%r16, %r15, %r5;
-	mov.u32 	%r17, %r2;
-BB0_9:                                  // %for_loop37.i.lr.ph.us
-                                        //   Parent Loop BB0_2 Depth=1
-                                        // =>  This Loop Header: Depth=2
-                                        //       Child Loop BB0_5 Depth 3
-	mad.lo.s32 	%r18, %r17, %r4, %r16;
-	add.s32 	%r19, %r18, %r4;
-	add.s32 	%r20, %r18, %r6;
-	sub.s32 	%r21, %r18, %r4;
-	add.s32 	%r22, %r18, %r7;
-	add.s32 	%r23, %r18, %r14;
-	add.s32 	%r24, %r18, %r5;
-	add.s32 	%r25, %r18, %r8;
-	add.s32 	%r26, %r18, %r9;
-	sub.s32 	%r27, %r18, %r5;
-	add.s32 	%r28, %r18, %r10;
-	add.s32 	%r29, %r18, %r13;
-	add.s32 	%r30, %r18, %r11;
-	mov.u32 	%r31, %r0;
-BB0_5:                                  // %for_loop37.i.us
-                                        //   Parent Loop BB0_2 Depth=1
-                                        //     Parent Loop BB0_9 Depth=2
-                                        // =>    This Inner Loop Header: Depth=3
-	mov.u32 	%r33, %tid.x;
-	add.s32 	%r34, %r32, -1;
-	and.b32  	%r33, %r34, %r33;
-	add.s32 	%r33, %r33, %r31;
-	setp.ge.s32 	%p0, %r33, %r1;
-	@%p0 bra 	BB0_7;
-// BB#6:                                // %pl_dolane.i.us
-                                        //   in Loop: Header=BB0_5 Depth=3
-	add.s32 	%r34, %r18, %r33;
-	shl.b32 	%r34, %r34, 3;
-	add.s32 	%r35, %r34, -8;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl4, [%rl3];
-	add.s32 	%r35, %r34, 8;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl5, [%rl3];
-	add.s32 	%r35, %r34, -16;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl6, [%rl3];
-	add.s32 	%r35, %r34, 16;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl9, [%rl3];
-	add.s32 	%r35, %r19, %r33;
-	shl.b32 	%r35, %r35, 3;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl8, [%rl3];
-	add.s32 	%r35, %r34, -24;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl7, [%rl3];
-	add.s32 	%r35, %r34, 24;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl10, [%rl3];
-	add.s32 	%r35, %r20, %r33;
-	shl.b32 	%r35, %r35, 3;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl13, [%rl3];
-	add.s32 	%r35, %r21, %r33;
-	shl.b32 	%r35, %r35, 3;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl12, [%rl3];
-	add.s32 	%r35, %r22, %r33;
-	shl.b32 	%r35, %r35, 3;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl11, [%rl3];
-	add.s32 	%r35, %r23, %r33;
-	shl.b32 	%r35, %r35, 3;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl16, [%rl3];
-	add.s32 	%r35, %r24, %r33;
-	shl.b32 	%r35, %r35, 3;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl15, [%rl3];
-	add.s32 	%r35, %r25, %r33;
-	shl.b32 	%r35, %r35, 3;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl14, [%rl3];
-	add.s32 	%r35, %r26, %r33;
-	shl.b32 	%r35, %r35, 3;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl19, [%rl3];
-	add.s32 	%r35, %r27, %r33;
-	shl.b32 	%r35, %r35, 3;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl18, [%rl3];
-	add.s32 	%r35, %r28, %r33;
-	shl.b32 	%r35, %r35, 3;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl17, [%rl3];
-	add.s32 	%r35, %r29, %r33;
-	shl.b32 	%r35, %r35, 3;
-	cvt.s64.s32 	%rl3, %r35;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl24, [%rl3];
-	cvt.s64.s32 	%rl4, %r34;
-	add.s64 	%rl3, %rl4, %rl1;
-	ld.f64 	%fl21, [%rl3];
-	add.s32 	%r33, %r30, %r33;
-	shl.b32 	%r33, %r33, 3;
-	cvt.s64.s32 	%rl3, %r33;
-	add.s64 	%rl3, %rl3, %rl1;
-	ld.f64 	%fl20, [%rl3];
-	add.s64 	%rl3, %rl4, %rl2;
-	ld.f64 	%fl23, [%rl3];
-	add.s64 	%rl4, %rl4, %rl0;
-	ld.f64 	%fl22, [%rl4];
-	add.f64 	%fl25, %fl21, %fl21;
-	sub.f64 	%fl23, %fl25, %fl23;
-	add.f64 	%fl6, %fl6, %fl9;
-	add.f64 	%fl6, %fl6, %fl13;
-	add.f64 	%fl6, %fl6, %fl16;
-	add.f64 	%fl6, %fl6, %fl19;
-	add.f64 	%fl6, %fl6, %fl24;
-	add.f64 	%fl4, %fl4, %fl5;
-	add.f64 	%fl4, %fl4, %fl8;
-	add.f64 	%fl4, %fl4, %fl12;
-	add.f64 	%fl4, %fl4, %fl15;
-	add.f64 	%fl4, %fl4, %fl18;
-	mul.f64 	%fl5, %fl0, %fl21;
-	fma.rn.f64 	%fl4, %fl1, %fl4, %fl5;
-	fma.rn.f64 	%fl4, %fl2, %fl6, %fl4;
-	add.f64 	%fl5, %fl7, %fl10;
-	add.f64 	%fl5, %fl5, %fl11;
-	add.f64 	%fl5, %fl5, %fl14;
-	add.f64 	%fl5, %fl5, %fl17;
-	add.f64 	%fl5, %fl5, %fl20;
-	fma.rn.f64 	%fl4, %fl3, %fl5, %fl4;
-	fma.rn.f64 	%fl4, %fl4, %fl22, %fl23;
-	st.f64 	[%rl3], %fl4;
-BB0_7:                                  // %safe_if_after_true.i.us
-                                        //   in Loop: Header=BB0_5 Depth=3
-	add.s32 	%r31, %r32, %r31;
-	setp.lt.s32 	%p0, %r31, %r1;
-	@%p0 bra 	BB0_5;
-// BB#8:                                // %for_exit38.i.us
-                                        //   in Loop: Header=BB0_9 Depth=2
-	add.s32 	%r17, %r17, 1;
-	setp.eq.s32 	%p0, %r17, %r3;
-	@%p0 bra 	BB0_10;
-	bra.uni 	BB0_9;
-BB0_10:                                 // %for_exit31.i
-                                        //   in Loop: Header=BB0_2 Depth=1
-	add.s32 	%r16, %r15, 1;
-	setp.ne.s32 	%p0, %r15, %r12;
-	@%p0 bra 	BB0_2;
-BB0_11:                                 // %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit
-	ret;
-}
-
--- a/examples/stencil/stencil_cu.cpp
+++ b/examples/stencil/stencil_cu.cpp
@@ -1,317 +0,0 @@
-/*
-  Copyright (c) 2010-2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-#ifdef _MSC_VER
-#define _CRT_SECURE_NO_WARNINGS
-#define NOMINMAX
-#pragma warning (disable: 4244)
-#pragma warning (disable: 4305)
-#endif
-
-#include <stdio.h>
-#include <algorithm>
-#include <math.h>
-#include "../timing.h"
-#include "stencil_ispc.h"
-using namespace ispc;
-
-#include <cassert>
-#include <iostream>
-#include <cuda.h>
-#include "drvapi_error_string.h"
-
-#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)
-// These are the inline versions for all of the SDK helper functions
-void __checkCudaErrors(CUresult err, const char *file, const int line) {
-  if(CUDA_SUCCESS != err) {
-    std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
-           << getCudaDrvErrorString(err) << "\" from file <" << file
-           << ", line " << line << "\n";
-    exit(-1);
-  }
-}
-
-/**********************/
-/* Basic CUDriver API */
-CUcontext context;
-
-void createContext(const int deviceId = 0)
-{
-  CUdevice device;
-  int devCount;
-  checkCudaErrors(cuInit(0));
-  checkCudaErrors(cuDeviceGetCount(&devCount));
-  assert(devCount > 0);
-  checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
-
-  char name[128];
-  checkCudaErrors(cuDeviceGetName(name, 128, device));
-  std::cout << "Using CUDA Device [0]: " << name << "\n";
-
-  int devMajor, devMinor;
-  checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
-  std::cout << "Device Compute Capability: " 
-    << devMajor << "." << devMinor << "\n";
-  if (devMajor < 2) {
-    std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
-    exit(1); 
-  }
-
-  // Create driver context
-  checkCudaErrors(cuCtxCreate(&context, 0, device));
-}
-void destroyContext()
-{
-  checkCudaErrors(cuCtxDestroy(context));
-}
-
-CUmodule loadModule(const char * module)
-{
-  CUmodule cudaModule;
-  checkCudaErrors(cuModuleLoadData(&cudaModule, module));
-  return cudaModule;
-}
-void unloadModule(CUmodule &cudaModule)
-{
-  checkCudaErrors(cuModuleUnload(cudaModule));
-}
-
-CUfunction getFunction(CUmodule &cudaModule, const char * function)
-{
-  CUfunction cudaFunction;
-  checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
-  return cudaFunction;
-}
-  
-CUdeviceptr deviceMalloc(const size_t size)
-{
-  CUdeviceptr d_buf;
-  checkCudaErrors(cuMemAlloc(&d_buf, size));
-  return d_buf;
-}
-void deviceFree(CUdeviceptr d_buf)
-{
-  checkCudaErrors(cuMemFree(d_buf));
-}
-void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
-{
-  checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
-}
-void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
-{
-  checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
-}
-#define deviceLaunch(func,nbx,nby,nbz,params) \
-  checkCudaErrors( \
-      cuLaunchKernel( \
-        (func), \
-        (nbx), (nby), (nbz), \
-        32, 1, 1, \
-        0, NULL, (params), NULL \
-        ));
-
-typedef CUdeviceptr devicePtr;
-
-
-/**************/
-
-extern "C" 
-{
-
-  void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment)
-  {
-    return NULL;
-  }
-  void CUDALaunch(
-      void **handlePtr, 
-      const char * module_name,
-      const char * module, 
-      const char * func_name,
-      void **func_args, 
-      int countx, int county, int countz)
-  {
-    assert(module_name != NULL);
-    assert(module != NULL);
-    assert(func_name != NULL);
-    assert(func_args != NULL);
-    CUmodule   cudaModule   = loadModule(module);
-    CUfunction cudaFunction = getFunction(cudaModule, func_name);
-    deviceLaunch(cudaFunction, countx, county, countz, func_args);
-    unloadModule(cudaModule);
-  }
-  void CUDASync(void *handle)
-  {
-    checkCudaErrors(cuStreamSynchronize(0));
-  }
-  void ISPCSync(void *handle)
-  {
-    checkCudaErrors(cuStreamSynchronize(0));
-  }
-  void CUDAFree(void *handle)
-  {
-  }
-}
-
-
-extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
-                                int y0, int y1, int z0, int z1,
-                                int Nx, int Ny, int Nz,
-                                const double coef[5], 
-                                const double vsq[],
-                                double Aeven[], double Aodd[]);
-
-
-void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
-    int offset = 0;
-    for (int z = 0; z < Nz; ++z)
-        for (int y = 0; y < Ny; ++y)
-            for (int x = 0; x < Nx; ++x, ++offset) {
-                A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny);
-                A[1][offset] = 0;
-                vsq[offset] = x*y*z / double(Nx * Ny * Nz);
-            }
-}
-
-
-int main() {
-    int Nx = 256, Ny = 256, Nz = 256;
-    int width = 4;
-    double *Aserial[2], *Aispc[2];
-    Aserial[0] = new double [Nx * Ny * Nz];
-    Aserial[1] = new double [Nx * Ny * Nz];
-    Aispc[0] = new double [Nx * Ny * Nz];
-    Aispc[1] = new double [Nx * Ny * Nz];
-    double *vsq = new double [Nx * Ny * Nz];
-
-    double coeff[4] = { 0.5, -.25, .125, -.0625 }; 
-    
-    /*******************/
-    createContext();
-    /*******************/
-
-    const size_t bufsize = sizeof(double)*Nx*Ny*Nz;
-    devicePtr d_Aispc0   = deviceMalloc(bufsize);
-    devicePtr d_Aispc1   = deviceMalloc(bufsize);
-    devicePtr d_vsq      = deviceMalloc(bufsize);
-    devicePtr d_coeff    = deviceMalloc(4*sizeof(double));
-
-
-    InitData(Nx, Ny, Nz, Aispc, vsq);
-
-    //
-    // Compute the image using the ispc implementation on one core; report
-    // the minimum time of three runs.
-    //
-    double minTimeISPC = 1e30;
-    for (int i = 0; i < 3; ++i) {
-        reset_and_start_timer();
-        loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
-                          width, Nz - width, Nx, Ny, Nz, coeff, vsq,
-                          Aispc[0], Aispc[1]);
-        double dt = get_elapsed_mcycles();
-        minTimeISPC = std::min(minTimeISPC, dt);
-    }
-
-    printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
-  
-    InitData(Nx, Ny, Nz, Aispc, vsq);
-
-    memcpyH2D(d_Aispc0, Aispc[0], bufsize);
-    memcpyH2D(d_Aispc1, Aispc[1], bufsize);
-    memcpyH2D(d_vsq,    vsq,      bufsize);
-    memcpyH2D(d_coeff,  coeff,    4*sizeof(double));
-    //
-    // Compute the image using the ispc implementation with tasks; report
-    // the minimum time of three runs.
-    //
-    double minTimeISPCTasks = 1e30;
-    for (int i = 0; i < 3; ++i) {
-        reset_and_start_timer();
-        loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
-                                width, Nz - width, Nx, Ny, Nz, (double*)d_coeff, (double*)d_vsq,
-                                (double*)d_Aispc0, (double*)d_Aispc1);
-        double dt = get_elapsed_mcycles();
-        minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
-    }
-    memcpyD2H(Aispc[1], d_Aispc1, bufsize);
-    //memcpyD2H(Aispc[1], d_vsq, bufsize);
-
-    printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
-
-    InitData(Nx, Ny, Nz, Aserial, vsq);
-
-    // 
-    // And run the serial implementation 3 times, again reporting the
-    // minimum time.
-    //
-    double minTimeSerial = 1e30;
-    for (int i = 0; i < 3; ++i) {
-        reset_and_start_timer();
-        loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
-                            width, Nz - width, Nx, Ny, Nz, coeff, vsq,
-                            Aserial[0], Aserial[1]);
-        double dt = get_elapsed_mcycles();
-        minTimeSerial = std::min(minTimeSerial, dt);
-    }
-
-    printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial);
-
-    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
-           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
-
-    // Check for agreement
-    int offset = 0;
-    int nerr = 0;
-    for (int z = 0; z < Nz; ++z)
-        for (int y = 0; y < Ny; ++y)
-            for (int x = 0; x < Nx; ++x, ++offset) {
-
-                double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
-                                    Aserial[1][offset]);
-                if (error > 1e-3)
-                {
-                  if (nerr < 100)
-                    printf("Error @ (%d,%d,%d): ispc = %g, serial = %g error= %g\n",
-                           x, y, z, Aispc[1][offset], Aserial[1][offset], error);
-                    nerr++;
-                }
-            }
-
-    fprintf(stderr, " nerr= %d  frac= %g \n", nerr, 1.0*nerr/(1.0*Nx*Ny*Nz));
-  
-    /*******************/
-    destroyContext();
-    /*******************/
-
-    return 0;
-}
--- a/examples/stencil/stencil_cu.o
+++ b/examples/stencil/stencil_cu.o
--- a/examples/stencil/stencil_parallel.cpp
+++ b/examples/stencil/stencil_parallel.cpp
@@ -1,87 +0,0 @@
-/*
-  Copyright (c) 2010-2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-
-static void
-stencil_step(int x0, int x1,
-             int y0, int y1,
-             int z0, int z1,
-             int Nx, int Ny, int Nz,
-             const float coef[4], const float vsq[],
-             const float Ain[], float Aout[]) {
-    int Nxy = Nx * Ny;
-
-#pragma omp parallel for
-    for (int z = z0; z < z1; ++z) {
-        for (int y = y0; y < y1; ++y) {
-            for (int x = x0; x < x1; ++x) {
-                int index = (z * Nxy) + (y * Nx) + x;
-#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
-#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
-                float div = coef[0] * A_cur(0, 0, 0) +
-                            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
-                                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
-                                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
-                            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
-                                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
-                                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
-                            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
-                                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
-                                       A_cur(0, 0, +3) + A_cur(0, 0, -3));
-
-                A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
-                    vsq[index] * div;
-            }
-        }
-    }
-}
-
-
-void loop_stencil_parallel(int t0, int t1, 
-                         int x0, int x1,
-                         int y0, int y1,
-                         int z0, int z1,
-                         int Nx, int Ny, int Nz,
-                         const float coef[4], 
-                         const float vsq[],
-                         float Aeven[], float Aodd[])
-{
-    for (int t = t0; t < t1; ++t) {
-        if ((t & 1) == 0)
-            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
-                         Aeven, Aodd);
-        else
-            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
-                         Aodd, Aeven);
-    }
-}
--- a/examples/volume_rendering/volume_ispc.bc
+++ b/examples/volume_rendering/volume_ispc.bc
--- a/examples/volume_rendering/volume_ispc_avx.bc
+++ b/examples/volume_rendering/volume_ispc_avx.bc
--- a/examples/volume_rendering/volume_ispc_nvptx64.bc
+++ b/examples/volume_rendering/volume_ispc_nvptx64.bc