White space and copyright fixes in examples.

2014-07-08 20:08:34 +04:00
parent 8894156df5
commit d8e2fdf913
30 changed files with 563 additions and 563 deletions
--- a/examples/portable/aobench/ao.cpp
+++ b/examples/portable/aobench/ao.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #ifdef _MSC_VER
@@ -141,12 +141,12 @@ int main(int argc, char **argv)
    }

    // Report results and save image
-    printf("[aobench ispc + tasks]:\t\t[%.3f] msec (%d x %d image)\n", 
+    printf("[aobench ispc + tasks]:\t\t[%.3f] msec (%d x %d image)\n",
           minTimeISPCTasks, width, height);
-    savePPM("ao-ispc-tasks.ppm", width, height); 
+    savePPM("ao-ispc-tasks.ppm", width, height);

    delete img;
    delete fimg;
-        
+
    return 0;
 }
--- a/examples/portable/aobench/ao.cu
+++ b/examples/portable/aobench/ao.cu
@@ -1,6 +1,6 @@
 // -*- mode: c++ -*-
 /*
-   Copyright (c) 2010-2011, Intel Corporation
+   Copyright (c) 2010-2014, Intel Corporation
   All rights reserved.

   Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,7 @@ met:
 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /*
   Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
@@ -37,7 +37,7 @@ met:

 #include "cuda_helpers.cuh"

-#define NAO_SAMPLES		8
+#define NAO_SAMPLES        8
 //#define M_PI 3.1415926535f

 #define vec Float3
@@ -109,7 +109,7 @@ static inline unsigned int random(RNGState * state)

    b  = ((state->z1 << 6) ^ state->z1) >> 13;
    state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
-    b  = ((state->z2 << 2) ^ state->z2) >> 27; 
+    b  = ((state->z2 << 2) ^ state->z2) >> 27;
    state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
    b  = ((state->z3 << 13) ^ state->z3) >> 21;
    state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
@@ -128,7 +128,7 @@ static inline float frandom(RNGState * state)
 }

 __device__
-static inline void seed_rng(RNGState * state, 
+static inline void seed_rng(RNGState * state,
                            unsigned int seed) {
    state->z1 = seed;
    state->z2 = seed ^ 0xbeeff00d;
@@ -143,7 +143,7 @@ struct Isect {
  float      t;
  vec        p;
  vec        n;
-  int        hit; 
+  int        hit;
 };

 struct Sphere {
@@ -190,7 +190,7 @@ ray_plane_intersect(Isect &isect,const  Ray &ray, const  Plane &plane) {
  float v = dot(ray.dir, plane.n);

 #if 0
-  if (abs(v) < 1.0f-17) 
+  if (abs(v) < 1.0f-17)
    return;
  else {
    float t = -(dot(ray.org, plane.n) + d) / v;
@@ -238,7 +238,7 @@ ray_sphere_intersect(Isect &isect,const  Ray &ray, const Sphere &sphere) {
    }
  }
 #else
-    if (D <= 0.0f) 
+    if (D <= 0.0f)
      return;

    float t = -B - sqrt(D);
@@ -319,8 +319,8 @@ ambient_occlusion(Isect &isect,  const Plane &plane, const  Sphere spheres[3],
      occIsect.hit = 0;

      for ( int snum = 0; snum < 3; ++snum)
-        ray_sphere_intersect(occIsect, ray, spheres[snum]); 
-      ray_plane_intersect (occIsect, ray, plane); 
+        ray_sphere_intersect(occIsect, ray, spheres[snum]);
+      ray_plane_intersect (occIsect, ray, plane);

      if (occIsect.hit) occlusion += 1.0f;
    }
@@ -337,10 +337,10 @@ ambient_occlusion(Isect &isect,  const Plane &plane, const  Sphere spheres[3],
 __device__
 static inline void ao_tiles(
     int x0,  int x1,
-     int y0,  int y1, 
+     int y0,  int y1,
     int w,  int h,
-     int nsubsamples, 
-     float image[]) 
+     int nsubsamples,
+     float image[])
 {
  const  Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
  const  Sphere spheres[3] = {
@@ -411,8 +411,8 @@ static inline void ao_tiles(

 extern "C"
 __global__
-void ao_task( int width,  int height, 
-     int nsubsamples,  float image[]) 
+void ao_task( int width,  int height,
+     int nsubsamples,  float image[])
 {
  if (taskIndex0 >= taskCount0) return;
  if (taskIndex1 >= taskCount1) return;
@@ -428,8 +428,8 @@ void ao_task( int width,  int height,
 extern "C"
 __global__
 void ao_ispc_tasks___export(
-    int w, int h, int nsubsamples, 
-    float image[]) 
+    int w, int h, int nsubsamples,
+    float image[])
 {
  const int ntilex = (w+TILEX-1)/TILEX;
  const int ntiley = (h+TILEY-1)/TILEY;
@@ -439,8 +439,8 @@ void ao_ispc_tasks___export(

 extern "C"
 __host__ void ao_ispc_tasks(
-    int w, int h, int nsubsamples, 
-    float image[]) 
+    int w, int h, int nsubsamples,
+    float image[])
 {
  ao_ispc_tasks___export<<<1,32>>>(w,h,nsubsamples,image);
  cudaDeviceSynchronize();
--- a/examples/portable/aobench/ao.ispc
+++ b/examples/portable/aobench/ao.ispc
@@ -1,6 +1,6 @@
 // -*- mode: c++ -*-
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -29,13 +29,13 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /*
  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
 */

-#define NAO_SAMPLES		8
+#define NAO_SAMPLES        8
 #define M_PI 3.1415926535f

 typedef float<3> vec;
@@ -50,7 +50,7 @@ struct Isect {
    float      t;
    vec        p;
    vec        n;
-    int        hit; 
+    int        hit;
 };

 struct Sphere {
@@ -94,7 +94,7 @@ ray_plane_intersect(Isect &isect, Ray &ray, const Plane &plane) {
    float v = dot(ray.dir, plane.n);

 #if 0
-    cif (abs(v) < 1.0e-17) 
+    cif (abs(v) < 1.0e-17)
        return;
    else {
        float t = -(dot(ray.org, plane.n) + d) / v;
@@ -141,7 +141,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, const Sphere &sphere) {
        }
    }
 #else
-    cif (D <=0.0f) 
+    cif (D <=0.0f)
      return;

    float t = -B - sqrt(D);
@@ -220,8 +220,8 @@ ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
            occIsect.hit = 0;

            for (uniform int snum = 0; snum < 3; ++snum)
-                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
-            ray_plane_intersect (occIsect, ray, plane); 
+                ray_sphere_intersect(occIsect, ray, spheres[snum]);
+            ray_plane_intersect (occIsect, ray, plane);

            if (occIsect.hit) occlusion += 1.0;
        }
@@ -233,10 +233,10 @@ ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],

 static  inline void ao_tiles(
    uniform int x0, uniform int x1,
-    uniform int y0, uniform int y1, 
+    uniform int y0, uniform int y1,
    uniform int w, uniform int h,
-    uniform int nsubsamples, 
-    uniform float image[]) 
+    uniform int nsubsamples,
+    uniform float image[])
 {
  const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
  const Sphere spheres[3] = {
@@ -306,7 +306,7 @@ static  inline void ao_tiles(
 #define TILEX max(64,programCount*2)
 #define TILEY 4

-export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, 
+export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
                    uniform float image[]) {
  const uniform int x0 = 0;
  const uniform int x1 = w;
@@ -315,8 +315,8 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
  ao_tiles(x0,x1,y0,y1, w, h, nsubsamples, image);
 }

-void task ao_task(uniform int width, uniform int height, 
-    uniform int nsubsamples, uniform float image[]) 
+void task ao_task(uniform int width, uniform int height,
+    uniform int nsubsamples, uniform float image[])
 {
  if (taskIndex0 >= taskCount0) return;
  if (taskIndex1 >= taskCount1) return;
@@ -330,8 +330,8 @@ void task ao_task(uniform int width, uniform int height,
 }


-export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
-    uniform float image[]) 
+export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
+    uniform float image[])
 {
  const uniform int ntilex = (w+TILEX-1)/TILEX;
  const uniform int ntiley = (h+TILEY-1)/TILEY;
--- a/examples/portable/deferred/common.cpp
+++ b/examples/portable/deferred/common.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #ifdef _MSC_VER
@@ -145,13 +145,13 @@ CreateInputDataFromFile(const char *path) {
    }

    // Load data chunk and update pointers
-    input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize, 
+    input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
                                             ALIGNMENT_BYTES);
    if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
        return NULL;
    }
-    
+
    input->arrays.zBuffer =
        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
    input->arrays.normalEncoded_x =
@@ -199,21 +199,21 @@ void WriteFrame(const char *filename, const InputData *input,
                const Framebuffer &framebuffer) {
    // Deswizzle and copy to RGBA output
    // Doesn't need to be fast... only happens once
-    size_t imageBytes = 3 * input->header.framebufferWidth * 
+    size_t imageBytes = 3 * input->header.framebufferWidth *
        input->header.framebufferHeight;
    uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
    memset(framebufferAOS, 0, imageBytes);

-    for (int i = 0; i < input->header.framebufferWidth * 
+    for (int i = 0; i < input->header.framebufferWidth *
                        input->header.framebufferHeight; ++i) {
        framebufferAOS[3 * i + 0] = framebuffer.r[i];
        framebufferAOS[3 * i + 1] = framebuffer.g[i];
        framebufferAOS[3 * i + 2] = framebuffer.b[i];
    }
-    
+
    // Write out simple PPM file
    FILE *out = fopen(filename, "wb");
-    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth, 
+    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
            input->header.framebufferHeight);
    fwrite(framebufferAOS, imageBytes, 1, out);
    fclose(out);
--- a/examples/portable/deferred/deferred.h
+++ b/examples/portable/deferred/deferred.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #ifndef DEFERRED_H
--- a/examples/portable/deferred/dynamic_c.cpp
+++ b/examples/portable/deferred/dynamic_c.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #include "deferred.h"
@@ -147,7 +147,7 @@ ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
        float minZ, maxZ;
        ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
                       tileY * tileHeight, tileY * tileHeight + tileHeight,
-                       zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, 
+                       zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
                       cameraNear, cameraFar, &minZ, &maxZ);
        minZArray[tileX] = minZ;
        maxZArray[tileX] = maxZ;
@@ -167,7 +167,7 @@ public:
    {
        mNumTilesX = gBufferWidth / mTileWidth;
        mNumTilesY = gBufferHeight / mTileHeight;
-        
+
        // Allocate arrays
        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
@@ -213,7 +213,7 @@ public:
                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
                    if (srcX + 1 < srcTilesX) {
-                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
                                                                    (srcX + 1)]);
                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
                                                                    (srcX + 1)]);
@@ -243,7 +243,7 @@ public:
            lAlignedFree(mMaxZArrays[i]);
        }
        lAlignedFree(mMinZArrays);
-        lAlignedFree(mMaxZArrays); 
+        lAlignedFree(mMaxZArrays);
    }

    int Levels() const { return mLevels; }
@@ -277,9 +277,9 @@ private:
 static MinMaxZTree *gMinMaxZTree = 0;

 void InitDynamicC(InputData *input) {
-    gMinMaxZTree = 
+    gMinMaxZTree =
        new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
-                        input->header.framebufferWidth, 
+                        input->header.framebufferWidth,
                        input->header.framebufferHeight);
 }

@@ -311,7 +311,7 @@ SplitTileMinMax(
 {
    float gBufferScale_x = 0.5f * (float)gBufferWidth;
    float gBufferScale_y = 0.5f * (float)gBufferHeight;
-        
+
    float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
                                   (cameraProj_22 * gBufferScale_y) };
    float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
@@ -319,7 +319,7 @@ SplitTileMinMax(

    for (int i = 0; i < 2; ++i) {
        // Normalize
-        float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+        float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
                                 frustumPlanes_z[i] * frustumPlanes_z[i]);
        frustumPlanes_xy[i] *= norm;
        frustumPlanes_z[i] *= norm;
@@ -340,23 +340,23 @@ SplitTileMinMax(
        float light_positionView_z = light_positionView_z_array[lightIndex];
        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
        float light_attenuationEndNeg = -light_attenuationEnd;
-        
+
        // Test lights again against subtile z bounds
        bool inFrustum[4];
        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
-        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
-        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
-        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);

-        float dx = light_positionView_z * frustumPlanes_z[0] + 
+        float dx = light_positionView_z * frustumPlanes_z[0] +
            light_positionView_x * frustumPlanes_xy[0];
        float dy = light_positionView_z * frustumPlanes_z[1] +
            light_positionView_y * frustumPlanes_xy[1];
-        
+
        if (fabsf(dx) > light_attenuationEnd) {
            bool positiveX = dx > 0.0f;
            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
@@ -423,13 +423,13 @@ half_to_float_fast(uint16_t h) {
    uint32_t hm = h & (int32_t)0x03FFu;  // Pick off mantissa bits

    // sign
-    uint32_t xs = ((uint32_t) hs) << 16; 
+    uint32_t xs = ((uint32_t) hs) << 16;
    // Exponent: unbias the halfp, then bias the single
-    int32_t xes = ((int32_t) (he >> 10)) - 15 + 127; 
+    int32_t xes = ((int32_t) (he >> 10)) - 15 + 127;
    // Exponent
    uint32_t xe = (uint32_t) (xes << 23);
    // Mantissa
-    uint32_t xm = ((uint32_t) hm) << 13; 
+    uint32_t xm = ((uint32_t) hm) << 13;

    uint32_t bits = (xs | xe | xm);
    float *fp = reinterpret_cast<float *>(&bits);
@@ -470,13 +470,13 @@ ShadeTileC(
    } else {
        float twoOverGBufferWidth = 2.0f / gBufferWidth;
        float twoOverGBufferHeight = 2.0f / gBufferHeight;
-        
+
        for (int32_t y = tileStartY; y < tileEndY; ++y) {
            float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);

            for (int32_t x = tileStartX; x < tileEndX; ++x) {
                int32_t gBufferOffset = y * gBufferWidth + x;
-                
+
                // Reconstruct position and (negative) view vector from G-buffer
                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
                float Vneg_x, Vneg_y, Vneg_z;
@@ -485,70 +485,70 @@ ShadeTileC(

                // Compute screen/clip-space position
                // NOTE: Mind DX11 viewport transform and pixel center!
-                float positionScreen_x = (0.5f + (float)(x)) * 
+                float positionScreen_x = (0.5f + (float)(x)) *
                    twoOverGBufferWidth - 1.0f;

                // Unproject depth buffer Z value into view space
                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
-                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                surface_positionView_x = positionScreen_x * surface_positionView_z /
                    cameraProj_11;
-                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                surface_positionView_y = positionScreen_y * surface_positionView_z /
                    cameraProj_22;
-                
+
                // We actually end up with a vector pointing *at* the
                // surface (i.e. the negative view vector)
-                normalize3(surface_positionView_x, surface_positionView_y, 
+                normalize3(surface_positionView_x, surface_positionView_y,
                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);

                // Reconstruct normal from G-buffer
                float surface_normal_x, surface_normal_y, surface_normal_z;
                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
-                    
+
                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
                float m = sqrtf(4.0f * f - 1.0f);
-                    
+
                surface_normal_x = m * (4.0f * normal_x - 2.0f);
                surface_normal_y = m * (4.0f * normal_y - 2.0f);
                surface_normal_z = 3.0f - 8.0f * f;

                // Load other G-buffer parameters
-                float surface_specularAmount = 
+                float surface_specularAmount =
                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
-                float surface_specularPower  = 
+                float surface_specularPower  =
                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
-                
+
                float lit_x = 0.0f;
                float lit_y = 0.0f;
                float lit_z = 0.0f;
-                for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights;
                     ++tileLightIndex) {
                    int32_t lightIndex = tileLightIndices[tileLightIndex];
-                                        
+
                    // Gather light data relevant to initial culling
-                    float light_positionView_x = 
+                    float light_positionView_x =
                        inputData.lightPositionView_x[lightIndex];
-                    float light_positionView_y = 
+                    float light_positionView_y =
                        inputData.lightPositionView_y[lightIndex];
-                    float light_positionView_z = 
+                    float light_positionView_z =
                        inputData.lightPositionView_z[lightIndex];
-                    float light_attenuationEnd = 
+                    float light_attenuationEnd =
                        inputData.lightAttenuationEnd[lightIndex];
-                    
+
                    // Compute light vector
                    float L_x = light_positionView_x - surface_positionView_x;
                    float L_y = light_positionView_y - surface_positionView_y;
                    float L_z = light_positionView_z - surface_positionView_z;

                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
-                    
+
                    // Clip at end of attenuation
                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;

-                    if (distanceToLight2 < light_attenutaionEnd2) {                    
+                    if (distanceToLight2 < light_attenutaionEnd2) {
                        float distanceToLight = sqrtf(distanceToLight2);

                        float distanceToLightRcp = 1.f / distanceToLight;
@@ -557,12 +557,12 @@ ShadeTileC(
                        L_z *= distanceToLightRcp;

                        // Start computing brdf
-                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                        float NdotL = dot3(surface_normal_x, surface_normal_y,
                                           surface_normal_z, L_x, L_y, L_z);
-                    
+
                        // Clip back facing
                        if (NdotL > 0.0f) {
-                            float light_attenuationBegin = 
+                            float light_attenuationBegin =
                                inputData.lightAttenuationBegin[lightIndex];

                            // Light distance attenuation (linstep)
@@ -574,19 +574,19 @@ ShadeTileC(
                            float H_y = (L_y - Vneg_y);
                            float H_z = (L_z - Vneg_z);
                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
-                    
-                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+
+                            float NdotH = dot3(surface_normal_x, surface_normal_y,
                                               surface_normal_z, H_x, H_y, H_z);
                            NdotH = std::max(NdotH, 0.0f);

                            float specular = powf(NdotH, surface_specularPower);
-                            float specularNorm = (surface_specularPower + 2.0f) * 
+                            float specularNorm = (surface_specularPower + 2.0f) *
                                (1.0f / 8.0f);
-                            float specularContrib = surface_specularAmount * 
+                            float specularContrib = surface_specularAmount *
                                specularNorm * specular;

                            float k = attenuation * NdotL * (1.0f + specularContrib);
-                    
+
                            float light_color_x = inputData.lightColor_x[lightIndex];
                            float light_color_y = inputData.lightColor_y[lightIndex];
                            float light_color_z = inputData.lightColor_z[lightIndex];
@@ -607,7 +607,7 @@ ShadeTileC(
                lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
                lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
                lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
-                
+
                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
@@ -618,11 +618,11 @@ ShadeTileC(


 void
-ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
-                        int *lightIndices, int numLights, 
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
+                        int *lightIndices, int numLights,
                        Framebuffer *framebuffer) {
    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
-    
+
    // If we few enough lights or this is the base case (last level), shade
    // this full tile directly
    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
@@ -632,18 +632,18 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
        int startY = tileY * height;
        int endX = std::min(input->header.framebufferWidth, startX + width);
        int endY = std::min(input->header.framebufferHeight, startY + height);
-        
+
        // Skip entirely offscreen tiles
        if (endX > startX && endY > startY) {
            ShadeTileC(startX, endX, startY, endY,
                       input->header.framebufferWidth, input->header.framebufferHeight,
                       input->arrays,
-                       input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
+                       input->header.cameraProj[0][0], input->header.cameraProj[1][1],
                       input->header.cameraProj[2][2], input->header.cameraProj[3][2],
-                       lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
+                       lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
                       framebuffer->r, framebuffer->g, framebuffer->b);
        }
-    } 
+    }
    else {
        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
        // Move down a level in the tree
@@ -666,9 +666,9 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,

        // NOTE: Order is 00, 10, 01, 11
        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
-        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
                         input->header.cameraFar, input->header.cameraFar};
-        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
                         input->header.cameraNear, input->header.cameraNear};

        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
@@ -688,7 +688,7 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,

        // Cull lights into subtile lists
 #ifdef ISPC_IS_WINDOWS
-        __declspec(align(ALIGNMENT_BYTES)) 
+        __declspec(align(ALIGNMENT_BYTES))
 #endif
            int subtileLightIndices[4][MAX_LIGHTS]
 #ifndef ISPC_IS_WINDOWS
@@ -697,15 +697,15 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
 ;
        int subtileNumLights[4];
        SplitTileMinMax(midX, midY, minZ, maxZ,
-            input->header.framebufferWidth, input->header.framebufferHeight, 
+            input->header.framebufferWidth, input->header.framebufferHeight,
            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
-            lightIndices, numLights, input->arrays.lightPositionView_x, 
-            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+            lightIndices, numLights, input->arrays.lightPositionView_x,
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
            input->arrays.lightAttenuationEnd,
            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
-        
+
        // Recurse into subtiles
-        ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY,
                                subtileLightIndices[0], subtileNumLights[0],
                                framebuffer);
        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
@@ -744,7 +744,7 @@ IntersectLightsWithTileMinMax(
 {
    float gBufferScale_x = 0.5f * (float)gBufferWidth;
    float gBufferScale_y = 0.5f * (float)gBufferHeight;
-        
+
    float frustumPlanes_xy[4];
    float frustumPlanes_z[4];

@@ -753,14 +753,14 @@ IntersectLightsWithTileMinMax(
                                    (cameraProj_11 * gBufferScale_x),
                                    (cameraProj_22 * gBufferScale_y),
                                    -(cameraProj_22 * gBufferScale_y) };
-    
+
    float frustumPlanes_z_v[4] = {  tileEndX - gBufferScale_x,
                                    -tileStartX + gBufferScale_x,
                                    tileEndY - gBufferScale_y,
                                    -tileStartY + gBufferScale_y };

    for (int i = 0; i < 4; ++i) {
-        float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] + 
+        float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] +
                                 frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
        frustumPlanes_xy_v[i] *= norm;
        frustumPlanes_z_v[i] *= norm;
@@ -781,29 +781,29 @@ IntersectLightsWithTileMinMax(

        d = maxZ - light_positionView_z;
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-        
-        if (!inFrustum) 
+
+        if (!inFrustum)
            continue;

        float light_positionView_x = light_positionView_x_array[lightIndex];
        float light_positionView_y = light_positionView_y_array[lightIndex];

-        d = light_positionView_z * frustumPlanes_z[0] + 
+        d = light_positionView_z * frustumPlanes_z[0] +
            light_positionView_x * frustumPlanes_xy[0];
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);

-        d = light_positionView_z * frustumPlanes_z[1] + 
+        d = light_positionView_z * frustumPlanes_z[1] +
            light_positionView_x * frustumPlanes_xy[1];
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);

-        d = light_positionView_z * frustumPlanes_z[2] + 
+        d = light_positionView_z * frustumPlanes_z[2] +
            light_positionView_y * frustumPlanes_xy[2];
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);

-        d = light_positionView_z * frustumPlanes_z[3] + 
+        d = light_positionView_z * frustumPlanes_z[3] +
            light_positionView_y * frustumPlanes_xy[3];
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-        
+
        // Pack and store intersecting lights
        if (inFrustum)
            tileLightIndices[tileNumLights++] = lightIndex;
@@ -831,7 +831,7 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,

    // This is a root tile, so first do a full 6-plane cull
 #ifdef ISPC_IS_WINDOWS
-    __declspec(align(ALIGNMENT_BYTES)) 
+    __declspec(align(ALIGNMENT_BYTES))
 #endif
        int lightIndices[MAX_LIGHTS]
 #ifndef ISPC_IS_WINDOWS
@@ -842,12 +842,12 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
        startX, endX, startY, endY,    minZ, maxZ,
        input->header.framebufferWidth, input->header.framebufferHeight,
        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
-        MAX_LIGHTS, input->arrays.lightPositionView_x, 
-        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+        MAX_LIGHTS, input->arrays.lightPositionView_x,
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
        input->arrays.lightAttenuationEnd, lightIndices);

    // Now kick off the recursive process for this tile
-    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
                            numLights, framebuffer);
 }

@@ -856,10 +856,10 @@ void
 DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
 {
    MinMaxZTree *minMaxZTree = gMinMaxZTree;
-        
+
    // Update min/max Z tree
    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
-        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2],
        input->header.cameraNear, input->header.cameraFar);

    int rootLevel = minMaxZTree->Levels() - 1;
--- a/examples/portable/deferred/dynamic_cilk.cpp
+++ b/examples/portable/deferred/dynamic_cilk.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #ifdef __cilk
@@ -104,7 +104,7 @@ public:
    {
        mNumTilesX = gBufferWidth / mTileWidth;
        mNumTilesY = gBufferHeight / mTileHeight;
-        
+
        // Allocate arrays
        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
@@ -155,7 +155,7 @@ public:
                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
                    if (srcX + 1 < srcTilesX) {
-                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
                                                                    (srcX + 1)]);
                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
                                                                    (srcX + 1)]);
@@ -185,7 +185,7 @@ public:
            lAlignedFree(mMaxZArrays[i]);
        }
        lAlignedFree(mMinZArrays);
-        lAlignedFree(mMaxZArrays); 
+        lAlignedFree(mMaxZArrays);
    }

    int Levels() const { return mLevels; }
@@ -219,19 +219,19 @@ private:
 static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;

 void InitDynamicCilk(InputData *input) {
-    gMinMaxZTreeCilk = 
+    gMinMaxZTreeCilk =
        new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
-                            input->header.framebufferWidth, 
+                            input->header.framebufferWidth,
                            input->header.framebufferHeight);
 }


 static void
-ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
-                        int *lightIndices, int numLights, 
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
+                        int *lightIndices, int numLights,
                        Framebuffer *framebuffer) {
    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
-    
+
    // If we few enough lights or this is the base case (last level), shade
    // this full tile directly
    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
@@ -241,19 +241,19 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
        int startY = tileY * height;
        int endX = std::min(input->header.framebufferWidth, startX + width);
        int endY = std::min(input->header.framebufferHeight, startY + height);
-        
+
        // Skip entirely offscreen tiles
        if (endX > startX && endY > startY) {
            ispc::ShadeTile(
                startX, endX, startY, endY,
                input->header.framebufferWidth, input->header.framebufferHeight,
                &input->arrays,
-                input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
+                input->header.cameraProj[0][0], input->header.cameraProj[1][1],
                input->header.cameraProj[2][2], input->header.cameraProj[3][2],
-                lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
+                lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
                framebuffer->r, framebuffer->g, framebuffer->b);
        }
-    } 
+    }
    else {
        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
        // Move down a level in the tree
@@ -276,9 +276,9 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,

        // NOTE: Order is 00, 10, 01, 11
        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
-        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
                         input->header.cameraFar, input->header.cameraFar};
-        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
                         input->header.cameraNear, input->header.cameraNear};

        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
@@ -298,7 +298,7 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,

        // Cull lights into subtile lists
 #ifdef ISPC_IS_WINDOWS
-        __declspec(align(ALIGNMENT_BYTES)) 
+        __declspec(align(ALIGNMENT_BYTES))
 #endif
            int subtileLightIndices[4][MAX_LIGHTS]
 #ifndef ISPC_IS_WINDOWS
@@ -307,15 +307,15 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
 ;
        int subtileNumLights[4];
        ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
-            input->header.framebufferWidth, input->header.framebufferHeight, 
+            input->header.framebufferWidth, input->header.framebufferHeight,
            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
-            lightIndices, numLights, input->arrays.lightPositionView_x, 
-            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+            lightIndices, numLights, input->arrays.lightPositionView_x,
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
            input->arrays.lightAttenuationEnd,
            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
-        
+
        // Recurse into subtiles
-        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY,
                                            subtileLightIndices[0], subtileNumLights[0],
                                            framebuffer);
        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
@@ -349,7 +349,7 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,

    // This is a root tile, so first do a full 6-plane cull
 #ifdef ISPC_IS_WINDOWS
-    __declspec(align(ALIGNMENT_BYTES)) 
+    __declspec(align(ALIGNMENT_BYTES))
 #endif
        int lightIndices[MAX_LIGHTS]
 #ifndef ISPC_IS_WINDOWS
@@ -360,12 +360,12 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
        startX, endX, startY, endY,    minZ, maxZ,
        input->header.framebufferWidth, input->header.framebufferHeight,
        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
-        MAX_LIGHTS, input->arrays.lightPositionView_x, 
-        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+        MAX_LIGHTS, input->arrays.lightPositionView_x,
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
        input->arrays.lightAttenuationEnd, lightIndices);

    // Now kick off the recursive process for this tile
-    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
                            numLights, framebuffer);
 }

@@ -374,10 +374,10 @@ void
 DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
 {
    MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
-        
+
    // Update min/max Z tree
    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
-        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2],
        input->header.cameraNear, input->header.cameraFar);

    // Launch the "root" tiles.  Ideally these should at least fill the
--- a/examples/portable/deferred/kernels.cu
+++ b/examples/portable/deferred/kernels.cu
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */


@@ -46,7 +46,7 @@
 #define int16 short
 #define int8 char

-__device__ static inline float clamp(float v, float low, float high) 
+__device__ static inline float clamp(float v, float low, float high)
 {
      return min(max(v, low), high);
 }
@@ -122,8 +122,8 @@ struct Uniform
    const int2 idx = get_chunk(i);
    return __shfl(data[idx.x], idx.y);
  }
-  
-  __device__ inline void set(const bool active, const int i, T value) 
+
+  __device__ inline void set(const bool active, const int i, T value)
  {
    const int2 idx = get_chunk(i);
    const int chunkIdx = idx.x;
@@ -160,9 +160,9 @@ struct Uniform
  {
    return data[i];
  }
-  
+
  __device__ inline T* get_ptr(const int i) {return &data[i]; }
-  __device__ inline void set(const bool active, const int i, T value) 
+  __device__ inline void set(const bool active, const int i, T value)
  {
    if (active)
      data[i] = value;
@@ -185,8 +185,8 @@ struct Uniform
  {
    return shdata[i];
  }
-  
-  __device__ inline void set(const bool active, const int i, T value) 
+
+  __device__ inline void set(const bool active, const int i, T value)
  {
    if (active)
      shdata[i] = value;
@@ -264,7 +264,7 @@ static __device__ __forceinline__ int2 warpBinExclusiveScan(const bool p)
  const int b = __ballot(p);
  return make_int2(__popc(b), __popc(b & lanemask_lt()));
 }
-  __device__ static inline 
+  __device__ static inline
 int packed_store_active(bool active, int* ptr, int value)
 {
  const int2 res = warpBinExclusiveScan(active);
@@ -358,7 +358,7 @@ IntersectLightsWithTileMinMax(
 {
     float gBufferScale_x = 0.5f * (float)gBufferWidth;
     float gBufferScale_y = 0.5f * (float)gBufferHeight;
-        
+
     float frustumPlanes_xy[4] = {
        -(cameraProj_11 * gBufferScale_x),
         (cameraProj_11 * gBufferScale_x),
@@ -371,7 +371,7 @@ IntersectLightsWithTileMinMax(
        -tileStartY + gBufferScale_y };

    for ( int i = 0; i < 4; ++i) {
-         float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+         float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
        frustumPlanes_xy[i] *= norm;
        frustumPlanes_z[i] *= norm;
@@ -393,32 +393,32 @@ IntersectLightsWithTileMinMax(

        d = maxZ - light_positionView_z;
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-        
+
        // This seems better than cif(!inFrustum) ccontinue; here since we
        // don't actually need to mask the rest of this function - this is
        // just a greedy early-out.  Could also structure all of this as
        // nested if() statements, but this a bit easier to read
-        if (__ballot(inFrustum) > 0) 
+        if (__ballot(inFrustum) > 0)
        {
            float light_positionView_x = light_positionView_x_array[lightIndex];
            float light_positionView_y = light_positionView_y_array[lightIndex];

-            d = light_positionView_z * frustumPlanes_z[0] + 
+            d = light_positionView_z * frustumPlanes_z[0] +
                light_positionView_x * frustumPlanes_xy[0];
            inFrustum = inFrustum && (d >= light_attenuationEndNeg);

-            d = light_positionView_z * frustumPlanes_z[1] + 
+            d = light_positionView_z * frustumPlanes_z[1] +
                light_positionView_x * frustumPlanes_xy[1];
            inFrustum = inFrustum && (d >= light_attenuationEndNeg);

-            d = light_positionView_z * frustumPlanes_z[2] + 
+            d = light_positionView_z * frustumPlanes_z[2] +
                light_positionView_y * frustumPlanes_xy[2];
            inFrustum = inFrustum && (d >= light_attenuationEndNeg);

-            d = light_positionView_z * frustumPlanes_z[3] + 
+            d = light_positionView_z * frustumPlanes_z[3] +
                light_positionView_y * frustumPlanes_xy[3];
            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-        
+
            // Pack and store intersecting lights
            const bool active = inFrustum && lightIndex < numLights;
 #if 0
@@ -472,7 +472,7 @@ IntersectLightsWithTile(
     int32 tileNumLights = IntersectLightsWithTileMinMax(
        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
-        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, 
+        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
        light_positionView_z_array, light_attenuationEnd_array,
        tileLightIndices);

@@ -505,7 +505,7 @@ ShadeTile(
         unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
        for ( int32 y = tileStartY; y < tileEndY; ++y) {
            for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
-            { 
+            {
              const int x = xb + programIndex;
              if (x >= tileEndX) continue;
                int32 framebufferIndex = (y * gBufferWidth + x);
@@ -517,16 +517,16 @@ ShadeTile(
    } else {
         float twoOverGBufferWidth = 2.0f / gBufferWidth;
         float twoOverGBufferHeight = 2.0f / gBufferHeight;
-        
+
        for ( int32 y = tileStartY; y < tileEndY; ++y) {
             float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);

            for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
-            { 
+            {
              const int x = xb + programIndex;
 //              if (x >= tileEndX) break;
                int32 gBufferOffset = y * gBufferWidth + x;
-                
+
                // Reconstruct position and (negative) view vector from G-buffer
                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
                float Vneg_x, Vneg_y, Vneg_z;
@@ -535,19 +535,19 @@ ShadeTile(

                // Compute screen/clip-space position
                // NOTE: Mind DX11 viewport transform and pixel center!
-                float positionScreen_x = (0.5f + (float)(x)) * 
+                float positionScreen_x = (0.5f + (float)(x)) *
                    twoOverGBufferWidth - 1.0f;

                // Unproject depth buffer Z value into view space
                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
-                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                surface_positionView_x = positionScreen_x * surface_positionView_z /
                    cameraProj_11;
-                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                surface_positionView_y = positionScreen_y * surface_positionView_z /
                    cameraProj_22;
-                
+
                // We actually end up with a vector pointing *at* the
                // surface (i.e. the negative view vector)
-                normalize3(surface_positionView_x, surface_positionView_y, 
+                normalize3(surface_positionView_x, surface_positionView_y,
                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);

                // Reconstruct normal from G-buffer
@@ -556,51 +556,51 @@ ShadeTile(
                float normal_x = __half2float(inputData.normalEncoded_x[gBufferOffset]);
                float normal_y = __half2float(inputData.normalEncoded_y[gBufferOffset]);
                asm("// half2float //");
-                    
+
                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
                float m = sqrt(4.0f * f - 1.0f);
-                    
+
                surface_normal_x = m * (4.0f * normal_x - 2.0f);
                surface_normal_y = m * (4.0f * normal_y - 2.0f);
                surface_normal_z = 3.0f - 8.0f * f;

                // Load other G-buffer parameters
-                float surface_specularAmount = 
+                float surface_specularAmount =
                    __half2float(inputData.specularAmount[gBufferOffset]);
-                float surface_specularPower  = 
+                float surface_specularPower  =
                    __half2float(inputData.specularPower[gBufferOffset]);
                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
-                
+
                float lit_x = 0.0f;
                float lit_y = 0.0f;
                float lit_z = 0.0f;
-                for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
                     ++tileLightIndex) {
                     int32 lightIndex = tileLightIndices.get(tileLightIndex);
-                                        
+
                    // Gather light data relevant to initial culling
-                     float light_positionView_x = 
+                     float light_positionView_x =
                        __ldg(&inputData.lightPositionView_x[lightIndex]);
-                     float light_positionView_y = 
+                     float light_positionView_y =
                        __ldg(&inputData.lightPositionView_y[lightIndex]);
-                     float light_positionView_z = 
+                     float light_positionView_z =
                        __ldg(&inputData.lightPositionView_z[lightIndex]);
-                     float light_attenuationEnd = 
+                     float light_attenuationEnd =
                        __ldg(&inputData.lightAttenuationEnd[lightIndex]);
-                    
+
                    // Compute light vector
                    float L_x = light_positionView_x - surface_positionView_x;
                    float L_y = light_positionView_y - surface_positionView_y;
                    float L_z = light_positionView_z - surface_positionView_z;

                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
-                    
+
                    // Clip at end of attenuation
                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;

-                    if (distanceToLight2 < light_attenutaionEnd2) {                    
+                    if (distanceToLight2 < light_attenutaionEnd2) {
                        float distanceToLight = sqrt(distanceToLight2);

                        // HLSL "rcp" is allowed to be fairly inaccurate
@@ -610,12 +610,12 @@ ShadeTile(
                        L_z *= distanceToLightRcp;

                        // Start computing brdf
-                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                        float NdotL = dot3(surface_normal_x, surface_normal_y,
                                           surface_normal_z, L_x, L_y, L_z);
-                    
+
                        // Clip back facing
                        if (NdotL > 0.0f) {
-                             float light_attenuationBegin = 
+                             float light_attenuationBegin =
                                inputData.lightAttenuationBegin[lightIndex];

                            // Light distance attenuation (linstep)
@@ -627,19 +627,19 @@ ShadeTile(
                            float H_y = (L_y - Vneg_y);
                            float H_z = (L_z - Vneg_z);
                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
-                    
-                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+
+                            float NdotH = dot3(surface_normal_x, surface_normal_y,
                                               surface_normal_z, H_x, H_y, H_z);
                            NdotH = max(NdotH, 0.0f);

                            float specular = pow(NdotH, surface_specularPower);
-                            float specularNorm = (surface_specularPower + 2.0f) * 
+                            float specularNorm = (surface_specularPower + 2.0f) *
                                (1.0f / 8.0f);
-                            float specularContrib = surface_specularAmount * 
+                            float specularContrib = surface_specularAmount *
                                specularNorm * specular;

                            float k = attenuation * NdotL * (1.0f + specularContrib);
-                    
+
                             float light_color_x = inputData.lightColor_x[lightIndex];
                             float light_color_y = inputData.lightColor_y[lightIndex];
                             float light_color_z = inputData.lightColor_z[lightIndex];
@@ -663,7 +663,7 @@ ShadeTile(
                lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
                lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
                lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
-                
+
                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
@@ -707,8 +707,8 @@ RenderTile( int num_groups_x,  int num_groups_y,
    // Light intersection: figure out which lights illuminate this tile.
     Uniform<int,MAX_LIGHTS> tileLightIndices;  // Light list for the tile
 #if 1
-     int numTileLights = 
-        IntersectLightsWithTile(tile_start_x, tile_end_x, 
+     int numTileLights =
+        IntersectLightsWithTile(tile_start_x, tile_end_x,
                                tile_start_y, tile_end_y,
                                framebufferWidth, framebufferHeight,
                                inputData.zBuffer,
@@ -716,9 +716,9 @@ RenderTile( int num_groups_x,  int num_groups_y,
                                cameraProj_22, cameraProj_32,
                                inputHeader.cameraNear, inputHeader.cameraFar,
                                MAX_LIGHTS,
-                                inputData.lightPositionView_x, 
-                                inputData.lightPositionView_y, 
-                                inputData.lightPositionView_z, 
+                                inputData.lightPositionView_x,
+                                inputData.lightPositionView_y,
+                                inputData.lightPositionView_z,
                                inputData.lightAttenuationEnd,
                                tileLightIndices);

@@ -726,7 +726,7 @@ RenderTile( int num_groups_x,  int num_groups_y,
    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
              framebufferWidth, framebufferHeight, inputData,
              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
-              tileLightIndices, numTileLights, visualizeLightCount, 
+              tileLightIndices, numTileLights, visualizeLightCount,
              framebuffer_r, framebuffer_g, framebuffer_b);
 #endif
 }
@@ -745,9 +745,9 @@ RenderStatic___export( InputHeader inputHeaderPtr[],
  const  InputDataArrays inputData = *inputDataPtr;


-     int num_groups_x = (inputHeader.framebufferWidth + 
+     int num_groups_x = (inputHeader.framebufferWidth +
                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
-     int num_groups_y = (inputHeader.framebufferHeight + 
+     int num_groups_y = (inputHeader.framebufferHeight +
                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
     int num_groups = num_groups_x * num_groups_y;

--- a/examples/portable/deferred/kernels.ispc
+++ b/examples/portable/deferred/kernels.ispc
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #include "deferred.h"
@@ -142,11 +142,11 @@ ComputeZBounds(
    maxZ = reduce_max(laneMaxZ);
 }

-#if 1 
+#if 1
 inline
 #endif
 #ifndef __NVPTX__
-export 
+export
 #endif
 uniform int32
 IntersectLightsWithTileMinMax(
@@ -171,7 +171,7 @@ IntersectLightsWithTileMinMax(
 {
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
-        
+
    uniform_t float frustumPlanes_xy[4] = {
        -(cameraProj_11 * gBufferScale_x),
         (cameraProj_11 * gBufferScale_x),
@@ -184,7 +184,7 @@ IntersectLightsWithTileMinMax(
        -tileStartY + gBufferScale_y };

    for (uniform int i = 0; i < 4; ++i) {
-        uniform_t float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+        uniform_t float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
        frustumPlanes_xy[i] *= norm;
        frustumPlanes_z[i] *= norm;
@@ -202,7 +202,7 @@ IntersectLightsWithTileMinMax(

        d = maxZ - light_positionView_z;
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-        
+
        // This seems better than cif(!inFrustum) ccontinue; here since we
        // don't actually need to mask the rest of this function - this is
        // just a greedy early-out.  Could also structure all of this as
@@ -211,26 +211,26 @@ IntersectLightsWithTileMinMax(
            float light_positionView_x = light_positionView_x_array[lightIndex];
            float light_positionView_y = light_positionView_y_array[lightIndex];

-            d = light_positionView_z * frustumPlanes_z[0] + 
+            d = light_positionView_z * frustumPlanes_z[0] +
                light_positionView_x * frustumPlanes_xy[0];
            inFrustum = inFrustum && (d >= light_attenuationEndNeg);

-            d = light_positionView_z * frustumPlanes_z[1] + 
+            d = light_positionView_z * frustumPlanes_z[1] +
                light_positionView_x * frustumPlanes_xy[1];
            inFrustum = inFrustum && (d >= light_attenuationEndNeg);

-            d = light_positionView_z * frustumPlanes_z[2] + 
+            d = light_positionView_z * frustumPlanes_z[2] +
                light_positionView_y * frustumPlanes_xy[2];
            inFrustum = inFrustum && (d >= light_attenuationEndNeg);

-            d = light_positionView_z * frustumPlanes_z[3] + 
+            d = light_positionView_z * frustumPlanes_z[3] +
                light_positionView_y * frustumPlanes_xy[3];
            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-       
-#if 0 
+
+#if 0
            // Pack and store intersecting lights
            cif (inFrustum) {
-                tileNumLights += packed_store_active(&tileLightIndices[tileNumLights], 
+                tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
                                                     lightIndex);
            }
 #else
@@ -245,7 +245,7 @@ IntersectLightsWithTileMinMax(
 }


-#if 1 
+#if 1
 inline
 #endif
 static uniform int32
@@ -277,7 +277,7 @@ IntersectLightsWithTile(
    uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
-        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, 
+        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
        light_positionView_z_array, light_attenuationEnd_array,
        tileLightIndices);

@@ -285,7 +285,7 @@ IntersectLightsWithTile(
 }


-#if 1 
+#if 1
 inline
 #endif
 #ifndef __NVPTX__
@@ -324,13 +324,13 @@ ShadeTile(
    } else {
        uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
        uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
-        
+
        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
            uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);

            foreach (x = tileStartX ... tileEndX) {
                int32 gBufferOffset = y * gBufferWidth + x;
-                
+
                // Reconstruct position and (negative) view vector from G-buffer
                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
                float Vneg_x, Vneg_y, Vneg_z;
@@ -339,70 +339,70 @@ ShadeTile(

                // Compute screen/clip-space position
                // NOTE: Mind DX11 viewport transform and pixel center!
-                float positionScreen_x = (0.5f + (float)(x)) * 
+                float positionScreen_x = (0.5f + (float)(x)) *
                    twoOverGBufferWidth - 1.0f;

                // Unproject depth buffer Z value into view space
                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
-                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                surface_positionView_x = positionScreen_x * surface_positionView_z /
                    cameraProj_11;
-                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                surface_positionView_y = positionScreen_y * surface_positionView_z /
                    cameraProj_22;
-                
+
                // We actually end up with a vector pointing *at* the
                // surface (i.e. the negative view vector)
-                normalize3(surface_positionView_x, surface_positionView_y, 
+                normalize3(surface_positionView_x, surface_positionView_y,
                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);

                // Reconstruct normal from G-buffer
                float surface_normal_x, surface_normal_y, surface_normal_z;
                float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
                float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
-                    
+
                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
                float m = sqrt(4.0f * f - 1.0f);
-                    
+
                surface_normal_x = m * (4.0f * normal_x - 2.0f);
                surface_normal_y = m * (4.0f * normal_y - 2.0f);
                surface_normal_z = 3.0f - 8.0f * f;

                // Load other G-buffer parameters
-                float surface_specularAmount = 
+                float surface_specularAmount =
                    half_to_float(inputData.specularAmount[gBufferOffset]);
-                float surface_specularPower  = 
+                float surface_specularPower  =
                    half_to_float(inputData.specularPower[gBufferOffset]);
                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
-                
+
                float lit_x = 0.0f;
                float lit_y = 0.0f;
                float lit_z = 0.0f;
-                for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
                     ++tileLightIndex) {
                    uniform int32 lightIndex = tileLightIndices[tileLightIndex];
-                                        
+
                    // Gather light data relevant to initial culling
-                    uniform float light_positionView_x = 
+                    uniform float light_positionView_x =
                        inputData.lightPositionView_x[lightIndex];
-                    uniform float light_positionView_y = 
+                    uniform float light_positionView_y =
                        inputData.lightPositionView_y[lightIndex];
-                    uniform float light_positionView_z = 
+                    uniform float light_positionView_z =
                        inputData.lightPositionView_z[lightIndex];
-                    uniform float light_attenuationEnd = 
+                    uniform float light_attenuationEnd =
                        inputData.lightAttenuationEnd[lightIndex];
-                    
+
                    // Compute light vector
                    float L_x = light_positionView_x - surface_positionView_x;
                    float L_y = light_positionView_y - surface_positionView_y;
                    float L_z = light_positionView_z - surface_positionView_z;

                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
-                    
+
                    // Clip at end of attenuation
                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;

-                    cif (distanceToLight2 < light_attenutaionEnd2) {                    
+                    cif (distanceToLight2 < light_attenutaionEnd2) {
                        float distanceToLight = sqrt(distanceToLight2);

                        // HLSL "rcp" is allowed to be fairly inaccurate
@@ -412,12 +412,12 @@ ShadeTile(
                        L_z *= distanceToLightRcp;

                        // Start computing brdf
-                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                        float NdotL = dot3(surface_normal_x, surface_normal_y,
                                           surface_normal_z, L_x, L_y, L_z);
-                    
+
                        // Clip back facing
                        cif (NdotL > 0.0f) {
-                            uniform float light_attenuationBegin = 
+                            uniform float light_attenuationBegin =
                                inputData.lightAttenuationBegin[lightIndex];

                            // Light distance attenuation (linstep)
@@ -429,19 +429,19 @@ ShadeTile(
                            float H_y = (L_y - Vneg_y);
                            float H_z = (L_z - Vneg_z);
                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
-                    
-                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+
+                            float NdotH = dot3(surface_normal_x, surface_normal_y,
                                               surface_normal_z, H_x, H_y, H_z);
                            NdotH = max(NdotH, 0.0f);

                            float specular = pow(NdotH, surface_specularPower);
-                            float specularNorm = (surface_specularPower + 2.0f) * 
+                            float specularNorm = (surface_specularPower + 2.0f) *
                                (1.0f / 8.0f);
-                            float specularContrib = surface_specularAmount * 
+                            float specularContrib = surface_specularAmount *
                                specularNorm * specular;

                            float k = attenuation * NdotL * (1.0f + specularContrib);
-                    
+
                            uniform float light_color_x = inputData.lightColor_x[lightIndex];
                            uniform float light_color_y = inputData.lightColor_y[lightIndex];
                            uniform float light_color_z = inputData.lightColor_z[lightIndex];
@@ -465,7 +465,7 @@ ShadeTile(
                lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
                lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
                lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
-                
+
                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
@@ -512,8 +512,8 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
 #else /* shared memory doesn't full work... why? */
    uniform int tileLightIndices[MAX_LIGHTS];  // Light list for the tile
 #endif
-    uniform int numTileLights = 
-        IntersectLightsWithTile(tile_start_x, tile_end_x, 
+    uniform int numTileLights =
+        IntersectLightsWithTile(tile_start_x, tile_end_x,
                                tile_start_y, tile_end_y,
                                framebufferWidth, framebufferHeight,
                                inputData.zBuffer,
@@ -521,9 +521,9 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
                                cameraProj_22, cameraProj_32,
                                inputHeader.cameraNear, inputHeader.cameraFar,
                                MAX_LIGHTS,
-                                inputData.lightPositionView_x, 
-                                inputData.lightPositionView_y, 
-                                inputData.lightPositionView_z, 
+                                inputData.lightPositionView_x,
+                                inputData.lightPositionView_y,
+                                inputData.lightPositionView_z,
                                inputData.lightAttenuationEnd,
                                tileLightIndices);

@@ -531,7 +531,7 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
              framebufferWidth, framebufferHeight, inputData,
              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
-              tileLightIndices, numTileLights, visualizeLightCount, 
+              tileLightIndices, numTileLights, visualizeLightCount,
              framebuffer_r, framebuffer_g, framebuffer_b);
 #ifdef MALLOC
    delete tileLightIndices;
@@ -551,9 +551,9 @@ RenderStatic(uniform InputHeader inputHeaderPtr[],
    uniform InputHeader inputHeader = *inputHeaderPtr;
    uniform InputDataArrays inputData = *inputDataPtr;

-    uniform int num_groups_x = (inputHeader.framebufferWidth + 
+    uniform int num_groups_x = (inputHeader.framebufferWidth +
                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
-    uniform int num_groups_y = (inputHeader.framebufferHeight + 
+    uniform int num_groups_y = (inputHeader.framebufferHeight +
                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
    uniform int num_groups = num_groups_x * num_groups_y;

@@ -627,16 +627,16 @@ SplitTileMinMax(
 {
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
-        
+
    uniform_t float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
                                           (cameraProj_22 * gBufferScale_y) };
    uniform_t float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
                                         tileMidY - gBufferScale_y };

    // Normalize
-    uniform_t float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] + 
+    uniform_t float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] +
                                    frustumPlanes_z[0] * frustumPlanes_z[0]),
-                              rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] + 
+                              rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] +
                                    frustumPlanes_z[1] * frustumPlanes_z[1]) };
    frustumPlanes_xy[0] *= norm[0];
    frustumPlanes_xy[1] *= norm[1];
@@ -658,23 +658,23 @@ SplitTileMinMax(
        float light_positionView_z = light_positionView_z_array[lightIndex];
        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
        float light_attenuationEndNeg = -light_attenuationEnd;
-        
+
        // Test lights again subtile z bounds
        bool inFrustum[4];
        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
-        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
-        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
-        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);

-        float dx = light_positionView_z * frustumPlanes_z[0] + 
+        float dx = light_positionView_z * frustumPlanes_z[0] +
            light_positionView_x * frustumPlanes_xy[0];
        float dy = light_positionView_z * frustumPlanes_z[1] +
            light_positionView_y * frustumPlanes_xy[1];
-        
+
        cif (abs(dx) > light_attenuationEnd) {
            bool positiveX = dx > 0.0f;
            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
@@ -693,20 +693,20 @@ SplitTileMinMax(
        // Pack and store intersecting lights
        // TODO: Experiment with a loop here instead
        cif (inFrustum[0])
-            subtileLightOffset[0] += 
+            subtileLightOffset[0] +=
            packed_store_active(&subtileIndices[subtileLightOffset[0]],
                                lightIndex);
        cif (inFrustum[1])
-            subtileLightOffset[1] += 
+            subtileLightOffset[1] +=
            packed_store_active(&subtileIndices[subtileLightOffset[1]],
                                lightIndex);
        cif (inFrustum[2])
-            subtileLightOffset[2] += 
-            packed_store_active(&subtileIndices[subtileLightOffset[2]], 
+            subtileLightOffset[2] +=
+            packed_store_active(&subtileIndices[subtileLightOffset[2]],
                                lightIndex);
        cif (inFrustum[3])
-            subtileLightOffset[3] += 
-            packed_store_active(&subtileIndices[subtileLightOffset[3]], 
+            subtileLightOffset[3] +=
+            packed_store_active(&subtileIndices[subtileLightOffset[3]],
                                lightIndex);
    }

--- a/examples/portable/deferred/main.cpp
+++ b/examples/portable/deferred/main.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #ifdef _MSC_VER
--- a/examples/portable/inplaceTraspose/inplaceTranspose.ispc
+++ b/examples/portable/inplaceTraspose/inplaceTranspose.ispc
@@ -5,8 +5,8 @@ uniform int gcd(uniform int a, uniform int b)
 {
  while ( a != 0 )
  {
-    uniform int c = a; 
-    a = b%a;  
+    uniform int c = a;
+    a = b%a;
    b = c;
  }
  return b;
@@ -32,7 +32,7 @@ int __sj(const int i, const uniform int j, const uniform int m, const uniform in


 #if 0
-static inline 
+static inline
 void transpose_serial(uniform T A[], const uniform int m, const uniform int n)
 {
  const uniform int tmpSize = max(m,n) * programCount;
@@ -90,7 +90,7 @@ static uniform int * uniform joverb = NULL;
 static uniform int * uniform iovera = NULL;
 static uniform int a,b,c;

-static 
+static
 void transpose_init(const uniform int m, const uniform int n, const uniform int nTask)
 {
  joverb = uniform new uniform int[n];
@@ -105,14 +105,14 @@ void transpose_init(const uniform int m, const uniform int n, const uniform int
    iovera[i] = i/a;
 }

-static 
+static
 void transpose_finalize()
 {
  delete iovera;
  delete joverb;
 }

-task 
+task
 void transpose_step1(uniform T A[], const uniform int m, const uniform int n)
 {
  const uniform int n_per_task = (n + taskCount - 1)/taskCount;
@@ -140,7 +140,7 @@ void transpose_step2(uniform T A[], const uniform int m, const uniform int n)
  const uniform int m_per_task = (m + taskCount - 1)/taskCount;
  const uniform int mibeg = taskIndex * m_per_task;
  const uniform int miend = min(mibeg + m_per_task, m);
-  
+
  uniform  T * uniform tmp = uniform new uniform T[n*programCount];

  uniform T (*uniform tmp2D)[programCount] = (uniform T (*uniform)[programCount])tmp;
@@ -161,7 +161,7 @@ void transpose_step3(uniform T A[], const uniform int m, const uniform int n)
  const uniform int n_per_task = (n + taskCount - 1)/taskCount;
  const uniform int nibeg = taskIndex * n_per_task;
  const uniform int niend = min(nibeg + n_per_task, n);
-  
+
  uniform  T * uniform tmp = uniform new uniform T[m];

  for (uniform int j = nibeg; j < niend; j++)
@@ -176,7 +176,7 @@ void transpose_step3(uniform T A[], const uniform int m, const uniform int n)
  delete tmp;
 }

-export 
+export
 void transpose(uniform T A[], const uniform int m, const uniform int n)
 {
 #if 0
@@ -187,7 +187,7 @@ void transpose(uniform T A[], const uniform int m, const uniform int n)

  launch [nTask] transpose_step1(A, m, n);
  sync;
-  
+
  launch [nTask] transpose_step2(A, m, n);
  sync;

--- a/examples/portable/mergeSort/mergeSort.cpp
+++ b/examples/portable/mergeSort/mergeSort.cpp
@@ -71,7 +71,7 @@ int main (int argc, char *argv[])
    valsGld[i] = valsSrc[i];
  }
  delete keys;
-    
+
  ispcSetMallocHeapLimit(1024*1024*1024);

  ispc::openMergeSort();
@@ -115,7 +115,7 @@ int main (int argc, char *argv[])
  }
  printf("\n---\n");
 #endif
-    
+


  std::sort(keysGld, keysGld + n);
--- a/examples/portable/mergeSort/mergeSort.cu
+++ b/examples/portable/mergeSort/mergeSort.cu
@@ -30,9 +30,9 @@ int nextPowerOfTwo(int x)

 __device__ static inline
 int binarySearchInclusiveRanks(
-    const int val, 
+    const int val,
    uniform int *data,
-    const int L, 
+    const int L,
    int stride)
 {
  if (L == 0)
@@ -52,9 +52,9 @@ int binarySearchInclusiveRanks(

 __device__ static inline
 int binarySearchExclusiveRanks(
-    const int val, 
-    uniform int *data, 
-    const int L, 
+    const int val,
+    uniform int *data,
+    const int L,
    int stride)
 {
  if (L == 0)
@@ -74,9 +74,9 @@ int binarySearchExclusiveRanks(

 __device__ static inline
 int binarySearchInclusive(
-    const Key_t val, 
+    const Key_t val,
    uniform Key_t *data,
-    const int L, 
+    const int L,
    int stride)
 {
  if (L == 0)
@@ -96,9 +96,9 @@ int binarySearchInclusive(

 __device__ static inline
 int binarySearchExclusive(
-    const Key_t val, 
-    uniform Key_t *data, 
-    const int L, 
+    const Key_t val,
+    uniform Key_t *data,
+    const int L,
    int stride)
 {
  if (L == 0)
@@ -118,9 +118,9 @@ int binarySearchExclusive(

 __device__ static inline
 int binarySearchInclusive1(
-    const Key_t val, 
+    const Key_t val,
    Key_t data,
-    const uniform int L, 
+    const uniform int L,
    uniform int stride)
 {
  if (L == 0)
@@ -140,9 +140,9 @@ int binarySearchInclusive1(

 __device__ static inline
 int binarySearchExclusive1(
-    const Key_t val, 
-    Key_t data, 
-    const uniform int L, 
+    const Key_t val,
+    Key_t data,
+    const uniform int L,
    uniform int stride)
 {
  if (L == 0)
@@ -245,7 +245,7 @@ void generateSampleRanksKernel(
  const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
  const uniform int blkBeg =     blkIdx * blkDim;
  const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
-  
+
  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
  {
    const int pos = blk * programCount + programIndex;
@@ -291,8 +291,8 @@ void generateSampleRanks(
    uniform int N)
 {
  uniform int lastSegmentElements = N % (2 * stride);
-  uniform int threadCount = (lastSegmentElements > stride) ? 
-    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : 
+  uniform int threadCount = (lastSegmentElements > stride) ?
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

  uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
@@ -304,7 +304,7 @@ void generateSampleRanks(
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 2: generate sample ranks and indices
 ////////////////////////////////////////////////////////////////////////////////
-__global__ 
+__global__
 void mergeRanksAndIndicesKernel(
    uniform int nBlocks,
    uniform int in_Limits[],
@@ -317,7 +317,7 @@ void mergeRanksAndIndicesKernel(
  const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
  const uniform int blkBeg =     blkIdx * blkDim;
  const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
-  
+
  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
  {
    int pos = blk * programCount + programIndex;
@@ -357,8 +357,8 @@ void mergeRanksAndIndices(
    uniform int N)
 {
  const uniform int lastSegmentElements = N % (2 * stride);
-  const uniform int threadCount = (lastSegmentElements > stride) ? 
-    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : 
+  const uniform int threadCount = (lastSegmentElements > stride) ?
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

  const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
@@ -457,13 +457,13 @@ void mergeElementaryIntervalsKernel(
      dstB = segmentBase + startDstB + dstPosB;

    // store merge data
-    if (dstA >= 0) 
+    if (dstA >= 0)
    {
 //     int dstA = segmentBase + startSrcA + programIndex;
      dstKey[dstA] = keyA;
      dstVal[dstA] = valA;
    }
-    if (dstB >= 0) 
+    if (dstB >= 0)
    {
 //      int dstB = segmentBase + stride + startSrcB + programIndex;
      dstKey[dstB] = keyB;
@@ -513,7 +513,7 @@ __device__ static uniform int * uniform limitsB;
 __device__ static uniform int nTasks;
 __device__ static uniform int MAX_SAMPLE_COUNT = 0;

-__global__ 
+__global__
 void openMergeSort___export()
 {
  nTasks = 13*32*13;
--- a/examples/portable/mergeSort/mergeSort.ispc
+++ b/examples/portable/mergeSort/mergeSort.ispc
@@ -25,9 +25,9 @@ int nextPowerOfTwo(int x)

 static inline
 int binarySearchInclusiveRanks(
-    const int val, 
+    const int val,
    uniform int *data,
-    const int L, 
+    const int L,
    int stride)
 {
  cif (L == 0)
@@ -47,9 +47,9 @@ int binarySearchInclusiveRanks(

 static inline
 int binarySearchExclusiveRanks(
-    const int val, 
-    uniform int *data, 
-    const int L, 
+    const int val,
+    uniform int *data,
+    const int L,
    int stride)
 {
  cif (L == 0)
@@ -69,9 +69,9 @@ int binarySearchExclusiveRanks(

 static inline
 int binarySearchInclusive(
-    const Key_t val, 
+    const Key_t val,
    uniform Key_t *data,
-    const int L, 
+    const int L,
    int stride)
 {
  cif (L == 0)
@@ -91,9 +91,9 @@ int binarySearchInclusive(

 static inline
 int binarySearchExclusive(
-    const Key_t val, 
-    uniform Key_t *data, 
-    const int L, 
+    const Key_t val,
+    uniform Key_t *data,
+    const int L,
    int stride)
 {
  cif (L == 0)
@@ -113,9 +113,9 @@ int binarySearchExclusive(

 static inline
 int binarySearchInclusive1(
-    const Key_t val, 
+    const Key_t val,
    Key_t data,
-    const uniform int L, 
+    const uniform int L,
    uniform int stride)
 {
  if (L == 0)
@@ -135,9 +135,9 @@ int binarySearchInclusive1(

 static inline
 int binarySearchExclusive1(
-    const Key_t val, 
-    Key_t data, 
-    const uniform int L, 
+    const Key_t val,
+    Key_t data,
+    const uniform int L,
    uniform int stride)
 {
  if (L == 0)
@@ -158,7 +158,7 @@ int binarySearchExclusive1(
 ////////////////////////////////////////////////////////////////////////////////
 // Bottom-level merge sort (binary search-based)
 ////////////////////////////////////////////////////////////////////////////////
-task 
+task
 void mergeSortGangKernel(
    uniform int batchSize,
    uniform Key_t dstKey[],
@@ -189,7 +189,7 @@ void mergeSortGangKernel(
      const int offset = 2 * (programIndex - lPos);
      uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
      uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
-      
+
      Key_t keyA = baseKey[lPos +      0];
      Val_t valA = baseVal[lPos +      0];
      Key_t keyB = baseKey[lPos + stride];
@@ -244,7 +244,7 @@ void generateSampleRanksKernel(
  const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
  const uniform int blockBeg =     blockIdx * blockDim;
  const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
-  
+
  for (uniform int block = blockBeg; block < blockEnd; block++)
  {
    const int pos = block * programCount + programIndex;
@@ -290,8 +290,8 @@ void generateSampleRanks(
    uniform int N)
 {
  uniform int lastSegmentElements = N % (2 * stride);
-  uniform int threadCount = (lastSegmentElements > stride) ? 
-    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : 
+  uniform int threadCount = (lastSegmentElements > stride) ?
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

  uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
@@ -306,7 +306,7 @@ void generateSampleRanks(
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 2: generate sample ranks and indices
 ////////////////////////////////////////////////////////////////////////////////
-task 
+task
 void mergeRanksAndIndicesKernel(
    uniform int nBlocks,
    uniform int in_Limits[],
@@ -319,7 +319,7 @@ void mergeRanksAndIndicesKernel(
  const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
  const uniform int blockBeg =     blockIdx * blockDim;
  const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
-  
+
  for (uniform int block = blockBeg; block < blockEnd; block++)
  {
    int pos = block * programCount + programIndex;
@@ -359,8 +359,8 @@ void mergeRanksAndIndices(
    uniform int N)
 {
  const uniform int lastSegmentElements = N % (2 * stride);
-  const uniform int threadCount = (lastSegmentElements > stride) ? 
-    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : 
+  const uniform int threadCount = (lastSegmentElements > stride) ?
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

  const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
@@ -462,12 +462,12 @@ void mergeElementaryIntervalsKernel(
    if (programIndex < lenSrcB && dstPosB < lenSrcB)
      dstB = segmentBase + startDstB + dstPosB;

-    if (dstA >= 0) 
+    if (dstA >= 0)
    {
      dstKey[dstA] = keyA;
      dstVal[dstA] = valA;
    }
-    if (dstB >= 0) 
+    if (dstB >= 0)
    {
      dstKey[dstB] = keyB;
      dstVal[dstB] = valB;
@@ -521,7 +521,7 @@ static uniform int * uniform limitsA;
 static uniform int * uniform limitsB;
 static uniform int MAX_SAMPLE_COUNT = 0;

-export 
+export
 void openMergeSort()
 {
  MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
@@ -542,7 +542,7 @@ void closeMergeSort()
  memPool = NULL;
 }

-export 
+export
 void mergeSort(
    uniform Key_t dstKey[],
    uniform Val_t dstVal[],
@@ -601,7 +601,7 @@ void mergeSort(
        }
 #endif

-        // cpu: 287  gpu: 194 M/s 
+        // cpu: 287  gpu: 194 M/s
        //Merge elementary intervals
        mergeElementaryIntervals(oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
      }
--- a/examples/portable/nbody_hermite4/hermite4.cpp
+++ b/examples/portable/nbody_hermite4/hermite4.cpp
@@ -61,7 +61,7 @@ struct Hermite4
    const real R0 = 1;
    const real mp = 1.0/n;
 #pragma omp parallel for schedule(runtime)
-    for (int i = 0; i < n; i++) 
+    for (int i = 0; i < n; i++)
    {
      real xp, yp, zp, s2 = 2*R0;
      real vx, vy, vz;
@@ -73,7 +73,7 @@ struct Hermite4
        vx = drand48() * 0.1;
        vy = drand48() * 0.1;
        vz = drand48() * 0.1;
-      } 
+      }
      g_posx[i] = xp;
      g_posy[i] = yp;
      g_posz[i] = zp;
@@ -104,7 +104,7 @@ struct Hermite4

  void forces();

-  real step(const real dt) 
+  real step(const real dt)
  {
    const real dt2 = dt*real(1.0/2.0);
    const real dt3 = dt*real(1.0/3.0);
@@ -149,9 +149,9 @@ struct Hermite4
      {
        /* compute snp & crk */

-        const real Amx = g_accx[i] - accx0[i]; 
-        const real Amy = g_accy[i] - accy0[i]; 
-        const real Amz = g_accz[i] - accz0[i]; 
+        const real Amx = g_accx[i] - accx0[i];
+        const real Amy = g_accy[i] - accy0[i];
+        const real Amz = g_accz[i] - accz0[i];

        const real Jmx = h*(g_jrkx[i] - jrkx0[i]);
        const real Jmy = h*(g_jrky[i] - jrky0[i]);
@@ -199,18 +199,18 @@ struct Hermite4
      }
    }

-    if (dt_min == HUGE) 
+    if (dt_min == HUGE)
      return dt;
-    else 
+    else
      return dt_min;
  }

-  void energy(real &Ekin, real &Epot) 
+  void energy(real &Ekin, real &Epot)
  {
    real ekin = 0, epot = 0;

 #pragma omp parallel for reduction(+:ekin,epot)
-    for (int i = 0; i < n; i++) 
+    for (int i = 0; i < n; i++)
    {
      ekin += g_mass[i] * (g_velx[i]*g_velx[i] + g_vely[i]*g_vely[i] + g_velz[i]*g_velz[i]) * real(0.5f);
      epot += real(0.5f)*g_mass[i] * g_gpot[i];
@@ -241,7 +241,7 @@ struct Hermite4
    real dt = 1.0/131072;
    real Epot, Ekin, Etot = Etot0;
    while (t_global < t_end) {
-      if (iter % ntime == 0) 
+      if (iter % ntime == 0)
        t0 = rtc();

      if (iter >= niter) return;
@@ -302,7 +302,7 @@ void run(const int nbodies, const real eta, const int nstep)
  h4.integrate(nstep);
 }

-int main(int argc, char *argv[]) 
+int main(int argc, char *argv[])
 {
  printf("  Usage: %s [nbodies=8192] [nsteps=40] [eta=0.1] \n", argv[0]);

--- a/examples/portable/nbody_hermite4/hermite4.ispc
+++ b/examples/portable/nbody_hermite4/hermite4.ispc
@@ -12,7 +12,7 @@ struct Predictor
  vec3 pos, vel;
 };

-static inline 
+static inline
 void body_body_force(
    Force &fi,
    const Predictor &pi,
@@ -40,14 +40,14 @@ void body_body_force(
  fi.acc.y += minv_ds3 * dy;
  fi.acc.z += minv_ds3 * dz;
  fi.pot   -= minv_ds;
-  
+
  const real dvx = pj.vel.x - pi.vel.x;
  const real dvy = pj.vel.y - pi.vel.y;
  const real dvz = pj.vel.z - pi.vel.z;
  const real rv  = dx*dvx + dy*dvy + dz*dvz;
-  
+
  const real Jij = (real)(-3.0) * (rv * inv_ds2 * minv_ds3);
-  
+
  fi.jrk.x += minv_ds3*dvx + Jij*dx;
  fi.jrk.y += minv_ds3*dvy + Jij*dy;
  fi.jrk.z += minv_ds3*dvz + Jij*dz;
@@ -75,7 +75,7 @@ task void compute_forces_task(
  const uniform int nibeg = taskIndex * nPerTask;
  const uniform int niend = min(n, nibeg + nPerTask);

-  if (nibeg >= n) 
+  if (nibeg >= n)
    return;

  uniform real shdata[7][programCount];
@@ -88,7 +88,7 @@ task void compute_forces_task(
    fi.acc = (real)0.0;
    fi.jrk = (real)0.0;
    fi.pot = (real)0.0;
-   
+
    Predictor pi;
    pi.pos.x = posx[i];
    pi.pos.y = posy[i];
@@ -155,7 +155,7 @@ export void compute_forces(

  launch [nTask]  compute_forces_task(
      n, nPerTask,
-      mass, 
+      mass,
      posx,posy,posz,
      velx,vely,velz,
      accx,accy,accz,
--- a/examples/portable/omp_tasksys.cpp
+++ b/examples/portable/omp_tasksys.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011-2012, Intel Corporation
+  Copyright (c) 2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,11 +28,11 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */


-#define DBG(x) 
+#define DBG(x)
 #include <omp.h>
 #include <malloc.h>

@@ -62,15 +62,15 @@ struct TaskInfo {
    event taskEvent;
 #endif
    int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; }
-    int taskIndex0() const 
+    int taskIndex0() const
    {
      return taskIndex % taskCount3d[0];
    }
-    int taskIndex1() const 
+    int taskIndex1() const
    {
      return ( taskIndex / taskCount3d[0] ) % taskCount3d[1];
    }
-    int taskIndex2() const 
+    int taskIndex2() const
    {
      return taskIndex / ( taskCount3d[0]*taskCount3d[1] );
    }
@@ -85,7 +85,7 @@ __attribute__((aligned(32)));
 ;

 // ispc expects these functions to have C linkage / not be mangled
-extern "C" { 
+extern "C" {
    void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz);
    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
    void ISPCSync(void *handle);
@@ -144,10 +144,10 @@ private:
 };


-inline TaskGroupBase::TaskGroupBase() { 
-    nextTaskInfoIndex = 0; 
+inline TaskGroupBase::TaskGroupBase() {
+    nextTaskInfoIndex = 0;

-    curMemBuffer = 0; 
+    curMemBuffer = 0;
    curMemBufferOffset = 0;
    memBuffers[0] = mem;
    memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
@@ -171,8 +171,8 @@ inline TaskGroupBase::~TaskGroupBase() {

 inline void
 TaskGroupBase::Reset() {
-    nextTaskInfoIndex = 0; 
-    curMemBuffer = 0; 
+    nextTaskInfoIndex = 0;
+    curMemBuffer = 0;
    curMemBufferOffset = 0;
 }

@@ -253,7 +253,7 @@ lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
 #endif // ISPC_IS_WINDOWS
 }

-static int32_t 
+static int32_t
 lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
 #ifdef ISPC_IS_WINDOWS
    return InterlockedCompareExchange((volatile LONG *)v, newValue, oldValue);
@@ -264,7 +264,7 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue)
 #endif // ISPC_IS_WINDOWS
 }

-static inline int32_t 
+static inline int32_t
 lAtomicAdd(volatile int32_t *v, int32_t delta) {
 #ifdef ISPC_IS_WINDOWS
    return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta;
@@ -300,11 +300,11 @@ TaskGroup::Launch(int baseIndex, int count) {

    TaskInfo ti = *GetTaskInfo(baseIndex);
 #pragma omp for schedule(runtime)
-    for(int i = 0; i < count; i++) 
+    for(int i = 0; i < count; i++)
    {
        ti.taskIndex = i;

-        // Actually run the task. 
+        // Actually run the task.
        ti.func(ti.data, threadIndex, threadCount, ti.taskIndex, ti.taskCount(),
            ti.taskIndex0(), ti.taskIndex1(), ti.taskIndex2(),
            ti.taskCount0(), ti.taskCount1(), ti.taskCount2());
@@ -322,7 +322,7 @@ TaskGroup::Sync() {
 static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];

  static inline TaskGroup *
-AllocTaskGroup() 
+AllocTaskGroup()
 {
  for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
    TaskGroup *tg = freeTaskGroups[i];
@@ -339,7 +339,7 @@ AllocTaskGroup()


  static inline void
-FreeTaskGroup(TaskGroup *tg) 
+FreeTaskGroup(TaskGroup *tg)
 {
  tg->Reset();

@@ -355,7 +355,7 @@ FreeTaskGroup(TaskGroup *tg)
 }

  void
-ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2) 
+ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2)
 {
  const int count = count0*count1*count2;
  TaskGroup *taskGroup;
@@ -382,7 +382,7 @@ ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1,


  void
-ISPCSync(void *h) 
+ISPCSync(void *h)
 {
  TaskGroup *taskGroup = (TaskGroup *)h;
  if (taskGroup != NULL) {
@@ -393,7 +393,7 @@ ISPCSync(void *h)


  void *
-ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) 
+ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment)
 {
  TaskGroup *taskGroup;
  if (*taskGroupPtr == NULL) {
--- a/examples/portable/options/options.cpp
+++ b/examples/portable/options/options.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #define NOMINMAX
@@ -96,7 +96,7 @@ int main(int argc, char *argv[]) {
    sum = 0.;
    for (int i = 0; i < nOptions; ++i)
      sum += result[i];
-    printf("[binomial ispc, tasks]:\t\t[%.3f] msec (avg %f)\n", 
+    printf("[binomial ispc, tasks]:\t\t[%.3f] msec (avg %f)\n",
           binomial_tasks, sum / nOptions);

    //
@@ -112,7 +112,7 @@ int main(int argc, char *argv[]) {
            sum += result[i];
        bs_ispc_tasks = std::min(bs_ispc_tasks, dt);
    }
-    printf("[black-scholes ispc, tasks]:\t[%.3f] msec (avg %f)\n", 
+    printf("[black-scholes ispc, tasks]:\t[%.3f] msec (avg %f)\n",
           bs_ispc_tasks, sum / nOptions);


--- a/examples/portable/options/options.cu
+++ b/examples/portable/options/options.cu
@@ -1,6 +1,6 @@
 // -*- mode: c++ -*-
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -29,13 +29,13 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #include "options_defs.h"
 #include "cuda_helpers.cuh"

-__device__ static inline void __range_reduce_log(float input, float * reduced, 
+__device__ static inline void __range_reduce_log(float input, float * reduced,
                                      int * exponent) {
    int int_version = __float_as_int(input); //intbits(input);
    // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
@@ -195,9 +195,9 @@ CND(float X) {
    return w;
 }

-__global__ 
+__global__
 void bs_task( float Sa[],  float Xa[],  float Ta[],
-    float ra[],  float va[], 
+    float ra[],  float va[],
    float result[],  int count) {
  if (taskIndex >= taskCount) return;
     int first = taskIndex * (count/taskCount);
@@ -218,7 +218,7 @@ void bs_task( float Sa[],  float Xa[],  float Ta[],
 extern "C"
 __global__ void
 black_scholes_ispc_tasks___export( float Sa[],  float Xa[],  float Ta[],
-                          float ra[],  float va[], 
+                          float ra[],  float va[],
                          float result[],  int count) {
  int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
  launch(nTasks,1,1,bs_task)
@@ -228,7 +228,7 @@ black_scholes_ispc_tasks___export( float Sa[],  float Xa[],  float Ta[],
 extern "C"
 __host__ void
 black_scholes_ispc_tasks( float Sa[],  float Xa[],  float Ta[],
-                          float ra[],  float va[], 
+                          float ra[],  float va[],
                          float result[],  int count) {
  black_scholes_ispc_tasks___export<<<1,32>>>(Sa,Xa,Ta,ra,va,result,count);
  cudaDeviceSynchronize();
@@ -243,8 +243,8 @@ struct loop
  __device__ static void op1(float V[], const float u, const float X, const float S)
  {
    const int j = NBEG;
-    float upow = powf(u, (float)(2*j-BINOMIAL_NUM)); 
-    V[j] = max(0.0f, X - S * upow);  
+    float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
+    V[j] = max(0.0f, X - S * upow);
    loop<j+STEP,NEND,STEP>::op1(V,u,X,S);
  }
  __device__ static void op2(float V[], const float Pu, const float disc)
@@ -257,9 +257,9 @@ struct loop
  }
 };

-template<int NEND, int STEP> 
+template<int NEND, int STEP>
 struct loop<NEND,NEND,STEP>
-{ 
+{
  __device__ static void op1(float V[], const float u, const float X, const float S) {}
  __device__ static void op2(float V[], const float Pu, const float disc) {}
 };
@@ -295,10 +295,10 @@ binomial_put(float S, float X, float T, float r, float v)


 __global__ void
-binomial_task( float Sa[],  float Xa[], 
-               float Ta[],  float ra[], 
-               float va[],  float result[], 
-               int count) 
+binomial_task( float Sa[],  float Xa[],
+               float Ta[],  float ra[],
+               float va[],  float result[],
+               int count)
 {
  int first = taskIndex * (count/taskCount);
  int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
@@ -313,9 +313,9 @@ binomial_task( float Sa[],  float Xa[],


 extern "C" __global__ void
-binomial_put_ispc_tasks___export( float Sa[],  float Xa[], 
-                         float Ta[],  float ra[], 
-                         float va[],  float result[], 
+binomial_put_ispc_tasks___export( float Sa[],  float Xa[],
+                         float Ta[],  float ra[],
+                         float va[],  float result[],
                         int count) {
  int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
  launch(nTasks,1,1,binomial_task)
@@ -325,7 +325,7 @@ binomial_put_ispc_tasks___export( float Sa[],  float Xa[],
 extern "C"
 __host__ void
 binomial_put_ispc_tasks( float Sa[],  float Xa[],  float Ta[],
-                          float ra[],  float va[], 
+                          float ra[],  float va[],
                          float result[],  int count) {

  cudaDeviceSetCacheConfig (cudaFuncCachePreferL1);
--- a/examples/portable/options/options.ispc
+++ b/examples/portable/options/options.ispc
@@ -1,6 +1,6 @@
 // -*- mode: c++ -*-
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #include "options_defs.h"
@@ -57,7 +57,7 @@ CND(float X) {

 task void
 bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
-        uniform float ra[], uniform float va[], 
+        uniform float ra[], uniform float va[],
        uniform float result[], uniform int count) {
    uniform int first = taskIndex * (count/taskCount);
    uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
@@ -74,7 +74,7 @@ bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],

 export void
 black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[],
-                         uniform float ra[], uniform float va[], 
+                         uniform float ra[], uniform float va[],
                         uniform float result[], uniform int count) {
    uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
    launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
@@ -85,7 +85,7 @@ black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float T

 export void
 black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
-                   uniform float ra[], uniform float va[], 
+                   uniform float ra[], uniform float va[],
                   uniform float result[], uniform int count) {
    foreach (i = 0 ... count) {
        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
@@ -135,7 +135,7 @@ binomial_put(float S, float X, float T, float r, float v) {
        V[j] = max(0., X - S * upow); }
 #define OP10(k) \
    OP(k+0); OP(k+1); OP(k+2); OP(k+3); OP(k+4) \
-    OP(k+5); OP(k+6); OP(k+7); OP(k+8); OP(k+9); 
+    OP(k+5); OP(k+6); OP(k+7); OP(k+8); OP(k+9);
    OP10(0)
    OP10(10)
    OP10(20)
@@ -176,8 +176,8 @@ binomial_put(float S, float X, float T, float r, float v) {


 export void
-binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[], 
-                  uniform float ra[], uniform float va[], 
+binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
+                  uniform float ra[], uniform float va[],
                  uniform float result[], uniform int count) {
    foreach (i = 0 ... count) {
        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
@@ -187,9 +187,9 @@ binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],


 task void
-binomial_task(uniform float Sa[], uniform float Xa[], 
-              uniform float Ta[], uniform float ra[], 
-              uniform float va[], uniform float result[], 
+binomial_task(uniform float Sa[], uniform float Xa[],
+              uniform float Ta[], uniform float ra[],
+              uniform float va[], uniform float result[],
              uniform int count) {
    uniform int first = taskIndex * (count/taskCount);
    uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
@@ -202,9 +202,9 @@ binomial_task(uniform float Sa[], uniform float Xa[],


 export void
-binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[], 
-                        uniform float Ta[], uniform float ra[], 
-                        uniform float va[], uniform float result[], 
+binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
+                        uniform float Ta[], uniform float ra[],
+                        uniform float va[], uniform float result[],
                        uniform int count) {
    uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
    launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);
--- a/examples/portable/options/options_defs.h
+++ b/examples/portable/options/options_defs.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #ifndef OPTIONS_DEFS_H
--- a/examples/portable/radixSort/radixSort.cpp
+++ b/examples/portable/radixSort/radixSort.cpp
@@ -42,7 +42,7 @@ int main (int argc, char *argv[])
  Key *keys = new Key [n];
  Key *keys_orig = new Key [n];
  unsigned int *keys_gold = new unsigned int [n];
-  
+
  srand48(rtc()*65536);

  int sortBits = 32;
@@ -63,7 +63,7 @@ int main (int argc, char *argv[])
    keys_gold[i] = keys[i].key;
    keys_orig[i] = keys[i];
  }
-    
+
  ispcSetMallocHeapLimit(1024*1024*1024);

  ispc::radixSort_alloc(n);
--- a/examples/portable/radixSort/radixSort.cu
+++ b/examples/portable/radixSort/radixSort.cu
@@ -9,7 +9,7 @@ typedef long long Key;
 __forceinline__ __device__ int atomic_add_global(int* ptr, int value)
 {
  return atomicAdd(ptr, value);
-} 
+}

 static __device__ __forceinline__ int shfl_scan_add_step(int partial, int up_offset)
 {
@@ -92,7 +92,7 @@ void sortPass(

  const  int mask = (1 << NUMBITS) - 1;

-  /* copy digit offset from Gmem to Lmem */ 
+  /* copy digit offset from Gmem to Lmem */
 #if 1
  __shared__ int digitOffsets_sh[NUMDIGITS*4];
  volatile int *digitOffsets = digitOffsets_sh + warpIdx*NUMDIGITS;
@@ -191,7 +191,7 @@ void completeScanGlobal(
  }
 }

-__device__ static 
+__device__ static
 inline void radixExclusiveScan(
    const  int numBlocks,
    int excScanPtr[],
@@ -242,11 +242,11 @@ void radixSort_alloc___export(const  int n)
  nPrefixSum    = NUMDIGITS*numBlocks;


-  const  int nalloc = 
+  const  int nalloc =
    nSharedCounts +
    nCountsGlobal +
    nExcScan +
-    nCountsBlock + 
+    nCountsBlock +
    nPartialSum +
    nPrefixSum;

@@ -261,7 +261,7 @@ void radixSort_alloc___export(const  int n)
  prefixSum    = partialSum   + nPartialSum;
 }

-extern "C" 
+extern "C"
 void radixSort_alloc(const  int n)
 {
  radixSort_alloc___export<<<1,32>>>(n);
@@ -269,7 +269,7 @@ void radixSort_alloc(const  int n)
 }


-__device__  static 
+__device__  static
 void radixSort_freeBufKeys()
 {
  if (numElementsBuf > 0)
@@ -344,9 +344,9 @@ __global__ void radixSort___export(
    /* sorting */
    launch (numBlocks,1,1,
      sortPass)(
-          bufKeys, 
-          keys, 
-          bit, 
+          bufKeys,
+          keys,
+          bit,
          numElements,
          excScan);
    sync;
--- a/examples/portable/radixSort/radixSort.ispc
+++ b/examples/portable/radixSort/radixSort.ispc
@@ -63,7 +63,7 @@ void sortPass(

  const uniform int mask = (1 << NUMBITS) - 1;

-  /* copy digit offset from Gmem to Lmem */ 
+  /* copy digit offset from Gmem to Lmem */
 #if 1
  uniform int digitOffsets[NUMDIGITS];
  foreach (digit = 0 ... NUMDIGITS)
@@ -95,7 +95,7 @@ void partialScanLocal(
  const uniform int  blockDim = (numBlocks+taskCount-1)/taskCount;
  const uniform int      bbeg = blockIdx * blockDim;
  const uniform int      bend = min(bbeg + blockDim, numBlocks);
-    
+
  uniform int (* uniform  countsBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])countsAll;
  uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
  uniform int (* uniform   partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
@@ -142,7 +142,7 @@ void completeScanGlobal(
  const uniform int  blockDim = (numBlocks+taskCount-1)/taskCount;
  const uniform int      bbeg = blockIdx * blockDim;
  const uniform int      bend = min(bbeg  + blockDim, numBlocks);
-  
+
  uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
  uniform int (* uniform   carryValue)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])carryValueAll;

@@ -154,7 +154,7 @@ void completeScanGlobal(
  }
 }

-static 
+static
 inline void radixExclusiveScan(
    const uniform int numBlocks,
    uniform int excScanPtr[],
@@ -207,11 +207,11 @@ export void radixSort_alloc(const uniform int n)
  nPrefixSum    = NUMDIGITS*numBlocks;


-  const uniform int nalloc = 
+  const uniform int nalloc =
    nSharedCounts +
    nCountsGlobal +
    nExcScan +
-    nCountsBlock + 
+    nCountsBlock +
    nPartialSum +
    nPrefixSum;

@@ -225,7 +225,7 @@ export void radixSort_alloc(const uniform int n)
  prefixSum    = partialSum   + nPartialSum;
 }

-static 
+static
 void radixSort_freeBufKeys()
 {
  if (numElementsBuf > 0)
@@ -283,16 +283,16 @@ export void radixSort(
      excScan[digit] = scan + carry;
      carry += broadcast(scan+value, programCount-1);
    }
-          
+
    /* computing offsets for each digit */
    radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);

    /* sorting */
-    launch [numBlocks] 
+    launch [numBlocks]
      sortPass(
-          bufKeys, 
-          keys, 
-          bit, 
+          bufKeys,
+          keys,
+          bit,
          numElements,
          excScan);
    sync;
--- a/examples/portable/rt/rt.cpp
+++ b/examples/portable/rt/rt.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #ifdef _MSC_VER
@@ -83,7 +83,7 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
            fputc(g, f);
            fputc(b, f);
        }
-    }            
+    }
    fclose(f);
    printf("Wrote image file %s\n", filename);
 }
@@ -116,7 +116,7 @@ int main(int argc, char *argv[]) {
    if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) {  \
        fprintf(stderr, "Unexpected EOF reading scene file\n"); \
        return 1;                                               \
-    } else /* eat ; */                                                     
+    } else /* eat ; */

    //
    // Read the camera specification information from the camera file
@@ -145,7 +145,7 @@ int main(int argc, char *argv[]) {
    READ(raster2camera[0][0], 16);

    //
-    // Read in the serialized BVH 
+    // Read in the serialized BVH
    //
    sprintf(fnbuf, "%s.bvh", filename);
    f = fopen(fnbuf, "rb");
@@ -178,7 +178,7 @@ int main(int argc, char *argv[]) {
        READ(nodes[i].pad, 1);
    }

-    // And then read the triangles 
+    // And then read the triangles
    uint nTris;
    READ(nTris, 1);
    Triangle *triangles = new Triangle[nTris];
@@ -204,7 +204,7 @@ int main(int argc, char *argv[]) {
    // the first interseciton
    int *id = new int[width*height];
    float *image = new float[width*height];
-    
+
    ispc_memset(id, 0, width*height*sizeof(int));
    ispc_memset(image, 0, width*height*sizeof(float));

@@ -220,7 +220,7 @@ int main(int argc, char *argv[]) {
        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
        minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
    }
-    printf("[rt ispc + tasks]:\t\t[%.3f] msec for %d x %d image\n", 
+    printf("[rt ispc + tasks]:\t\t[%.3f] msec for %d x %d image\n",
           minTimeISPCtasks, width, height);

    writeImage(id, image, width, height, "rt-ispc-tasks.ppm");
--- a/examples/portable/rt/rt.cu
+++ b/examples/portable/rt/rt.cu
@@ -96,7 +96,7 @@ static inline float Dot(const float3 a, const float3 b) {

 __device__
 inline
-static void generateRay( const float raster2camera[4][4], 
+static void generateRay( const float raster2camera[4][4],
                         const float camera2world[4][4],
                        float x, float y, Ray &ray) {
    ray.mint = 0.f;
@@ -113,11 +113,11 @@ static void generateRay( const float raster2camera[4][4],
    camy /= camw;
    camz /= camw;

-    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + 
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
        camera2world[0][2] * camz;
-    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + 
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
        camera2world[1][2] * camz;
-    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + 
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
        camera2world[2][2] * camz;

    ray.origin.x = camera2world[0][3] / camera2world[3][3];
@@ -139,7 +139,7 @@ static void generateRay( const float raster2camera[4][4],

 __device__
 inline
-static bool BBoxIntersect(const  float bounds[2][3], 
+static bool BBoxIntersect(const  float bounds[2][3],
                          const Ray &ray) {
     float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
     float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
@@ -172,7 +172,7 @@ static bool BBoxIntersect(const  float bounds[2][3],
    }
    t0 = max(tNear.z, t0);
    t1 = min(tFar.z, t1);
-    
+
    return (t0 <= t1);
 }

@@ -220,7 +220,7 @@ static bool TriIntersect(const  Triangle &tri, Ray &ray) {

 __device__
 inline
-bool BVHIntersect(const  LinearBVHNode nodes[], 
+bool BVHIntersect(const  LinearBVHNode nodes[],
                  const  Triangle tris[], Ray &r,
                   int todo[]) {
    Ray ray = r;
@@ -240,7 +240,7 @@ bool BVHIntersect(const  LinearBVHNode nodes[],
                    if (TriIntersect(tris[primitivesOffset+i], ray))
                        hit = true;
                }
-                if (todoOffset == 0) 
+                if (todoOffset == 0)
                    break;
                nodeNum = todo[--todoOffset];
            }
@@ -275,10 +275,10 @@ bool BVHIntersect(const  LinearBVHNode nodes[],
 __device__
 inline
 static void raytrace_tile( int x0,  int x1,
-                           int y0,  int y1, 
+                           int y0,  int y1,
                           int width,  int height,
                           int baseWidth,  int baseHeight,
-                          const  float raster2camera[4][4], 
+                          const  float raster2camera[4][4],
                          const  float camera2world[4][4],
                           float image[],  int id[],
                          const  LinearBVHNode nodes[],
@@ -317,7 +317,7 @@ static void raytrace_tile( int x0,  int x1,
 __global__
 void raytrace_tile_task( int width,  int height,
                              int baseWidth,  int baseHeight,
-                             const  float raster2camera[4][4], 
+                             const  float raster2camera[4][4],
                             const  float camera2world[4][4],
                              float image[],  int id[],
                             const  LinearBVHNode nodes[],
@@ -328,8 +328,8 @@ void raytrace_tile_task( int width,  int height,
     int x1 = min(x0 + dx, width);
     int y0 = (taskIndex / xBuckets) * dy;
     int y1 = min(y0 + dy, height);
-                             
-    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, 
+
+    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
                  raster2camera, camera2world, image,
                  id, nodes, triangles);
 }
@@ -337,7 +337,7 @@ void raytrace_tile_task( int width,  int height,

 extern "C" __global__ void raytrace_ispc_tasks___export( int width,  int height,
                                 int baseWidth,  int baseHeight,
-                                const  float raster2camera[4][4], 
+                                const  float raster2camera[4][4],
                                const  float camera2world[4][4],
                                 float image[],  int id[],
                                const  LinearBVHNode nodes[],
@@ -347,8 +347,8 @@ extern "C" __global__ void raytrace_ispc_tasks___export( int width,  int height,
     int yBuckets = (height + (dy-1)) / dy;
     int nTasks = xBuckets * yBuckets;
     launch(nTasks,1,1,raytrace_tile_task)
-       (width, height, baseWidth, baseHeight, 
-        raster2camera, camera2world, 
+       (width, height, baseWidth, baseHeight,
+        raster2camera, camera2world,
        image, id, nodes, triangles);
     cudaDeviceSynchronize();
 }
@@ -357,7 +357,7 @@ extern "C" __global__ void raytrace_ispc_tasks___export( int width,  int height,

 extern "C" __host__ void raytrace_ispc_tasks( int width,  int height,
    int baseWidth,  int baseHeight,
-    const  float raster2camera[4][4], 
+    const  float raster2camera[4][4],
    const  float camera2world[4][4],
    float image[],  int id[],
    const  LinearBVHNode nodes[],
--- a/examples/portable/rt/rt.ispc
+++ b/examples/portable/rt/rt.ispc
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #if 1
@@ -90,7 +90,7 @@ static inline float Dot(const float3 a, const float3 b) {
 #if 1
 inline
 #endif
-static void generateRay(uniform const float raster2camera[4][4], 
+static void generateRay(uniform const float raster2camera[4][4],
                        uniform const float camera2world[4][4],
                        float x, float y, Ray &ray) {
    ray.mint = 0.f;
@@ -107,11 +107,11 @@ static void generateRay(uniform const float raster2camera[4][4],
    camy /= camw;
    camz /= camw;

-    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + 
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
        camera2world[0][2] * camz;
-    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + 
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
        camera2world[1][2] * camz;
-    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + 
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
        camera2world[2][2] * camz;

    ray.origin.x = camera2world[0][3] / camera2world[3][3];
@@ -129,7 +129,7 @@ static void generateRay(uniform const float raster2camera[4][4],
 #if 1
 inline
 #endif
-static bool_t BBoxIntersect(const uniform float bounds[2][3], 
+static bool_t BBoxIntersect(const uniform float bounds[2][3],
                          const Ray &ray) {
    const uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
    const uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
@@ -162,7 +162,7 @@ static bool_t BBoxIntersect(const uniform float bounds[2][3],
    }
    t0 = max(tNear.z, t0);
    t1 = min(tFar.z, t1);
-    
+
    return (t0 <= t1);
 }

@@ -215,7 +215,7 @@ static bool_t TriIntersect(const uniform_t Triangle tri, Ray &ray) {
 inline
 #endif
 bool_t
-BVHIntersect(const uniform LinearBVHNode nodes[], 
+BVHIntersect(const uniform LinearBVHNode nodes[],
                  const uniform Triangle tris[], Ray &r) {
    Ray ray = r;
    bool_t hit = false;
@@ -235,7 +235,7 @@ BVHIntersect(const uniform LinearBVHNode nodes[],
                    if (TriIntersect(tris[primitivesOffset+i], ray))
                        hit = true;
                }
-                if (todoOffset == 0) 
+                if (todoOffset == 0)
                    break;
                nodeNum = todo[--todoOffset];
            }
@@ -276,10 +276,10 @@ BVHIntersect(const uniform LinearBVHNode nodes[],
 inline
 #endif
 static void raytrace_tile(uniform int x0, uniform int x1,
-                          uniform int y0, uniform int y1, 
+                          uniform int y0, uniform int y1,
                          uniform int width, uniform int height,
                          uniform int baseWidth, uniform int baseHeight,
-                          const uniform float raster2camera[4][4], 
+                          const uniform float raster2camera[4][4],
                          const uniform float camera2world[4][4],
                          uniform float image[], uniform int id[],
                          const uniform LinearBVHNode nodes[],
@@ -302,7 +302,7 @@ static void raytrace_tile(uniform int x0, uniform int x1,

 export void raytrace_ispc(uniform int width, uniform int height,
                          uniform int baseWidth, uniform int baseHeight,
-                          const uniform float raster2camera[4][4], 
+                          const uniform float raster2camera[4][4],
                          const uniform float camera2world[4][4],
                          uniform float image[], uniform int id[],
                          const uniform LinearBVHNode nodes[],
@@ -315,7 +315,7 @@ export void raytrace_ispc(uniform int width, uniform int height,

 task void raytrace_tile_task(uniform int width, uniform int height,
                             uniform int baseWidth, uniform int baseHeight,
-                             const uniform float raster2camera[4][4], 
+                             const uniform float raster2camera[4][4],
                             const uniform float camera2world[4][4],
                             uniform float image[], uniform int id[],
                             const uniform LinearBVHNode nodes[],
@@ -326,8 +326,8 @@ task void raytrace_tile_task(uniform int width, uniform int height,
    const uniform int x1 = min(x0 + dx, width);
    const uniform int y0 = (taskIndex / xBuckets) * dy;
    const uniform int y1 = min(y0 + dy, height);
-                             
-    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, 
+
+    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
                  raster2camera, camera2world, image,
                  id, nodes, triangles);
 }
@@ -335,7 +335,7 @@ task void raytrace_tile_task(uniform int width, uniform int height,

 export void raytrace_ispc_tasks(uniform int width, uniform int height,
                                uniform int baseWidth, uniform int baseHeight,
-                                const uniform float raster2camera[4][4], 
+                                const uniform float raster2camera[4][4],
                                const uniform float camera2world[4][4],
                                uniform float image[], uniform int id[],
                                const uniform LinearBVHNode nodes[],
@@ -344,8 +344,8 @@ export void raytrace_ispc_tasks(uniform int width, uniform int height,
    const uniform int xBuckets = (width + (dx-1)) / dx;
    const uniform int yBuckets = (height + (dy-1)) / dy;
    const uniform int nTasks = xBuckets * yBuckets;
-    launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight, 
-                                      raster2camera, camera2world, 
+    launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight,
+                                      raster2camera, camera2world,
                                      image, id, nodes, triangles);
 }

--- a/examples/portable/volume_rendering/volume.cpp
+++ b/examples/portable/volume_rendering/volume.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #ifdef _MSC_VER
--- a/examples/portable/volume_rendering/volume.cu
+++ b/examples/portable/volume_rendering/volume.cu
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,11 +28,11 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #include "cuda_helpers.cuh"
-__device__ static inline float clamp(float v, float low, float high) 
+__device__ static inline float clamp(float v, float low, float high)
 {
      return min(max(v, low), high);
 }
@@ -90,7 +90,7 @@ struct Ray {


 __device__ static void
-generateRay(const float raster2camera[4][4], 
+generateRay(const float raster2camera[4][4],
            const float camera2world[4][4],
            float x, float y, Ray &ray) {
    // transform raster coordinate (x, y, 0) to camera space
@@ -149,7 +149,7 @@ IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
    }
    t0 = max(tNear.z, t0);
    t1 = min(tFar.z, t1);
-    
+
    if (t0 <= t1) {
        hit0 = t0;
        hit1 = t1;
@@ -165,7 +165,7 @@ __device__ static inline float Lerp(float t, float a, float b) {
 }


-__device__ static inline float D(int x, int y, int z, int nVoxels[3], 
+__device__ static inline float D(int x, int y, int z, int nVoxels[3],
                      float density[]) {
    x = clamp(x, 0, nVoxels[0]-1);
    y = clamp(y, 0, nVoxels[1]-1);
@@ -180,9 +180,9 @@ __device__ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
 }


-__device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
+__device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
                     float density[], int nVoxels[3]) {
-    if (!Inside(Pobj, pMin, pMax)) 
+    if (!Inside(Pobj, pMin, pMax))
        return 0;
    // Compute voxel coordinates and offsets for _Pobj_
    float3 vox = Offset(Pobj, pMin, pMax);
@@ -193,13 +193,13 @@ __device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;

    // Trilinearly interpolate density values to compute local density
-    float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
+    float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
                     D(vx+1, vy, vz, nVoxels, density));
-    float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
+    float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
                     D(vx+1, vy+1, vz, nVoxels, density));
-    float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
+    float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
                     D(vx+1, vy, vz+1, nVoxels, density));
-    float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
+    float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
                     D(vx+1, vy+1, vz+1, nVoxels, density));
    float d0 = Lerp(dy, d00, d10);
    float d1 = Lerp(dy, d01, d11);
@@ -213,7 +213,7 @@ __device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
   array. */
 __device__ static inline float
 transmittance(float3 p0, float3 p1, float3 pMin,
-              float3 pMax, float sigma_t, 
+              float3 pMax, float sigma_t,
              float density[], int nVoxels[3]) {
    float rayT0, rayT1;
    Ray ray;
@@ -253,7 +253,7 @@ distanceSquared(float3 a, float3 b) {
 }


-__device__ static inline float 
+__device__ static inline float
 raymarch(float density[], int nVoxels[3], Ray ray) {
    float rayT0, rayT1;
    float3 pMin = {.3f, -.2f, .3f}, pMax = {1.8f, 2.3f, 1.8f};
@@ -281,7 +281,7 @@ raymarch(float density[], int nVoxels[3], Ray ray) {
    float t = rayT0;
    float3 pos = ray.origin + ray.dir * rayT0;
    float3 dirStep = ray.dir * stepT;
-    while (t < rayT1) 
+    while (t < rayT1)
    {
        float d = Density(pos, pMin, pMax, density, nVoxels);

@@ -291,7 +291,7 @@ raymarch(float density[], int nVoxels[3], Ray ray) {
            break;

        // direct lighting
-        float Li = lightIntensity / distanceSquared(lightPos, pos) * 
+        float Li = lightIntensity / distanceSquared(lightPos, pos) *
            transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
                          density, nVoxels);
        L += stepDist * atten * d * sigma_s * (Li + Le);
@@ -314,20 +314,20 @@ raymarch(float density[], int nVoxels[3], Ray ray) {
 */
 __device__ static void
 volume_tile(int x0, int y0, int x1,
-            int y1, float density[], int nVoxels[3], 
+            int y1, float density[], int nVoxels[3],
            const float raster2camera[4][4],
-            const float camera2world[4][4], 
+            const float camera2world[4][4],
            int width, int height, float image[]) {
    // Work on 4x4=16 pixel big tiles of the image.  This function thus
    // implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
    // by 4.
    for (int y = y0; y < y1; y += 8) {
        for (int x = x0; x < x1; x += 8) {
-              for (int ob = 0; ob < 64; ob += programCount)            
+              for (int ob = 0; ob < 64; ob += programCount)
              {
                const int o = ob + programIndex;

-          
+
                // These two arrays encode the mapping from [0,15] to
                // offsets within the 4x4 pixel block so that we render
                // each pixel inside the block
@@ -360,9 +360,9 @@ volume_tile(int x0, int y0, int x1,


 __global__ void
-volume_task(float density[], int _nVoxels[3], 
+volume_task(float density[], int _nVoxels[3],
            const float _raster2camera[4][4],
-            const float _camera2world[4][4], 
+            const float _camera2world[4][4],
            int width, int height, float image[]) {
  if (taskIndex0 >= taskCount0) return;

@@ -389,7 +389,7 @@ volume_task(float density[], int _nVoxels[3],
  raster2camera[3][1] = _raster2camera[3][1];
  raster2camera[3][2] = _raster2camera[3][2];
  raster2camera[3][3] = _raster2camera[3][3];
-  
+
  float camera2world[4][4];
  camera2world[0][0] = _camera2world[0][0];
  camera2world[0][1] = _camera2world[0][1];
@@ -430,24 +430,24 @@ volume_task(float density[], int _nVoxels[3],

 extern "C"
 __global__ void
-volume_ispc_tasks___export( float density[],  int nVoxels[3], 
+volume_ispc_tasks___export( float density[],  int nVoxels[3],
    const  float raster2camera[4][4],
-    const  float camera2world[4][4], 
+    const  float camera2world[4][4],
    int width,  int height,  float image[]) {
  // Launch tasks to work on (dx,dy)-sized tiles of the image
  int dx = 8, dy = 8;
  int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
  launch(nTasks,1,1,volume_task)
-    (density, nVoxels, raster2camera, camera2world, 
+    (density, nVoxels, raster2camera, camera2world,
     width, height, image);
  cudaDeviceSynchronize();
 }

 extern "C"
 __host__ void
-volume_ispc_tasks( float density[],  int nVoxels[3], 
+volume_ispc_tasks( float density[],  int nVoxels[3],
    const  float raster2camera[4][4],
-    const  float camera2world[4][4], 
+    const  float camera2world[4][4],
    int width,  int height,  float image[]) {
  volume_ispc_tasks___export<<<1,32>>>(density, nVoxels, raster2camera, camera2world, width, height,image);
  cudaDeviceSynchronize();
--- a/examples/portable/volume_rendering/volume.ispc
+++ b/examples/portable/volume_rendering/volume.ispc
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */


@@ -41,7 +41,7 @@ struct Ray {


 static inline void
-generateRay(const uniform float raster2camera[4][4], 
+generateRay(const uniform float raster2camera[4][4],
            const uniform float camera2world[4][4],
            float x, float y, Ray &ray) {
    // transform raster coordinate (x, y, 0) to camera space
@@ -100,7 +100,7 @@ IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
    }
    t0 = max(tNear.z, t0);
    t1 = min(tFar.z, t1);
-    
+
    if (t0 <= t1) {
        hit0 = t0;
        hit1 = t1;
@@ -116,7 +116,7 @@ static inline float Lerp(float t, float a, float b) {
 }


-static inline float D(int x, int y, int z, uniform int nVoxels[3], 
+static inline float D(int x, int y, int z, uniform int nVoxels[3],
                      uniform float density[]) {
    x = clamp(x, 0, nVoxels[0]-1);
    y = clamp(y, 0, nVoxels[1]-1);
@@ -131,9 +131,9 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
 }


-static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
+static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
                     uniform float density[], uniform int nVoxels[3]) {
-    if (!Inside(Pobj, pMin, pMax)) 
+    if (!Inside(Pobj, pMin, pMax))
        return 0;
    // Compute voxel coordinates and offsets for _Pobj_
    float3 vox = Offset(Pobj, pMin, pMax);
@@ -144,13 +144,13 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;

    // Trilinearly interpolate density values to compute local density
-    float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
+    float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
                     D(vx+1, vy, vz, nVoxels, density));
-    float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
+    float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
                     D(vx+1, vy+1, vz, nVoxels, density));
-    float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
+    float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
                     D(vx+1, vy, vz+1, nVoxels, density));
-    float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
+    float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
                     D(vx+1, vy+1, vz+1, nVoxels, density));
    float d0 = Lerp(dy, d00, d10);
    float d1 = Lerp(dy, d01, d11);
@@ -164,7 +164,7 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
   array. */
 static inline float
 transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
-              uniform float3 pMax, uniform float sigma_t, 
+              uniform float3 pMax, uniform float sigma_t,
              uniform float density[], uniform int nVoxels[3]) {
    float rayT0, rayT1;
    Ray ray;
@@ -204,7 +204,7 @@ distanceSquared(float3 a, float3 b) {
 }


-static inline float 
+static inline float
 raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
    float rayT0, rayT1;
    const uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8};
@@ -232,7 +232,7 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
    float t = rayT0;
    float3 pos = ray.origin + ray.dir * rayT0;
    float3 dirStep = ray.dir * stepT;
-    while (t < rayT1) 
+    while (t < rayT1)
    {
        float d = Density(pos, pMin, pMax, density, nVoxels);

@@ -242,7 +242,7 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
            break;

        // direct lighting
-        float Li = lightIntensity / distanceSquared(lightPos, pos) * 
+        float Li = lightIntensity / distanceSquared(lightPos, pos) *
            transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
                          density, nVoxels);
        L += stepDist * atten * d * sigma_s * (Li + Le);
@@ -265,16 +265,16 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
 */
 static inline void
 volume_tile(uniform int x0, uniform int y0, uniform int x1,
-            uniform int y1, uniform float density[], uniform int nVoxels[3], 
+            uniform int y1, uniform float density[], uniform int nVoxels[3],
            const uniform float raster2camera[4][4],
-            const uniform float camera2world[4][4], 
+            const uniform float camera2world[4][4],
            uniform int width, uniform int height, uniform float image[]) {
  // Work on 4x4=16 pixel big tiles of the image.  This function thus
  // implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
  // by 4.
 #if 0
-  for (uniform int y = y0; y < y1; y += 8) 
-    for (uniform int x = x0; x < x1; x += 8) 
+  for (uniform int y = y0; y < y1; y += 8)
+    for (uniform int x = x0; x < x1; x += 8)
      foreach (o = 0 ... 64)
      {
        // These two arrays encode the mapping from [0,15] to
@@ -304,7 +304,7 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
          image[offset] = raymarch(density, nVoxels, ray);
      }
 #else
-  foreach_tiled (y = y0 ... y1, x = x0 ... x1) 
+  foreach_tiled (y = y0 ... y1, x = x0 ... x1)
  {
    // Use viewing parameters to compute the corresponding ray
    // for the pixel
@@ -321,10 +321,10 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,


 task void
-volume_task(uniform float density[], uniform int _nVoxels[3], 
+volume_task(uniform float density[], uniform int _nVoxels[3],
            const uniform float _raster2camera[4][4],
-            const uniform float _camera2world[4][4], 
-            uniform int width, uniform int height, uniform float image[]) 
+            const uniform float _camera2world[4][4],
+            uniform int width, uniform int height, uniform float image[])
 {
  if (taskIndex >= taskCount) return;

@@ -351,7 +351,7 @@ volume_task(uniform float density[], uniform int _nVoxels[3],
  raster2camera[3][1] = _raster2camera[3][1];
  raster2camera[3][2] = _raster2camera[3][2];
  raster2camera[3][3] = _raster2camera[3][3];
-  
+
  uniform float camera2world[4][4];
  camera2world[0][0] = _camera2world[0][0];
  camera2world[0][1] = _camera2world[0][1];
@@ -390,24 +390,24 @@ volume_task(uniform float density[], uniform int _nVoxels[3],


 export void
-volume_ispc(uniform float density[], uniform int nVoxels[3], 
+volume_ispc(uniform float density[], uniform int nVoxels[3],
            const uniform float raster2camera[4][4],
-            const uniform float camera2world[4][4], 
+            const uniform float camera2world[4][4],
            uniform int width, uniform int height, uniform float image[]) {
-    volume_tile(0, 0, width, height, density, nVoxels, raster2camera, 
+    volume_tile(0, 0, width, height, density, nVoxels, raster2camera,
                camera2world, width, height,  image);
 }


 export void
-volume_ispc_tasks(uniform float density[], uniform int nVoxels[3], 
+volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
                  const uniform float raster2camera[4][4],
-                  const uniform float camera2world[4][4], 
+                  const uniform float camera2world[4][4],
                  uniform int width, uniform int height, uniform float image[]) {
    // Launch tasks to work on (dx,dy)-sized tiles of the image
    const uniform int dx = 8, dy = 8;
    const uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
-    launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world, 
+    launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world,
                               width, height, image);
    sync;
 }