From 4b7dbbf43bb51e33810759cda268440b9785a7f6 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Mon, 18 Nov 2013 12:46:30 +0100
Subject: [PATCH 1/8] added cuda kernel

---
 .../mandelbrot_tasks3d/mandelbrot_tasks3d.cu  | 104 ++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cu

diff --git a/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cu b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cu
new file mode 100644
index 00000000..e642042a
--- /dev/null
+++ b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cu
@@ -0,0 +1,104 @@
+/*
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#define programCount 32
+#define programIndex (threadIdx.x & 31)
+#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5))
+#define taskCount0 (gridDim.x*4)
+#define taskIndex1 (blockIdx.y)
+#define taskCount1 (gridDim.y)
+#define warpIdx (threadIdx.x >> 5)
+
+
+__device__
+static inline int
+mandel(float c_re, float c_im, int count) {
+    float z_re = c_re, z_im = c_im;
+    int i;
+    for (i = 0; i < count; ++i) {
+        if (z_re * z_re + z_im * z_im > 4.0f)
+            break;
+
+        float new_re = z_re*z_re - z_im*z_im;
+        float new_im = 2.f * z_re * z_im;
+        z_re = c_re + new_re;
+        z_im = c_im + new_im;
+    }
+
+    return i;
+}
+
+
+/* Task to compute the Mandelbrot iterations for a single scanline.
+ */
+__global__ void
+mandelbrot_scanline( float x0,  float dx, 
+                     float y0,  float dy,
+                     int width,  int height, 
+                     int xspan,  int yspan,
+                     int maxIterations,  int output[]) {
+    const  int xstart = taskIndex0 * xspan;
+    const  int xend   = min(xstart  + xspan, width);
+
+    const  int ystart = taskIndex1 * yspan;
+    const  int yend   = min(ystart  + yspan, height);
+   
+  for ( int yi = ystart; yi < yend; yi++)
+    for ( int xi = xstart; xi < xend; xi += programCount)
+    {
+      const float x = x0 + (xi + programIndex) * dx;
+      const float y = y0 +  yi              * dy;
+
+      const int res = mandel(x,y,maxIterations);
+      const int index = yi * width + (xi + programIndex);
+      if (xi + programIndex < xend)
+        output[index] = res;
+    }
+}
+
+extern "C" __global__ void
+mandelbrot_ispc( float x0,  float y0, 
+                 float x1,  float y1,
+                 int width,  int height, 
+                 int maxIterations,  int output[]) {
+     float dx = (x1 - x0) / width;
+     float dy = (y1 - y0) / height;
+    const  int xspan = 64;  /* make sure it is big enough to avoid false-sharing */
+    const  int yspan = 8; 
+
+
+    if (programIndex == 0)
+      mandelbrot_scanline<<<dim3((width+4-1)/xspan/4,height/yspan),128>>>
+        (x0, dx, y0, dy, width, height, xspan, yspan,  maxIterations, output);
+    cudaDeviceSynchronize();
+}

From db4abfe198d0519e570b9ca6d5589f1d08e9d45d Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Mon, 18 Nov 2013 12:58:30 +0100
Subject: [PATCH 2/8] +1

---
 examples_cuda/deferred/kernels.cu       |  54 +++---------------------
 examples_cuda/stencil/.stencil.ispc.swn | Bin 16384 -> 0 bytes
 2 files changed, 5 insertions(+), 49 deletions(-)
 delete mode 100644 examples_cuda/stencil/.stencil.ispc.swn

diff --git a/examples_cuda/deferred/kernels.cu b/examples_cuda/deferred/kernels.cu
index 2388ea22..9914256c 100644
--- a/examples_cuda/deferred/kernels.cu
+++ b/examples_cuda/deferred/kernels.cu
@@ -95,7 +95,7 @@ dot3(float x, float y, float z, float a, float b, float c) {
 }
 
 
-#if 0
+#if 1
 static __shared__ int shdata_full[128];
 template<typename T, int N>
 struct Uniform
@@ -171,7 +171,7 @@ struct Uniform
     shptr[chunk][elem] = value;
   }
 };
-#elif 0
+#elif 1
 template<typename T, int N>
 struct Uniform
 {
@@ -274,36 +274,6 @@ static float reduce_max(float value)
   return value;
 }
 
-#if 0
-__device__ inline
-static int reduce_sum(int value)
-{
-#pragma unroll
-  for (int i = 4; i >=0; i--)
-    value +=  __shfl_xor(value, 1<<i, 32);
-  return value;
-}
-static __device__ __forceinline__ uint shfl_scan_add_step(uint partial, uint up_offset)
-{
-  uint result;
-  asm(
-      "{.reg .u32 r0;"
-      ".reg .pred p;"
-      "shfl.up.b32 r0|p, %1, %2, 0;"
-      "@p add.u32 r0, r0, %3;"
-      "mov.u32 %0, r0;}"
-      : "=r"(result) : "r"(partial), "r"(up_offset), "r"(partial));
-  return result;
-}
-static __device__ __forceinline__ int inclusive_scan_warp(const int value)
-{
-  uint sum = value;
-#pragma unroll
-  for(int i = 0; i < 5; ++i)
-    sum = shfl_scan_add_step(sum, 1 << i);
-  return sum - value;
-}
-#endif
 
 
 static __device__ __forceinline__ int lanemask_lt()
@@ -451,7 +421,7 @@ IntersectLightsWithTileMinMax(
         // don't actually need to mask the rest of this function - this is
         // just a greedy early-out.  Could also structure all of this as
         // nested if() statements, but this a bit easier to read
-        if (__ballot(inFrustum) > 0) 
+        if (any(inFrustum))
         {
             float light_positionView_x = light_positionView_x_array[lightIndex];
             float light_positionView_y = light_positionView_y_array[lightIndex];
@@ -474,11 +444,7 @@ IntersectLightsWithTileMinMax(
         
             // Pack and store intersecting lights
             const bool active = inFrustum && lightIndex < numLights;
-#if 0
-            if (__ballot(active) > 0)
-              tileNumLights += packed_store_active(active, tileLightIndices.get_ptr(tileNumLights), lightIndex);
-#else
-            if (__ballot(active) > 0)
+            if (any(active))
             {
               const int2 res = warpBinExclusiveScan(active);
               const int idx = tileNumLights + res.y;
@@ -486,7 +452,6 @@ IntersectLightsWithTileMinMax(
               tileLightIndices.set(active, idx, lightIndex);
               tileNumLights += nactive;
             }
-#endif
         }
     }
 
@@ -717,7 +682,7 @@ ShadeTile(
                 lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
                 lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
                 
-              if (x >= tileEndX) break;
+        //      if (x >= tileEndX) break;
                 framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
                 framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
                 framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
@@ -744,7 +709,6 @@ RenderTile( int num_groups_x,  int num_groups_y,
   const InputHeader &inputHeader = *inputHeaderPtr;
   const InputDataArrays &inputData = *inputDataPtr;
 
-#if 1
      int32 group_y = taskIndex / num_groups_x;
      int32 group_x = taskIndex % num_groups_x;
 
@@ -759,17 +723,11 @@ RenderTile( int num_groups_x,  int num_groups_y,
      float cameraProj_11 = inputHeader.cameraProj[1][1];
      float cameraProj_22 = inputHeader.cameraProj[2][2];
      float cameraProj_32 = inputHeader.cameraProj[3][2];
-#endif
 
     // Light intersection: figure out which lights illuminate this tile.
      Uniform<int,MAX_LIGHTS> tileLightIndices;  // Light list for the tile
 
-#if 0
-     tileLightIndices.set(threadIdx.x&1, threadIdx.x, framebuffer_g[blockIdx.x]);
-     framebuffer_r[threadIdx.x] = tileLightIndices.get(threadIdx.x);
-#endif
 
-#if 1
      int numTileLights = 
         IntersectLightsWithTile(tile_start_x, tile_end_x, 
                                 tile_start_y, tile_end_y,
@@ -791,7 +749,6 @@ RenderTile( int num_groups_x,  int num_groups_y,
               cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
               tileLightIndices, numTileLights, visualizeLightCount, 
               framebuffer_r, framebuffer_g, framebuffer_b);
-#endif
 }
 
 
@@ -806,7 +763,6 @@ RenderStatic(InputHeader inputHeaderPtr[],
              unsigned int8 framebuffer_b[]) {
 
   const InputHeader inputHeader = *inputHeaderPtr;
-  const InputDataArrays inputData = *inputDataPtr;
 
     int num_groups_x = (inputHeader.framebufferWidth + 
                                 MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
diff --git a/examples_cuda/stencil/.stencil.ispc.swn b/examples_cuda/stencil/.stencil.ispc.swn
deleted file mode 100644
index ad3f6c7805836bce454e0ea3c69a06f27d7ae0cb..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16384
zcmeHOU2No56?PX|3bYh02&obehus#<Br}stg4CZSNY2EYSencX_RJ>f5=GAV&c<4g
zJ=mUPCbZ=R34v4|SSnHe9*__(JRk%Tg%>0qkXj)j0pg(`fdHxei}D0&f$v<~i6@!e
zY`ZTA*~&K_U!Qx<x!*na+V|Ya?)(*fogS$yC^+7!D4%%cQ^s!}s3_mYcd>tMPk4^Q
zw{CS@7Xw!Wm2epJLMyN;o;zLo=~Urd==Y*ICmeLkUEdb++ng9#edq~yx<lKV3nSrm
z9XCB!Sa`bDyfkMZXW;f2xL;XZsLr$O#S8DFcb)ItUbS2*XCP-FXCP-FXCP-FXCP-F
zXCP<b|AK*Na8CI+T5>LF%~o>0>%{p@$@Rm@`H2(vlgs>+GmtZoGmtZoGmtZoGmtZo
zGmtZoGmtZoGmtazKgfV(E6Tg!*XMY_k3Ii%{r}nnit-cSdEmQ12+RN%feXN|-k~VZ
z0iOq2z&*hC?^l%X0b`&7{OvwP`77`j;H$tE@E+iWw=2riz$3ul-=-*E2X=r5fnU5;
zQGN#e6!-yf1Ly&d0k7VxDBl4>paHA{=YjixU%y3Bz6@*vGr&LYQIsD7p9QW0v%tUZ
zMj3De=mNU{0rvoR1OIumqPzyY3j7Ip33w5B0eBwxB+vma0>42*<XNBr%mViUFC&Ta
zOW-BoYk&`Uz@xx9;4a`r@cb;meBWTsryWFH-wPuedQQ&|`s8>K9l91uXo>ocS2(Pe
z=&<JmD48X*r8b@@UQ(vkiiMWwa2}G86<*gpTZ|THCa!h~3`{XL&KeuT*f?u!>n6q$
zp(Pqm7^^16wr*l<lr=Vju~F98=1q(xLQ6E7FgBkU+myzL0uc=ZkICU9L%@fY<W<cd
zdeK5rk++ZUS1EhHN>4sX5OI1`I8re`Au^wc#CncD(=LoV)b@wFt{~O%u5Oo(x;HKL
zhtbvTB7F>&P99U$bbZmg`hkq12_yUAM`7d^x;UyDqhmMZ_4vf~;aM6E=O=3*G|#X(
zK2Zblr6fQ1*}-|R!xR_N7PD8Q6EzSds9{A;SN0g~`;LuaX^WoYi8!$7(TVG=6W5!x
zh+jNCB|UwL$3T2AXATdLZG@5S?5A;~?s#Gp72pQ{6)_4IDjaDleykK67ne~mDL6~V
z%GpG=!t7!ZV=`M(UM$LJPrS*>&n`;C8ToQjqNLI?92VL0LabEQnW!e~#3e`7xO&!!
z*@aVXEfi5Sy~6CmgaOb~PD+mCrC$qKzgS|Tnyhn{U)587Rd3-J<`HyMman8_(ywaP
zFP507Rz5CCswHC;v4J3vEt_WzSpwPYQSc=JfnkP`byy^o2e}L}_*NLPhlRqp2o;Uw
z0z!N-E^cR(aF`wI!1QsuDo7qYh~gScR5(oZv+41X;ZYpx@#KT2j<C9WEGr)!l}d&9
znKQ;ECU?H%r@{!An11<kgx-lm{q&;?je%fe#J0xikcu5Ungb`MuGcj`rW3`4hSxte
zrfS~MteCBQ=7g9PK5H(#-D%MAT%P*kiEw6)gVk(aZf~fDRwC_+)*$^dsVi4>Q(K|t
zWooZ!)M}chzO2{v_7)eJo!T0$sO##gRv~`cQA1m;>#JI$rcu+NCh8a)P=FvdbW?Av
zlHMS-u|*py6m;6D-i`@*HldrWrLL>7krHjJX{?0n)tZgAq1M_ZGTVk;YiGq^tlc!)
zRL~o>dS?ZeSWnAnUe?>@CH7^x1N)k3LQ`8aj83bqHycG-Yi^*e2Gvyfuma!E*4Rau
zZ5rInL{qI}XzN--`fPTV&9>g|v^837HdjnGU~0w{y{5rh9V~Hkps@sc+G+{+S&O!*
zt+(o`vBd@49#i{p2P$>7p3rD!T2O+|ZE7^Rq3e1Z-nE<ZK^fNt2e{ufOj_0?6?Kg+
z;}+Aqq8l0$r8kbh)$|pZQtP0vrPW|k>T0P*Td#HC6ty}A`d33&X;5I+>Z-o3tyGx$
zMpH&ao6I$}UYDUzYqquweRU0dY&104Jh&%s>1d=5cM(d4iq>h8jdjhat>H51T1ho}
zNA`*In;Fv03-SgxLfdQ^++k?bX^q^rq^D2u#ELkSj^Y_=ITo&fffZWbI^q9Xv%aE%
zM(%qVV=1o}R1>e-P=+Jgb3)?ZARkzPpaUn`!>ml<Ky;lR-YEuw;|FvQI1zq?Oal@0
zoiKEKucFA>{JtCo&h9YsgOK{xn05sXLt#_olkjYm3W9<`;P?GV@Yx=AqmXP7IQtOm
z;f7R&`B1>P2;h+*&+4OIk902*F4g>C;0Jie@jddOEtLpa#EHVREw<ks_Jzl9Ly)vQ
zdk(_BRNA*9ShU=bUD$VQnC7-hRkjnxZOF9C^YKO{N-;ey1lSZBYdfqvWaYAM&v#w_
z!11o(6)EhxmeWTI9FOlM%`!w*f5<m?9nT8JB-b<jA#?{KCPqk|)~>%VU~DiBoNIeg
z+PXV&DekTrI_V4=75c-V%jRrBi8GNEw}7U2(ihR=Xj$L4IX)!QCF0PLv4JSDj~R~k
ztmp(wSqcNDXwoCeE8>S)N*rURGlYeuxZ^k~<6vceRM#a*RtPRb5g@d<$+eV2Ds+pq
zFkhW7;{#9YQ*out%_zM7e+=vHKLEV`e;6k|euOptSAZ`8Uj)7YJO?}jw1M{mzr&jU
z8-NXzfq!Dn|1IE~z-NGGfL-7tz<J>JSo{AN_%xsc4*?GXKSv(m2G9pw;344O@aGla
z&%hr6_U~mNKXL|g266^+266^+266^+266^&kAXDVOY#WEUN=oe&CL<A$w9;`8oA6u
zt`|!z5wE_|RUf1_ut*c}I`6=8B3{gpg|~35D7c<$4ZZFj-$w#F<kcYZ(N4IB^&uzJ
z(lt%&02UvjS0ZGbOGpaZHa^9^@Fw%Wr_<t`3g<l%P8@Lxj<Zdind3x$11ZMFVTrH^
zNRHT}tBqys7?mL!ZxwBG!MIw64ldX#%7Rg~4CP#~S=>%_2sez^bg9Oy*X(cX{Djwc
z==2@?%)2y&h~7^~NiHoR3#`P>qXW@V)nuhghlFm%!a41|ec#1GI@8-N{4cZX3fDOl
zWOV~S4AEy;X^|DufpFc&N#;nov#~5g7##S)b>c-hg)SD-((F+_l_pYo5fY-&{3VL$
zgA`RS(d=x*`O|a@1N}29c}27XDdO3JS1ebHb2t^1SYO&3*>jp!Om~yGcc$(#**12W
zq;uKo>GxF4L^}1Rdr5JvsdbK0uxFZjLn+tPUz}6(tmn+^>rLuSOT=bGC%$IwP1~As
zeSg3^S^N*|JN%F9az`ti3;>MVteyyCSP>(ne4-?an{H1b51Sd=+41Md(Kn3b=u7S4
dJSfDQqb0esig`lrr<UZVZn|$=lKaUe<-e5A1RekY


From ee61a265f485f0397bfe63eecaf86a72508419fc Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Mon, 18 Nov 2013 13:01:36 +0100
Subject: [PATCH 3/8] fixed kernel

---
 examples_cuda/deferred/kernels.cu | 148 +++++++++++++-----------------
 1 file changed, 64 insertions(+), 84 deletions(-)

diff --git a/examples_cuda/deferred/kernels.cu b/examples_cuda/deferred/kernels.cu
index 9914256c..2530532a 100644
--- a/examples_cuda/deferred/kernels.cu
+++ b/examples_cuda/deferred/kernels.cu
@@ -95,7 +95,7 @@ dot3(float x, float y, float z, float a, float b, float c) {
 }
 
 
-#if 1
+#if 0
 static __shared__ int shdata_full[128];
 template<typename T, int N>
 struct Uniform
@@ -133,44 +133,6 @@ struct Uniform
     data[chunkIdx] = shdata[programIndex];
   }
 };
-#elif 0
-static __shared__ void* shptr_full[128];
-template<typename T, int N>
-struct Uniform
-{
-  T data[(N+programCount-1)/programCount];
-  T* *shptr;
-
-  __device__ inline Uniform()
-  {
-    shptr = (T**)shptr_full;
-    shptr[threadIdx.x] = data;
-    __syncthreads();
-  }
-
-  __device__ inline int2 get_chunk(const int i) const
-  {
-    const int elem  = i & (programCount - 1);
-    const int chunk = i >> 5;
-    return make_int2(chunk, elem);
-  }
-
-  __device__ inline const T get(const int i) const
-  {
-    const int2 idx = get_chunk(i);
-    const int chunk = idx.x;
-    const int elem  = idx.y;
-    return shptr[chunk][elem];
-  }
-  
-  __device__ inline void set(const bool active, const int i, T value) 
-  {
-    const int2 idx = get_chunk(i);
-    const int chunk = idx.x;
-    const int elem  = idx.y;
-    shptr[chunk][elem] = value;
-  }
-};
 #elif 1
 template<typename T, int N>
 struct Uniform
@@ -181,32 +143,17 @@ struct Uniform
     int32_t ptr[2];
   };
 
-
   __device__ inline Uniform()
   {
-#if 1
     if (programIndex == 0)
-      data = new T[N];
+      data = (T*)malloc(N*sizeof(T));
     ptr[0] = __shfl(ptr[0], 0);
     ptr[1] = __shfl(ptr[1], 0);
-#else
-    __shared__ T *ptr;
-    if (threadIdx.x == 0)
-      ptr = new T[4*N];
-    __syncthreads();
-    data = ptr;
-    data += warpIdx*N;
-#endif
   }
   __device__ inline ~Uniform()
   {
-#if 1
     if (programIndex == 0)
-      delete data;
-#else
-    if (threadIdx.x == 0)
-      delete data;
-#endif
+      free(data);
   }
 
   __device__ inline const T get(const int i) const
@@ -274,6 +221,36 @@ static float reduce_max(float value)
   return value;
 }
 
+#if 0
+__device__ inline
+static int reduce_sum(int value)
+{
+#pragma unroll
+  for (int i = 4; i >=0; i--)
+    value +=  __shfl_xor(value, 1<<i, 32);
+  return value;
+}
+static __device__ __forceinline__ uint shfl_scan_add_step(uint partial, uint up_offset)
+{
+  uint result;
+  asm(
+      "{.reg .u32 r0;"
+      ".reg .pred p;"
+      "shfl.up.b32 r0|p, %1, %2, 0;"
+      "@p add.u32 r0, r0, %3;"
+      "mov.u32 %0, r0;}"
+      : "=r"(result) : "r"(partial), "r"(up_offset), "r"(partial));
+  return result;
+}
+static __device__ __forceinline__ int inclusive_scan_warp(const int value)
+{
+  uint sum = value;
+#pragma unroll
+  for(int i = 0; i < 5; ++i)
+    sum = shfl_scan_add_step(sum, 1 << i);
+  return sum - value;
+}
+#endif
 
 
 static __device__ __forceinline__ int lanemask_lt()
@@ -421,7 +398,7 @@ IntersectLightsWithTileMinMax(
         // don't actually need to mask the rest of this function - this is
         // just a greedy early-out.  Could also structure all of this as
         // nested if() statements, but this a bit easier to read
-        if (any(inFrustum))
+        if (__ballot(inFrustum) > 0) 
         {
             float light_positionView_x = light_positionView_x_array[lightIndex];
             float light_positionView_y = light_positionView_y_array[lightIndex];
@@ -444,7 +421,11 @@ IntersectLightsWithTileMinMax(
         
             // Pack and store intersecting lights
             const bool active = inFrustum && lightIndex < numLights;
-            if (any(active))
+#if 0
+            if (__ballot(active) > 0)
+              tileNumLights += packed_store_active(active, tileLightIndices.get_ptr(tileNumLights), lightIndex);
+#else
+            if (__ballot(active) > 0)
             {
               const int2 res = warpBinExclusiveScan(active);
               const int idx = tileNumLights + res.y;
@@ -452,6 +433,7 @@ IntersectLightsWithTileMinMax(
               tileLightIndices.set(active, idx, lightIndex);
               tileNumLights += nactive;
             }
+#endif
         }
     }
 
@@ -682,7 +664,6 @@ ShadeTile(
                 lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
                 lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
                 
-        //      if (x >= tileEndX) break;
                 framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
                 framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
                 framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
@@ -695,20 +676,19 @@ ShadeTile(
 ///////////////////////////////////////////////////////////////////////////
 // Static decomposition
 
-
 __global__ void
 RenderTile( int num_groups_x,  int num_groups_y,
-           const  InputHeader inputHeaderPtr[],
-           const  InputDataArrays inputDataPtr[],
+           const  InputHeader *inputHeaderPtr,
+           const  InputDataArrays *inputDataPtr,
             int visualizeLightCount,
            // Output
             unsigned int8 framebuffer_r[],
             unsigned int8 framebuffer_g[],
             unsigned int8 framebuffer_b[]) {
   if (taskIndex >= taskCount) return;
-  const InputHeader &inputHeader = *inputHeaderPtr;
-  const InputDataArrays &inputData = *inputDataPtr;
 
+  const  InputHeader inputHeader = *inputHeaderPtr;
+  const  InputDataArrays inputData = *inputDataPtr;
      int32 group_y = taskIndex / num_groups_x;
      int32 group_x = taskIndex % num_groups_x;
 
@@ -726,8 +706,7 @@ RenderTile( int num_groups_x,  int num_groups_y,
 
     // Light intersection: figure out which lights illuminate this tile.
      Uniform<int,MAX_LIGHTS> tileLightIndices;  // Light list for the tile
-
-
+#if 1
      int numTileLights = 
         IntersectLightsWithTile(tile_start_x, tile_end_x, 
                                 tile_start_y, tile_end_y,
@@ -749,33 +728,34 @@ RenderTile( int num_groups_x,  int num_groups_y,
               cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
               tileLightIndices, numTileLights, visualizeLightCount, 
               framebuffer_r, framebuffer_g, framebuffer_b);
+#endif
 }
 
 
-  extern "C"
-__global__ void
-RenderStatic(InputHeader inputHeaderPtr[],
-             InputDataArrays inputDataPtr[],
-             int visualizeLightCount,
+extern "C" __global__ void
+RenderStatic( InputHeader inputHeaderPtr[],
+              InputDataArrays inputDataPtr[],
+              int visualizeLightCount,
              // Output
-             unsigned int8 framebuffer_r[],
-             unsigned int8 framebuffer_g[],
-             unsigned int8 framebuffer_b[]) {
+              unsigned int8 framebuffer_r[],
+              unsigned int8 framebuffer_g[],
+              unsigned int8 framebuffer_b[]) {
 
-  const InputHeader inputHeader = *inputHeaderPtr;
+  const  InputHeader inputHeader = *inputHeaderPtr;
+  const  InputDataArrays inputData = *inputDataPtr;
 
-    int num_groups_x = (inputHeader.framebufferWidth + 
+
+     int num_groups_x = (inputHeader.framebufferWidth + 
                                 MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
-    int num_groups_y = (inputHeader.framebufferHeight + 
+     int num_groups_y = (inputHeader.framebufferHeight + 
                                 MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
-    int num_groups = num_groups_x * num_groups_y;
+     int num_groups = num_groups_x * num_groups_y;
 
     // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
     // by MIN_TILE_HEIGHT pixels.
-    if (programIndex == 0)
-      RenderTile<<<num_groups, 128>>>(num_groups_x, num_groups_y,
-          inputHeaderPtr, inputDataPtr, visualizeLightCount,
-          framebuffer_r, framebuffer_g, framebuffer_b);
-    cudaDeviceSynchronize();
-    cudaDeviceSynchronize();
+     if (programIndex == 0)
+       RenderTile<<<(num_groups+4-1)/4,128>>>(num_groups_x, num_groups_y,
+           inputHeaderPtr, inputDataPtr, visualizeLightCount,
+           framebuffer_r, framebuffer_g, framebuffer_b);
+     cudaDeviceSynchronize();
 }

From 4f9b8ebc7319b44e9e438108e24c7115c973e6f3 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Mon, 18 Nov 2013 13:11:57 +0100
Subject: [PATCH 4/8] +1

---
 examples_cuda/rt/rt.cu | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/examples_cuda/rt/rt.cu b/examples_cuda/rt/rt.cu
index 2575662d..b7277cf6 100644
--- a/examples_cuda/rt/rt.cu
+++ b/examples_cuda/rt/rt.cu
@@ -319,7 +319,6 @@ static inline void raytrace_tile( int x0,  int x1,
 
 
-extern "C"
 __global__
 void raytrace_tile_task( int width,  int height,
                               int baseWidth,  int baseHeight,
@@ -330,7 +329,7 @@ void raytrace_tile_task( int width,  int height,
                              const  Triangle triangles[]) {
 
   if (taskIndex >= taskCount) return;
-     int dx = 32, dy = 16; // must match dx, dy below
+     int dx = 64, dy = 8; // must match dx, dy below
      int xBuckets = (width + (dx-1)) / dx;
      int x0 = (taskIndex % xBuckets) * dx;
      int x1 = min(x0 + dx, width);
@@ -343,3 +342,20 @@ void raytrace_tile_task( int width,  int height,
 }
 
 
+extern "C" __global__ void raytrace_ispc_tasks( int width,  int height,
+                                 int baseWidth,  int baseHeight,
+                                const  float raster2camera[4][4], 
+                                const  float camera2world[4][4],
+                                 float image[],  int id[],
+                                const  LinearBVHNode nodes[],
+                                const  Triangle triangles[]) {
+     int dx = 64, dy = 8;
+     int xBuckets = (width + (dx-1)) / dx;
+     int yBuckets = (height + (dy-1)) / dy;
+     int nTasks = xBuckets * yBuckets;
+     if (programIndex == 0)
+       raytrace_tile_task<<<(nTasks-1+4)/4,128>>>(width, height, baseWidth, baseHeight, 
+           raster2camera, camera2world, 
+           image, id, nodes, triangles);
+     cudaDeviceSynchronize();
+}

From 64762c5acddba333f2d86c02b4df07479d98525f Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Mon, 18 Nov 2013 13:15:05 +0100
Subject: [PATCH 5/8] +1

---
 examples_cuda/rt/rt.cu | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples_cuda/rt/rt.cu b/examples_cuda/rt/rt.cu
index b7277cf6..46267941 100644
--- a/examples_cuda/rt/rt.cu
+++ b/examples_cuda/rt/rt.cu
@@ -2,6 +2,7 @@
 #define programIndex (threadIdx.x & 31)
 #define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5))
 #define taskCount (gridDim.x*4)
+#define warpIdx (threadIdx.x >> 5)
 
 #define float3 Float3
 struct Float3
@@ -235,7 +236,12 @@ static inline bool BVHIntersect(const  LinearBVHNode nodes[],
     bool hit = false;
     // Follow ray through BVH nodes to find primitive intersections
      int todoOffset = 0, nodeNum = 0;
+#if 0
+     __shared__ int todoX[64*4];
+     volatile int * todo = &todoX[warpIdx * 64];
+#else
      int todo[64];
+#endif
 
     while (true) {
         // Check ray against BVH node

From cf2116e167e409f0e73cd68c92b36202802f332d Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Mon, 18 Nov 2013 13:16:30 +0100
Subject: [PATCH 6/8] +1

---
 examples_cuda/rt/rt.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples_cuda/rt/rt.cu b/examples_cuda/rt/rt.cu
index 46267941..e0c3855a 100644
--- a/examples_cuda/rt/rt.cu
+++ b/examples_cuda/rt/rt.cu
@@ -345,6 +345,7 @@ void raytrace_tile_task( int width,  int height,
     raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, 
                   raster2camera, camera2world, image,
                   id, nodes, triangles);
+    cudaDeviceSynchronize();
 }
 
 
From 915dc4be7fdf6c8a4bf311801a1869076082571c Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Mon, 18 Nov 2013 13:24:01 +0100
Subject: [PATCH 7/8] +1

---
 examples_cuda/sort/sort1.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples_cuda/sort/sort1.cu b/examples_cuda/sort/sort1.cu
index 49886fdb..2c94a409 100644
--- a/examples_cuda/sort/sort1.cu
+++ b/examples_cuda/sort/sort1.cu
@@ -247,7 +247,7 @@ void sort_ispc ( int n,  unsigned int code[],  int order[],  int ntasks,
 {
    int num = ntasks;
    int span = n / num;
-#if 0
+#if 1
    int hsize = 256*programCount*num;
    int *  hist =  __new< int>(hsize);
    int64 *  pair =  __new< int64>(n);
@@ -293,7 +293,7 @@ void sort_ispc ( int n,  unsigned int code[],  int order[],  int ntasks,
     unpack<<<nbx(num),128>>> (span, n, pair, code, order);
   sync;
 
-#if ALLOCATED
+#ifdef ALLOCATED
   __delete(g);
   __delete(hist);
   __delete(pair);

From 4bc8c79bd3371b663f1b34289df97843f085c990 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Mon, 18 Nov 2013 13:28:12 +0100
Subject: [PATCH 8/8] fixed cuda kernel

---
 examples_cuda/rt/rt.cu | 91 +++++++++++++++++++-----------------------
 1 file changed, 41 insertions(+), 50 deletions(-)

diff --git a/examples_cuda/rt/rt.cu b/examples_cuda/rt/rt.cu
index e0c3855a..8decd03a 100644
--- a/examples_cuda/rt/rt.cu
+++ b/examples_cuda/rt/rt.cu
@@ -58,17 +58,12 @@ struct Float3
   }
 };
 
-#if 0
-#define DIRISNEG
-#endif
+#define int8 char
+#define int16 short
 
 struct Ray {
     float3 origin, dir, invDir;
-#ifdef DIRISNEG /* this fails to compile with nvvm */
-    unsigned int dirIsNeg[3];
-#else
     unsigned int dirIsNeg0, dirIsNeg1, dirIsNeg2;
-#endif
     float mint, maxt;
     int hitId;
 };
@@ -79,8 +74,6 @@ struct Triangle {
     int pad[3];
 };
 
-#define int8 char
-#define int16 short
 struct LinearBVHNode {
     float bounds[2][3];
     unsigned int offset;     // num primitives for leaf, second child for interior
@@ -106,7 +99,8 @@ static inline float Dot(const float3 a, const float3 b) {
 }
 
 __device__
-static inline void generateRay( const float raster2camera[4][4], 
+inline
+static void generateRay( const float raster2camera[4][4], 
                          const float camera2world[4][4],
                         float x, float y, Ray &ray) {
     ray.mint = 0.f;
@@ -136,7 +130,7 @@ static inline void generateRay( const float raster2camera[4][4],
 
     ray.invDir = 1.f / ray.dir;
 
-#ifdef DIRISNEG
+#if 0
     ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
     ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
     ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
@@ -147,9 +141,9 @@ static inline void generateRay( const float raster2camera[4][4],
 #endif
 }
 
-
 __device__
-static inline bool BBoxIntersect(const  float bounds[2][3], 
+inline
+static bool BBoxIntersect(const  float bounds[2][3], 
                           const Ray &ray) {
      float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
      float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
@@ -187,9 +181,9 @@ static inline bool BBoxIntersect(const  float bounds[2][3],
 }
 
 
-
 __device__
-static inline bool TriIntersect(const  Triangle &tri, Ray &ray) {
+inline
+static bool TriIntersect(const  Triangle &tri, Ray &ray) {
      float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
      float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
      float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
@@ -228,20 +222,15 @@ static inline bool TriIntersect(const  Triangle &tri, Ray &ray) {
     return hit;
 }
 
-
 __device__
-static inline bool BVHIntersect(const  LinearBVHNode nodes[], 
-                  const  Triangle tris[], Ray &r) {
+inline
+bool BVHIntersect(const  LinearBVHNode nodes[], 
+                  const  Triangle tris[], Ray &r,
+                   int todo[]) {
     Ray ray = r;
     bool hit = false;
     // Follow ray through BVH nodes to find primitive intersections
      int todoOffset = 0, nodeNum = 0;
-#if 0
-     __shared__ int todoX[64*4];
-     volatile int * todo = &todoX[warpIdx * 64];
-#else
-     int todo[64];
-#endif
 
     while (true) {
         // Check ray against BVH node
@@ -255,20 +244,17 @@ static inline bool BVHIntersect(const  LinearBVHNode nodes[],
                     if (TriIntersect(tris[primitivesOffset+i], ray))
                         hit = true;
                 }
+                if (todoOffset == 0) 
+                    break;
                 nodeNum = todo[--todoOffset];
             }
             else {
                 // Put far BVH node on _todo_ stack, advance to near node
-#ifdef DIRISNEG
-                const int dirIsNeg = r.dirIsNeg[node.splitAxis];
-#else
                 int dirIsNeg;
                 if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg0;
                 if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg1;
                 if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg2;
-#endif
-                if (dirIsNeg) 
-                {
+                if (dirIsNeg) {
                    todo[todoOffset++] = nodeNum + 1;
                    nodeNum = node.offset;
                 }
@@ -279,10 +265,10 @@ static inline bool BVHIntersect(const  LinearBVHNode nodes[],
             }
         }
         else {
-            nodeNum = todo[--todoOffset];
-        }
             if (todoOffset == 0)
                 break;
+            nodeNum = todo[--todoOffset];
+        }
     }
     r.maxt = ray.maxt;
     r.hitId = ray.hitId;
@@ -290,9 +276,9 @@ static inline bool BVHIntersect(const  LinearBVHNode nodes[],
     return hit;
 }
 
-
 __device__
-static inline void raytrace_tile( int x0,  int x1,
+inline
+static void raytrace_tile( int x0,  int x1,
                            int y0,  int y1, 
                            int width,  int height,
                            int baseWidth,  int baseHeight,
@@ -304,23 +290,30 @@ static inline void raytrace_tile( int x0,  int x1,
      float widthScale = (float)(baseWidth) / (float)(width);
      float heightScale = (float)(baseHeight) / (float)(height);
 
-//    foreach_tiled (y = y0 ... y1, x = x0 ... x1) 
-    for ( int y = y0; y < y1; y++)
-      for ( int xb = x0; xb < x1; xb += programCount)
-    {
-      const int x = xb + programIndex;
-        Ray ray;
-        generateRay(raster2camera, camera2world, x*widthScale,
-                    y*heightScale, ray);
-        BVHIntersect(nodes, triangles, ray);
+#if 0
+   int *  todo =  new  int[64];
+#define ALLOC
+#else
+   int todo[64];
+#endif
 
-        int offset = y * width + x;
+    for (int y = y0 ;y < y1; y++)
+      for (int x = x0 + programIndex; x < x1; x += programCount)
         if (x < x1)
         {
+          Ray ray;
+          generateRay(raster2camera, camera2world, x*widthScale,
+              y*heightScale, ray);
+          BVHIntersect(nodes, triangles, ray, todo);
+
+          int offset = y * width + x;
           image[offset] = ray.maxt;
           id[offset] = ray.hitId;
         }
-    }
+
+#ifdef ALLOC
+  delete todo;
+#endif
 }
 
 
@@ -333,19 +326,16 @@ void raytrace_tile_task( int width,  int height,
                               float image[],  int id[],
                              const  LinearBVHNode nodes[],
                              const  Triangle triangles[]) {
-
-  if (taskIndex >= taskCount) return;
      int dx = 64, dy = 8; // must match dx, dy below
      int xBuckets = (width + (dx-1)) / dx;
      int x0 = (taskIndex % xBuckets) * dx;
      int x1 = min(x0 + dx, width);
      int y0 = (taskIndex / xBuckets) * dy;
      int y1 = min(y0 + dy, height);
-          
+                             
     raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, 
                   raster2camera, camera2world, image,
                   id, nodes, triangles);
-    cudaDeviceSynchronize();
 }
 
 
@@ -361,8 +351,9 @@ extern "C" __global__ void raytrace_ispc_tasks( int width,  int height,
      int yBuckets = (height + (dy-1)) / dy;
      int nTasks = xBuckets * yBuckets;
      if (programIndex == 0)
-       raytrace_tile_task<<<(nTasks-1+4)/4,128>>>(width, height, baseWidth, baseHeight, 
+       raytrace_tile_task<<<(nTasks+4-1)/4,128>>>(width, height, baseWidth, baseHeight, 
            raster2camera, camera2world, 
            image, id, nodes, triangles);
      cudaDeviceSynchronize();
 }
+