Merge branch 'sm35_foreach' of github.com:egaburov/ispc into sm35_foreach

2013-11-18 21:59:09 +01:00
parent e4ee1692fb 4bc8c79bd3
commit e8fc32a7dc
5 changed files with 190 additions and 136 deletions
--- a/examples_cuda/deferred/kernels.cu
+++ b/examples_cuda/deferred/kernels.cu
@@ -133,45 +133,7 @@ struct Uniform
    data[chunkIdx] = shdata[programIndex];
  }
 };
-#elif 0
-static __shared__ void* shptr_full[128];
-template<typename T, int N>
-struct Uniform
-{
-  T data[(N+programCount-1)/programCount];
-  T* *shptr;
-
-  __device__ inline Uniform()
-  {
-    shptr = (T**)shptr_full;
-    shptr[threadIdx.x] = data;
-    __syncthreads();
-  }
-
-  __device__ inline int2 get_chunk(const int i) const
-  {
-    const int elem  = i & (programCount - 1);
-    const int chunk = i >> 5;
-    return make_int2(chunk, elem);
-  }
-
-  __device__ inline const T get(const int i) const
-  {
-    const int2 idx = get_chunk(i);
-    const int chunk = idx.x;
-    const int elem  = idx.y;
-    return shptr[chunk][elem];
-  }
-  
-  __device__ inline void set(const bool active, const int i, T value) 
-  {
-    const int2 idx = get_chunk(i);
-    const int chunk = idx.x;
-    const int elem  = idx.y;
-    shptr[chunk][elem] = value;
-  }
-};
-#elif 0
+#elif 1
 template<typename T, int N>
 struct Uniform
 {
@@ -181,32 +143,17 @@ struct Uniform
    int32_t ptr[2];
  };

-
  __device__ inline Uniform()
  {
-#if 1
    if (programIndex == 0)
-      data = new T[N];
+      data = (T*)malloc(N*sizeof(T));
    ptr[0] = __shfl(ptr[0], 0);
    ptr[1] = __shfl(ptr[1], 0);
-#else
-    __shared__ T *ptr;
-    if (threadIdx.x == 0)
-      ptr = new T[4*N];
-    __syncthreads();
-    data = ptr;
-    data += warpIdx*N;
-#endif
  }
  __device__ inline ~Uniform()
  {
-#if 1
    if (programIndex == 0)
-      delete data;
-#else
-    if (threadIdx.x == 0)
-      delete data;
-#endif
+      free(data);
  }

  __device__ inline const T get(const int i) const
@@ -717,7 +664,6 @@ ShadeTile(
                lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
                lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
                
-              if (x >= tileEndX) break;
                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
@@ -730,21 +676,19 @@ ShadeTile(
 ///////////////////////////////////////////////////////////////////////////
 // Static decomposition

-
 __global__ void
 RenderTile( int num_groups_x,  int num_groups_y,
-           const  InputHeader inputHeaderPtr[],
-           const  InputDataArrays inputDataPtr[],
+           const  InputHeader *inputHeaderPtr,
+           const  InputDataArrays *inputDataPtr,
            int visualizeLightCount,
           // Output
            unsigned int8 framebuffer_r[],
            unsigned int8 framebuffer_g[],
            unsigned int8 framebuffer_b[]) {
  if (taskIndex >= taskCount) return;
-  const InputHeader &inputHeader = *inputHeaderPtr;
-  const InputDataArrays &inputData = *inputDataPtr;

-#if 1
+  const  InputHeader inputHeader = *inputHeaderPtr;
+  const  InputDataArrays inputData = *inputDataPtr;
     int32 group_y = taskIndex / num_groups_x;
     int32 group_x = taskIndex % num_groups_x;

@@ -759,16 +703,9 @@ RenderTile( int num_groups_x,  int num_groups_y,
     float cameraProj_11 = inputHeader.cameraProj[1][1];
     float cameraProj_22 = inputHeader.cameraProj[2][2];
     float cameraProj_32 = inputHeader.cameraProj[3][2];
-#endif

    // Light intersection: figure out which lights illuminate this tile.
     Uniform<int,MAX_LIGHTS> tileLightIndices;  // Light list for the tile
-
-#if 0
-     tileLightIndices.set(threadIdx.x&1, threadIdx.x, framebuffer_g[blockIdx.x]);
-     framebuffer_r[threadIdx.x] = tileLightIndices.get(threadIdx.x);
-#endif
-
 #if 1
     int numTileLights = 
        IntersectLightsWithTile(tile_start_x, tile_end_x, 
@@ -795,31 +732,30 @@ RenderTile( int num_groups_x,  int num_groups_y,
 }


-  extern "C"
-__global__ void
-RenderStatic(InputHeader inputHeaderPtr[],
-             InputDataArrays inputDataPtr[],
-             int visualizeLightCount,
+extern "C" __global__ void
+RenderStatic( InputHeader inputHeaderPtr[],
+              InputDataArrays inputDataPtr[],
+              int visualizeLightCount,
             // Output
-             unsigned int8 framebuffer_r[],
-             unsigned int8 framebuffer_g[],
-             unsigned int8 framebuffer_b[]) {
+              unsigned int8 framebuffer_r[],
+              unsigned int8 framebuffer_g[],
+              unsigned int8 framebuffer_b[]) {

-  const InputHeader inputHeader = *inputHeaderPtr;
-  const InputDataArrays inputData = *inputDataPtr;
+  const  InputHeader inputHeader = *inputHeaderPtr;
+  const  InputDataArrays inputData = *inputDataPtr;

-    int num_groups_x = (inputHeader.framebufferWidth + 
+
+     int num_groups_x = (inputHeader.framebufferWidth + 
                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
-    int num_groups_y = (inputHeader.framebufferHeight + 
+     int num_groups_y = (inputHeader.framebufferHeight + 
                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
-    int num_groups = num_groups_x * num_groups_y;
+     int num_groups = num_groups_x * num_groups_y;

    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
    // by MIN_TILE_HEIGHT pixels.
-    if (programIndex == 0)
-      RenderTile<<<num_groups, 128>>>(num_groups_x, num_groups_y,
-          inputHeaderPtr, inputDataPtr, visualizeLightCount,
-          framebuffer_r, framebuffer_g, framebuffer_b);
-    cudaDeviceSynchronize();
-    cudaDeviceSynchronize();
+     if (programIndex == 0)
+       RenderTile<<<(num_groups+4-1)/4,128>>>(num_groups_x, num_groups_y,
+           inputHeaderPtr, inputDataPtr, visualizeLightCount,
+           framebuffer_r, framebuffer_g, framebuffer_b);
+     cudaDeviceSynchronize();
 }
--- a/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cu
+++ b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cu
@@ -0,0 +1,104 @@
+/*
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#define programCount 32
+#define programIndex (threadIdx.x & 31)
+#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5))
+#define taskCount0 (gridDim.x*4)
+#define taskIndex1 (blockIdx.y)
+#define taskCount1 (gridDim.y)
+#define warpIdx (threadIdx.x >> 5)
+
+
+__device__
+static inline int
+mandel(float c_re, float c_im, int count) {
+    float z_re = c_re, z_im = c_im;
+    int i;
+    for (i = 0; i < count; ++i) {
+        if (z_re * z_re + z_im * z_im > 4.0f)
+            break;
+
+        float new_re = z_re*z_re - z_im*z_im;
+        float new_im = 2.f * z_re * z_im;
+        z_re = c_re + new_re;
+        z_im = c_im + new_im;
+    }
+
+    return i;
+}
+
+
+/* Task to compute the Mandelbrot iterations for a single scanline.
+ */
+__global__ void
+mandelbrot_scanline( float x0,  float dx, 
+                     float y0,  float dy,
+                     int width,  int height, 
+                     int xspan,  int yspan,
+                     int maxIterations,  int output[]) {
+    const  int xstart = taskIndex0 * xspan;
+    const  int xend   = min(xstart  + xspan, width);
+
+    const  int ystart = taskIndex1 * yspan;
+    const  int yend   = min(ystart  + yspan, height);
+   
+  for ( int yi = ystart; yi < yend; yi++)
+    for ( int xi = xstart; xi < xend; xi += programCount)
+    {
+      const float x = x0 + (xi + programIndex) * dx;
+      const float y = y0 +  yi              * dy;
+
+      const int res = mandel(x,y,maxIterations);
+      const int index = yi * width + (xi + programIndex);
+      if (xi + programIndex < xend)
+        output[index] = res;
+    }
+}
+
+extern "C" __global__ void
+mandelbrot_ispc( float x0,  float y0, 
+                 float x1,  float y1,
+                 int width,  int height, 
+                 int maxIterations,  int output[]) {
+     float dx = (x1 - x0) / width;
+     float dy = (y1 - y0) / height;
+    const  int xspan = 64;  /* make sure it is big enough to avoid false-sharing */
+    const  int yspan = 8; 
+
+
+    if (programIndex == 0)
+      mandelbrot_scanline<<<dim3((width+4-1)/xspan/4,height/yspan),128>>>
+        (x0, dx, y0, dy, width, height, xspan, yspan,  maxIterations, output);
+    cudaDeviceSynchronize();
+}
--- a/examples_cuda/rt/rt.cu
+++ b/examples_cuda/rt/rt.cu
@@ -2,6 +2,7 @@
 #define programIndex (threadIdx.x & 31)
 #define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5))
 #define taskCount (gridDim.x*4)
+#define warpIdx (threadIdx.x >> 5)

 #define float3 Float3
 struct Float3
@@ -57,17 +58,12 @@ struct Float3
  }
 };

-#if 0
-#define DIRISNEG
-#endif
+#define int8 char
+#define int16 short

 struct Ray {
    float3 origin, dir, invDir;
-#ifdef DIRISNEG /* this fails to compile with nvvm */
-    unsigned int dirIsNeg[3];
-#else
    unsigned int dirIsNeg0, dirIsNeg1, dirIsNeg2;
-#endif
    float mint, maxt;
    int hitId;
 };
@@ -78,8 +74,6 @@ struct Triangle {
    int pad[3];
 };

-#define int8 char
-#define int16 short
 struct LinearBVHNode {
    float bounds[2][3];
    unsigned int offset;     // num primitives for leaf, second child for interior
@@ -105,7 +99,8 @@ static inline float Dot(const float3 a, const float3 b) {
 }

 __device__
-static inline void generateRay( const float raster2camera[4][4], 
+inline
+static void generateRay( const float raster2camera[4][4], 
                         const float camera2world[4][4],
                        float x, float y, Ray &ray) {
    ray.mint = 0.f;
@@ -135,7 +130,7 @@ static inline void generateRay( const float raster2camera[4][4],

    ray.invDir = 1.f / ray.dir;

-#ifdef DIRISNEG
+#if 0
    ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
    ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
    ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
@@ -146,9 +141,9 @@ static inline void generateRay( const float raster2camera[4][4],
 #endif
 }

-
 __device__
-static inline bool BBoxIntersect(const  float bounds[2][3], 
+inline
+static bool BBoxIntersect(const  float bounds[2][3], 
                          const Ray &ray) {
     float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
     float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
@@ -186,9 +181,9 @@ static inline bool BBoxIntersect(const  float bounds[2][3],
 }


-
 __device__
-static inline bool TriIntersect(const  Triangle &tri, Ray &ray) {
+inline
+static bool TriIntersect(const  Triangle &tri, Ray &ray) {
     float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
     float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
     float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
@@ -227,15 +222,15 @@ static inline bool TriIntersect(const  Triangle &tri, Ray &ray) {
    return hit;
 }

-
 __device__
-static inline bool BVHIntersect(const  LinearBVHNode nodes[], 
-                  const  Triangle tris[], Ray &r) {
+inline
+bool BVHIntersect(const  LinearBVHNode nodes[], 
+                  const  Triangle tris[], Ray &r,
+                   int todo[]) {
    Ray ray = r;
    bool hit = false;
    // Follow ray through BVH nodes to find primitive intersections
     int todoOffset = 0, nodeNum = 0;
-     int todo[64];

    while (true) {
        // Check ray against BVH node
@@ -249,20 +244,17 @@ static inline bool BVHIntersect(const  LinearBVHNode nodes[],
                    if (TriIntersect(tris[primitivesOffset+i], ray))
                        hit = true;
                }
+                if (todoOffset == 0) 
+                    break;
                nodeNum = todo[--todoOffset];
            }
            else {
                // Put far BVH node on _todo_ stack, advance to near node
-#ifdef DIRISNEG
-                const int dirIsNeg = r.dirIsNeg[node.splitAxis];
-#else
                int dirIsNeg;
                if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg0;
                if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg1;
                if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg2;
-#endif
-                if (dirIsNeg) 
-                {
+                if (dirIsNeg) {
                   todo[todoOffset++] = nodeNum + 1;
                   nodeNum = node.offset;
                }
@@ -273,10 +265,10 @@ static inline bool BVHIntersect(const  LinearBVHNode nodes[],
            }
        }
        else {
-            nodeNum = todo[--todoOffset];
-        }
            if (todoOffset == 0)
                break;
+            nodeNum = todo[--todoOffset];
+        }
    }
    r.maxt = ray.maxt;
    r.hitId = ray.hitId;
@@ -284,9 +276,9 @@ static inline bool BVHIntersect(const  LinearBVHNode nodes[],
    return hit;
 }

-
 __device__
-static inline void raytrace_tile( int x0,  int x1,
+inline
+static void raytrace_tile( int x0,  int x1,
                           int y0,  int y1, 
                           int width,  int height,
                           int baseWidth,  int baseHeight,
@@ -298,28 +290,34 @@ static inline void raytrace_tile( int x0,  int x1,
     float widthScale = (float)(baseWidth) / (float)(width);
     float heightScale = (float)(baseHeight) / (float)(height);

-//    foreach_tiled (y = y0 ... y1, x = x0 ... x1) 
-    for ( int y = y0; y < y1; y++)
-      for ( int xb = x0; xb < x1; xb += programCount)
-    {
-      const int x = xb + programIndex;
-        Ray ray;
-        generateRay(raster2camera, camera2world, x*widthScale,
-                    y*heightScale, ray);
-        BVHIntersect(nodes, triangles, ray);
+#if 0
+   int *  todo =  new  int[64];
+#define ALLOC
+#else
+   int todo[64];
+#endif

-        int offset = y * width + x;
+    for (int y = y0 ;y < y1; y++)
+      for (int x = x0 + programIndex; x < x1; x += programCount)
        if (x < x1)
        {
+          Ray ray;
+          generateRay(raster2camera, camera2world, x*widthScale,
+              y*heightScale, ray);
+          BVHIntersect(nodes, triangles, ray, todo);
+
+          int offset = y * width + x;
          image[offset] = ray.maxt;
          id[offset] = ray.hitId;
        }
-    }
+
+#ifdef ALLOC
+  delete todo;
+#endif
 }



-extern "C"
 __global__
 void raytrace_tile_task( int width,  int height,
                              int baseWidth,  int baseHeight,
@@ -328,18 +326,34 @@ void raytrace_tile_task( int width,  int height,
                              float image[],  int id[],
                             const  LinearBVHNode nodes[],
                             const  Triangle triangles[]) {
-
-  if (taskIndex >= taskCount) return;
-     int dx = 32, dy = 16; // must match dx, dy below
+     int dx = 64, dy = 8; // must match dx, dy below
     int xBuckets = (width + (dx-1)) / dx;
     int x0 = (taskIndex % xBuckets) * dx;
     int x1 = min(x0 + dx, width);
     int y0 = (taskIndex / xBuckets) * dy;
     int y1 = min(y0 + dy, height);
-          
+                             
    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, 
                  raster2camera, camera2world, image,
                  id, nodes, triangles);
 }


+extern "C" __global__ void raytrace_ispc_tasks( int width,  int height,
+                                 int baseWidth,  int baseHeight,
+                                const  float raster2camera[4][4], 
+                                const  float camera2world[4][4],
+                                 float image[],  int id[],
+                                const  LinearBVHNode nodes[],
+                                const  Triangle triangles[]) {
+     int dx = 64, dy = 8;
+     int xBuckets = (width + (dx-1)) / dx;
+     int yBuckets = (height + (dy-1)) / dy;
+     int nTasks = xBuckets * yBuckets;
+     if (programIndex == 0)
+       raytrace_tile_task<<<(nTasks+4-1)/4,128>>>(width, height, baseWidth, baseHeight, 
+           raster2camera, camera2world, 
+           image, id, nodes, triangles);
+     cudaDeviceSynchronize();
+}
+
--- a/examples_cuda/sort/sort1.cu
+++ b/examples_cuda/sort/sort1.cu
@@ -247,7 +247,7 @@ void sort_ispc ( int n,  unsigned int code[],  int order[],  int ntasks,
 {
   int num = ntasks;
   int span = n / num;
-#if 0
+#if 1
   int hsize = 256*programCount*num;
   int *  hist =  __new< int>(hsize);
   int64 *  pair =  __new< int64>(n);
@@ -293,7 +293,7 @@ void sort_ispc ( int n,  unsigned int code[],  int order[],  int ntasks,
    unpack<<<nbx(num),128>>> (span, n, pair, code, order);
  sync;

-#if ALLOCATED
+#ifdef ALLOCATED
  __delete(g);
  __delete(hist);
  __delete(pair);
--- a/examples_cuda/stencil/.stencil.ispc.swn
+++ b/examples_cuda/stencil/.stencil.ispc.swn