diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll index f3c1a314..65cf405a 100644 --- a/builtins/target-nvptx64.ll +++ b/builtins/target-nvptx64.ll @@ -484,25 +484,28 @@ svml_stubs(double,d,WIDTH) define i64 @__movmsk(<1 x i1>) nounwind readnone alwaysinline { %v = extractelement <1 x i1> %0, i32 0 - %v64 = zext i1 %v to i64 - ret i64 %v64 + %v64 = zext i1 %v to i64 + ret i64 %v64 } define i1 @__any(<1 x i1>) nounwind readnone alwaysinline { %v = extractelement <1 x i1> %0, i32 0 - %cmp = icmp ne i1 %v, 0 + %res = call i32 @__ballot(i1 %v) + %cmp = icmp ne i32 %res, 0 ret i1 %cmp } define i1 @__all(<1 x i1>) nounwind readnone alwaysinline { %v = extractelement <1 x i1> %0, i32 0 - %cmp = icmp eq i1 %v, 1 + %res = call i32 @__ballot(i1 %v) + %cmp = icmp eq i32 %res, 31 ret i1 %cmp } define i1 @__none(<1 x i1>) nounwind readnone alwaysinline { %v = extractelement <1 x i1> %0, i32 0 - %cmp = icmp eq i1 %v, 0 + %res = call i32 @__ballot(i1 %v) + %cmp = icmp eq i32 %res, 0 ret i1 %cmp } diff --git a/examples_cuda/deferred/common.cpp b/examples_cuda/deferred/common.cpp index fa4ee57b..309be180 100644 --- a/examples_cuda/deferred/common.cpp +++ b/examples_cuda/deferred/common.cpp @@ -131,6 +131,7 @@ CreateInputDataFromFile(const char *path) { fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path); return NULL; } + fprintf(stderr, " numLights= %d\n", input->header.numLights); // Load data chunk and update pointers input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize, diff --git a/examples_cuda/deferred/kernels.ispc b/examples_cuda/deferred/kernels.ispc index 82f8bdf7..2b90c014 100644 --- a/examples_cuda/deferred/kernels.ispc +++ b/examples_cuda/deferred/kernels.ispc @@ -200,36 +200,33 @@ IntersectLightsWithTileMinMax( // don't actually need to mask the rest of this function - this is // just a greedy early-out. Could also structure all of this as // nested if() statements, but this a bit easier to read - bool active = false; - if (any(inFrustum)) { - float light_positionView_x = light_positionView_x_array[lightIndex]; - float light_positionView_y = light_positionView_y_array[lightIndex]; + if (any(inFrustum)) { + float light_positionView_x = light_positionView_x_array[lightIndex]; + float light_positionView_y = light_positionView_y_array[lightIndex]; - d = light_positionView_z * frustumPlanes_z[0] + - light_positionView_x * frustumPlanes_xy[0]; - inFrustum = inFrustum && (d >= light_attenuationEndNeg); + d = light_positionView_z * frustumPlanes_z[0] + + light_positionView_x * frustumPlanes_xy[0]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); - d = light_positionView_z * frustumPlanes_z[1] + - light_positionView_x * frustumPlanes_xy[1]; - inFrustum = inFrustum && (d >= light_attenuationEndNeg); + d = light_positionView_z * frustumPlanes_z[1] + + light_positionView_x * frustumPlanes_xy[1]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); - d = light_positionView_z * frustumPlanes_z[2] + - light_positionView_y * frustumPlanes_xy[2]; - inFrustum = inFrustum && (d >= light_attenuationEndNeg); + d = light_positionView_z * frustumPlanes_z[2] + + light_positionView_y * frustumPlanes_xy[2]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); - d = light_positionView_z * frustumPlanes_z[3] + - light_positionView_y * frustumPlanes_xy[3]; - inFrustum = inFrustum && (d >= light_attenuationEndNeg); - - // Pack and store intersecting lights - if (inFrustum) - active = true; - } - if (lightIndex >= numLights) - active = false; + d = light_positionView_z * frustumPlanes_z[3] + + light_positionView_y * frustumPlanes_xy[3]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); - tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex); - } + // Pack and store intersecting lights + const bool active = inFrustum && lightIndex < numLights; + + if (any(active)) + tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex); + } + } return tileNumLights; } diff --git a/examples_cuda/deferred/kernels1.ispc b/examples_cuda/deferred/kernels1.ispc index 251da1f2..07afe33a 100644 --- a/examples_cuda/deferred/kernels1.ispc +++ b/examples_cuda/deferred/kernels1.ispc @@ -37,6 +37,7 @@ #define programIndex laneIndex() #define taskIndex blockIndex0() #define taskCount blockCount0() +#define cif if #else #warning "emitting HOST code" #endif @@ -212,8 +213,8 @@ IntersectLightsWithTileMinMax( // don't actually need to mask the rest of this function - this is // just a greedy early-out. Could also structure all of this as // nested if() statements, but this a bit easier to read - bool active = false; - if (any(inFrustum)) { + if (any(inFrustum)) + { float light_positionView_x = light_positionView_x_array[lightIndex]; float light_positionView_y = light_positionView_y_array[lightIndex]; @@ -234,13 +235,11 @@ IntersectLightsWithTileMinMax( inFrustum = inFrustum && (d >= light_attenuationEndNeg); // Pack and store intersecting lights - if (inFrustum) - active = true; - } - if (lightIndex >= numLights) - active = false; + const bool active = inFrustum && lightIndex < numLights; - tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex); + if(any(active)) + tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex); + } } return tileNumLights; @@ -402,7 +401,7 @@ ShadeTile( // Clip at end of attenuation float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd; - if (distanceToLight2 < light_attenutaionEnd2) { + cif (distanceToLight2 < light_attenutaionEnd2) { float distanceToLight = sqrt(distanceToLight2); // HLSL "rcp" is allowed to be fairly inaccurate @@ -416,7 +415,7 @@ ShadeTile( surface_normal_z, L_x, L_y, L_z); // Clip back facing - if (NdotL > 0.0f) { + cif (NdotL > 0.0f) { uniform float light_attenuationBegin = inputData.lightAttenuationBegin[lightIndex]; diff --git a/examples_cuda/deferred/main_cu.cpp b/examples_cuda/deferred/main_cu.cpp index df96b209..e16745ec 100755 --- a/examples_cuda/deferred/main_cu.cpp +++ b/examples_cuda/deferred/main_cu.cpp @@ -186,7 +186,7 @@ void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size) checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); } #define deviceLaunch(func,nbx,nby,nbz,params) \ - checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_EQUAL)); \ + checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \ checkCudaErrors( \ cuLaunchKernel( \ (func), \ @@ -251,7 +251,7 @@ extern "C" assert(module_1 != NULL); assert(func_name != NULL); assert(func_args != NULL); -#if 1 +#if 0 const char * module = module_1; #else const std::vector module_str = readBinary("kernel.cubin"); @@ -388,7 +388,7 @@ int main(int argc, char** argv) { memcpyD2H(framebuffer.g, d_g, buffsize); memcpyD2H(framebuffer.b, d_b, buffsize); - printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render " + printf("[ispc cuda]:\t\t[%.3f] million cycles to render " "%d x %d image\n", ispcCycles, input->header.framebufferWidth, input->header.framebufferHeight); WriteFrame("deferred-cuda.ppm", input, framebuffer);