added any/none/all for bool
This commit is contained in:
@@ -490,19 +490,22 @@ define i64 @__movmsk(<1 x i1>) nounwind readnone alwaysinline {
|
|||||||
|
|
||||||
define i1 @__any(<1 x i1>) nounwind readnone alwaysinline {
|
define i1 @__any(<1 x i1>) nounwind readnone alwaysinline {
|
||||||
%v = extractelement <1 x i1> %0, i32 0
|
%v = extractelement <1 x i1> %0, i32 0
|
||||||
%cmp = icmp ne i1 %v, 0
|
%res = call i32 @__ballot(i1 %v)
|
||||||
|
%cmp = icmp ne i32 %res, 0
|
||||||
ret i1 %cmp
|
ret i1 %cmp
|
||||||
}
|
}
|
||||||
|
|
||||||
define i1 @__all(<1 x i1>) nounwind readnone alwaysinline {
|
define i1 @__all(<1 x i1>) nounwind readnone alwaysinline {
|
||||||
%v = extractelement <1 x i1> %0, i32 0
|
%v = extractelement <1 x i1> %0, i32 0
|
||||||
%cmp = icmp eq i1 %v, 1
|
%res = call i32 @__ballot(i1 %v)
|
||||||
|
%cmp = icmp eq i32 %res, 31
|
||||||
ret i1 %cmp
|
ret i1 %cmp
|
||||||
}
|
}
|
||||||
|
|
||||||
define i1 @__none(<1 x i1>) nounwind readnone alwaysinline {
|
define i1 @__none(<1 x i1>) nounwind readnone alwaysinline {
|
||||||
%v = extractelement <1 x i1> %0, i32 0
|
%v = extractelement <1 x i1> %0, i32 0
|
||||||
%cmp = icmp eq i1 %v, 0
|
%res = call i32 @__ballot(i1 %v)
|
||||||
|
%cmp = icmp eq i32 %res, 0
|
||||||
ret i1 %cmp
|
ret i1 %cmp
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -131,6 +131,7 @@ CreateInputDataFromFile(const char *path) {
|
|||||||
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
|
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
fprintf(stderr, " numLights= %d\n", input->header.numLights);
|
||||||
|
|
||||||
// Load data chunk and update pointers
|
// Load data chunk and update pointers
|
||||||
input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
|
input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
|
||||||
|
|||||||
@@ -200,7 +200,6 @@ IntersectLightsWithTileMinMax(
|
|||||||
// don't actually need to mask the rest of this function - this is
|
// don't actually need to mask the rest of this function - this is
|
||||||
// just a greedy early-out. Could also structure all of this as
|
// just a greedy early-out. Could also structure all of this as
|
||||||
// nested if() statements, but this a bit easier to read
|
// nested if() statements, but this a bit easier to read
|
||||||
bool active = false;
|
|
||||||
if (any(inFrustum)) {
|
if (any(inFrustum)) {
|
||||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||||
@@ -222,14 +221,12 @@ IntersectLightsWithTileMinMax(
|
|||||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
// Pack and store intersecting lights
|
// Pack and store intersecting lights
|
||||||
if (inFrustum)
|
const bool active = inFrustum && lightIndex < numLights;
|
||||||
active = true;
|
|
||||||
}
|
|
||||||
if (lightIndex >= numLights)
|
|
||||||
active = false;
|
|
||||||
|
|
||||||
|
if (any(active))
|
||||||
tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex);
|
tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return tileNumLights;
|
return tileNumLights;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -37,6 +37,7 @@
|
|||||||
#define programIndex laneIndex()
|
#define programIndex laneIndex()
|
||||||
#define taskIndex blockIndex0()
|
#define taskIndex blockIndex0()
|
||||||
#define taskCount blockCount0()
|
#define taskCount blockCount0()
|
||||||
|
#define cif if
|
||||||
#else
|
#else
|
||||||
#warning "emitting HOST code"
|
#warning "emitting HOST code"
|
||||||
#endif
|
#endif
|
||||||
@@ -212,8 +213,8 @@ IntersectLightsWithTileMinMax(
|
|||||||
// don't actually need to mask the rest of this function - this is
|
// don't actually need to mask the rest of this function - this is
|
||||||
// just a greedy early-out. Could also structure all of this as
|
// just a greedy early-out. Could also structure all of this as
|
||||||
// nested if() statements, but this a bit easier to read
|
// nested if() statements, but this a bit easier to read
|
||||||
bool active = false;
|
if (any(inFrustum))
|
||||||
if (any(inFrustum)) {
|
{
|
||||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||||
|
|
||||||
@@ -234,14 +235,12 @@ IntersectLightsWithTileMinMax(
|
|||||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
// Pack and store intersecting lights
|
// Pack and store intersecting lights
|
||||||
if (inFrustum)
|
const bool active = inFrustum && lightIndex < numLights;
|
||||||
active = true;
|
|
||||||
}
|
|
||||||
if (lightIndex >= numLights)
|
|
||||||
active = false;
|
|
||||||
|
|
||||||
|
if(any(active))
|
||||||
tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex);
|
tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return tileNumLights;
|
return tileNumLights;
|
||||||
}
|
}
|
||||||
@@ -402,7 +401,7 @@ ShadeTile(
|
|||||||
// Clip at end of attenuation
|
// Clip at end of attenuation
|
||||||
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
||||||
|
|
||||||
if (distanceToLight2 < light_attenutaionEnd2) {
|
cif (distanceToLight2 < light_attenutaionEnd2) {
|
||||||
float distanceToLight = sqrt(distanceToLight2);
|
float distanceToLight = sqrt(distanceToLight2);
|
||||||
|
|
||||||
// HLSL "rcp" is allowed to be fairly inaccurate
|
// HLSL "rcp" is allowed to be fairly inaccurate
|
||||||
@@ -416,7 +415,7 @@ ShadeTile(
|
|||||||
surface_normal_z, L_x, L_y, L_z);
|
surface_normal_z, L_x, L_y, L_z);
|
||||||
|
|
||||||
// Clip back facing
|
// Clip back facing
|
||||||
if (NdotL > 0.0f) {
|
cif (NdotL > 0.0f) {
|
||||||
uniform float light_attenuationBegin =
|
uniform float light_attenuationBegin =
|
||||||
inputData.lightAttenuationBegin[lightIndex];
|
inputData.lightAttenuationBegin[lightIndex];
|
||||||
|
|
||||||
|
|||||||
@@ -186,7 +186,7 @@ void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
|
|||||||
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
|
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
|
||||||
}
|
}
|
||||||
#define deviceLaunch(func,nbx,nby,nbz,params) \
|
#define deviceLaunch(func,nbx,nby,nbz,params) \
|
||||||
checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_EQUAL)); \
|
checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \
|
||||||
checkCudaErrors( \
|
checkCudaErrors( \
|
||||||
cuLaunchKernel( \
|
cuLaunchKernel( \
|
||||||
(func), \
|
(func), \
|
||||||
@@ -251,7 +251,7 @@ extern "C"
|
|||||||
assert(module_1 != NULL);
|
assert(module_1 != NULL);
|
||||||
assert(func_name != NULL);
|
assert(func_name != NULL);
|
||||||
assert(func_args != NULL);
|
assert(func_args != NULL);
|
||||||
#if 1
|
#if 0
|
||||||
const char * module = module_1;
|
const char * module = module_1;
|
||||||
#else
|
#else
|
||||||
const std::vector<char> module_str = readBinary("kernel.cubin");
|
const std::vector<char> module_str = readBinary("kernel.cubin");
|
||||||
@@ -388,7 +388,7 @@ int main(int argc, char** argv) {
|
|||||||
memcpyD2H(framebuffer.g, d_g, buffsize);
|
memcpyD2H(framebuffer.g, d_g, buffsize);
|
||||||
memcpyD2H(framebuffer.b, d_b, buffsize);
|
memcpyD2H(framebuffer.b, d_b, buffsize);
|
||||||
|
|
||||||
printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
|
printf("[ispc cuda]:\t\t[%.3f] million cycles to render "
|
||||||
"%d x %d image\n", ispcCycles,
|
"%d x %d image\n", ispcCycles,
|
||||||
input->header.framebufferWidth, input->header.framebufferHeight);
|
input->header.framebufferWidth, input->header.framebufferHeight);
|
||||||
WriteFrame("deferred-cuda.ppm", input, framebuffer);
|
WriteFrame("deferred-cuda.ppm", input, framebuffer);
|
||||||
|
|||||||
Reference in New Issue
Block a user