diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll index b0374b46..f3c1a314 100644 --- a/builtins/target-nvptx64.ll +++ b/builtins/target-nvptx64.ll @@ -65,6 +65,36 @@ define i32 @__shfl_i32(i32, i32) nounwind readnone alwaysinline %shfl = tail call i32 asm sideeffect "shfl.idx.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) nounwind readnone alwaysinline ret i32 %shfl } +define float @__shfl_xor_float(float, i32) nounwind readnone alwaysinline +{ + %shfl = tail call float asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) nounwind readnone alwaysinline + ret float %shfl +} +define float @__fminf(float,float) nounwind readnone alwaysinline +{ + %min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) nounwind readnone alwaysinline + ret float %min +} +define float @__fmaxf(float,float) nounwind readnone alwaysinline +{ + %max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) nounwind readnone alwaysinline + ret float %max +} +define i32 @__ballot(i1) nounwind readnone alwaysinline +{ + %conv = zext i1 %0 to i32 + %res = tail call i32 asm sideeffect + "{ .reg .pred %p1; + setp.ne.u32 %p1, $1, 0; + vote.ballot.b32 $0, %p1; + }", "=r,r"(i32 %conv) nounwind readnone alwaysinline + ret i32 %res +} +define i32 @__lanemask_lt() nounwind readnone alwaysinline +{ + %mask = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() nounwind readnone alwaysinline + ret i32 %mask +} ;;;;;;;;;;;;;; @@ -161,10 +191,38 @@ define void ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines -declare float @__half_to_float_uniform(i16 %v) nounwind readnone -declare @__half_to_float_varying( %v) nounwind readnone -declare i16 @__float_to_half_uniform(float %v) nounwind readnone -declare @__float_to_half_varying( %v) nounwind readnone +declare float @llvm.convert.from.fp16(i16) nounwind readnone +declare i16 @llvm.convert.to.fp16(float) nounwind readnone +define float @__half_to_float_uniform(i16 %v) nounwind readnone alwaysinline +{ + ;; %res = call float @llvm.convert.from.fp16(i16 %v) + %res = tail call float asm sideeffect + "{ .reg .b16 %tmp; + mov.b16 %tmp, $1; + cvt.f32.f16 $0, %tmp; + }", "=f,h"(i16 %v) nounwind readnone alwaysinline + ret float %res +} +define i16 @__float_to_half_uniform(float %v) nounwind readnone alwaysinline +{ + ;; this will break the compiler, use inline asm similarly to above case + %half = call i16 @llvm.convert.to.fp16(float %v) + ret i16 %half +} +define @__half_to_float_varying( %v) nounwind readnone alwaysinline +{ + %el = extractelement <1 x i16> %v, i32 0 + %sf = call float @__half_to_float_uniform(i16 %el) + %vf = insertelement <1 x float> undef, float %sf, i32 0 + ret <1 x float> %vf; +} +define @__float_to_half_varying( %v) nounwind readnone alwaysinline +{ + %el = extractelement <1 x float> %v, i32 0 + %sh = call i16 @__float_to_half_uniform(float %el) + %vh = insertelement <1 x i16> undef, i16 %sh, i32 0 + ret <1 x i16> %vh; +} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; math @@ -376,8 +434,10 @@ declare @__sqrt_varying_double() nounwind readn declare i32 @llvm.ctpop.i32(i32) nounwind readnone define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { - %call = call i32 @llvm.ctpop.i32(i32 %0) - ret i32 %call +;; %call = call i32 @llvm.ctpop.i32(i32 %0) +;; ret i32 %call + %res = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %0) nounwind readnone alwaysinline + ret i32 %res } declare i64 @llvm.ctpop.i64(i64) nounwind readnone @@ -386,6 +446,21 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { ret i64 %call } +define i64 @__warpBinExclusiveScan(i1 %p) nounwind readonly alwaysinline +{ +entry: + %call = call i32 @__ballot(i1 zeroext %p) + %call1 = call i32 @__popcnt_int32(i32 %call) + %call2 = call i32 @__lanemask_lt() + %and = and i32 %call2, %call + %call3 = call i32 @__popcnt_int32(i32 %and) + %retval.sroa.1.4.insert.ext.i = zext i32 %call3 to i64 + %retval.sroa.1.4.insert.shift.i = shl nuw i64 %retval.sroa.1.4.insert.ext.i, 32 + %retval.sroa.0.0.insert.ext.i = zext i32 %call1 to i64 + %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.1.4.insert.shift.i, %retval.sroa.0.0.insert.ext.i + ret i64 %retval.sroa.0.0.insert.insert.i +} + ctlztz() ; FIXME: need either to wire these up to the 8-wide SVML entrypoints, @@ -440,13 +515,34 @@ define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline } define float @__reduce_min_float(<1 x float>) nounwind readnone { - %r = extractelement <1 x float> %0, i32 0 - ret float %r + %value = extractelement <1 x float> %0, i32 0 + %call = tail call float @__shfl_xor_float(float %value, i32 16) + %call1 = tail call float @__fminf(float %value, float %call) #4 + %call.1 = tail call float @__shfl_xor_float(float %call1, i32 8) + %call1.1 = tail call float @__fminf(float %call1, float %call.1) #4 + %call.2 = tail call float @__shfl_xor_float(float %call1.1, i32 4) + %call1.2 = tail call float @__fminf(float %call1.1, float %call.2) #4 + %call.3 = tail call float @__shfl_xor_float(float %call1.2, i32 2) + %call1.3 = tail call float @__fminf(float %call1.2, float %call.3) #4 + %call.4 = tail call float @__shfl_xor_float(float %call1.3, i32 1) + %call1.4 = tail call float @__fminf(float %call1.3, float %call.4) #4 + ret float %call1.4 } -define float @__reduce_max_float(<1 x float>) nounwind readnone { - %r = extractelement <1 x float> %0, i32 0 - ret float %r +define float @__reduce_max_float(<1 x float>) nounwind readnone +{ + %value = extractelement <1 x float> %0, i32 0 + %call = tail call float @__shfl_xor_float(float %value, i32 16) + %call1 = tail call float @__fmaxf(float %value, float %call) + %call.1 = tail call float @__shfl_xor_float(float %call1, i32 8) + %call1.1 = tail call float @__fmaxf(float %call1, float %call.1) + %call.2 = tail call float @__shfl_xor_float(float %call1.1, i32 4) + %call1.2 = tail call float @__fmaxf(float %call1.1, float %call.2) + %call.3 = tail call float @__shfl_xor_float(float %call1.2, i32 2) + %call1.3 = tail call float @__fmaxf(float %call1.2, float %call.3) + %call.4 = tail call float @__shfl_xor_float(float %call1.3, i32 1) + %call1.4 = tail call float @__fmaxf(float %call1.3, float %call.4) + ret float %call1.4 } define i32 @__reduce_add_int32(<1 x i32> %v) nounwind readnone { diff --git a/builtins/util_ptx.m4 b/builtins/util_ptx.m4 index c89ffb93..399d87d2 100644 --- a/builtins/util_ptx.m4 +++ b/builtins/util_ptx.m4 @@ -3768,52 +3768,23 @@ done: } define i32 @__packed_store_active(i32 * %startptr, %vals, - %full_mask) nounwind alwaysinline { + %full_mask) nounwind alwaysinline +{ entry: - %mask = call i64 @__movmsk( %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) - br i1 %mask_known, label %known_mask, label %unknown_mask + %active = extractelement <1 x i1> %full_mask, i32 0 + %call = tail call i64 @__warpBinExclusiveScan(i1 zeroext %active) + %res.sroa.0.0.extract.trunc = trunc i64 %call to i32 + br i1 %active, label %if.then, label %if.end -known_mask: - %allon = icmp eq i64 %mask, ALL_ON_MASK - br i1 %allon, label %all_on, label %unknown_mask +if.then: ; preds = %entry + %idxprom = ashr i64 %call, 32 + %arrayidx = getelementptr inbounds i32* %startptr, i64 %idxprom + %val = extractelement <1 x i32> %vals, i32 0 + store i32 %val, i32* %arrayidx, align 4 + br label %if.end -all_on: - %vecptr = bitcast i32 *%startptr to * - store %vals, * %vecptr, align 4 - ret i32 WIDTH - -unknown_mask: - br label %loop - -loop: - %lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ] - %lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ] - %offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ] - - ; is the current lane on? - %and = and i64 %mask, %lanemask - %do_store = icmp eq i64 %and, %lanemask - br i1 %do_store, label %store, label %loopend - -store: - %storeval = extractelement %vals, i32 %lane - %storeptr = getelementptr i32 *%startptr, i32 %offset - store i32 %storeval, i32 *%storeptr - %offset1 = add i32 %offset, 1 - br label %loopend - -loopend: - %nextoffset = phi i32 [ %offset1, %store ], [ %offset, %loop ] - %nextlane = add i32 %lane, 1 - %nextlanemask = mul i64 %lanemask, 2 - - ; are we done yet? - %test = icmp ne i32 %nextlane, WIDTH - br i1 %test, label %loop, label %done - -done: - ret i32 %nextoffset +if.end: ; preds = %if.then, %entry + ret i32 %res.sroa.0.0.extract.trunc } ') diff --git a/examples_cuda/deferred/kernels.cu b/examples_cuda/deferred/kernels.cu index fbb6eddc..2af43a28 100644 --- a/examples_cuda/deferred/kernels.cu +++ b/examples_cuda/deferred/kernels.cu @@ -209,7 +209,7 @@ static float reduce_min(float value) { #pragma unroll for (int i = 4; i >=0; i--) - value = min(value, __shfl_xor(value, 1<=0; i--) - value = max(value, __shfl_xor(value, 1<= numLights) break; float light_positionView_z = light_positionView_z_array[lightIndex]; float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; @@ -431,8 +432,6 @@ IntersectLightsWithTileMinMax( } #endif } - if (lightIndex >= numLights) - active = 0; #if 0 const int2 res = warpBinExclusiveScan(active); @@ -561,8 +560,10 @@ ShadeTile( // Reconstruct normal from G-buffer float surface_normal_x, surface_normal_y, surface_normal_z; + asm("// half2float //"); float normal_x = __half2float(inputData.normalEncoded_x[gBufferOffset]); float normal_y = __half2float(inputData.normalEncoded_y[gBufferOffset]); + asm("// half2float //"); float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y); float m = sqrt(4.0f * f - 1.0f); diff --git a/examples_cuda/deferred/kernels.ispc b/examples_cuda/deferred/kernels.ispc index 6d2a8cc9..82f8bdf7 100644 --- a/examples_cuda/deferred/kernels.ispc +++ b/examples_cuda/deferred/kernels.ispc @@ -116,7 +116,11 @@ ComputeZBounds( float laneMinZ = cameraFar; float laneMaxZ = cameraNear; for (uniform int32 y = tileStartY; y < tileEndY; ++y) { - foreach (x = tileStartX ... tileEndX) { +// foreach (x = tileStartX ... tileEndX) + for (uniform int xb = tileStartX; xb < tileEndX; xb += programCount) + { + const int x = xb + programIndex; + if (x >= tileEndX) break; // Unproject depth buffer Z value into view space float z = zBuffer[y * gBufferWidth + x]; float viewSpaceZ = cameraProj_43 / (z - cameraProj_33); @@ -178,7 +182,10 @@ IntersectLightsWithTileMinMax( uniform int32 tileNumLights = 0; - foreach (lightIndex = 0 ... numLights) { +// foreach (lightIndex = 0 ... numLights) + for (uniform int lightIndexB = 0; lightIndexB < numLights; lightIndexB += programCount) + { + const int lightIndex = lightIndexB + programIndex; float light_positionView_z = light_positionView_z_array[lightIndex]; float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; float light_attenuationEndNeg = -light_attenuationEnd; @@ -193,6 +200,7 @@ IntersectLightsWithTileMinMax( // don't actually need to mask the rest of this function - this is // just a greedy early-out. Could also structure all of this as // nested if() statements, but this a bit easier to read + bool active = false; if (any(inFrustum)) { float light_positionView_x = light_positionView_x_array[lightIndex]; float light_positionView_y = light_positionView_y_array[lightIndex]; @@ -214,12 +222,14 @@ IntersectLightsWithTileMinMax( inFrustum = inFrustum && (d >= light_attenuationEndNeg); // Pack and store intersecting lights - cif (inFrustum) { - tileNumLights += packed_store_active(&tileLightIndices[tileNumLights], - lightIndex); + if (inFrustum) + active = true; } + if (lightIndex >= numLights) + active = false; + + tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex); } - } return tileNumLights; } @@ -285,7 +295,11 @@ ShadeTile( if (tileNumLights == 0 || visualizeLightCount) { uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255)); for (uniform int32 y = tileStartY; y < tileEndY; ++y) { - foreach (x = tileStartX ... tileEndX) { +// foreach (x = tileStartX ... tileEndX) + for (uniform int xb = tileStartX ; xb < tileEndX; xb += programCount) + { + const int x = xb + programIndex; + if (x >= tileEndX) continue; int32 framebufferIndex = (y * gBufferWidth + x); framebuffer_r[framebufferIndex] = c; framebuffer_g[framebufferIndex] = c; @@ -299,7 +313,10 @@ ShadeTile( for (uniform int32 y = tileStartY; y < tileEndY; ++y) { uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f); - foreach (x = tileStartX ... tileEndX) { +// foreach (x = tileStartX ... tileEndX) { + for (uniform int xb = tileStartX ; xb < tileEndX; xb += programCount) + { + const int x = xb + programIndex; int32 gBufferOffset = y * gBufferWidth + x; // Reconstruct position and (negative) view vector from G-buffer diff --git a/examples_cuda/deferred/kernels1.ispc b/examples_cuda/deferred/kernels1.ispc index 69986e63..251da1f2 100644 --- a/examples_cuda/deferred/kernels1.ispc +++ b/examples_cuda/deferred/kernels1.ispc @@ -131,7 +131,7 @@ ComputeZBounds( for (uniform int xb = tileStartX; xb < tileEndX; xb += programCount) { const int x = xb + programIndex; - if (x >= tileEndX) continue; + if (x >= tileEndX) break; // Unproject depth buffer Z value into view space float z = zBuffer[y * gBufferWidth + x]; float viewSpaceZ = cameraProj_43 / (z - cameraProj_33); @@ -197,48 +197,50 @@ IntersectLightsWithTileMinMax( for (uniform int lightIndexB = 0; lightIndexB < numLights; lightIndexB += programCount) { const int lightIndex = lightIndexB + programIndex; - if (lightIndex >= numLights) continue; - float light_positionView_z = light_positionView_z_array[lightIndex]; - float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; - float light_attenuationEndNeg = -light_attenuationEnd; + float light_positionView_z = light_positionView_z_array[lightIndex]; + float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; + float light_attenuationEndNeg = -light_attenuationEnd; - float d = light_positionView_z - minZ; - bool inFrustum = (d >= light_attenuationEndNeg); + float d = light_positionView_z - minZ; + bool inFrustum = (d >= light_attenuationEndNeg); - d = maxZ - light_positionView_z; + d = maxZ - light_positionView_z; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + // This seems better than cif(!inFrustum) ccontinue; here since we + // don't actually need to mask the rest of this function - this is + // just a greedy early-out. Could also structure all of this as + // nested if() statements, but this a bit easier to read + bool active = false; + if (any(inFrustum)) { + float light_positionView_x = light_positionView_x_array[lightIndex]; + float light_positionView_y = light_positionView_y_array[lightIndex]; + + d = light_positionView_z * frustumPlanes_z[0] + + light_positionView_x * frustumPlanes_xy[0]; inFrustum = inFrustum && (d >= light_attenuationEndNeg); - - // This seems better than cif(!inFrustum) ccontinue; here since we - // don't actually need to mask the rest of this function - this is - // just a greedy early-out. Could also structure all of this as - // nested if() statements, but this a bit easier to read - if (any(inFrustum)) { - float light_positionView_x = light_positionView_x_array[lightIndex]; - float light_positionView_y = light_positionView_y_array[lightIndex]; - d = light_positionView_z * frustumPlanes_z[0] + - light_positionView_x * frustumPlanes_xy[0]; - inFrustum = inFrustum && (d >= light_attenuationEndNeg); + d = light_positionView_z * frustumPlanes_z[1] + + light_positionView_x * frustumPlanes_xy[1]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); - d = light_positionView_z * frustumPlanes_z[1] + - light_positionView_x * frustumPlanes_xy[1]; - inFrustum = inFrustum && (d >= light_attenuationEndNeg); + d = light_positionView_z * frustumPlanes_z[2] + + light_positionView_y * frustumPlanes_xy[2]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); - d = light_positionView_z * frustumPlanes_z[2] + - light_positionView_y * frustumPlanes_xy[2]; - inFrustum = inFrustum && (d >= light_attenuationEndNeg); + d = light_positionView_z * frustumPlanes_z[3] + + light_positionView_y * frustumPlanes_xy[3]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); - d = light_positionView_z * frustumPlanes_z[3] + - light_positionView_y * frustumPlanes_xy[3]; - inFrustum = inFrustum && (d >= light_attenuationEndNeg); - - // Pack and store intersecting lights - if (inFrustum) { - tileNumLights += packed_store_active(&tileLightIndices[tileNumLights], - lightIndex); - } - } + // Pack and store intersecting lights + if (inFrustum) + active = true; + } + if (lightIndex >= numLights) + active = false; + + tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex); } return tileNumLights; diff --git a/examples_cuda/deferred/main_cu.cpp b/examples_cuda/deferred/main_cu.cpp index 930db971..df96b209 100755 --- a/examples_cuda/deferred/main_cu.cpp +++ b/examples_cuda/deferred/main_cu.cpp @@ -251,7 +251,7 @@ extern "C" assert(module_1 != NULL); assert(func_name != NULL); assert(func_args != NULL); -#if 0 +#if 1 const char * module = module_1; #else const std::vector module_str = readBinary("kernel.cubin");