diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll index ecd536a3..297b9779 100644 --- a/builtins/target-nvptx64.ll +++ b/builtins/target-nvptx64.ll @@ -175,8 +175,28 @@ declare double @__round_uniform_double(double) nounwind readnone declare double @__floor_uniform_double(double) nounwind readnone declare double @__ceil_uniform_double(double) nounwind readnone -declare @__round_varying_float() nounwind readnone -declare @__floor_varying_float() nounwind readnone +define <1 x float> @__round_varying_float(<1 x float>) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32> + %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, + %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float> + %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, + %binop21.i = fadd <1 x float> %binop.i, + %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32> + %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float> + ret <1 x float> %int_to_float_bitcast.i.i.i +} +define <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind + %bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32> + %bitop.i = and <1 x i32> %val_to_boolvec32.i, + %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float> + %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <1 x float> %binop.i +} + declare @__ceil_varying_float() nounwind readnone declare @__round_varying_double() nounwind readnone @@ -246,10 +266,40 @@ define double @__min_uniform_double(double, double) nounwind readonly alwaysinl ;; min/max uniform -declare @__max_varying_float(, ) nounwind readnone -declare @__min_varying_float(, ) nounwind readnone -declare @__min_varying_int32(, ) nounwind readnone -declare @__max_varying_int32(, ) nounwind readnone +;; /* float */ +define <1 x float> @__max_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline { + %a = extractelement <1 x float> %0, i32 0 + %b = extractelement <1 x float> %1, i32 0 + %r = call float @__max_uniform_float(float %a, float %b) + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret <1 x float> %rv +} +define <1 x float> @__min_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline { + %a = extractelement <1 x float> %0, i32 0 + %b = extractelement <1 x float> %1, i32 0 + %r = call float @__min_uniform_float(float %a, float %b) + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret <1 x float> %rv + +} + +;; /* int32 */ +define <1 x i32> @__max_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline { + %a = extractelement <1 x i32> %0, i32 0 + %b = extractelement <1 x i32> %1, i32 0 + %r = call i32 @__max_uniform_int32(i32 %a, i32 %b) + %rv = insertelement <1 x i32> undef, i32 %r, i32 0 + ret <1 x i32> %rv +} +define <1 x i32> @__min_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline { + %a = extractelement <1 x i32> %0, i32 0 + %b = extractelement <1 x i32> %1, i32 0 + %r = call i32 @__min_uniform_int32(i32 %a, i32 %b) + %rv = insertelement <1 x i32> undef, i32 %r, i32 0 + ret <1 x i32> %rv +} + +;; /* uint32 */ declare @__min_varying_uint32(, ) nounwind readnone declare @__max_varying_uint32(, ) nounwind readnone ;; declare @__min_varying_int64(, ) nounwind readnone @@ -289,7 +339,13 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline declare @__rcp_varying_float() nounwind readnone declare @__rsqrt_varying_float() nounwind readnone -declare @__sqrt_varying_float() nounwind readnone +define @__sqrt_varying_float() nounwind readnone alwaysinline +{ + %v = extractelement <1 x float> %0, i32 0 + %r = call float @__sqrt_uniform_float(float %v) + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret %rv +} ;; declare double @__sqrt_uniform_double(double) nounwind readnone define double @__sqrt_uniform_double(double) nounwind readonly alwaysinline { diff --git a/examples_cuda/mandelbrot_tasks3d/mandelbrot_task.o b/examples_cuda/mandelbrot_tasks3d/mandelbrot_task.o deleted file mode 100644 index d3eb8d1d..00000000 Binary files a/examples_cuda/mandelbrot_tasks3d/mandelbrot_task.o and /dev/null differ diff --git a/examples_cuda/volume_rendering/Makefile b/examples_cuda/volume_rendering/Makefile index 7bb86e10..ca2d0958 100644 --- a/examples_cuda/volume_rendering/Makefile +++ b/examples_cuda/volume_rendering/Makefile @@ -2,7 +2,6 @@ EXAMPLE=volume CPP_SRC=volume.cpp volume_serial.cpp ISPC_SRC=volume.ispc -ISPC_IA_TARGETS=sse2,sse4-x2,avx -ISPC_ARM_TARGETS=neon +ISPC_IA_TARGETS=avx include ../common.mk diff --git a/examples_cuda/volume_rendering/volume.ispc b/examples_cuda/volume_rendering/volume.ispc index a60c6a42..ef79ab85 100644 --- a/examples_cuda/volume_rendering/volume.ispc +++ b/examples_cuda/volume_rendering/volume.ispc @@ -31,6 +31,17 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#ifdef __NVPTX__ +#warning "emitting DEVICE code" +#define taskIndex blockIndex0() +#define taskCount blockCount0() +#define programIndex laneIndex() +#define programCount warpSize() +#else +#warning "emitting HOST code" +#endif + + typedef float<3> float3; struct Ray { @@ -236,7 +247,7 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) { // terminate once attenuation is high float atten = exp(-tau); if (atten < .005) - cbreak; + break; // direct lighting float Li = lightIntensity / distanceSquared(lightPos, pos) * diff --git a/examples_cuda/volume_rendering/volume1.ispc b/examples_cuda/volume_rendering/volume1.ispc new file mode 100644 index 00000000..67f658b7 --- /dev/null +++ b/examples_cuda/volume_rendering/volume1.ispc @@ -0,0 +1,410 @@ +/* + Copyright (c) 2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef __NVPTX__ +#warning "emitting DEVICE code" +#define taskIndex blockIndex0() +#define taskCount blockCount0() +#define programIndex laneIndex() +#define programCount warpSize() +#else +#warning "emitting HOST code" +#endif + + +typedef float<3> float3; + +struct Ray { + float3 origin, dir; +}; + + +static inline void +generateRay(const uniform float raster2camera[4][4], + const uniform float camera2world[4][4], + float x, float y, Ray &ray) { + // transform raster coordinate (x, y, 0) to camera space + float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3]; + float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3]; + float camz = raster2camera[2][3]; + float camw = raster2camera[3][3]; + camx /= camw; + camy /= camw; + camz /= camw; + + ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz; + ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz; + ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz; + + ray.origin.x = camera2world[0][3] / camera2world[3][3]; + ray.origin.y = camera2world[1][3] / camera2world[3][3]; + ray.origin.z = camera2world[2][3] / camera2world[3][3]; +} + + +static inline bool +Inside(float3 p, float3 pMin, float3 pMax) { + return (p.x >= pMin.x && p.x <= pMax.x && + p.y >= pMin.y && p.y <= pMax.y && + p.z >= pMin.z && p.z <= pMax.z); +} + + +static inline bool +IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) { + float t0 = -1e30, t1 = 1e30; + + float3 tNear = (pMin - ray.origin) / ray.dir; + float3 tFar = (pMax - ray.origin) / ray.dir; + if (tNear.x > tFar.x) { + float tmp = tNear.x; + tNear.x = tFar.x; + tFar.x = tmp; + } + t0 = max(tNear.x, t0); + t1 = min(tFar.x, t1); + + if (tNear.y > tFar.y) { + float tmp = tNear.y; + tNear.y = tFar.y; + tFar.y = tmp; + } + t0 = max(tNear.y, t0); + t1 = min(tFar.y, t1); + + if (tNear.z > tFar.z) { + float tmp = tNear.z; + tNear.z = tFar.z; + tFar.z = tmp; + } + t0 = max(tNear.z, t0); + t1 = min(tFar.z, t1); + + if (t0 <= t1) { + hit0 = t0; + hit1 = t1; + return true; + } + else + return false; +} + + +static inline float Lerp(float t, float a, float b) { + return (1.f - t) * a + t * b; +} + + +static inline float D(int x, int y, int z, uniform int nVoxels[3], + uniform float density[]) { + x = clamp(x, 0, nVoxels[0]-1); + y = clamp(y, 0, nVoxels[1]-1); + z = clamp(z, 0, nVoxels[2]-1); + + return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x]; +} + + +static inline float3 Offset(float3 p, float3 pMin, float3 pMax) { + return (p - pMin) / (pMax - pMin); +} + + +static inline float Density(float3 Pobj, float3 pMin, float3 pMax, + uniform float density[], uniform int nVoxels[3]) { + if (!Inside(Pobj, pMin, pMax)) + return 0; + // Compute voxel coordinates and offsets for _Pobj_ + float3 vox = Offset(Pobj, pMin, pMax); + vox.x = vox.x * nVoxels[0] - .5f; + vox.y = vox.y * nVoxels[1] - .5f; + vox.z = vox.z * nVoxels[2] - .5f; + int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z); + float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz; + + // Trilinearly interpolate density values to compute local density + float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density), + D(vx+1, vy, vz, nVoxels, density)); + float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density), + D(vx+1, vy+1, vz, nVoxels, density)); + float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density), + D(vx+1, vy, vz+1, nVoxels, density)); + float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), + D(vx+1, vy+1, vz+1, nVoxels, density)); + float d0 = Lerp(dy, d00, d10); + float d1 = Lerp(dy, d01, d11); + return Lerp(dz, d0, d1); +} + + +/* Returns the transmittance between two points p0 and p1, in a volume + with extent (pMin,pMax) with transmittance coefficient sigma_t, + defined by nVoxels[3] voxels in each dimension in the given density + array. */ +static inline float +transmittance(uniform float3 p0, float3 p1, uniform float3 pMin, + uniform float3 pMax, uniform float sigma_t, + uniform float density[], uniform int nVoxels[3]) { + float rayT0, rayT1; + Ray ray; + ray.origin = p1; + ray.dir = p0 - p1; + + // Find the parametric t range along the ray that is inside the volume. + if (!IntersectP(ray, pMin, pMax, rayT0, rayT1)) + return 1.; + + rayT0 = max(rayT0, 0.f); + + // Accumulate beam transmittance in tau + float tau = 0; + float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y + + ray.dir.z * ray.dir.z); + uniform float stepDist = 0.2; + float stepT = stepDist / rayLength; + + float t = rayT0; + float3 pos = ray.origin + ray.dir * rayT0; + float3 dirStep = ray.dir * stepT; + while (t < rayT1) { + tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels); + pos = pos + dirStep; + t += stepT; + } + + return exp(-tau); +} + + +static inline float +distanceSquared(float3 a, float3 b) { + float3 d = a-b; + return d.x*d.x + d.y*d.y + d.z*d.z; +} + + +static inline float +raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) { + float rayT0, rayT1; + uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8}; + uniform float3 lightPos = { -1, 4, 1.5 }; + + if (!IntersectP(ray, pMin, pMax, rayT0, rayT1)) + return 0.; + + rayT0 = max(rayT0, 0.f); + + // Parameters that define the volume scattering characteristics and + // sampling rate for raymarching + uniform float Le = .25; // Emission coefficient + uniform float sigma_a = 10; // Absorption coefficient + uniform float sigma_s = 10; // Scattering coefficient + uniform float stepDist = 0.025; // Ray step amount + uniform float lightIntensity = 40; // Light source intensity + + float tau = 0.f; // accumulated beam transmittance + float L = 0; // radiance along the ray + float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y + + ray.dir.z * ray.dir.z); + float stepT = stepDist / rayLength; + + float t = rayT0; + float3 pos = ray.origin + ray.dir * rayT0; + float3 dirStep = ray.dir * stepT; + while (t < rayT1) + { + float d = Density(pos, pMin, pMax, density, nVoxels); + + // terminate once attenuation is high + float atten = exp(-tau); + if (atten < .005) + break; + + // direct lighting + float Li = lightIntensity / distanceSquared(lightPos, pos) * + transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s, + density, nVoxels); + L += stepDist * atten * d * sigma_s * (Li + Le); + + // update beam transmittance + tau += stepDist * (sigma_a + sigma_s) * d; + + pos = pos + dirStep; + t += stepT; + } + + // Gamma correction + return pow(L, 1.f / 2.2f); +} + + +/* Utility routine used by both the task-based and the single-core entrypoints. + Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the + result into the image[] array. + */ +static inline void +volume_tile(uniform int x0, uniform int y0, uniform int x1, + uniform int y1, uniform float density[], uniform int nVoxels[3], + const uniform float raster2camera[4][4], + const uniform float camera2world[4][4], + uniform int width, uniform int height, uniform float image[]) { + // Work on 4x4=16 pixel big tiles of the image. This function thus + // implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble + // by 4. + for (uniform int y = y0; y < y1; y += 4) { + for (uniform int x = x0; x < x1; x += 4) { +// foreach (o = 0 ... 16) { + for (int ob = 0; ob < 32; ob += programCount) + { + const int o = ob + programIndex; + if (o >= 16) continue; + + + // These two arrays encode the mapping from [0,15] to + // offsets within the 4x4 pixel block so that we render + // each pixel inside the block + const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3, + 0, 1, 0, 1, 2, 3, 2, 3 }; + const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1, + 2, 2, 3, 3, 2, 2, 3, 3 }; + + // Figure out the pixel to render for this program instance + int xo = x + xoffsets[o], yo = y + yoffsets[o]; + + // Use viewing parameters to compute the corresponding ray + // for the pixel + Ray ray; + generateRay(raster2camera, camera2world, xo, yo, ray); + + // And raymarch through the volume to compute the pixel's + // value + int offset = yo * width + xo; + image[offset] = raymarch(density, nVoxels, ray); + } + } + } +} + + +task void +volume_task(uniform float density[], uniform int _nVoxels[3], + const uniform float _raster2camera[4][4], + const uniform float _camera2world[4][4], + uniform int width, uniform int height, uniform float image[]) +{ + if (taskIndex >= taskCount) return; + +#if 1 + uniform int nVoxels[3]; + nVoxels[0] = _nVoxels[0]; + nVoxels[1] = _nVoxels[1]; + nVoxels[2] = _nVoxels[2]; + + uniform float raster2camera[4][4]; + raster2camera[0][0] = _raster2camera[0][0]; + raster2camera[0][1] = _raster2camera[0][1]; + raster2camera[0][2] = _raster2camera[0][2]; + raster2camera[0][3] = _raster2camera[0][3]; + raster2camera[1][0] = _raster2camera[1][0]; + raster2camera[1][1] = _raster2camera[1][1]; + raster2camera[1][2] = _raster2camera[1][2]; + raster2camera[1][3] = _raster2camera[1][3]; + raster2camera[2][0] = _raster2camera[2][0]; + raster2camera[2][1] = _raster2camera[2][1]; + raster2camera[2][2] = _raster2camera[2][2]; + raster2camera[2][3] = _raster2camera[2][3]; + raster2camera[3][0] = _raster2camera[3][0]; + raster2camera[3][1] = _raster2camera[3][1]; + raster2camera[3][2] = _raster2camera[3][2]; + raster2camera[3][3] = _raster2camera[3][3]; + + uniform float camera2world[4][4]; + camera2world[0][0] = _camera2world[0][0]; + camera2world[0][1] = _camera2world[0][1]; + camera2world[0][2] = _camera2world[0][2]; + camera2world[0][3] = _camera2world[0][3]; + camera2world[1][0] = _camera2world[1][0]; + camera2world[1][1] = _camera2world[1][1]; + camera2world[1][2] = _camera2world[1][2]; + camera2world[1][3] = _camera2world[1][3]; + camera2world[2][0] = _camera2world[2][0]; + camera2world[2][1] = _camera2world[2][1]; + camera2world[2][2] = _camera2world[2][2]; + camera2world[2][3] = _camera2world[2][3]; + camera2world[3][0] = _camera2world[3][0]; + camera2world[3][1] = _camera2world[3][1]; + camera2world[3][2] = _camera2world[3][2]; + camera2world[3][3] = _camera2world[3][3]; +#else +#define nVoxels _nVoxels +#define raster2camera _raster2camera +#define camera2world _camera2world +#endif + + uniform int dx = 8, dy = 8; // must match value in volume_ispc_tasks + uniform int xbuckets = (width + (dx-1)) / dx; + uniform int ybuckets = (height + (dy-1)) / dy; + + uniform int x0 = (taskIndex % xbuckets) * dx; + uniform int y0 = (taskIndex / xbuckets) * dy; + uniform int x1 = x0 + dx, y1 = y0 + dy; + x1 = min(x1, width); + y1 = min(y1, height); + + volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera, + camera2world, width, height, image); +} + + +export void +volume_ispc(uniform float density[], uniform int nVoxels[3], + const uniform float raster2camera[4][4], + const uniform float camera2world[4][4], + uniform int width, uniform int height, uniform float image[]) { + volume_tile(0, 0, width, height, density, nVoxels, raster2camera, + camera2world, width, height, image); +} + + +export void +volume_ispc_tasks(uniform float density[], uniform int nVoxels[3], + const uniform float raster2camera[4][4], + const uniform float camera2world[4][4], + uniform int width, uniform int height, uniform float image[]) { + // Launch tasks to work on (dx,dy)-sized tiles of the image + uniform int dx = 8, dy = 8; + uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy); + print("nTasks= %\n", nTasks); + launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world, + width, height, image); +} diff --git a/examples_cuda/volume_rendering/volume_cu.cpp b/examples_cuda/volume_rendering/volume_cu.cpp new file mode 100644 index 00000000..b020d9a7 --- /dev/null +++ b/examples_cuda/volume_rendering/volume_cu.cpp @@ -0,0 +1,478 @@ +/* + Copyright (c) 2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include +#include "../timing.h" +#include "volume_ispc.h" +using namespace ispc; + +#include +#include +#include +#include "drvapi_error_string.h" + +#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) +// These are the inline versions for all of the SDK helper functions +void __checkCudaErrors(CUresult err, const char *file, const int line) { + if(CUDA_SUCCESS != err) { + std::cerr << "checkCudeErrors() Driver API error = " << err << "\"" + << getCudaDrvErrorString(err) << "\" from file <" << file + << ", line " << line << "\n"; + exit(-1); + } +} + +/**********************/ +/* Basic CUDriver API */ +CUcontext context; + +void createContext(const int deviceId = 0) +{ + CUdevice device; + int devCount; + checkCudaErrors(cuInit(0)); + checkCudaErrors(cuDeviceGetCount(&devCount)); + assert(devCount > 0); + checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0)); + + char name[128]; + checkCudaErrors(cuDeviceGetName(name, 128, device)); + std::cout << "Using CUDA Device [0]: " << name << "\n"; + + int devMajor, devMinor; + checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device)); + std::cout << "Device Compute Capability: " + << devMajor << "." << devMinor << "\n"; + if (devMajor < 2) { + std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n"; + exit(1); + } + + // Create driver context + checkCudaErrors(cuCtxCreate(&context, 0, device)); +} +void destroyContext() +{ + checkCudaErrors(cuCtxDestroy(context)); +} + +CUmodule loadModule(const char * module) +{ + CUmodule cudaModule; + // in this branch we use compilation with parameters + const unsigned int jitNumOptions = 1; + CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; + void **jitOptVals = new void*[jitNumOptions]; + // set up pointer to set the Maximum # of registers for a particular kernel + jitOptions[0] = CU_JIT_MAX_REGISTERS; + int jitRegCount = 64; + jitOptVals[0] = (void *)(size_t)jitRegCount; + +#if 0 + // set up size of compilation log buffer + jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + int jitLogBufferSize = 1024; + jitOptVals[0] = (void *)(size_t)jitLogBufferSize; + + // set up pointer to the compilation log buffer + jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; + char *jitLogBuffer = new char[jitLogBufferSize]; + jitOptVals[1] = jitLogBuffer; + + // set up pointer to set the Maximum # of registers for a particular kernel + jitOptions[2] = CU_JIT_MAX_REGISTERS; + int jitRegCount = 32; + jitOptVals[2] = (void *)(size_t)jitRegCount; +#endif + + checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals)); + return cudaModule; +} +void unloadModule(CUmodule &cudaModule) +{ + checkCudaErrors(cuModuleUnload(cudaModule)); +} + +CUfunction getFunction(CUmodule &cudaModule, const char * function) +{ + CUfunction cudaFunction; + checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function)); + return cudaFunction; +} + +CUdeviceptr deviceMalloc(const size_t size) +{ + CUdeviceptr d_buf; + checkCudaErrors(cuMemAlloc(&d_buf, size)); + return d_buf; +} +void deviceFree(CUdeviceptr d_buf) +{ + checkCudaErrors(cuMemFree(d_buf)); +} +void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size) +{ + checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size)); +} +void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size) +{ + checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); +} +#define deviceLaunch(func,nbx,nby,nbz,params) \ + checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \ + checkCudaErrors( \ + cuLaunchKernel( \ + (func), \ + ((nbx-1)/(128/32)+1), (nby), (nbz), \ + 128, 1, 1, \ + 0, NULL, (params), NULL \ + )); + +typedef CUdeviceptr devicePtr; + + +/**************/ +#include +std::vector readBinary(const char * filename) +{ + std::vector buffer; + FILE *fp = fopen(filename, "rb"); + if (!fp ) + { + fprintf(stderr, "file %s not found\n", filename); + assert(0); + } +#if 0 + char c; + while ((c = fgetc(fp)) != EOF) + buffer.push_back(c); +#else + fseek(fp, 0, SEEK_END); + const unsigned long long size = ftell(fp); /*calc the size needed*/ + fseek(fp, 0, SEEK_SET); + buffer.resize(size); + + if (fp == NULL){ /*ERROR detection if file == empty*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n",filename); + exit(1); + } + else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n", filename); + exit(1); + } +#endif + fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size()); + return buffer; +} + +extern "C" +{ + + void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment) + { + return NULL; + } + void CUDALaunch( + void **handlePtr, + const char * module_name, + const char * module_1, + const char * func_name, + void **func_args, + int countx, int county, int countz) + { + assert(module_name != NULL); + assert(module_1 != NULL); + assert(func_name != NULL); + assert(func_args != NULL); +#if 1 + const char * module = module_1; +#else + const std::vector module_str = readBinary("kernel.cubin"); + const char * module = &module_str[0]; +#endif + CUmodule cudaModule = loadModule(module); + CUfunction cudaFunction = getFunction(cudaModule, func_name); + deviceLaunch(cudaFunction, countx, county, countz, func_args); + unloadModule(cudaModule); + } + void CUDASync(void *handle) + { + checkCudaErrors(cuStreamSynchronize(0)); + } + void ISPCSync(void *handle) + { + checkCudaErrors(cuStreamSynchronize(0)); + } + void CUDAFree(void *handle) + { + } +} + +extern void volume_serial(float density[], int nVoxels[3], + const float raster2camera[4][4], + const float camera2world[4][4], + int width, int height, float image[]); + +/* Write a PPM image file with the image */ +static void +writePPM(float *buf, int width, int height, const char *fn) { + FILE *fp = fopen(fn, "wb"); + fprintf(fp, "P6\n"); + fprintf(fp, "%d %d\n", width, height); + fprintf(fp, "255\n"); + for (int i = 0; i < width*height; ++i) { + float v = buf[i] * 255.f; + if (v < 0.f) v = 0.f; + else if (v > 255.f) v = 255.f; + unsigned char c = (unsigned char)v; + for (int j = 0; j < 3; ++j) + fputc(c, fp); + } + fclose(fp); + printf("Wrote image file %s\n", fn); +} + + +/* Load image and viewing parameters from a camera data file. + FIXME: we should add support to be able to specify viewing parameters + in the program here directly. */ +static void +loadCamera(const char *fn, int *width, int *height, float raster2camera[4][4], + float camera2world[4][4]) { + FILE *f = fopen(fn, "r"); + if (!f) { + perror(fn); + exit(1); + } + if (fscanf(f, "%d %d", width, height) != 2) { + fprintf(stderr, "Unexpected end of file in camera file\n"); + exit(1); + } + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + if (fscanf(f, "%f", &raster2camera[i][j]) != 1) { + fprintf(stderr, "Unexpected end of file in camera file\n"); + exit(1); + } + } + } + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + if (fscanf(f, "%f", &camera2world[i][j]) != 1) { + fprintf(stderr, "Unexpected end of file in camera file\n"); + exit(1); + } + } + } + fclose(f); +} + + +/* Load a volume density file. Expects the number of x, y, and z samples + as the first three values (as integer strings), then x*y*z + floating-point values (also as strings) to give the densities. */ +static float * +loadVolume(const char *fn, int n[3]) { + FILE *f = fopen(fn, "r"); + if (!f) { + perror(fn); + exit(1); + } + + if (fscanf(f, "%d %d %d", &n[0], &n[1], &n[2]) != 3) { + fprintf(stderr, "Couldn't find resolution at start of density file\n"); + exit(1); + } + + int count = n[0] * n[1] * n[2]; + float *v = new float[count]; + for (int i = 0; i < count; ++i) { + if (fscanf(f, "%f", &v[i]) != 1) { + fprintf(stderr, "Unexpected end of file at %d'th density value\n", i); + exit(1); + } + } + + return v; +} + + +int main(int argc, char *argv[]) { + if (argc != 3) { + fprintf(stderr, "usage: volume \n"); + return 1; + } + + // + // Load viewing data and the volume density data + // + int width, height; + float raster2camera[4][4], camera2world[4][4]; + loadCamera(argv[1], &width, &height, raster2camera, camera2world); + float *image = new float[width*height]; + + int n[3]; + float *density = loadVolume(argv[2], n); + + /*******************/ + createContext(); + /*******************/ + + devicePtr d_raster2camera = deviceMalloc(4*4*sizeof(float)); + devicePtr d_camera2world = deviceMalloc(4*4*sizeof(float)); + devicePtr d_n = deviceMalloc(3*sizeof(int)); + devicePtr d_density = deviceMalloc(n[0]*n[1]*n[2]*sizeof(float)); + devicePtr d_image = deviceMalloc(width*height*sizeof(float)); + + + + // + // Compute the image using the ispc implementation; report the minimum + // time of three runs. + // + double minISPC = 1e30; +#if 0 + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + volume_ispc(density, n, raster2camera, camera2world, + width, height, image); + double dt = get_elapsed_mcycles(); + minISPC = std::min(minISPC, dt); + } + + for (int i = 0; i < width*height; i += 4) + { + if (image[i] != 0.0f) + { + fprintf(stderr, " i= %d image= %g %g %g %g \n", i, + image[i+0], + image[i+1], + image[i+2], + image[i+3]); + break; + } + } + + printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC); + writePPM(image, width, height, "volume-ispc-1core.ppm"); +#endif + + + // Clear out the buffer + for (int i = 0; i < width * height; ++i) + image[i] = 0.; + + memcpyH2D(d_raster2camera, raster2camera, 4*4*sizeof(float)); + memcpyH2D(d_camera2world, camera2world, 4*4*sizeof(float)); + memcpyH2D(d_n, n, 3*sizeof(int)); + memcpyH2D(d_density, density, n[0]*n[1]*n[2]*sizeof(float)); + memcpyH2D(d_image, image, width*height*sizeof(float)); + + // + // Compute the image using the ispc implementation that also uses + // tasks; report the minimum time of three runs. + // + double minISPCtasks = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + volume_ispc_tasks( + (float*)d_density, + (int*)d_n, + (float(*)[4])d_raster2camera, + (float(*)[4])d_camera2world, + width, height, + (float*)d_image); + double dt = get_elapsed_mcycles(); + minISPCtasks = std::min(minISPCtasks, dt); + } + + memcpyD2H(image, d_image, width*height*sizeof(float)); + for (int i = 0; i < width*height; i += 4) + { + if (image[i] != 0.0f) + { + fprintf(stderr, " i= %d image= %g %g %g %g \n", i, + image[i+0], + image[i+1], + image[i+2], + image[i+3]); + break; + } + } + + printf("[volume ispc + tasks]:\t\t[%.3f] million cycles\n", minISPCtasks); + writePPM(image, width, height, "volume-cuda.ppm"); + +#if 0 + // Clear out the buffer + for (int i = 0; i < width * height; ++i) + image[i] = 0.; + + // + // And run the serial implementation 3 times, again reporting the + // minimum time. + // + double minSerial = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + volume_serial(density, n, raster2camera, camera2world, + width, height, image); + double dt = get_elapsed_mcycles(); + minSerial = std::min(minSerial, dt); + } + + printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial); + writePPM(image, width, height, "volume-serial.ppm"); + + printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", + minSerial/minISPC, minSerial / minISPCtasks); +#else + printf("\t\t\t\t %.2fx speedup from ISPC + tasks)\n", + minISPC / minISPCtasks); +#endif + + /*******************/ + destroyContext(); + /*******************/ + + return 0; +} diff --git a/examples_cuda/volume_rendering/volume_ispc.bc b/examples_cuda/volume_rendering/volume_ispc.bc deleted file mode 100644 index 4a7c47b9..00000000 Binary files a/examples_cuda/volume_rendering/volume_ispc.bc and /dev/null differ diff --git a/examples_cuda/volume_rendering/volume_ispc_avx.bc b/examples_cuda/volume_rendering/volume_ispc_avx.bc deleted file mode 100644 index 9b0e4e5f..00000000 Binary files a/examples_cuda/volume_rendering/volume_ispc_avx.bc and /dev/null differ diff --git a/examples_cuda/volume_rendering/volume_ispc_nvptx64.bc b/examples_cuda/volume_rendering/volume_ispc_nvptx64.bc deleted file mode 100644 index 62a9265e..00000000 Binary files a/examples_cuda/volume_rendering/volume_ispc_nvptx64.bc and /dev/null differ diff --git a/ispc.cpp b/ispc.cpp index b83f33a0..9f12a62d 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -665,7 +665,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo this->m_hasHalf = false; this->m_maskingIsFree = true; this->m_maskBitCount = 1; - this->m_hasTranscendentals = true; + this->m_hasTranscendentals = false; //true; this->m_hasGather = this->m_hasScatter = false; } else {