added volume rendering to run on GPU

This commit is contained in:
Evghenii
2013-11-08 13:57:16 +01:00
parent 348100ba42
commit b3c68af40a
10 changed files with 965 additions and 11 deletions

View File

@@ -175,8 +175,28 @@ declare double @__round_uniform_double(double) nounwind readnone
declare double @__floor_uniform_double(double) nounwind readnone
declare double @__ceil_uniform_double(double) nounwind readnone
declare <WIDTH x float> @__round_varying_float(<WIDTH x float>) nounwind readnone
declare <WIDTH x float> @__floor_varying_float(<WIDTH x float>) nounwind readnone
define <1 x float> @__round_varying_float(<1 x float>) nounwind readonly alwaysinline {
%float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32>
%bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648>
%bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
%int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float>
%binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06>
%binop21.i = fadd <1 x float> %binop.i, <float -8.388608e+06>
%float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32>
%bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
%int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float>
ret <1 x float> %int_to_float_bitcast.i.i.i
}
define <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly alwaysinline {
%calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
%bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0
%val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
%bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 -1082130432>
%int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
%binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
ret <1 x float> %binop.i
}
declare <WIDTH x float> @__ceil_varying_float(<WIDTH x float>) nounwind readnone
declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone
@@ -246,10 +266,40 @@ define double @__min_uniform_double(double, double) nounwind readonly alwaysinl
;; min/max uniform
declare <WIDTH x float> @__max_varying_float(<WIDTH x float>, <WIDTH x float>) nounwind readnone
declare <WIDTH x float> @__min_varying_float(<WIDTH x float>, <WIDTH x float>) nounwind readnone
declare <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
declare <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
;; /* float */
define <1 x float> @__max_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
%a = extractelement <1 x float> %0, i32 0
%b = extractelement <1 x float> %1, i32 0
%r = call float @__max_uniform_float(float %a, float %b)
%rv = insertelement <1 x float> undef, float %r, i32 0
ret <1 x float> %rv
}
define <1 x float> @__min_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
%a = extractelement <1 x float> %0, i32 0
%b = extractelement <1 x float> %1, i32 0
%r = call float @__min_uniform_float(float %a, float %b)
%rv = insertelement <1 x float> undef, float %r, i32 0
ret <1 x float> %rv
}
;; /* int32 */
define <1 x i32> @__max_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
%a = extractelement <1 x i32> %0, i32 0
%b = extractelement <1 x i32> %1, i32 0
%r = call i32 @__max_uniform_int32(i32 %a, i32 %b)
%rv = insertelement <1 x i32> undef, i32 %r, i32 0
ret <1 x i32> %rv
}
define <1 x i32> @__min_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
%a = extractelement <1 x i32> %0, i32 0
%b = extractelement <1 x i32> %1, i32 0
%r = call i32 @__min_uniform_int32(i32 %a, i32 %b)
%rv = insertelement <1 x i32> undef, i32 %r, i32 0
ret <1 x i32> %rv
}
;; /* uint32 */
declare <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
declare <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
;; declare <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone
@@ -289,7 +339,13 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline
declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone
declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone
declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline
{
%v = extractelement <1 x float> %0, i32 0
%r = call float @__sqrt_uniform_float(float %v)
%rv = insertelement <1 x float> undef, float %r, i32 0
ret <WIDTH x float> %rv
}
;; declare double @__sqrt_uniform_double(double) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind readonly alwaysinline {

View File

@@ -2,7 +2,6 @@
EXAMPLE=volume
CPP_SRC=volume.cpp volume_serial.cpp
ISPC_SRC=volume.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx
ISPC_ARM_TARGETS=neon
ISPC_IA_TARGETS=avx
include ../common.mk

View File

@@ -31,6 +31,17 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef __NVPTX__
#warning "emitting DEVICE code"
#define taskIndex blockIndex0()
#define taskCount blockCount0()
#define programIndex laneIndex()
#define programCount warpSize()
#else
#warning "emitting HOST code"
#endif
typedef float<3> float3;
struct Ray {
@@ -236,7 +247,7 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
// terminate once attenuation is high
float atten = exp(-tau);
if (atten < .005)
cbreak;
break;
// direct lighting
float Li = lightIntensity / distanceSquared(lightPos, pos) *

View File

@@ -0,0 +1,410 @@
/*
Copyright (c) 2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef __NVPTX__
#warning "emitting DEVICE code"
#define taskIndex blockIndex0()
#define taskCount blockCount0()
#define programIndex laneIndex()
#define programCount warpSize()
#else
#warning "emitting HOST code"
#endif
typedef float<3> float3;
struct Ray {
float3 origin, dir;
};
static inline void
generateRay(const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
float x, float y, Ray &ray) {
// transform raster coordinate (x, y, 0) to camera space
float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
float camz = raster2camera[2][3];
float camw = raster2camera[3][3];
camx /= camw;
camy /= camw;
camz /= camw;
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
ray.origin.x = camera2world[0][3] / camera2world[3][3];
ray.origin.y = camera2world[1][3] / camera2world[3][3];
ray.origin.z = camera2world[2][3] / camera2world[3][3];
}
static inline bool
Inside(float3 p, float3 pMin, float3 pMax) {
return (p.x >= pMin.x && p.x <= pMax.x &&
p.y >= pMin.y && p.y <= pMax.y &&
p.z >= pMin.z && p.z <= pMax.z);
}
static inline bool
IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
float t0 = -1e30, t1 = 1e30;
float3 tNear = (pMin - ray.origin) / ray.dir;
float3 tFar = (pMax - ray.origin) / ray.dir;
if (tNear.x > tFar.x) {
float tmp = tNear.x;
tNear.x = tFar.x;
tFar.x = tmp;
}
t0 = max(tNear.x, t0);
t1 = min(tFar.x, t1);
if (tNear.y > tFar.y) {
float tmp = tNear.y;
tNear.y = tFar.y;
tFar.y = tmp;
}
t0 = max(tNear.y, t0);
t1 = min(tFar.y, t1);
if (tNear.z > tFar.z) {
float tmp = tNear.z;
tNear.z = tFar.z;
tFar.z = tmp;
}
t0 = max(tNear.z, t0);
t1 = min(tFar.z, t1);
if (t0 <= t1) {
hit0 = t0;
hit1 = t1;
return true;
}
else
return false;
}
static inline float Lerp(float t, float a, float b) {
return (1.f - t) * a + t * b;
}
static inline float D(int x, int y, int z, uniform int nVoxels[3],
uniform float density[]) {
x = clamp(x, 0, nVoxels[0]-1);
y = clamp(y, 0, nVoxels[1]-1);
z = clamp(z, 0, nVoxels[2]-1);
return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
}
static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
return (p - pMin) / (pMax - pMin);
}
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
uniform float density[], uniform int nVoxels[3]) {
if (!Inside(Pobj, pMin, pMax))
return 0;
// Compute voxel coordinates and offsets for _Pobj_
float3 vox = Offset(Pobj, pMin, pMax);
vox.x = vox.x * nVoxels[0] - .5f;
vox.y = vox.y * nVoxels[1] - .5f;
vox.z = vox.z * nVoxels[2] - .5f;
int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
// Trilinearly interpolate density values to compute local density
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
D(vx+1, vy, vz, nVoxels, density));
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
D(vx+1, vy+1, vz, nVoxels, density));
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
D(vx+1, vy, vz+1, nVoxels, density));
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
D(vx+1, vy+1, vz+1, nVoxels, density));
float d0 = Lerp(dy, d00, d10);
float d1 = Lerp(dy, d01, d11);
return Lerp(dz, d0, d1);
}
/* Returns the transmittance between two points p0 and p1, in a volume
with extent (pMin,pMax) with transmittance coefficient sigma_t,
defined by nVoxels[3] voxels in each dimension in the given density
array. */
static inline float
transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
uniform float3 pMax, uniform float sigma_t,
uniform float density[], uniform int nVoxels[3]) {
float rayT0, rayT1;
Ray ray;
ray.origin = p1;
ray.dir = p0 - p1;
// Find the parametric t range along the ray that is inside the volume.
if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
return 1.;
rayT0 = max(rayT0, 0.f);
// Accumulate beam transmittance in tau
float tau = 0;
float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
ray.dir.z * ray.dir.z);
uniform float stepDist = 0.2;
float stepT = stepDist / rayLength;
float t = rayT0;
float3 pos = ray.origin + ray.dir * rayT0;
float3 dirStep = ray.dir * stepT;
while (t < rayT1) {
tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
pos = pos + dirStep;
t += stepT;
}
return exp(-tau);
}
static inline float
distanceSquared(float3 a, float3 b) {
float3 d = a-b;
return d.x*d.x + d.y*d.y + d.z*d.z;
}
static inline float
raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
float rayT0, rayT1;
uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8};
uniform float3 lightPos = { -1, 4, 1.5 };
if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
return 0.;
rayT0 = max(rayT0, 0.f);
// Parameters that define the volume scattering characteristics and
// sampling rate for raymarching
uniform float Le = .25; // Emission coefficient
uniform float sigma_a = 10; // Absorption coefficient
uniform float sigma_s = 10; // Scattering coefficient
uniform float stepDist = 0.025; // Ray step amount
uniform float lightIntensity = 40; // Light source intensity
float tau = 0.f; // accumulated beam transmittance
float L = 0; // radiance along the ray
float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
ray.dir.z * ray.dir.z);
float stepT = stepDist / rayLength;
float t = rayT0;
float3 pos = ray.origin + ray.dir * rayT0;
float3 dirStep = ray.dir * stepT;
while (t < rayT1)
{
float d = Density(pos, pMin, pMax, density, nVoxels);
// terminate once attenuation is high
float atten = exp(-tau);
if (atten < .005)
break;
// direct lighting
float Li = lightIntensity / distanceSquared(lightPos, pos) *
transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
density, nVoxels);
L += stepDist * atten * d * sigma_s * (Li + Le);
// update beam transmittance
tau += stepDist * (sigma_a + sigma_s) * d;
pos = pos + dirStep;
t += stepT;
}
// Gamma correction
return pow(L, 1.f / 2.2f);
}
/* Utility routine used by both the task-based and the single-core entrypoints.
Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the
result into the image[] array.
*/
static inline void
volume_tile(uniform int x0, uniform int y0, uniform int x1,
uniform int y1, uniform float density[], uniform int nVoxels[3],
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform int width, uniform int height, uniform float image[]) {
// Work on 4x4=16 pixel big tiles of the image. This function thus
// implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
// by 4.
for (uniform int y = y0; y < y1; y += 4) {
for (uniform int x = x0; x < x1; x += 4) {
// foreach (o = 0 ... 16) {
for (int ob = 0; ob < 32; ob += programCount)
{
const int o = ob + programIndex;
if (o >= 16) continue;
// These two arrays encode the mapping from [0,15] to
// offsets within the 4x4 pixel block so that we render
// each pixel inside the block
const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
0, 1, 0, 1, 2, 3, 2, 3 };
const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
2, 2, 3, 3, 2, 2, 3, 3 };
// Figure out the pixel to render for this program instance
int xo = x + xoffsets[o], yo = y + yoffsets[o];
// Use viewing parameters to compute the corresponding ray
// for the pixel
Ray ray;
generateRay(raster2camera, camera2world, xo, yo, ray);
// And raymarch through the volume to compute the pixel's
// value
int offset = yo * width + xo;
image[offset] = raymarch(density, nVoxels, ray);
}
}
}
}
task void
volume_task(uniform float density[], uniform int _nVoxels[3],
const uniform float _raster2camera[4][4],
const uniform float _camera2world[4][4],
uniform int width, uniform int height, uniform float image[])
{
if (taskIndex >= taskCount) return;
#if 1
uniform int nVoxels[3];
nVoxels[0] = _nVoxels[0];
nVoxels[1] = _nVoxels[1];
nVoxels[2] = _nVoxels[2];
uniform float raster2camera[4][4];
raster2camera[0][0] = _raster2camera[0][0];
raster2camera[0][1] = _raster2camera[0][1];
raster2camera[0][2] = _raster2camera[0][2];
raster2camera[0][3] = _raster2camera[0][3];
raster2camera[1][0] = _raster2camera[1][0];
raster2camera[1][1] = _raster2camera[1][1];
raster2camera[1][2] = _raster2camera[1][2];
raster2camera[1][3] = _raster2camera[1][3];
raster2camera[2][0] = _raster2camera[2][0];
raster2camera[2][1] = _raster2camera[2][1];
raster2camera[2][2] = _raster2camera[2][2];
raster2camera[2][3] = _raster2camera[2][3];
raster2camera[3][0] = _raster2camera[3][0];
raster2camera[3][1] = _raster2camera[3][1];
raster2camera[3][2] = _raster2camera[3][2];
raster2camera[3][3] = _raster2camera[3][3];
uniform float camera2world[4][4];
camera2world[0][0] = _camera2world[0][0];
camera2world[0][1] = _camera2world[0][1];
camera2world[0][2] = _camera2world[0][2];
camera2world[0][3] = _camera2world[0][3];
camera2world[1][0] = _camera2world[1][0];
camera2world[1][1] = _camera2world[1][1];
camera2world[1][2] = _camera2world[1][2];
camera2world[1][3] = _camera2world[1][3];
camera2world[2][0] = _camera2world[2][0];
camera2world[2][1] = _camera2world[2][1];
camera2world[2][2] = _camera2world[2][2];
camera2world[2][3] = _camera2world[2][3];
camera2world[3][0] = _camera2world[3][0];
camera2world[3][1] = _camera2world[3][1];
camera2world[3][2] = _camera2world[3][2];
camera2world[3][3] = _camera2world[3][3];
#else
#define nVoxels _nVoxels
#define raster2camera _raster2camera
#define camera2world _camera2world
#endif
uniform int dx = 8, dy = 8; // must match value in volume_ispc_tasks
uniform int xbuckets = (width + (dx-1)) / dx;
uniform int ybuckets = (height + (dy-1)) / dy;
uniform int x0 = (taskIndex % xbuckets) * dx;
uniform int y0 = (taskIndex / xbuckets) * dy;
uniform int x1 = x0 + dx, y1 = y0 + dy;
x1 = min(x1, width);
y1 = min(y1, height);
volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
camera2world, width, height, image);
}
export void
volume_ispc(uniform float density[], uniform int nVoxels[3],
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform int width, uniform int height, uniform float image[]) {
volume_tile(0, 0, width, height, density, nVoxels, raster2camera,
camera2world, width, height, image);
}
export void
volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform int width, uniform int height, uniform float image[]) {
// Launch tasks to work on (dx,dy)-sized tiles of the image
uniform int dx = 8, dy = 8;
uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
print("nTasks= %\n", nTasks);
launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world,
width, height, image);
}

View File

@@ -0,0 +1,478 @@
/*
Copyright (c) 2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#define NOMINMAX
#pragma warning (disable: 4244)
#pragma warning (disable: 4305)
#endif
#include <stdio.h>
#include <algorithm>
#include "../timing.h"
#include "volume_ispc.h"
using namespace ispc;
#include <cassert>
#include <iostream>
#include <cuda.h>
#include "drvapi_error_string.h"
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
// These are the inline versions for all of the SDK helper functions
void __checkCudaErrors(CUresult err, const char *file, const int line) {
if(CUDA_SUCCESS != err) {
std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
<< getCudaDrvErrorString(err) << "\" from file <" << file
<< ", line " << line << "\n";
exit(-1);
}
}
/**********************/
/* Basic CUDriver API */
CUcontext context;
void createContext(const int deviceId = 0)
{
CUdevice device;
int devCount;
checkCudaErrors(cuInit(0));
checkCudaErrors(cuDeviceGetCount(&devCount));
assert(devCount > 0);
checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
char name[128];
checkCudaErrors(cuDeviceGetName(name, 128, device));
std::cout << "Using CUDA Device [0]: " << name << "\n";
int devMajor, devMinor;
checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
std::cout << "Device Compute Capability: "
<< devMajor << "." << devMinor << "\n";
if (devMajor < 2) {
std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
exit(1);
}
// Create driver context
checkCudaErrors(cuCtxCreate(&context, 0, device));
}
void destroyContext()
{
checkCudaErrors(cuCtxDestroy(context));
}
CUmodule loadModule(const char * module)
{
CUmodule cudaModule;
// in this branch we use compilation with parameters
const unsigned int jitNumOptions = 1;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void*[jitNumOptions];
// set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[0] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 64;
jitOptVals[0] = (void *)(size_t)jitRegCount;
#if 0
// set up size of compilation log buffer
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = 1024;
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
// set up pointer to the compilation log buffer
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[1] = jitLogBuffer;
// set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[2] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 32;
jitOptVals[2] = (void *)(size_t)jitRegCount;
#endif
checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals));
return cudaModule;
}
void unloadModule(CUmodule &cudaModule)
{
checkCudaErrors(cuModuleUnload(cudaModule));
}
CUfunction getFunction(CUmodule &cudaModule, const char * function)
{
CUfunction cudaFunction;
checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
return cudaFunction;
}
CUdeviceptr deviceMalloc(const size_t size)
{
CUdeviceptr d_buf;
checkCudaErrors(cuMemAlloc(&d_buf, size));
return d_buf;
}
void deviceFree(CUdeviceptr d_buf)
{
checkCudaErrors(cuMemFree(d_buf));
}
void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
{
checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
}
void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
{
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
}
#define deviceLaunch(func,nbx,nby,nbz,params) \
checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \
checkCudaErrors( \
cuLaunchKernel( \
(func), \
((nbx-1)/(128/32)+1), (nby), (nbz), \
128, 1, 1, \
0, NULL, (params), NULL \
));
typedef CUdeviceptr devicePtr;
/**************/
#include <vector>
std::vector<char> readBinary(const char * filename)
{
std::vector<char> buffer;
FILE *fp = fopen(filename, "rb");
if (!fp )
{
fprintf(stderr, "file %s not found\n", filename);
assert(0);
}
#if 0
char c;
while ((c = fgetc(fp)) != EOF)
buffer.push_back(c);
#else
fseek(fp, 0, SEEK_END);
const unsigned long long size = ftell(fp); /*calc the size needed*/
fseek(fp, 0, SEEK_SET);
buffer.resize(size);
if (fp == NULL){ /*ERROR detection if file == empty*/
fprintf(stderr, "Error: There was an Error reading the file %s \n",filename);
exit(1);
}
else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/
fprintf(stderr, "Error: There was an Error reading the file %s \n", filename);
exit(1);
}
#endif
fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size());
return buffer;
}
extern "C"
{
void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment)
{
return NULL;
}
void CUDALaunch(
void **handlePtr,
const char * module_name,
const char * module_1,
const char * func_name,
void **func_args,
int countx, int county, int countz)
{
assert(module_name != NULL);
assert(module_1 != NULL);
assert(func_name != NULL);
assert(func_args != NULL);
#if 1
const char * module = module_1;
#else
const std::vector<char> module_str = readBinary("kernel.cubin");
const char * module = &module_str[0];
#endif
CUmodule cudaModule = loadModule(module);
CUfunction cudaFunction = getFunction(cudaModule, func_name);
deviceLaunch(cudaFunction, countx, county, countz, func_args);
unloadModule(cudaModule);
}
void CUDASync(void *handle)
{
checkCudaErrors(cuStreamSynchronize(0));
}
void ISPCSync(void *handle)
{
checkCudaErrors(cuStreamSynchronize(0));
}
void CUDAFree(void *handle)
{
}
}
extern void volume_serial(float density[], int nVoxels[3],
const float raster2camera[4][4],
const float camera2world[4][4],
int width, int height, float image[]);
/* Write a PPM image file with the image */
static void
writePPM(float *buf, int width, int height, const char *fn) {
FILE *fp = fopen(fn, "wb");
fprintf(fp, "P6\n");
fprintf(fp, "%d %d\n", width, height);
fprintf(fp, "255\n");
for (int i = 0; i < width*height; ++i) {
float v = buf[i] * 255.f;
if (v < 0.f) v = 0.f;
else if (v > 255.f) v = 255.f;
unsigned char c = (unsigned char)v;
for (int j = 0; j < 3; ++j)
fputc(c, fp);
}
fclose(fp);
printf("Wrote image file %s\n", fn);
}
/* Load image and viewing parameters from a camera data file.
FIXME: we should add support to be able to specify viewing parameters
in the program here directly. */
static void
loadCamera(const char *fn, int *width, int *height, float raster2camera[4][4],
float camera2world[4][4]) {
FILE *f = fopen(fn, "r");
if (!f) {
perror(fn);
exit(1);
}
if (fscanf(f, "%d %d", width, height) != 2) {
fprintf(stderr, "Unexpected end of file in camera file\n");
exit(1);
}
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
if (fscanf(f, "%f", &raster2camera[i][j]) != 1) {
fprintf(stderr, "Unexpected end of file in camera file\n");
exit(1);
}
}
}
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
if (fscanf(f, "%f", &camera2world[i][j]) != 1) {
fprintf(stderr, "Unexpected end of file in camera file\n");
exit(1);
}
}
}
fclose(f);
}
/* Load a volume density file. Expects the number of x, y, and z samples
as the first three values (as integer strings), then x*y*z
floating-point values (also as strings) to give the densities. */
static float *
loadVolume(const char *fn, int n[3]) {
FILE *f = fopen(fn, "r");
if (!f) {
perror(fn);
exit(1);
}
if (fscanf(f, "%d %d %d", &n[0], &n[1], &n[2]) != 3) {
fprintf(stderr, "Couldn't find resolution at start of density file\n");
exit(1);
}
int count = n[0] * n[1] * n[2];
float *v = new float[count];
for (int i = 0; i < count; ++i) {
if (fscanf(f, "%f", &v[i]) != 1) {
fprintf(stderr, "Unexpected end of file at %d'th density value\n", i);
exit(1);
}
}
return v;
}
int main(int argc, char *argv[]) {
if (argc != 3) {
fprintf(stderr, "usage: volume <camera.dat> <volume_density.vol>\n");
return 1;
}
//
// Load viewing data and the volume density data
//
int width, height;
float raster2camera[4][4], camera2world[4][4];
loadCamera(argv[1], &width, &height, raster2camera, camera2world);
float *image = new float[width*height];
int n[3];
float *density = loadVolume(argv[2], n);
/*******************/
createContext();
/*******************/
devicePtr d_raster2camera = deviceMalloc(4*4*sizeof(float));
devicePtr d_camera2world = deviceMalloc(4*4*sizeof(float));
devicePtr d_n = deviceMalloc(3*sizeof(int));
devicePtr d_density = deviceMalloc(n[0]*n[1]*n[2]*sizeof(float));
devicePtr d_image = deviceMalloc(width*height*sizeof(float));
//
// Compute the image using the ispc implementation; report the minimum
// time of three runs.
//
double minISPC = 1e30;
#if 0
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
volume_ispc(density, n, raster2camera, camera2world,
width, height, image);
double dt = get_elapsed_mcycles();
minISPC = std::min(minISPC, dt);
}
for (int i = 0; i < width*height; i += 4)
{
if (image[i] != 0.0f)
{
fprintf(stderr, " i= %d image= %g %g %g %g \n", i,
image[i+0],
image[i+1],
image[i+2],
image[i+3]);
break;
}
}
printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC);
writePPM(image, width, height, "volume-ispc-1core.ppm");
#endif
// Clear out the buffer
for (int i = 0; i < width * height; ++i)
image[i] = 0.;
memcpyH2D(d_raster2camera, raster2camera, 4*4*sizeof(float));
memcpyH2D(d_camera2world, camera2world, 4*4*sizeof(float));
memcpyH2D(d_n, n, 3*sizeof(int));
memcpyH2D(d_density, density, n[0]*n[1]*n[2]*sizeof(float));
memcpyH2D(d_image, image, width*height*sizeof(float));
//
// Compute the image using the ispc implementation that also uses
// tasks; report the minimum time of three runs.
//
double minISPCtasks = 1e30;
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
volume_ispc_tasks(
(float*)d_density,
(int*)d_n,
(float(*)[4])d_raster2camera,
(float(*)[4])d_camera2world,
width, height,
(float*)d_image);
double dt = get_elapsed_mcycles();
minISPCtasks = std::min(minISPCtasks, dt);
}
memcpyD2H(image, d_image, width*height*sizeof(float));
for (int i = 0; i < width*height; i += 4)
{
if (image[i] != 0.0f)
{
fprintf(stderr, " i= %d image= %g %g %g %g \n", i,
image[i+0],
image[i+1],
image[i+2],
image[i+3]);
break;
}
}
printf("[volume ispc + tasks]:\t\t[%.3f] million cycles\n", minISPCtasks);
writePPM(image, width, height, "volume-cuda.ppm");
#if 0
// Clear out the buffer
for (int i = 0; i < width * height; ++i)
image[i] = 0.;
//
// And run the serial implementation 3 times, again reporting the
// minimum time.
//
double minSerial = 1e30;
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
volume_serial(density, n, raster2camera, camera2world,
width, height, image);
double dt = get_elapsed_mcycles();
minSerial = std::min(minSerial, dt);
}
printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial);
writePPM(image, width, height, "volume-serial.ppm");
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
minSerial/minISPC, minSerial / minISPCtasks);
#else
printf("\t\t\t\t %.2fx speedup from ISPC + tasks)\n",
minISPC / minISPCtasks);
#endif
/*******************/
destroyContext();
/*******************/
return 0;
}

View File

@@ -665,7 +665,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
this->m_hasHalf = false;
this->m_maskingIsFree = true;
this->m_maskBitCount = 1;
this->m_hasTranscendentals = true;
this->m_hasTranscendentals = false; //true;
this->m_hasGather = this->m_hasScatter = false;
}
else {