added volume rendering to run on GPU
This commit is contained in:
@@ -175,8 +175,28 @@ declare double @__round_uniform_double(double) nounwind readnone
|
||||
declare double @__floor_uniform_double(double) nounwind readnone
|
||||
declare double @__ceil_uniform_double(double) nounwind readnone
|
||||
|
||||
declare <WIDTH x float> @__round_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x float> @__floor_varying_float(<WIDTH x float>) nounwind readnone
|
||||
define <1 x float> @__round_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32>
|
||||
%bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648>
|
||||
%bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float>
|
||||
%binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06>
|
||||
%binop21.i = fadd <1 x float> %binop.i, <float -8.388608e+06>
|
||||
%float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32>
|
||||
%bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float>
|
||||
ret <1 x float> %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
define <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
|
||||
%bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
|
||||
%bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 -1082130432>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
|
||||
%binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <1 x float> %binop.i
|
||||
}
|
||||
|
||||
declare <WIDTH x float> @__ceil_varying_float(<WIDTH x float>) nounwind readnone
|
||||
|
||||
declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone
|
||||
@@ -246,10 +266,40 @@ define double @__min_uniform_double(double, double) nounwind readonly alwaysinl
|
||||
|
||||
;; min/max uniform
|
||||
|
||||
declare <WIDTH x float> @__max_varying_float(<WIDTH x float>, <WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x float> @__min_varying_float(<WIDTH x float>, <WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
|
||||
;; /* float */
|
||||
define <1 x float> @__max_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
|
||||
%a = extractelement <1 x float> %0, i32 0
|
||||
%b = extractelement <1 x float> %1, i32 0
|
||||
%r = call float @__max_uniform_float(float %a, float %b)
|
||||
%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
ret <1 x float> %rv
|
||||
}
|
||||
define <1 x float> @__min_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
|
||||
%a = extractelement <1 x float> %0, i32 0
|
||||
%b = extractelement <1 x float> %1, i32 0
|
||||
%r = call float @__min_uniform_float(float %a, float %b)
|
||||
%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
ret <1 x float> %rv
|
||||
|
||||
}
|
||||
|
||||
;; /* int32 */
|
||||
define <1 x i32> @__max_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
|
||||
%a = extractelement <1 x i32> %0, i32 0
|
||||
%b = extractelement <1 x i32> %1, i32 0
|
||||
%r = call i32 @__max_uniform_int32(i32 %a, i32 %b)
|
||||
%rv = insertelement <1 x i32> undef, i32 %r, i32 0
|
||||
ret <1 x i32> %rv
|
||||
}
|
||||
define <1 x i32> @__min_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
|
||||
%a = extractelement <1 x i32> %0, i32 0
|
||||
%b = extractelement <1 x i32> %1, i32 0
|
||||
%r = call i32 @__min_uniform_int32(i32 %a, i32 %b)
|
||||
%rv = insertelement <1 x i32> undef, i32 %r, i32 0
|
||||
ret <1 x i32> %rv
|
||||
}
|
||||
|
||||
;; /* uint32 */
|
||||
declare <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
|
||||
;; declare <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone
|
||||
@@ -289,7 +339,13 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline
|
||||
|
||||
declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
|
||||
define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline
|
||||
{
|
||||
%v = extractelement <1 x float> %0, i32 0
|
||||
%r = call float @__sqrt_uniform_float(float %v)
|
||||
%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
ret <WIDTH x float> %rv
|
||||
}
|
||||
|
||||
;; declare double @__sqrt_uniform_double(double) nounwind readnone
|
||||
define double @__sqrt_uniform_double(double) nounwind readonly alwaysinline {
|
||||
|
||||
Binary file not shown.
@@ -2,7 +2,6 @@
|
||||
EXAMPLE=volume
|
||||
CPP_SRC=volume.cpp volume_serial.cpp
|
||||
ISPC_SRC=volume.ispc
|
||||
ISPC_IA_TARGETS=sse2,sse4-x2,avx
|
||||
ISPC_ARM_TARGETS=neon
|
||||
ISPC_IA_TARGETS=avx
|
||||
|
||||
include ../common.mk
|
||||
|
||||
@@ -31,6 +31,17 @@
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef __NVPTX__
|
||||
#warning "emitting DEVICE code"
|
||||
#define taskIndex blockIndex0()
|
||||
#define taskCount blockCount0()
|
||||
#define programIndex laneIndex()
|
||||
#define programCount warpSize()
|
||||
#else
|
||||
#warning "emitting HOST code"
|
||||
#endif
|
||||
|
||||
|
||||
typedef float<3> float3;
|
||||
|
||||
struct Ray {
|
||||
@@ -236,7 +247,7 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
|
||||
// terminate once attenuation is high
|
||||
float atten = exp(-tau);
|
||||
if (atten < .005)
|
||||
cbreak;
|
||||
break;
|
||||
|
||||
// direct lighting
|
||||
float Li = lightIntensity / distanceSquared(lightPos, pos) *
|
||||
|
||||
410
examples_cuda/volume_rendering/volume1.ispc
Normal file
410
examples_cuda/volume_rendering/volume1.ispc
Normal file
@@ -0,0 +1,410 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef __NVPTX__
|
||||
#warning "emitting DEVICE code"
|
||||
#define taskIndex blockIndex0()
|
||||
#define taskCount blockCount0()
|
||||
#define programIndex laneIndex()
|
||||
#define programCount warpSize()
|
||||
#else
|
||||
#warning "emitting HOST code"
|
||||
#endif
|
||||
|
||||
|
||||
typedef float<3> float3;
|
||||
|
||||
struct Ray {
|
||||
float3 origin, dir;
|
||||
};
|
||||
|
||||
|
||||
static inline void
|
||||
generateRay(const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
float x, float y, Ray &ray) {
|
||||
// transform raster coordinate (x, y, 0) to camera space
|
||||
float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
|
||||
float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
|
||||
float camz = raster2camera[2][3];
|
||||
float camw = raster2camera[3][3];
|
||||
camx /= camw;
|
||||
camy /= camw;
|
||||
camz /= camw;
|
||||
|
||||
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
|
||||
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
|
||||
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
|
||||
|
||||
ray.origin.x = camera2world[0][3] / camera2world[3][3];
|
||||
ray.origin.y = camera2world[1][3] / camera2world[3][3];
|
||||
ray.origin.z = camera2world[2][3] / camera2world[3][3];
|
||||
}
|
||||
|
||||
|
||||
static inline bool
|
||||
Inside(float3 p, float3 pMin, float3 pMax) {
|
||||
return (p.x >= pMin.x && p.x <= pMax.x &&
|
||||
p.y >= pMin.y && p.y <= pMax.y &&
|
||||
p.z >= pMin.z && p.z <= pMax.z);
|
||||
}
|
||||
|
||||
|
||||
static inline bool
|
||||
IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
|
||||
float t0 = -1e30, t1 = 1e30;
|
||||
|
||||
float3 tNear = (pMin - ray.origin) / ray.dir;
|
||||
float3 tFar = (pMax - ray.origin) / ray.dir;
|
||||
if (tNear.x > tFar.x) {
|
||||
float tmp = tNear.x;
|
||||
tNear.x = tFar.x;
|
||||
tFar.x = tmp;
|
||||
}
|
||||
t0 = max(tNear.x, t0);
|
||||
t1 = min(tFar.x, t1);
|
||||
|
||||
if (tNear.y > tFar.y) {
|
||||
float tmp = tNear.y;
|
||||
tNear.y = tFar.y;
|
||||
tFar.y = tmp;
|
||||
}
|
||||
t0 = max(tNear.y, t0);
|
||||
t1 = min(tFar.y, t1);
|
||||
|
||||
if (tNear.z > tFar.z) {
|
||||
float tmp = tNear.z;
|
||||
tNear.z = tFar.z;
|
||||
tFar.z = tmp;
|
||||
}
|
||||
t0 = max(tNear.z, t0);
|
||||
t1 = min(tFar.z, t1);
|
||||
|
||||
if (t0 <= t1) {
|
||||
hit0 = t0;
|
||||
hit1 = t1;
|
||||
return true;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
static inline float Lerp(float t, float a, float b) {
|
||||
return (1.f - t) * a + t * b;
|
||||
}
|
||||
|
||||
|
||||
static inline float D(int x, int y, int z, uniform int nVoxels[3],
|
||||
uniform float density[]) {
|
||||
x = clamp(x, 0, nVoxels[0]-1);
|
||||
y = clamp(y, 0, nVoxels[1]-1);
|
||||
z = clamp(z, 0, nVoxels[2]-1);
|
||||
|
||||
return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
|
||||
}
|
||||
|
||||
|
||||
static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
|
||||
return (p - pMin) / (pMax - pMin);
|
||||
}
|
||||
|
||||
|
||||
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
uniform float density[], uniform int nVoxels[3]) {
|
||||
if (!Inside(Pobj, pMin, pMax))
|
||||
return 0;
|
||||
// Compute voxel coordinates and offsets for _Pobj_
|
||||
float3 vox = Offset(Pobj, pMin, pMax);
|
||||
vox.x = vox.x * nVoxels[0] - .5f;
|
||||
vox.y = vox.y * nVoxels[1] - .5f;
|
||||
vox.z = vox.z * nVoxels[2] - .5f;
|
||||
int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
|
||||
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
|
||||
|
||||
// Trilinearly interpolate density values to compute local density
|
||||
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
|
||||
D(vx+1, vy, vz, nVoxels, density));
|
||||
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
|
||||
D(vx+1, vy+1, vz, nVoxels, density));
|
||||
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
|
||||
D(vx+1, vy, vz+1, nVoxels, density));
|
||||
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
|
||||
D(vx+1, vy+1, vz+1, nVoxels, density));
|
||||
float d0 = Lerp(dy, d00, d10);
|
||||
float d1 = Lerp(dy, d01, d11);
|
||||
return Lerp(dz, d0, d1);
|
||||
}
|
||||
|
||||
|
||||
/* Returns the transmittance between two points p0 and p1, in a volume
|
||||
with extent (pMin,pMax) with transmittance coefficient sigma_t,
|
||||
defined by nVoxels[3] voxels in each dimension in the given density
|
||||
array. */
|
||||
static inline float
|
||||
transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
|
||||
uniform float3 pMax, uniform float sigma_t,
|
||||
uniform float density[], uniform int nVoxels[3]) {
|
||||
float rayT0, rayT1;
|
||||
Ray ray;
|
||||
ray.origin = p1;
|
||||
ray.dir = p0 - p1;
|
||||
|
||||
// Find the parametric t range along the ray that is inside the volume.
|
||||
if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
|
||||
return 1.;
|
||||
|
||||
rayT0 = max(rayT0, 0.f);
|
||||
|
||||
// Accumulate beam transmittance in tau
|
||||
float tau = 0;
|
||||
float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
|
||||
ray.dir.z * ray.dir.z);
|
||||
uniform float stepDist = 0.2;
|
||||
float stepT = stepDist / rayLength;
|
||||
|
||||
float t = rayT0;
|
||||
float3 pos = ray.origin + ray.dir * rayT0;
|
||||
float3 dirStep = ray.dir * stepT;
|
||||
while (t < rayT1) {
|
||||
tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
|
||||
pos = pos + dirStep;
|
||||
t += stepT;
|
||||
}
|
||||
|
||||
return exp(-tau);
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
distanceSquared(float3 a, float3 b) {
|
||||
float3 d = a-b;
|
||||
return d.x*d.x + d.y*d.y + d.z*d.z;
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
|
||||
float rayT0, rayT1;
|
||||
uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8};
|
||||
uniform float3 lightPos = { -1, 4, 1.5 };
|
||||
|
||||
if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
|
||||
return 0.;
|
||||
|
||||
rayT0 = max(rayT0, 0.f);
|
||||
|
||||
// Parameters that define the volume scattering characteristics and
|
||||
// sampling rate for raymarching
|
||||
uniform float Le = .25; // Emission coefficient
|
||||
uniform float sigma_a = 10; // Absorption coefficient
|
||||
uniform float sigma_s = 10; // Scattering coefficient
|
||||
uniform float stepDist = 0.025; // Ray step amount
|
||||
uniform float lightIntensity = 40; // Light source intensity
|
||||
|
||||
float tau = 0.f; // accumulated beam transmittance
|
||||
float L = 0; // radiance along the ray
|
||||
float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
|
||||
ray.dir.z * ray.dir.z);
|
||||
float stepT = stepDist / rayLength;
|
||||
|
||||
float t = rayT0;
|
||||
float3 pos = ray.origin + ray.dir * rayT0;
|
||||
float3 dirStep = ray.dir * stepT;
|
||||
while (t < rayT1)
|
||||
{
|
||||
float d = Density(pos, pMin, pMax, density, nVoxels);
|
||||
|
||||
// terminate once attenuation is high
|
||||
float atten = exp(-tau);
|
||||
if (atten < .005)
|
||||
break;
|
||||
|
||||
// direct lighting
|
||||
float Li = lightIntensity / distanceSquared(lightPos, pos) *
|
||||
transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
|
||||
density, nVoxels);
|
||||
L += stepDist * atten * d * sigma_s * (Li + Le);
|
||||
|
||||
// update beam transmittance
|
||||
tau += stepDist * (sigma_a + sigma_s) * d;
|
||||
|
||||
pos = pos + dirStep;
|
||||
t += stepT;
|
||||
}
|
||||
|
||||
// Gamma correction
|
||||
return pow(L, 1.f / 2.2f);
|
||||
}
|
||||
|
||||
|
||||
/* Utility routine used by both the task-based and the single-core entrypoints.
|
||||
Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the
|
||||
result into the image[] array.
|
||||
*/
|
||||
static inline void
|
||||
volume_tile(uniform int x0, uniform int y0, uniform int x1,
|
||||
uniform int y1, uniform float density[], uniform int nVoxels[3],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[]) {
|
||||
// Work on 4x4=16 pixel big tiles of the image. This function thus
|
||||
// implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
|
||||
// by 4.
|
||||
for (uniform int y = y0; y < y1; y += 4) {
|
||||
for (uniform int x = x0; x < x1; x += 4) {
|
||||
// foreach (o = 0 ... 16) {
|
||||
for (int ob = 0; ob < 32; ob += programCount)
|
||||
{
|
||||
const int o = ob + programIndex;
|
||||
if (o >= 16) continue;
|
||||
|
||||
|
||||
// These two arrays encode the mapping from [0,15] to
|
||||
// offsets within the 4x4 pixel block so that we render
|
||||
// each pixel inside the block
|
||||
const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
|
||||
0, 1, 0, 1, 2, 3, 2, 3 };
|
||||
const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
|
||||
2, 2, 3, 3, 2, 2, 3, 3 };
|
||||
|
||||
// Figure out the pixel to render for this program instance
|
||||
int xo = x + xoffsets[o], yo = y + yoffsets[o];
|
||||
|
||||
// Use viewing parameters to compute the corresponding ray
|
||||
// for the pixel
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, xo, yo, ray);
|
||||
|
||||
// And raymarch through the volume to compute the pixel's
|
||||
// value
|
||||
int offset = yo * width + xo;
|
||||
image[offset] = raymarch(density, nVoxels, ray);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
task void
|
||||
volume_task(uniform float density[], uniform int _nVoxels[3],
|
||||
const uniform float _raster2camera[4][4],
|
||||
const uniform float _camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[])
|
||||
{
|
||||
if (taskIndex >= taskCount) return;
|
||||
|
||||
#if 1
|
||||
uniform int nVoxels[3];
|
||||
nVoxels[0] = _nVoxels[0];
|
||||
nVoxels[1] = _nVoxels[1];
|
||||
nVoxels[2] = _nVoxels[2];
|
||||
|
||||
uniform float raster2camera[4][4];
|
||||
raster2camera[0][0] = _raster2camera[0][0];
|
||||
raster2camera[0][1] = _raster2camera[0][1];
|
||||
raster2camera[0][2] = _raster2camera[0][2];
|
||||
raster2camera[0][3] = _raster2camera[0][3];
|
||||
raster2camera[1][0] = _raster2camera[1][0];
|
||||
raster2camera[1][1] = _raster2camera[1][1];
|
||||
raster2camera[1][2] = _raster2camera[1][2];
|
||||
raster2camera[1][3] = _raster2camera[1][3];
|
||||
raster2camera[2][0] = _raster2camera[2][0];
|
||||
raster2camera[2][1] = _raster2camera[2][1];
|
||||
raster2camera[2][2] = _raster2camera[2][2];
|
||||
raster2camera[2][3] = _raster2camera[2][3];
|
||||
raster2camera[3][0] = _raster2camera[3][0];
|
||||
raster2camera[3][1] = _raster2camera[3][1];
|
||||
raster2camera[3][2] = _raster2camera[3][2];
|
||||
raster2camera[3][3] = _raster2camera[3][3];
|
||||
|
||||
uniform float camera2world[4][4];
|
||||
camera2world[0][0] = _camera2world[0][0];
|
||||
camera2world[0][1] = _camera2world[0][1];
|
||||
camera2world[0][2] = _camera2world[0][2];
|
||||
camera2world[0][3] = _camera2world[0][3];
|
||||
camera2world[1][0] = _camera2world[1][0];
|
||||
camera2world[1][1] = _camera2world[1][1];
|
||||
camera2world[1][2] = _camera2world[1][2];
|
||||
camera2world[1][3] = _camera2world[1][3];
|
||||
camera2world[2][0] = _camera2world[2][0];
|
||||
camera2world[2][1] = _camera2world[2][1];
|
||||
camera2world[2][2] = _camera2world[2][2];
|
||||
camera2world[2][3] = _camera2world[2][3];
|
||||
camera2world[3][0] = _camera2world[3][0];
|
||||
camera2world[3][1] = _camera2world[3][1];
|
||||
camera2world[3][2] = _camera2world[3][2];
|
||||
camera2world[3][3] = _camera2world[3][3];
|
||||
#else
|
||||
#define nVoxels _nVoxels
|
||||
#define raster2camera _raster2camera
|
||||
#define camera2world _camera2world
|
||||
#endif
|
||||
|
||||
uniform int dx = 8, dy = 8; // must match value in volume_ispc_tasks
|
||||
uniform int xbuckets = (width + (dx-1)) / dx;
|
||||
uniform int ybuckets = (height + (dy-1)) / dy;
|
||||
|
||||
uniform int x0 = (taskIndex % xbuckets) * dx;
|
||||
uniform int y0 = (taskIndex / xbuckets) * dy;
|
||||
uniform int x1 = x0 + dx, y1 = y0 + dy;
|
||||
x1 = min(x1, width);
|
||||
y1 = min(y1, height);
|
||||
|
||||
volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
|
||||
camera2world, width, height, image);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
volume_ispc(uniform float density[], uniform int nVoxels[3],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[]) {
|
||||
volume_tile(0, 0, width, height, density, nVoxels, raster2camera,
|
||||
camera2world, width, height, image);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[]) {
|
||||
// Launch tasks to work on (dx,dy)-sized tiles of the image
|
||||
uniform int dx = 8, dy = 8;
|
||||
uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
|
||||
print("nTasks= %\n", nTasks);
|
||||
launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world,
|
||||
width, height, image);
|
||||
}
|
||||
478
examples_cuda/volume_rendering/volume_cu.cpp
Normal file
478
examples_cuda/volume_rendering/volume_cu.cpp
Normal file
@@ -0,0 +1,478 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
#include "volume_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <cuda.h>
|
||||
#include "drvapi_error_string.h"
|
||||
|
||||
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
|
||||
// These are the inline versions for all of the SDK helper functions
|
||||
void __checkCudaErrors(CUresult err, const char *file, const int line) {
|
||||
if(CUDA_SUCCESS != err) {
|
||||
std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
|
||||
<< getCudaDrvErrorString(err) << "\" from file <" << file
|
||||
<< ", line " << line << "\n";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
/**********************/
|
||||
/* Basic CUDriver API */
|
||||
CUcontext context;
|
||||
|
||||
void createContext(const int deviceId = 0)
|
||||
{
|
||||
CUdevice device;
|
||||
int devCount;
|
||||
checkCudaErrors(cuInit(0));
|
||||
checkCudaErrors(cuDeviceGetCount(&devCount));
|
||||
assert(devCount > 0);
|
||||
checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
|
||||
|
||||
char name[128];
|
||||
checkCudaErrors(cuDeviceGetName(name, 128, device));
|
||||
std::cout << "Using CUDA Device [0]: " << name << "\n";
|
||||
|
||||
int devMajor, devMinor;
|
||||
checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
|
||||
std::cout << "Device Compute Capability: "
|
||||
<< devMajor << "." << devMinor << "\n";
|
||||
if (devMajor < 2) {
|
||||
std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Create driver context
|
||||
checkCudaErrors(cuCtxCreate(&context, 0, device));
|
||||
}
|
||||
void destroyContext()
|
||||
{
|
||||
checkCudaErrors(cuCtxDestroy(context));
|
||||
}
|
||||
|
||||
CUmodule loadModule(const char * module)
|
||||
{
|
||||
CUmodule cudaModule;
|
||||
// in this branch we use compilation with parameters
|
||||
const unsigned int jitNumOptions = 1;
|
||||
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
|
||||
void **jitOptVals = new void*[jitNumOptions];
|
||||
// set up pointer to set the Maximum # of registers for a particular kernel
|
||||
jitOptions[0] = CU_JIT_MAX_REGISTERS;
|
||||
int jitRegCount = 64;
|
||||
jitOptVals[0] = (void *)(size_t)jitRegCount;
|
||||
|
||||
#if 0
|
||||
// set up size of compilation log buffer
|
||||
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
||||
int jitLogBufferSize = 1024;
|
||||
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
|
||||
|
||||
// set up pointer to the compilation log buffer
|
||||
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
|
||||
char *jitLogBuffer = new char[jitLogBufferSize];
|
||||
jitOptVals[1] = jitLogBuffer;
|
||||
|
||||
// set up pointer to set the Maximum # of registers for a particular kernel
|
||||
jitOptions[2] = CU_JIT_MAX_REGISTERS;
|
||||
int jitRegCount = 32;
|
||||
jitOptVals[2] = (void *)(size_t)jitRegCount;
|
||||
#endif
|
||||
|
||||
checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals));
|
||||
return cudaModule;
|
||||
}
|
||||
void unloadModule(CUmodule &cudaModule)
|
||||
{
|
||||
checkCudaErrors(cuModuleUnload(cudaModule));
|
||||
}
|
||||
|
||||
CUfunction getFunction(CUmodule &cudaModule, const char * function)
|
||||
{
|
||||
CUfunction cudaFunction;
|
||||
checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
|
||||
return cudaFunction;
|
||||
}
|
||||
|
||||
CUdeviceptr deviceMalloc(const size_t size)
|
||||
{
|
||||
CUdeviceptr d_buf;
|
||||
checkCudaErrors(cuMemAlloc(&d_buf, size));
|
||||
return d_buf;
|
||||
}
|
||||
void deviceFree(CUdeviceptr d_buf)
|
||||
{
|
||||
checkCudaErrors(cuMemFree(d_buf));
|
||||
}
|
||||
void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
|
||||
{
|
||||
checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
|
||||
}
|
||||
void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
|
||||
{
|
||||
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
|
||||
}
|
||||
#define deviceLaunch(func,nbx,nby,nbz,params) \
|
||||
checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \
|
||||
checkCudaErrors( \
|
||||
cuLaunchKernel( \
|
||||
(func), \
|
||||
((nbx-1)/(128/32)+1), (nby), (nbz), \
|
||||
128, 1, 1, \
|
||||
0, NULL, (params), NULL \
|
||||
));
|
||||
|
||||
typedef CUdeviceptr devicePtr;
|
||||
|
||||
|
||||
/**************/
|
||||
#include <vector>
|
||||
std::vector<char> readBinary(const char * filename)
|
||||
{
|
||||
std::vector<char> buffer;
|
||||
FILE *fp = fopen(filename, "rb");
|
||||
if (!fp )
|
||||
{
|
||||
fprintf(stderr, "file %s not found\n", filename);
|
||||
assert(0);
|
||||
}
|
||||
#if 0
|
||||
char c;
|
||||
while ((c = fgetc(fp)) != EOF)
|
||||
buffer.push_back(c);
|
||||
#else
|
||||
fseek(fp, 0, SEEK_END);
|
||||
const unsigned long long size = ftell(fp); /*calc the size needed*/
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
buffer.resize(size);
|
||||
|
||||
if (fp == NULL){ /*ERROR detection if file == empty*/
|
||||
fprintf(stderr, "Error: There was an Error reading the file %s \n",filename);
|
||||
exit(1);
|
||||
}
|
||||
else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/
|
||||
fprintf(stderr, "Error: There was an Error reading the file %s \n", filename);
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size());
|
||||
return buffer;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
{
|
||||
|
||||
void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
void CUDALaunch(
|
||||
void **handlePtr,
|
||||
const char * module_name,
|
||||
const char * module_1,
|
||||
const char * func_name,
|
||||
void **func_args,
|
||||
int countx, int county, int countz)
|
||||
{
|
||||
assert(module_name != NULL);
|
||||
assert(module_1 != NULL);
|
||||
assert(func_name != NULL);
|
||||
assert(func_args != NULL);
|
||||
#if 1
|
||||
const char * module = module_1;
|
||||
#else
|
||||
const std::vector<char> module_str = readBinary("kernel.cubin");
|
||||
const char * module = &module_str[0];
|
||||
#endif
|
||||
CUmodule cudaModule = loadModule(module);
|
||||
CUfunction cudaFunction = getFunction(cudaModule, func_name);
|
||||
deviceLaunch(cudaFunction, countx, county, countz, func_args);
|
||||
unloadModule(cudaModule);
|
||||
}
|
||||
void CUDASync(void *handle)
|
||||
{
|
||||
checkCudaErrors(cuStreamSynchronize(0));
|
||||
}
|
||||
void ISPCSync(void *handle)
|
||||
{
|
||||
checkCudaErrors(cuStreamSynchronize(0));
|
||||
}
|
||||
void CUDAFree(void *handle)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
extern void volume_serial(float density[], int nVoxels[3],
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
int width, int height, float image[]);
|
||||
|
||||
/* Write a PPM image file with the image */
|
||||
static void
|
||||
writePPM(float *buf, int width, int height, const char *fn) {
|
||||
FILE *fp = fopen(fn, "wb");
|
||||
fprintf(fp, "P6\n");
|
||||
fprintf(fp, "%d %d\n", width, height);
|
||||
fprintf(fp, "255\n");
|
||||
for (int i = 0; i < width*height; ++i) {
|
||||
float v = buf[i] * 255.f;
|
||||
if (v < 0.f) v = 0.f;
|
||||
else if (v > 255.f) v = 255.f;
|
||||
unsigned char c = (unsigned char)v;
|
||||
for (int j = 0; j < 3; ++j)
|
||||
fputc(c, fp);
|
||||
}
|
||||
fclose(fp);
|
||||
printf("Wrote image file %s\n", fn);
|
||||
}
|
||||
|
||||
|
||||
/* Load image and viewing parameters from a camera data file.
|
||||
FIXME: we should add support to be able to specify viewing parameters
|
||||
in the program here directly. */
|
||||
static void
|
||||
loadCamera(const char *fn, int *width, int *height, float raster2camera[4][4],
|
||||
float camera2world[4][4]) {
|
||||
FILE *f = fopen(fn, "r");
|
||||
if (!f) {
|
||||
perror(fn);
|
||||
exit(1);
|
||||
}
|
||||
if (fscanf(f, "%d %d", width, height) != 2) {
|
||||
fprintf(stderr, "Unexpected end of file in camera file\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
if (fscanf(f, "%f", &raster2camera[i][j]) != 1) {
|
||||
fprintf(stderr, "Unexpected end of file in camera file\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
if (fscanf(f, "%f", &camera2world[i][j]) != 1) {
|
||||
fprintf(stderr, "Unexpected end of file in camera file\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
|
||||
/* Load a volume density file. Expects the number of x, y, and z samples
|
||||
as the first three values (as integer strings), then x*y*z
|
||||
floating-point values (also as strings) to give the densities. */
|
||||
static float *
|
||||
loadVolume(const char *fn, int n[3]) {
|
||||
FILE *f = fopen(fn, "r");
|
||||
if (!f) {
|
||||
perror(fn);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (fscanf(f, "%d %d %d", &n[0], &n[1], &n[2]) != 3) {
|
||||
fprintf(stderr, "Couldn't find resolution at start of density file\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int count = n[0] * n[1] * n[2];
|
||||
float *v = new float[count];
|
||||
for (int i = 0; i < count; ++i) {
|
||||
if (fscanf(f, "%f", &v[i]) != 1) {
|
||||
fprintf(stderr, "Unexpected end of file at %d'th density value\n", i);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc != 3) {
|
||||
fprintf(stderr, "usage: volume <camera.dat> <volume_density.vol>\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
//
|
||||
// Load viewing data and the volume density data
|
||||
//
|
||||
int width, height;
|
||||
float raster2camera[4][4], camera2world[4][4];
|
||||
loadCamera(argv[1], &width, &height, raster2camera, camera2world);
|
||||
float *image = new float[width*height];
|
||||
|
||||
int n[3];
|
||||
float *density = loadVolume(argv[2], n);
|
||||
|
||||
/*******************/
|
||||
createContext();
|
||||
/*******************/
|
||||
|
||||
devicePtr d_raster2camera = deviceMalloc(4*4*sizeof(float));
|
||||
devicePtr d_camera2world = deviceMalloc(4*4*sizeof(float));
|
||||
devicePtr d_n = deviceMalloc(3*sizeof(int));
|
||||
devicePtr d_density = deviceMalloc(n[0]*n[1]*n[2]*sizeof(float));
|
||||
devicePtr d_image = deviceMalloc(width*height*sizeof(float));
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation; report the minimum
|
||||
// time of three runs.
|
||||
//
|
||||
double minISPC = 1e30;
|
||||
#if 0
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
volume_ispc(density, n, raster2camera, camera2world,
|
||||
width, height, image);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minISPC = std::min(minISPC, dt);
|
||||
}
|
||||
|
||||
for (int i = 0; i < width*height; i += 4)
|
||||
{
|
||||
if (image[i] != 0.0f)
|
||||
{
|
||||
fprintf(stderr, " i= %d image= %g %g %g %g \n", i,
|
||||
image[i+0],
|
||||
image[i+1],
|
||||
image[i+2],
|
||||
image[i+3]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC);
|
||||
writePPM(image, width, height, "volume-ispc-1core.ppm");
|
||||
#endif
|
||||
|
||||
|
||||
// Clear out the buffer
|
||||
for (int i = 0; i < width * height; ++i)
|
||||
image[i] = 0.;
|
||||
|
||||
memcpyH2D(d_raster2camera, raster2camera, 4*4*sizeof(float));
|
||||
memcpyH2D(d_camera2world, camera2world, 4*4*sizeof(float));
|
||||
memcpyH2D(d_n, n, 3*sizeof(int));
|
||||
memcpyH2D(d_density, density, n[0]*n[1]*n[2]*sizeof(float));
|
||||
memcpyH2D(d_image, image, width*height*sizeof(float));
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation that also uses
|
||||
// tasks; report the minimum time of three runs.
|
||||
//
|
||||
double minISPCtasks = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
volume_ispc_tasks(
|
||||
(float*)d_density,
|
||||
(int*)d_n,
|
||||
(float(*)[4])d_raster2camera,
|
||||
(float(*)[4])d_camera2world,
|
||||
width, height,
|
||||
(float*)d_image);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minISPCtasks = std::min(minISPCtasks, dt);
|
||||
}
|
||||
|
||||
memcpyD2H(image, d_image, width*height*sizeof(float));
|
||||
for (int i = 0; i < width*height; i += 4)
|
||||
{
|
||||
if (image[i] != 0.0f)
|
||||
{
|
||||
fprintf(stderr, " i= %d image= %g %g %g %g \n", i,
|
||||
image[i+0],
|
||||
image[i+1],
|
||||
image[i+2],
|
||||
image[i+3]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
printf("[volume ispc + tasks]:\t\t[%.3f] million cycles\n", minISPCtasks);
|
||||
writePPM(image, width, height, "volume-cuda.ppm");
|
||||
|
||||
#if 0
|
||||
// Clear out the buffer
|
||||
for (int i = 0; i < width * height; ++i)
|
||||
image[i] = 0.;
|
||||
|
||||
//
|
||||
// And run the serial implementation 3 times, again reporting the
|
||||
// minimum time.
|
||||
//
|
||||
double minSerial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
volume_serial(density, n, raster2camera, camera2world,
|
||||
width, height, image);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minSerial = std::min(minSerial, dt);
|
||||
}
|
||||
|
||||
printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial);
|
||||
writePPM(image, width, height, "volume-serial.ppm");
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||
minSerial/minISPC, minSerial / minISPCtasks);
|
||||
#else
|
||||
printf("\t\t\t\t %.2fx speedup from ISPC + tasks)\n",
|
||||
minISPC / minISPCtasks);
|
||||
#endif
|
||||
|
||||
/*******************/
|
||||
destroyContext();
|
||||
/*******************/
|
||||
|
||||
return 0;
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
2
ispc.cpp
2
ispc.cpp
@@ -665,7 +665,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
|
||||
this->m_hasHalf = false;
|
||||
this->m_maskingIsFree = true;
|
||||
this->m_maskBitCount = 1;
|
||||
this->m_hasTranscendentals = true;
|
||||
this->m_hasTranscendentals = false; //true;
|
||||
this->m_hasGather = this->m_hasScatter = false;
|
||||
}
|
||||
else {
|
||||
|
||||
Reference in New Issue
Block a user