foreach[_tiled] seems to work now
This commit is contained in:
@@ -228,57 +228,54 @@ static inline void ao_tile(
|
|||||||
|
|
||||||
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 31)));
|
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 31)));
|
||||||
float invSamples = 1.f / nsubsamples;
|
float invSamples = 1.f / nsubsamples;
|
||||||
for (uniform int y = y0; y < y1; y++)
|
foreach_tiled (y = y0 ... y1, x = x0 ... x1)
|
||||||
for (uniform int xb = x0; xb < x1; xb += programCount)
|
{
|
||||||
{
|
const int offset = 3 * (y * w + x);
|
||||||
const int x = xb + programIndex;
|
float res = 0.0f;
|
||||||
const int offset = 3 * (y * w + x);
|
|
||||||
float res = 0.0f;
|
|
||||||
|
|
||||||
for (uniform int u = 0; u < nsubsamples; u++)
|
for (uniform int u = 0; u < nsubsamples; u++)
|
||||||
for (uniform int v = 0; v < nsubsamples; v++)
|
for (uniform int v = 0; v < nsubsamples; v++)
|
||||||
{
|
|
||||||
float du = (float)u * invSamples, dv = (float)v * invSamples;
|
|
||||||
|
|
||||||
// Figure out x,y pixel in NDC
|
|
||||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
|
||||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
|
||||||
float ret = 0.f;
|
|
||||||
Ray ray;
|
|
||||||
Isect isect;
|
|
||||||
|
|
||||||
ray.org = 0.f;
|
|
||||||
|
|
||||||
// Poor man's perspective projection
|
|
||||||
ray.dir.x = px;
|
|
||||||
ray.dir.y = py;
|
|
||||||
ray.dir.z = -1.0;
|
|
||||||
vnormalize(ray.dir);
|
|
||||||
|
|
||||||
isect.t = 1.0e+17;
|
|
||||||
isect.hit = 0;
|
|
||||||
|
|
||||||
for (uniform int snum = 0; snum < 3; ++snum)
|
|
||||||
ray_sphere_intersect(isect, ray, spheres[snum]);
|
|
||||||
ray_plane_intersect(isect, ray, plane);
|
|
||||||
|
|
||||||
// Note use of 'coherent' if statement; the set of rays we
|
|
||||||
// trace will often all hit or all miss the scene
|
|
||||||
if (isect.hit) {
|
|
||||||
ret = ambient_occlusion(isect, plane, spheres, rngstate);
|
|
||||||
ret *= invSamples * invSamples;
|
|
||||||
res += ret;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (xb < x1)
|
|
||||||
{
|
{
|
||||||
image[offset ] = res;
|
float du = (float)u * invSamples, dv = (float)v * invSamples;
|
||||||
image[offset+1] = res;
|
|
||||||
image[offset+2] = res;
|
// Figure out x,y pixel in NDC
|
||||||
|
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||||
|
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||||
|
float ret = 0.f;
|
||||||
|
Ray ray;
|
||||||
|
Isect isect;
|
||||||
|
|
||||||
|
ray.org = 0.f;
|
||||||
|
|
||||||
|
// Poor man's perspective projection
|
||||||
|
ray.dir.x = px;
|
||||||
|
ray.dir.y = py;
|
||||||
|
ray.dir.z = -1.0;
|
||||||
|
vnormalize(ray.dir);
|
||||||
|
|
||||||
|
isect.t = 1.0e+17;
|
||||||
|
isect.hit = 0;
|
||||||
|
|
||||||
|
for (uniform int snum = 0; snum < 3; ++snum)
|
||||||
|
ray_sphere_intersect(isect, ray, spheres[snum]);
|
||||||
|
ray_plane_intersect(isect, ray, plane);
|
||||||
|
|
||||||
|
// Note use of 'coherent' if statement; the set of rays we
|
||||||
|
// trace will often all hit or all miss the scene
|
||||||
|
if (isect.hit) {
|
||||||
|
ret = ambient_occlusion(isect, plane, spheres, rngstate);
|
||||||
|
ret *= invSamples * invSamples;
|
||||||
|
res += ret;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//if (x < x1)
|
||||||
|
{
|
||||||
|
image[offset ] = res;
|
||||||
|
image[offset+1] = res;
|
||||||
|
image[offset+2] = res;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -142,7 +142,7 @@ int main(int argc, char *argv[]) {
|
|||||||
deviceFree(d_buf);
|
deviceFree(d_buf);
|
||||||
|
|
||||||
printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
|
printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
|
||||||
writePPM(buf, width, height, "mandelbrot-ispc.ppm");
|
writePPM(buf, width, height, "mandelbrot-cuda.ppm");
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|||||||
@@ -73,7 +73,8 @@ mandelbrot_scanline(uniform float x0, uniform float dx,
|
|||||||
|
|
||||||
const uniform int ystart = taskIndex1 * yspan;
|
const uniform int ystart = taskIndex1 * yspan;
|
||||||
const uniform int yend = min(ystart + yspan, height);
|
const uniform int yend = min(ystart + yspan, height);
|
||||||
|
|
||||||
|
#if 0
|
||||||
for (uniform int yi = ystart; yi < yend; yi++)
|
for (uniform int yi = ystart; yi < yend; yi++)
|
||||||
for (uniform int xi = xstart; xi < xend; xi += programCount)
|
for (uniform int xi = xstart; xi < xend; xi += programCount)
|
||||||
{
|
{
|
||||||
@@ -85,6 +86,17 @@ mandelbrot_scanline(uniform float x0, uniform float dx,
|
|||||||
if (xi + programIndex < xend)
|
if (xi + programIndex < xend)
|
||||||
output[index] = res;
|
output[index] = res;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
foreach (yi = ystart ... yend, xi = xstart ... xend)
|
||||||
|
{
|
||||||
|
const float x = x0 + xi * dx;
|
||||||
|
const float y = y0 + yi * dy;
|
||||||
|
|
||||||
|
const int res = mandel(x,y,maxIterations);
|
||||||
|
const int index = yi * width + xi;
|
||||||
|
output[index] = res;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -278,44 +278,54 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
|
|||||||
const uniform float raster2camera[4][4],
|
const uniform float raster2camera[4][4],
|
||||||
const uniform float camera2world[4][4],
|
const uniform float camera2world[4][4],
|
||||||
uniform int width, uniform int height, uniform float image[]) {
|
uniform int width, uniform int height, uniform float image[]) {
|
||||||
// Work on 4x4=16 pixel big tiles of the image. This function thus
|
// Work on 4x4=16 pixel big tiles of the image. This function thus
|
||||||
// implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
|
// implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
|
||||||
// by 4.
|
// by 4.
|
||||||
for (uniform int y = y0; y < y1; y += 8) {
|
#if 0
|
||||||
for (uniform int x = x0; x < x1; x += 8) {
|
for (uniform int y = y0; y < y1; y += 8)
|
||||||
// foreach (o = 0 ... 16) {
|
for (uniform int x = x0; x < x1; x += 8)
|
||||||
for (uniform int ob = 0; ob < 64; ob += programCount)
|
foreach (o = 0 ... 64)
|
||||||
{
|
{
|
||||||
const int o = ob + programIndex;
|
// These two arrays encode the mapping from [0,15] to
|
||||||
|
// offsets within the 4x4 pixel block so that we render
|
||||||
// These two arrays encode the mapping from [0,15] to
|
// each pixel inside the block
|
||||||
// offsets within the 4x4 pixel block so that we render
|
const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
|
||||||
// each pixel inside the block
|
0, 1, 0, 1, 2, 3, 2, 3 };
|
||||||
const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
|
const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
|
||||||
0, 1, 0, 1, 2, 3, 2, 3 };
|
2, 2, 3, 3, 2, 2, 3, 3 };
|
||||||
const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
|
|
||||||
2, 2, 3, 3, 2, 2, 3, 3 };
|
|
||||||
|
|
||||||
const uniform int xblock[4] = {0, 4, 0, 4};
|
const uniform int xblock[4] = {0, 4, 0, 4};
|
||||||
const uniform int yblock[4] = {0, 0, 4, 4};
|
const uniform int yblock[4] = {0, 0, 4, 4};
|
||||||
|
|
||||||
// Figure out the pixel to render for this program instance
|
// Figure out the pixel to render for this program instance
|
||||||
const int xo = x + xblock[o/16] + xoffsets[o&15];
|
const int xo = x + xblock[o/16] + xoffsets[o&15];
|
||||||
const int yo = y + yblock[o/16] + yoffsets[o&15];
|
const int yo = y + yblock[o/16] + yoffsets[o&15];
|
||||||
|
|
||||||
// Use viewing parameters to compute the corresponding ray
|
// Use viewing parameters to compute the corresponding ray
|
||||||
// for the pixel
|
// for the pixel
|
||||||
Ray ray;
|
Ray ray;
|
||||||
generateRay(raster2camera, camera2world, xo, yo, ray);
|
generateRay(raster2camera, camera2world, xo, yo, ray);
|
||||||
|
|
||||||
// And raymarch through the volume to compute the pixel's
|
// And raymarch through the volume to compute the pixel's
|
||||||
// value
|
// value
|
||||||
int offset = yo * width + xo;
|
int offset = yo * width + xo;
|
||||||
if (xo < x1 && yo < y1)
|
if (xo < x1 && yo < y1)
|
||||||
image[offset] = raymarch(density, nVoxels, ray);
|
image[offset] = raymarch(density, nVoxels, ray);
|
||||||
}
|
}
|
||||||
}
|
#else
|
||||||
}
|
foreach_tiled (y = y0 ... y1, x = x0 ... x1)
|
||||||
|
{
|
||||||
|
// Use viewing parameters to compute the corresponding ray
|
||||||
|
// for the pixel
|
||||||
|
Ray ray;
|
||||||
|
generateRay(raster2camera, camera2world, x, y, ray);
|
||||||
|
|
||||||
|
// And raymarch through the volume to compute the pixel's
|
||||||
|
// value
|
||||||
|
int offset = y * width + x;
|
||||||
|
image[offset] = raymarch(density, nVoxels, ray);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
76
stmt.cpp
76
stmt.cpp
@@ -1325,28 +1325,76 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
|
|||||||
// (0,0,0,0,1,1,1,1).
|
// (0,0,0,0,1,1,1,1).
|
||||||
int32_t delta[ISPC_MAX_NVEC];
|
int32_t delta[ISPC_MAX_NVEC];
|
||||||
const int vecWidth = 32;
|
const int vecWidth = 32;
|
||||||
for (int i = 0; i < vecWidth; ++i) {
|
std::vector<llvm::Constant*> constDeltaList;
|
||||||
int d = i;
|
for (int i = 0; i < vecWidth; ++i)
|
||||||
// First, account for the effect of any dimensions at deeper
|
{
|
||||||
// nesting levels than the current one.
|
int d = i;
|
||||||
int prevDimSpanCount = 1;
|
// First, account for the effect of any dimensions at deeper
|
||||||
for (int j = dim; j < nDims-1; ++j)
|
// nesting levels than the current one.
|
||||||
prevDimSpanCount *= spans[j+1];
|
int prevDimSpanCount = 1;
|
||||||
d /= prevDimSpanCount;
|
for (int j = dim; j < nDims-1; ++j)
|
||||||
|
prevDimSpanCount *= spans[j+1];
|
||||||
|
d /= prevDimSpanCount;
|
||||||
|
|
||||||
// And now with what's left, figure out our own offset
|
// And now with what's left, figure out our own offset
|
||||||
delta[i] = d % spans[dim];
|
delta[i] = d % spans[dim];
|
||||||
|
constDeltaList.push_back(LLVMInt8(delta[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
llvm::VectorType *LLVMTypes::Int32VectorSIMT = llvm::VectorType::get(LLVMTypes::Int32Type, 32);
|
llvm::ArrayType* ArrayDelta = llvm::ArrayType::get(LLVMTypes::Int8Type, 32);
|
||||||
llvm::ArrayType* ArrayDelta = llvm::ArrayType::get(LLVMTypes::Int32Type, 32);
|
// llvm::PointerType::get(ArrayDelta, 4); /* constant memory */
|
||||||
|
|
||||||
|
|
||||||
|
llvm::GlobalVariable* globalDelta = new llvm::GlobalVariable(
|
||||||
|
/*Module=*/*m->module,
|
||||||
|
/*Type=*/ArrayDelta,
|
||||||
|
/*isConstant=*/true,
|
||||||
|
/*Linkage=*/llvm::GlobalValue::PrivateLinkage,
|
||||||
|
/*Initializer=*/0, // has initializer, specified below
|
||||||
|
/*Name=*/"constDeltaForeach");
|
||||||
|
#if 0
|
||||||
|
/*ThreadLocalMode=*/llvm::GlobalVariable::NotThreadLocal,
|
||||||
|
/*unsigned AddressSpace=*/4 /*constant*/);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList);
|
||||||
|
|
||||||
|
globalDelta->setInitializer(constDelta);
|
||||||
|
llvm::Function *func_tid_x = m->module->getFunction("__tid_x");
|
||||||
|
std::vector<llvm::Value *> allocArgs;
|
||||||
|
llvm::Value *__tid_x = ctx->CallInst(func_tid_x, NULL, allocArgs, "laneIdxForEach");
|
||||||
|
llvm::Value *laneIdx = ctx->BinaryOperator(llvm::Instruction::And, __tid_x, LLVMInt32(31), "__laneidx");
|
||||||
|
|
||||||
|
std::vector<llvm::Value*> ptr_arrayidx_indices;
|
||||||
|
ptr_arrayidx_indices.push_back(LLVMInt32(0));
|
||||||
|
ptr_arrayidx_indices.push_back(laneIdx);
|
||||||
|
#if 1
|
||||||
|
llvm::Instruction* ptr_arrayidx = llvm::GetElementPtrInst::Create(globalDelta, ptr_arrayidx_indices, "arrayidx", ctx->GetCurrentBasicBlock());
|
||||||
|
llvm::LoadInst* int8_39 = new llvm::LoadInst(ptr_arrayidx, "", false, ctx->GetCurrentBasicBlock());
|
||||||
|
llvm::Value * int32_39 = ctx->ZExtInst(int8_39, LLVMTypes::Int32Type);
|
||||||
|
|
||||||
|
llvm::VectorType* VectorTy_2 = llvm::VectorType::get(llvm::IntegerType::get(*g->ctx, 32), 1);
|
||||||
|
llvm::UndefValue* const_packed_41 = llvm::UndefValue::get(VectorTy_2);
|
||||||
|
|
||||||
|
llvm::InsertElementInst* packed_43 = llvm::InsertElementInst::Create(
|
||||||
|
// llvm::UndefValue(LLVMInt32Vector),
|
||||||
|
const_packed_41,
|
||||||
|
int32_39, LLVMInt32(0), "", ctx->GetCurrentBasicBlock());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Add the deltas to compute the varying counter values; store the
|
// Add the deltas to compute the varying counter values; store the
|
||||||
// result to memory and then return it directly as well.
|
// result to memory and then return it directly as well.
|
||||||
|
#if 0
|
||||||
llvm::Value *varyingCounter =
|
llvm::Value *varyingCounter =
|
||||||
ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
|
ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
|
||||||
LLVMInt32Vector(delta), "iter_val");
|
LLVMInt32Vector(delta), "iter_val");
|
||||||
|
#else
|
||||||
|
llvm::Value *varyingCounter =
|
||||||
|
ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
|
||||||
|
packed_43, "iter_val");
|
||||||
|
#endif
|
||||||
ctx->StoreInst(varyingCounter, varyingCounterPtr);
|
ctx->StoreInst(varyingCounter, varyingCounterPtr);
|
||||||
return varyingCounter;
|
return varyingCounter;
|
||||||
}
|
}
|
||||||
@@ -1895,12 +1943,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
|||||||
std::vector<int> span(nDims, 0);
|
std::vector<int> span(nDims, 0);
|
||||||
const int vectorWidth = 32;
|
const int vectorWidth = 32;
|
||||||
lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]);
|
lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]);
|
||||||
|
#if 0
|
||||||
for (int i = 0; i < nDims; i++)
|
for (int i = 0; i < nDims; i++)
|
||||||
{
|
{
|
||||||
fprintf(stderr, " i= %d [ %d ] : %d \n",
|
fprintf(stderr, " i= %d [ %d ] : %d \n",
|
||||||
i, nDims, span[i]);
|
i, nDims, span[i]);
|
||||||
}
|
}
|
||||||
fprintf(stderr, " --- \n");
|
fprintf(stderr, " --- \n");
|
||||||
|
#endif
|
||||||
|
|
||||||
for (int i = 0; i < nDims; ++i) {
|
for (int i = 0; i < nDims; ++i) {
|
||||||
// Basic blocks that we'll fill in later with the looping logic for
|
// Basic blocks that we'll fill in later with the looping logic for
|
||||||
|
|||||||
Reference in New Issue
Block a user