From af75afeb7a8bcfb5745e862495a793580367267d Mon Sep 17 00:00:00 2001 From: Evghenii Date: Thu, 14 Nov 2013 16:29:40 +0100 Subject: [PATCH] foreach[_tiled] seems to work now --- examples_cuda/aobench/ao1.ispc | 89 +++++++++---------- .../mandelbrot_tasks3d/mandel_cu.cpp | 2 +- .../mandelbrot_tasks3d.ispc | 14 ++- examples_cuda/volume_rendering/volume1.ispc | 78 +++++++++------- stmt.cpp | 76 +++++++++++++--- 5 files changed, 164 insertions(+), 95 deletions(-) diff --git a/examples_cuda/aobench/ao1.ispc b/examples_cuda/aobench/ao1.ispc index cccb3d7b..2dc612bc 100644 --- a/examples_cuda/aobench/ao1.ispc +++ b/examples_cuda/aobench/ao1.ispc @@ -228,57 +228,54 @@ static inline void ao_tile( seed_rng(&rngstate, programIndex + (y0 << (programIndex & 31))); float invSamples = 1.f / nsubsamples; - for (uniform int y = y0; y < y1; y++) - for (uniform int xb = x0; xb < x1; xb += programCount) - { - const int x = xb + programIndex; - const int offset = 3 * (y * w + x); - float res = 0.0f; + foreach_tiled (y = y0 ... y1, x = x0 ... x1) + { + const int offset = 3 * (y * w + x); + float res = 0.0f; - for (uniform int u = 0; u < nsubsamples; u++) - for (uniform int v = 0; v < nsubsamples; v++) - { - float du = (float)u * invSamples, dv = (float)v * invSamples; - - // Figure out x,y pixel in NDC - float px = (x + du - (w / 2.0f)) / (w / 2.0f); - float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); - float ret = 0.f; - Ray ray; - Isect isect; - - ray.org = 0.f; - - // Poor man's perspective projection - ray.dir.x = px; - ray.dir.y = py; - ray.dir.z = -1.0; - vnormalize(ray.dir); - - isect.t = 1.0e+17; - isect.hit = 0; - - for (uniform int snum = 0; snum < 3; ++snum) - ray_sphere_intersect(isect, ray, spheres[snum]); - ray_plane_intersect(isect, ray, plane); - - // Note use of 'coherent' if statement; the set of rays we - // trace will often all hit or all miss the scene - if (isect.hit) { - ret = ambient_occlusion(isect, plane, spheres, rngstate); - ret *= invSamples * invSamples; - res += ret; - } - } - - if (xb < x1) + for (uniform int u = 0; u < nsubsamples; u++) + for (uniform int v = 0; v < nsubsamples; v++) { - image[offset ] = res; - image[offset+1] = res; - image[offset+2] = res; + float du = (float)u * invSamples, dv = (float)v * invSamples; + + // Figure out x,y pixel in NDC + float px = (x + du - (w / 2.0f)) / (w / 2.0f); + float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); + float ret = 0.f; + Ray ray; + Isect isect; + + ray.org = 0.f; + + // Poor man's perspective projection + ray.dir.x = px; + ray.dir.y = py; + ray.dir.z = -1.0; + vnormalize(ray.dir); + + isect.t = 1.0e+17; + isect.hit = 0; + + for (uniform int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(isect, ray, spheres[snum]); + ray_plane_intersect(isect, ray, plane); + + // Note use of 'coherent' if statement; the set of rays we + // trace will often all hit or all miss the scene + if (isect.hit) { + ret = ambient_occlusion(isect, plane, spheres, rngstate); + ret *= invSamples * invSamples; + res += ret; + } } + //if (x < x1) + { + image[offset ] = res; + image[offset+1] = res; + image[offset+2] = res; } + } } diff --git a/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp b/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp index d16c5b3c..0963984c 100644 --- a/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp +++ b/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp @@ -142,7 +142,7 @@ int main(int argc, char *argv[]) { deviceFree(d_buf); printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC); - writePPM(buf, width, height, "mandelbrot-ispc.ppm"); + writePPM(buf, width, height, "mandelbrot-cuda.ppm"); // diff --git a/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc index 83b2f27f..cd084aec 100644 --- a/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc +++ b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc @@ -73,7 +73,8 @@ mandelbrot_scanline(uniform float x0, uniform float dx, const uniform int ystart = taskIndex1 * yspan; const uniform int yend = min(ystart + yspan, height); - + +#if 0 for (uniform int yi = ystart; yi < yend; yi++) for (uniform int xi = xstart; xi < xend; xi += programCount) { @@ -85,6 +86,17 @@ mandelbrot_scanline(uniform float x0, uniform float dx, if (xi + programIndex < xend) output[index] = res; } +#else + foreach (yi = ystart ... yend, xi = xstart ... xend) + { + const float x = x0 + xi * dx; + const float y = y0 + yi * dy; + + const int res = mandel(x,y,maxIterations); + const int index = yi * width + xi; + output[index] = res; + } +#endif } diff --git a/examples_cuda/volume_rendering/volume1.ispc b/examples_cuda/volume_rendering/volume1.ispc index 4dd7029f..d680bf00 100644 --- a/examples_cuda/volume_rendering/volume1.ispc +++ b/examples_cuda/volume_rendering/volume1.ispc @@ -278,44 +278,54 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1, const uniform float raster2camera[4][4], const uniform float camera2world[4][4], uniform int width, uniform int height, uniform float image[]) { - // Work on 4x4=16 pixel big tiles of the image. This function thus - // implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble - // by 4. - for (uniform int y = y0; y < y1; y += 8) { - for (uniform int x = x0; x < x1; x += 8) { -// foreach (o = 0 ... 16) { - for (uniform int ob = 0; ob < 64; ob += programCount) - { - const int o = ob + programIndex; - - // These two arrays encode the mapping from [0,15] to - // offsets within the 4x4 pixel block so that we render - // each pixel inside the block - const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3, - 0, 1, 0, 1, 2, 3, 2, 3 }; - const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1, - 2, 2, 3, 3, 2, 2, 3, 3 }; + // Work on 4x4=16 pixel big tiles of the image. This function thus + // implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble + // by 4. +#if 0 + for (uniform int y = y0; y < y1; y += 8) + for (uniform int x = x0; x < x1; x += 8) + foreach (o = 0 ... 64) + { + // These two arrays encode the mapping from [0,15] to + // offsets within the 4x4 pixel block so that we render + // each pixel inside the block + const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3, + 0, 1, 0, 1, 2, 3, 2, 3 }; + const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1, + 2, 2, 3, 3, 2, 2, 3, 3 }; - const uniform int xblock[4] = {0, 4, 0, 4}; - const uniform int yblock[4] = {0, 0, 4, 4}; + const uniform int xblock[4] = {0, 4, 0, 4}; + const uniform int yblock[4] = {0, 0, 4, 4}; - // Figure out the pixel to render for this program instance - const int xo = x + xblock[o/16] + xoffsets[o&15]; - const int yo = y + yblock[o/16] + yoffsets[o&15]; + // Figure out the pixel to render for this program instance + const int xo = x + xblock[o/16] + xoffsets[o&15]; + const int yo = y + yblock[o/16] + yoffsets[o&15]; - // Use viewing parameters to compute the corresponding ray - // for the pixel - Ray ray; - generateRay(raster2camera, camera2world, xo, yo, ray); + // Use viewing parameters to compute the corresponding ray + // for the pixel + Ray ray; + generateRay(raster2camera, camera2world, xo, yo, ray); - // And raymarch through the volume to compute the pixel's - // value - int offset = yo * width + xo; - if (xo < x1 && yo < y1) - image[offset] = raymarch(density, nVoxels, ray); - } - } - } + // And raymarch through the volume to compute the pixel's + // value + int offset = yo * width + xo; + if (xo < x1 && yo < y1) + image[offset] = raymarch(density, nVoxels, ray); + } +#else + foreach_tiled (y = y0 ... y1, x = x0 ... x1) + { + // Use viewing parameters to compute the corresponding ray + // for the pixel + Ray ray; + generateRay(raster2camera, camera2world, x, y, ray); + + // And raymarch through the volume to compute the pixel's + // value + int offset = y * width + x; + image[offset] = raymarch(density, nVoxels, ray); + } +#endif } diff --git a/stmt.cpp b/stmt.cpp index 05209e14..ee990c51 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -1325,28 +1325,76 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx, // (0,0,0,0,1,1,1,1). int32_t delta[ISPC_MAX_NVEC]; const int vecWidth = 32; - for (int i = 0; i < vecWidth; ++i) { - int d = i; - // First, account for the effect of any dimensions at deeper - // nesting levels than the current one. - int prevDimSpanCount = 1; - for (int j = dim; j < nDims-1; ++j) - prevDimSpanCount *= spans[j+1]; - d /= prevDimSpanCount; + std::vector constDeltaList; + for (int i = 0; i < vecWidth; ++i) + { + int d = i; + // First, account for the effect of any dimensions at deeper + // nesting levels than the current one. + int prevDimSpanCount = 1; + for (int j = dim; j < nDims-1; ++j) + prevDimSpanCount *= spans[j+1]; + d /= prevDimSpanCount; - // And now with what's left, figure out our own offset - delta[i] = d % spans[dim]; + // And now with what's left, figure out our own offset + delta[i] = d % spans[dim]; + constDeltaList.push_back(LLVMInt8(delta[i])); } - llvm::VectorType *LLVMTypes::Int32VectorSIMT = llvm::VectorType::get(LLVMTypes::Int32Type, 32); - llvm::ArrayType* ArrayDelta = llvm::ArrayType::get(LLVMTypes::Int32Type, 32); + llvm::ArrayType* ArrayDelta = llvm::ArrayType::get(LLVMTypes::Int8Type, 32); +// llvm::PointerType::get(ArrayDelta, 4); /* constant memory */ + + + llvm::GlobalVariable* globalDelta = new llvm::GlobalVariable( + /*Module=*/*m->module, + /*Type=*/ArrayDelta, + /*isConstant=*/true, + /*Linkage=*/llvm::GlobalValue::PrivateLinkage, + /*Initializer=*/0, // has initializer, specified below + /*Name=*/"constDeltaForeach"); +#if 0 + /*ThreadLocalMode=*/llvm::GlobalVariable::NotThreadLocal, + /*unsigned AddressSpace=*/4 /*constant*/); +#endif + + + llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList); + + globalDelta->setInitializer(constDelta); + llvm::Function *func_tid_x = m->module->getFunction("__tid_x"); + std::vector allocArgs; + llvm::Value *__tid_x = ctx->CallInst(func_tid_x, NULL, allocArgs, "laneIdxForEach"); + llvm::Value *laneIdx = ctx->BinaryOperator(llvm::Instruction::And, __tid_x, LLVMInt32(31), "__laneidx"); + + std::vector ptr_arrayidx_indices; + ptr_arrayidx_indices.push_back(LLVMInt32(0)); + ptr_arrayidx_indices.push_back(laneIdx); +#if 1 + llvm::Instruction* ptr_arrayidx = llvm::GetElementPtrInst::Create(globalDelta, ptr_arrayidx_indices, "arrayidx", ctx->GetCurrentBasicBlock()); + llvm::LoadInst* int8_39 = new llvm::LoadInst(ptr_arrayidx, "", false, ctx->GetCurrentBasicBlock()); + llvm::Value * int32_39 = ctx->ZExtInst(int8_39, LLVMTypes::Int32Type); + + llvm::VectorType* VectorTy_2 = llvm::VectorType::get(llvm::IntegerType::get(*g->ctx, 32), 1); + llvm::UndefValue* const_packed_41 = llvm::UndefValue::get(VectorTy_2); + + llvm::InsertElementInst* packed_43 = llvm::InsertElementInst::Create( +// llvm::UndefValue(LLVMInt32Vector), + const_packed_41, + int32_39, LLVMInt32(0), "", ctx->GetCurrentBasicBlock()); +#endif + - // Add the deltas to compute the varying counter values; store the // result to memory and then return it directly as well. +#if 0 llvm::Value *varyingCounter = ctx->BinaryOperator(llvm::Instruction::Add, smearCounter, LLVMInt32Vector(delta), "iter_val"); +#else + llvm::Value *varyingCounter = + ctx->BinaryOperator(llvm::Instruction::Add, smearCounter, + packed_43, "iter_val"); +#endif ctx->StoreInst(varyingCounter, varyingCounterPtr); return varyingCounter; } @@ -1895,12 +1943,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { std::vector span(nDims, 0); const int vectorWidth = 32; lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]); +#if 0 for (int i = 0; i < nDims; i++) { fprintf(stderr, " i= %d [ %d ] : %d \n", i, nDims, span[i]); } fprintf(stderr, " --- \n"); +#endif for (int i = 0; i < nDims; ++i) { // Basic blocks that we'll fill in later with the looping logic for