diff --git a/examples_cuda/cuda_ispc.h b/examples_cuda/cuda_ispc.h index 14a50bbb..7b034411 100644 --- a/examples_cuda/cuda_ispc.h +++ b/examples_cuda/cuda_ispc.h @@ -68,6 +68,7 @@ static void createContext( // Create driver context checkCudaErrors(cuCtxCreate(&context, 0, device)); +#if 0 size_t limit; checkCudaErrors(cuCtxGetLimit(&limit, CU_LIMIT_STACK_SIZE)); fprintf(stderr, " stack_limit= %llu KB\n", limit/1024); @@ -75,6 +76,7 @@ static void createContext( fprintf(stderr, " heap_limit= %llu KB\n", limit/1024); checkCudaErrors(cuCtxSetLimit(CU_LIMIT_STACK_SIZE,stackLimit)); checkCudaErrors(cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE,heapLimit)); +#endif } static void destroyContext() { diff --git a/examples_cuda/deferred/Makefile_gpu b/examples_cuda/deferred/Makefile_gpu index 2d06e3e7..83b45531 100644 --- a/examples_cuda/deferred/Makefile_gpu +++ b/examples_cuda/deferred/Makefile_gpu @@ -13,6 +13,8 @@ ISPCFLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math LLVM32 = $(HOME)/usr/local/llvm/bin-3.2 LLVM = $(HOME)/usr/local/llvm/bin-3.3 PTXGEN = $(HOME)/ptxgen +PTXGEN += -opt=3 +PTXGEN += -ftz=1 -prec-div=0 -prec-sqrt=0 -fma=1 LLVM32DIS=$(LLVM32)/bin/llvm-dis diff --git a/examples_cuda/deferred/kernels.cu b/examples_cuda/deferred/kernels.cu index 85b646c2..2388ea22 100644 --- a/examples_cuda/deferred/kernels.cu +++ b/examples_cuda/deferred/kernels.cu @@ -171,7 +171,7 @@ struct Uniform shptr[chunk][elem] = value; } }; -#elif 1 +#elif 0 template struct Uniform { @@ -184,7 +184,7 @@ struct Uniform __device__ inline Uniform() { -#if 0 +#if 1 if (programIndex == 0) data = new T[N]; ptr[0] = __shfl(ptr[0], 0); @@ -200,7 +200,7 @@ struct Uniform } __device__ inline ~Uniform() { -#if 0 +#if 1 if (programIndex == 0) delete data; #else @@ -821,4 +821,5 @@ RenderStatic(InputHeader inputHeaderPtr[], inputHeaderPtr, inputDataPtr, visualizeLightCount, framebuffer_r, framebuffer_g, framebuffer_b); cudaDeviceSynchronize(); + cudaDeviceSynchronize(); } diff --git a/examples_cuda/deferred/kernels1.ispc b/examples_cuda/deferred/kernels1.ispc index 740d86a3..1c0962cc 100644 --- a/examples_cuda/deferred/kernels1.ispc +++ b/examples_cuda/deferred/kernels1.ispc @@ -549,6 +549,7 @@ RenderStatic(uniform InputHeader inputHeaderPtr[], launch[num_groups] RenderTile(num_groups_x, num_groups_y, inputHeaderPtr, inputDataPtr, visualizeLightCount, framebuffer_r, framebuffer_g, framebuffer_b); + sync; } diff --git a/examples_cuda/mandelbrot_tasks3d/Makefile_gpu b/examples_cuda/mandelbrot_tasks3d/Makefile_gpu index e5ba555c..e5f8e001 100644 --- a/examples_cuda/mandelbrot_tasks3d/Makefile_gpu +++ b/examples_cuda/mandelbrot_tasks3d/Makefile_gpu @@ -13,6 +13,8 @@ ISPCFLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math LLVM32 = $(HOME)/usr/local/llvm/bin-3.2 LLVM = $(HOME)/usr/local/llvm/bin-3.3 PTXGEN = $(HOME)/ptxgen +PTXGEN += -opt=3 +PTXGEN += -ftz=1 -prec-div=0 -prec-sqrt=0 -fma=1 LLVM32DIS=$(LLVM32)/bin/llvm-dis diff --git a/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc index cd084aec..1f038674 100644 --- a/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc +++ b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc @@ -119,5 +119,6 @@ mandelbrot_ispc(uniform float x0, uniform float y0, #endif mandelbrot_scanline(x0, dx, y0, dy, width, height, xspan, yspan, maxIterations, output); + sync; } #endif diff --git a/examples_cuda/rt/Makefile_gpu b/examples_cuda/rt/Makefile_gpu index 82d1d5d2..2eba1da2 100644 --- a/examples_cuda/rt/Makefile_gpu +++ b/examples_cuda/rt/Makefile_gpu @@ -13,6 +13,8 @@ ISPCFLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math LLVM32 = $(HOME)/usr/local/llvm/bin-3.2 LLVM = $(HOME)/usr/local/llvm/bin-3.3 PTXGEN = $(HOME)/ptxgen +PTXGEN += -opt=3 +PTXGEN += -ftz=1 -prec-div=0 -prec-sqrt=0 -fma=1 LLVM32DIS=$(LLVM32)/bin/llvm-dis diff --git a/examples_cuda/volume_rendering/Makefile_gpu b/examples_cuda/volume_rendering/Makefile_gpu index 1a314788..028224ef 100644 --- a/examples_cuda/volume_rendering/Makefile_gpu +++ b/examples_cuda/volume_rendering/Makefile_gpu @@ -13,6 +13,8 @@ ISPCFLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math LLVM32 = $(HOME)/usr/local/llvm/bin-3.2 LLVM = $(HOME)/usr/local/llvm/bin-3.3 PTXGEN = $(HOME)/ptxgen +PTXGEN += -opt=3 +PTXGEN += -ftz=1 -prec-div=0 -prec-sqrt=0 -fma=1 LLVM32DIS=$(LLVM32)/bin/llvm-dis diff --git a/examples_cuda/volume_rendering/volume1.ispc b/examples_cuda/volume_rendering/volume1.ispc index d680bf00..5437b56f 100644 --- a/examples_cuda/volume_rendering/volume1.ispc +++ b/examples_cuda/volume_rendering/volume1.ispc @@ -419,4 +419,5 @@ volume_ispc_tasks(uniform float density[], uniform int nVoxels[3], uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy); launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world, width, height, image); + sync; }