diff --git a/examples_cuda/common.mk b/examples_cuda/common.mk index 24581dd8..518eec69 100644 --- a/examples_cuda/common.mk +++ b/examples_cuda/common.mk @@ -10,7 +10,7 @@ CCFLAGS+=-Iobjs/ -O2 LIBS=-lm $(TASK_LIB) -lstdc++ ISPC=ispc -ISPC_FLAGS+=-O2 +ISPC_FLAGS+=-O2 --opt=fast-math --math-lib=fast ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h) ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/) diff --git a/examples_cuda/deferred/Makefile_gpu b/examples_cuda/deferred/Makefile_gpu index ddc74f56..2d06e3e7 100644 --- a/examples_cuda/deferred/Makefile_gpu +++ b/examples_cuda/deferred/Makefile_gpu @@ -8,7 +8,7 @@ LD=g++ LDFLAGS=-lcuda ISPC=ispc -ISPCFLAGS=-O3 --math-lib=default --target=nvptx64 +ISPCFLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math LLVM32 = $(HOME)/usr/local/llvm/bin-3.2 LLVM = $(HOME)/usr/local/llvm/bin-3.3 diff --git a/examples_cuda/mandelbrot_tasks3d/Makefile_gpu b/examples_cuda/mandelbrot_tasks3d/Makefile_gpu index 1bb6d8d1..e5ba555c 100644 --- a/examples_cuda/mandelbrot_tasks3d/Makefile_gpu +++ b/examples_cuda/mandelbrot_tasks3d/Makefile_gpu @@ -8,7 +8,7 @@ LD=g++ LDFLAGS=-lcuda ISPC=ispc -ISPCFLAGS=-O3 --math-lib=default --target=nvptx64 +ISPCFLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math LLVM32 = $(HOME)/usr/local/llvm/bin-3.2 LLVM = $(HOME)/usr/local/llvm/bin-3.3 diff --git a/examples_cuda/volume_rendering/Makefile_gpu b/examples_cuda/volume_rendering/Makefile_gpu index 80cbcd1b..1a314788 100644 --- a/examples_cuda/volume_rendering/Makefile_gpu +++ b/examples_cuda/volume_rendering/Makefile_gpu @@ -8,7 +8,7 @@ LD=g++ LDFLAGS=-lcuda ISPC=ispc -ISPCFLAGS=-O3 --math-lib=default --target=nvptx64,avx +ISPCFLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math LLVM32 = $(HOME)/usr/local/llvm/bin-3.2 LLVM = $(HOME)/usr/local/llvm/bin-3.3 @@ -37,7 +37,7 @@ $(PROG): $(CXX_OBJ) kernel.ptx %_ispc_nvptx64.bc: %.ispc - $(ISPC) $(ISPCFLAGS) --emit-llvm -o `basename $< .ispc`_ispc.bc -h `basename $< .ispc`_ispc.h $< --emit-llvm + $(ISPC) $(ISPCFLAGS) --emit-llvm -o `basename $< .ispc`_ispc_nvptx64.bc -h `basename $< .ispc`_ispc.h $< --emit-llvm %.ptx: %.bc $(LLVM32DIS) $< diff --git a/examples_cuda/volume_rendering/volume.cu b/examples_cuda/volume_rendering/volume.cu index 909f1f45..547bd5a0 100644 --- a/examples_cuda/volume_rendering/volume.cu +++ b/examples_cuda/volume_rendering/volume.cu @@ -362,7 +362,6 @@ volume_tile(int x0, int y0, int x1, } -extern "C" __global__ void volume_task(float density[], int _nVoxels[3], const float _raster2camera[4][4], @@ -432,3 +431,16 @@ volume_task(float density[], int _nVoxels[3], } +extern "C" +__global__ void +volume_ispc_tasks( float density[], int nVoxels[3], + const float raster2camera[4][4], + const float camera2world[4][4], + int width, int height, float image[]) { + // Launch tasks to work on (dx,dy)-sized tiles of the image + int dx = 8, dy = 8; + int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy); + if (programIndex == 0) + volume_task<<<(nTasks-1)/4+1, 128>>>(density, nVoxels, raster2camera, camera2world, + width, height, image); +}