diff --git a/examples/mandelbrot_tasks3d/Makefile b/examples/mandelbrot_tasks3d/Makefile index 3dd44d65..ad1a9b3a 100644 --- a/examples/mandelbrot_tasks3d/Makefile +++ b/examples/mandelbrot_tasks3d/Makefile @@ -2,7 +2,7 @@ EXAMPLE=mandelbrot_tasks3d CPP_SRC=mandelbrot_tasks3d.cpp mandelbrot_tasks_serial.cpp ISPC_SRC=mandelbrot_tasks3d.ispc -ISPC_IA_TARGETS=avx,sse2,sse4 +ISPC_IA_TARGETS=avx ISPC_ARM_TARGETS=neon include ../common.mk diff --git a/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp b/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp new file mode 100644 index 00000000..0f7f0884 --- /dev/null +++ b/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp @@ -0,0 +1,402 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include +#include +#include "../timing.h" + +#include +#include +#include +#include +#include "drvapi_error_string.h" + +#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) +// These are the inline versions for all of the SDK helper functions +void __checkCudaErrors(CUresult err, const char *file, const int line) { + if(CUDA_SUCCESS != err) { + std::cerr << "checkCudeErrors() Driver API error = " << err << "\"" + << getCudaDrvErrorString(err) << "\" from file <" << file + << ", line " << line << "\n"; + exit(-1); + } +} +extern "C" +void mandelbrot_ispc( + float x0, float y0, + float x1, float y1, + int width, int height, + int maxIterations, int output[]) ; + + +/**********************/ +/* Basic CUDriver API */ +CUcontext context; + +void createContext(const int deviceId = 0) +{ + CUdevice device; + int devCount; + checkCudaErrors(cuInit(0)); + checkCudaErrors(cuDeviceGetCount(&devCount)); + assert(devCount > 0); + checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0)); + + char name[128]; + checkCudaErrors(cuDeviceGetName(name, 128, device)); + std::cout << "Using CUDA Device [0]: " << name << "\n"; + + int devMajor, devMinor; + checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device)); + std::cout << "Device Compute Capability: " + << devMajor << "." << devMinor << "\n"; + if (devMajor < 2) { + std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n"; + exit(1); + } + + // Create driver context + checkCudaErrors(cuCtxCreate(&context, 0, device)); +} +void destroyContext() +{ + checkCudaErrors(cuCtxDestroy(context)); +} + +CUmodule loadModule(const char * module) +{ + CUmodule cudaModule; + checkCudaErrors(cuModuleLoadData(&cudaModule, module)); + return cudaModule; +} +void unloadModule(CUmodule &cudaModule) +{ + checkCudaErrors(cuModuleUnload(cudaModule)); +} + +CUfunction getFunction(CUmodule &cudaModule, const char * function) +{ + CUfunction cudaFunction; + checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function)); + return cudaFunction; +} + +CUdeviceptr deviceMalloc(const size_t size) +{ + CUdeviceptr d_buf; + checkCudaErrors(cuMemAlloc(&d_buf, size)); + return d_buf; +} +void deviceFree(CUdeviceptr d_buf) +{ + checkCudaErrors(cuMemFree(d_buf)); +} +void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size) +{ + checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size)); +} +void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size) +{ + checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); +} +#define deviceLaunch(func,nbx,nby,nbz,params) \ + checkCudaErrors( \ + cuLaunchKernel( \ + (func), \ + ((nbx-1)/(128/32)+1), (nby), (nbz), \ + 128, 1, 1, \ + 0, NULL, (params), NULL \ + )); + +typedef CUdeviceptr devicePtr; + + +/**************/ +#include +std::vector readBinary(const char * filename) +{ + std::vector buffer; + FILE *fp = fopen(filename, "rb"); + if (!fp ) + { + fprintf(stderr, "file %s not found\n", filename); + assert(0); + } +#if 0 + char c; + while ((c = fgetc(fp)) != EOF) + buffer.push_back(c); +#else + fseek(fp, 0, SEEK_END); + const unsigned long long size = ftell(fp); /*calc the size needed*/ + fseek(fp, 0, SEEK_SET); + buffer.resize(size); + + if (fp == NULL){ /*ERROR detection if file == empty*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n",filename); + exit(1); + } + else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n", filename); + exit(1); + } +#endif + fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size()); + return buffer; +} + +extern "C" +{ +#if 0 + struct ModuleManager + { + private: + typedef std::pair ModulePair; + typedef std::map ModuleMap; + ModuleMap module_list; + + ModuleMap::iterator findModule(const char * module_name) + { + return module_list.find(std::string(module_name)); + } + + public: + + CUmodule loadModule(const char * module_name, const char * module_data) + { + const ModuleMap::iterator it = findModule(module_name) + if (it != ModuleMap::end) + { + CUmodule cudaModule = loadModule(module); + module_list.insert(std::make_pair(std::string(module_name), cudaModule)); + return cudaModule + } + return it->second; + } + void unloadModule(const char * module_name) + { + ModuleMap::iterator it = findModule(module_name) + if (it != ModuleMap::end) + module_list.erase(it); + } + }; +#endif + + void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment) + { +#if 0 + fprintf(stderr, " ptr= %p\n", *handlePtr); + fprintf(stderr, " size= %d\n", (int)size); + fprintf(stderr, " alignment= %d\n", (int)alignment); + fprintf(stderr, " ------- \n\n"); +#endif + return NULL; + } + void CUDALaunch( + void **handlePtr, + const char * module_name, + const char * module_1, + const char * func_name, + void **func_args, + int countx, int county, int countz) + { + assert(module_name != NULL); + assert(module_1 != NULL); + assert(func_name != NULL); + assert(func_args != NULL); +#if 1 + const char * module = module_1; +#else + const std::vector module_str = readBinary("kernel.cubin"); + const char * module = &module_str[0]; +#endif +#if 1 + CUmodule cudaModule = loadModule(module); + CUfunction cudaFunction = getFunction(cudaModule, func_name); + deviceLaunch(cudaFunction, countx, county, countz, func_args); + unloadModule(cudaModule); +#else + fprintf(stderr, " handle= %p\n", *handlePtr); + fprintf(stderr, " count= %d %d %d\n", countx, county, countz); + + fprintf(stderr, " module_name= %s \n", module_name); + fprintf(stderr, " func_name= %s \n", func_name); +// fprintf(stderr, " ptx= %s \n", module); + fprintf(stderr, " x0= %g \n", *((float*)(func_args[0]))); + fprintf(stderr, " dx= %g \n", *((float*)(func_args[1]))); + fprintf(stderr, " y0= %g \n", *((float*)(func_args[2]))); + fprintf(stderr, " dy= %g \n", *((float*)(func_args[3]))); + fprintf(stderr, " w= %d \n", *((int*)(func_args[4]))); + fprintf(stderr, " h= %d \n", *((int*)(func_args[5]))); + fprintf(stderr, " xs= %d \n", *((int*)(func_args[6]))); + fprintf(stderr, " ys= %d \n", *((int*)(func_args[7]))); + fprintf(stderr, " maxit= %d \n", *((int*)(func_args[8]))); + fprintf(stderr, " ptr= %p \n", *((int**)(func_args[9]))); + fprintf(stderr, " ------- \n\n"); +#endif + } + void CUDASync(void *handle) + { + checkCudaErrors(cuStreamSynchronize(0)); + } + void ISPCSync(void *handle) + { + } + void CUDAFree(void *handle) + { + } +} + +/********************/ + + +extern void mandelbrot_serial(float x0, float y0, float x1, float y1, + int width, int height, int maxIterations, + int output[]); + +/* Write a PPM image file with the image of the Mandelbrot set */ +static void +writePPM(int *buf, int width, int height, const char *fn) { + FILE *fp = fopen(fn, "wb"); + fprintf(fp, "P6\n"); + fprintf(fp, "%d %d\n", width, height); + fprintf(fp, "255\n"); + for (int i = 0; i < width*height; ++i) { + // Map the iteration count to colors by just alternating between + // two greys. + char c = (buf[i] & 0x1) ? 240 : 20; + for (int j = 0; j < 3; ++j) + fputc(c, fp); + } + fclose(fp); + printf("Wrote image file %s\n", fn); +} + + +static void usage() { + fprintf(stderr, "usage: mandelbrot [--scale=]\n"); + exit(1); +} + +int main(int argc, char *argv[]) { + unsigned int width = 1536; + unsigned int height = 1024; + float x0 = -2; + float x1 = 1; + float y0 = -1; + float y1 = 1; + + if (argc == 1) + ; + else if (argc == 2) { + if (strncmp(argv[1], "--scale=", 8) == 0) { + float scale = atof(argv[1] + 8); + if (scale == 0.f) + usage(); + width *= scale; + height *= scale; + // round up to multiples of 16 + width = (width + 0xf) & ~0xf; + height = (height + 0xf) & ~0xf; + } + else + usage(); + } + else + usage(); + + /*******************/ + createContext(); + /*******************/ + + int maxIterations = 512; + int *buf = new int[width*height]; + + for (unsigned int i = 0; i < width*height; i++) + buf[i] = 0; + const size_t bufsize = sizeof(int)*width*height; + devicePtr d_buf = deviceMalloc(bufsize); + memcpyH2D(d_buf, buf, bufsize); + + // + // Compute the image using the ispc implementation; report the minimum + // time of three runs. + // + double minISPC = 1e30; + for (int i = 0; i < 3; ++i) { + // Clear out the buffer + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; + reset_and_start_timer(); + mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, (int*)d_buf); + double dt = get_elapsed_mcycles(); + minISPC = std::min(minISPC, dt); + } + + memcpyD2H(buf, d_buf, bufsize); + deviceFree(d_buf); + + printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC); + writePPM(buf, width, height, "mandelbrot-ispc.ppm"); + + + // + // And run the serial implementation 3 times, again reporting the + // minimum time. + // + double minSerial = 1e30; + for (int i = 0; i < 3; ++i) { + // Clear out the buffer + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; + reset_and_start_timer(); + mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); + double dt = get_elapsed_mcycles(); + minSerial = std::min(minSerial, dt); + } + + printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); + writePPM(buf, width, height, "mandelbrot-serial.ppm"); + + printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); + + return 0; +} diff --git a/examples_cuda/mandelbrot_tasks3d/mandel_task_cu.cu b/examples_cuda/mandelbrot_tasks3d/mandel_task_cu.cu index 33003897..3252fad1 100644 --- a/examples_cuda/mandelbrot_tasks3d/mandel_task_cu.cu +++ b/examples_cuda/mandelbrot_tasks3d/mandel_task_cu.cu @@ -1,8 +1,8 @@ #include -#define blockIndex0 (blockIdx.x) +#define blockIndex0 (blockIdx.x*4 + (threadIdx.x >> 5)) #define blockIndex1 (blockIdx.y) #define vectorWidth (32) -#define vectorIndex (threadIdx.x & (vectorWidth-1)) +#define vectorIndex (threadIdx.x & 31) int __device__ __forceinline__ mandel(float c_re, float c_im, int count) diff --git a/examples_cuda/mandelbrot_tasks3d/mandelbrot_task.ispc b/examples_cuda/mandelbrot_tasks3d/mandelbrot_task.ispc index 7ba09a2e..1b6e1040 100644 --- a/examples_cuda/mandelbrot_tasks3d/mandelbrot_task.ispc +++ b/examples_cuda/mandelbrot_tasks3d/mandelbrot_task.ispc @@ -1,13 +1,10 @@ #ifdef __NVPTX__ -#define blockIndex0 blockIndex0() -#define blockIndex1 blockIndex1() -#define vectorWidth warpSize() -#define vectorIndex laneIndex() -#else -#define blockIndex0 taskIndex0 -#define blockIndex1 taskIndex1 -#define vectorWidth programCount -#define vectorIndex programIndex +#define taskIndex0 blockIndex0() +#define taskIndex1 blockIndex1() +#define taskCount0 blockCount0() +#define taskCount1 blockCount1() +#define programCount warpSize() +#define programIndex laneIndex() #endif #if 0 @@ -46,23 +43,25 @@ mandelbrot_scanline( uniform int xspan, uniform int yspan, uniform int maxIterations, uniform int output[]) { - const uniform int xstart = blockIndex0 * xspan; + if (taskIndex0 >= taskCount0) return; + if (taskIndex1 >= taskCount1) return; + + const uniform int xstart = taskIndex0 * xspan; const uniform int xend = min(xstart + xspan, width); - const uniform int ystart = blockIndex1 * yspan; + const uniform int ystart = taskIndex1 * yspan; const uniform int yend = min(ystart + yspan, height); -// assert(xspan >= vectorWidth); for (uniform int yi = ystart; yi < yend; yi++) - for (uniform int xi = xstart; xi < xend; xi += vectorWidth) + for (uniform int xi = xstart; xi < xend; xi += programCount) { - const float x = x0 + (xi + vectorIndex) * dx; + const float x = x0 + (xi + programIndex) * dx; const float y = y0 + yi * dy; const int res = mandel(x,y,maxIterations); - const int index = yi * width + (xi + vectorIndex); - if (xi + vectorIndex < xend) + const int index = yi * width + (xi + programIndex); + if (xi + programIndex < xend) output[index] = res; } } diff --git a/examples_cuda/stencil/a.out b/examples_cuda/stencil/a.out deleted file mode 100755 index db6400a6..00000000 Binary files a/examples_cuda/stencil/a.out and /dev/null differ diff --git a/examples_cuda/stencil/stencil.cu b/examples_cuda/stencil/stencil.cu index 63b051e8..7895589f 100644 --- a/examples_cuda/stencil/stencil.cu +++ b/examples_cuda/stencil/stencil.cu @@ -1,6 +1,6 @@ #define programCount 32 -#define programIndex threadIdx.x -#define taskIndex blockIdx.x +#define programIndex (threadIdx.x & 31) +#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5)) __device__ static void stencil_step( int x0, int x1, diff --git a/examples_cuda/stencil/stencil.cubin b/examples_cuda/stencil/stencil.cubin index db1b1bca..5e8d9431 100644 Binary files a/examples_cuda/stencil/stencil.cubin and b/examples_cuda/stencil/stencil.cubin differ diff --git a/examples_cuda/stencil/stencil.ispc b/examples_cuda/stencil/stencil.ispc index f96ea8fa..d2e095b3 100644 --- a/examples_cuda/stencil/stencil.ispc +++ b/examples_cuda/stencil/stencil.ispc @@ -34,13 +34,14 @@ #ifdef __NVPTX__ #warning "emitting DEVICE code" #define taskIndex blockIndex0() +#define taskCount blockCount0() #define programIndex laneIndex() #define programCount warpSize() #else #warning "emitting HOST code" #endif -static void +static inline void stencil_step(uniform int x0, uniform int x1, uniform int y0, uniform int y1, uniform int z0, uniform int z1, @@ -50,29 +51,62 @@ stencil_step(uniform int x0, uniform int x1, const uniform int Nxy = Nx * Ny; // foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) +#if 0 +#define VER1 +#endif + +#ifdef VER1 + const uniform long x1o = 1; + const uniform long x2o = 2; + const uniform long x3o = 3; + const uniform long y1o = Nx; + const uniform long y2o = Nx*2; + const uniform long y3o = Nx*3; + const uniform long z1o = Nxy; + const uniform long z2o = Nxy*2; + const uniform long z3o = Nxy*3; +#endif for (uniform int z = z0; z < z1; z++) for (uniform int y = y0; y < y1; y++) - for (uniform int xb = x0; xb < x1; xb += programCount) { - const int x = xb + programIndex; - int index = (z * Nxy) + (y * Nx) + x; + const int index_base = (z * Nxy) + (y * Nx); + for (uniform int xb = x0; xb < x1; xb += programCount) + { + const int x = xb + programIndex; + int index = index_base + x; +#ifndef VER1 #define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] #define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] - double div = coef[0] * A_cur(0, 0, 0) + + double div = coef[0] * A_cur(0, 0, 0) + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + - A_cur(0, +1, 0) + A_cur(0, -1, 0) + - A_cur(0, 0, +1) + A_cur(0, 0, -1)) + + A_cur(0, +1, 0) + A_cur(0, -1, 0) + + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + - A_cur(0, +2, 0) + A_cur(0, -2, 0) + - A_cur(0, 0, +2) + A_cur(0, 0, -2)) + + A_cur(0, +2, 0) + A_cur(0, -2, 0) + + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + - A_cur(0, +3, 0) + A_cur(0, -3, 0) + - A_cur(0, 0, +3) + A_cur(0, 0, -3)); + A_cur(0, +3, 0) + A_cur(0, -3, 0) + + A_cur(0, 0, +3) + A_cur(0, 0, -3)); +#else +#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)] +#define A_next(x, y, z) Aout[index + (x) + (y) + (z)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) + + A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) + + A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) + + coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) + + A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) + + A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) + + coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) + + A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) + + A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o)); +#endif - if (x < x1) - A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + - vsq[index] * div; - } + if (x < x1) + A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + + vsq[index] * div; + } + } } @@ -83,6 +117,8 @@ stencil_step_task(uniform int x0, uniform int x1, uniform int Nx, uniform int Ny, uniform int Nz, uniform const double coef[4], uniform const double vsq[], uniform const double Ain[], uniform double Aout[]) { + if(taskIndex >= taskCount) return; + stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1, Nx, Ny, Nz, coef, vsq, Ain, Aout); } diff --git a/examples_cuda/stencil/stencil_cu.cpp b/examples_cuda/stencil/stencil_cu.cpp index 3f06e841..9cdd1050 100644 --- a/examples_cuda/stencil/stencil_cu.cpp +++ b/examples_cuda/stencil/stencil_cu.cpp @@ -132,11 +132,12 @@ void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size) checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); } #define deviceLaunch(func,nbx,nby,nbz,params) \ + checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \ checkCudaErrors( \ cuLaunchKernel( \ (func), \ - (nbx), (nby), (nbz), \ - 32, 1, 1, \ + ((nbx-1)/(128/32)+1), (nby), (nbz), \ + 128, 1, 1, \ 0, NULL, (params), NULL \ )); @@ -144,6 +145,38 @@ typedef CUdeviceptr devicePtr; /**************/ +#include +std::vector readBinary(const char * filename) +{ + std::vector buffer; + FILE *fp = fopen(filename, "rb"); + if (!fp ) + { + fprintf(stderr, "file %s not found\n", filename); + assert(0); + } +#if 0 + char c; + while ((c = fgetc(fp)) != EOF) + buffer.push_back(c); +#else + fseek(fp, 0, SEEK_END); + const unsigned long long size = ftell(fp); /*calc the size needed*/ + fseek(fp, 0, SEEK_SET); + buffer.resize(size); + + if (fp == NULL){ /*ERROR detection if file == empty*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n",filename); + exit(1); + } + else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n", filename); + exit(1); + } +#endif + fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size()); + return buffer; +} extern "C" { @@ -155,15 +188,21 @@ extern "C" void CUDALaunch( void **handlePtr, const char * module_name, - const char * module, + const char * module_1, const char * func_name, void **func_args, int countx, int county, int countz) { assert(module_name != NULL); - assert(module != NULL); + assert(module_1 != NULL); assert(func_name != NULL); assert(func_args != NULL); +#if 1 + const char * module = module_1; +#else + const std::vector module_str = readBinary("kernel.cubin"); + const char * module = &module_str[0]; +#endif CUmodule cudaModule = loadModule(module); CUfunction cudaFunction = getFunction(cudaModule, func_name); deviceLaunch(cudaFunction, countx, county, countz, func_args); @@ -184,134 +223,134 @@ extern "C" extern void loop_stencil_serial(int t0, int t1, int x0, int x1, - int y0, int y1, int z0, int z1, - int Nx, int Ny, int Nz, - const double coef[5], - const double vsq[], - double Aeven[], double Aodd[]); + int y0, int y1, int z0, int z1, + int Nx, int Ny, int Nz, + const double coef[5], + const double vsq[], + double Aeven[], double Aodd[]); void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) { - int offset = 0; - for (int z = 0; z < Nz; ++z) - for (int y = 0; y < Ny; ++y) - for (int x = 0; x < Nx; ++x, ++offset) { - A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny); - A[1][offset] = 0; - vsq[offset] = x*y*z / double(Nx * Ny * Nz); - } + int offset = 0; + for (int z = 0; z < Nz; ++z) + for (int y = 0; y < Ny; ++y) + for (int x = 0; x < Nx; ++x, ++offset) { + A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny); + A[1][offset] = 0; + vsq[offset] = x*y*z / double(Nx * Ny * Nz); + } } int main() { - int Nx = 256, Ny = 256, Nz = 256; - int width = 4; - double *Aserial[2], *Aispc[2]; - Aserial[0] = new double [Nx * Ny * Nz]; - Aserial[1] = new double [Nx * Ny * Nz]; - Aispc[0] = new double [Nx * Ny * Nz]; - Aispc[1] = new double [Nx * Ny * Nz]; - double *vsq = new double [Nx * Ny * Nz]; + int Nx = 256, Ny = 256, Nz = 256; + int width = 4; + double *Aserial[2], *Aispc[2]; + Aserial[0] = new double [Nx * Ny * Nz]; + Aserial[1] = new double [Nx * Ny * Nz]; + Aispc[0] = new double [Nx * Ny * Nz]; + Aispc[1] = new double [Nx * Ny * Nz]; + double *vsq = new double [Nx * Ny * Nz]; - double coeff[4] = { 0.5, -.25, .125, -.0625 }; - - /*******************/ - createContext(); - /*******************/ + double coeff[4] = { 0.5, -.25, .125, -.0625 }; - const size_t bufsize = sizeof(double)*Nx*Ny*Nz; - devicePtr d_Aispc0 = deviceMalloc(bufsize); - devicePtr d_Aispc1 = deviceMalloc(bufsize); - devicePtr d_vsq = deviceMalloc(bufsize); - devicePtr d_coeff = deviceMalloc(4*sizeof(double)); + /*******************/ + createContext(); + /*******************/ + + const size_t bufsize = sizeof(double)*Nx*Ny*Nz; + devicePtr d_Aispc0 = deviceMalloc(bufsize); + devicePtr d_Aispc1 = deviceMalloc(bufsize); + devicePtr d_vsq = deviceMalloc(bufsize); + devicePtr d_coeff = deviceMalloc(4*sizeof(double)); - InitData(Nx, Ny, Nz, Aispc, vsq); + InitData(Nx, Ny, Nz, Aispc, vsq); - // - // Compute the image using the ispc implementation on one core; report - // the minimum time of three runs. - // - double minTimeISPC = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, coeff, vsq, - Aispc[0], Aispc[1]); - double dt = get_elapsed_mcycles(); - minTimeISPC = std::min(minTimeISPC, dt); - } + // + // Compute the image using the ispc implementation on one core; report + // the minimum time of three runs. + // + double minTimeISPC = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aispc[0], Aispc[1]); + double dt = get_elapsed_mcycles(); + minTimeISPC = std::min(minTimeISPC, dt); + } - printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); - - InitData(Nx, Ny, Nz, Aispc, vsq); + printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); - memcpyH2D(d_Aispc0, Aispc[0], bufsize); - memcpyH2D(d_Aispc1, Aispc[1], bufsize); - memcpyH2D(d_vsq, vsq, bufsize); - memcpyH2D(d_coeff, coeff, 4*sizeof(double)); - // - // Compute the image using the ispc implementation with tasks; report - // the minimum time of three runs. - // - double minTimeISPCTasks = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, (double*)d_coeff, (double*)d_vsq, - (double*)d_Aispc0, (double*)d_Aispc1); - double dt = get_elapsed_mcycles(); - minTimeISPCTasks = std::min(minTimeISPCTasks, dt); - } - memcpyD2H(Aispc[1], d_Aispc1, bufsize); - //memcpyD2H(Aispc[1], d_vsq, bufsize); + InitData(Nx, Ny, Nz, Aispc, vsq); - printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); + memcpyH2D(d_Aispc0, Aispc[0], bufsize); + memcpyH2D(d_Aispc1, Aispc[1], bufsize); + memcpyH2D(d_vsq, vsq, bufsize); + memcpyH2D(d_coeff, coeff, 4*sizeof(double)); + // + // Compute the image using the ispc implementation with tasks; report + // the minimum time of three runs. + // + double minTimeISPCTasks = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, (double*)d_coeff, (double*)d_vsq, + (double*)d_Aispc0, (double*)d_Aispc1); + double dt = get_elapsed_mcycles(); + minTimeISPCTasks = std::min(minTimeISPCTasks, dt); + } + memcpyD2H(Aispc[1], d_Aispc1, bufsize); + //memcpyD2H(Aispc[1], d_vsq, bufsize); - InitData(Nx, Ny, Nz, Aserial, vsq); + printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); - // - // And run the serial implementation 3 times, again reporting the - // minimum time. - // - double minTimeSerial = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, coeff, vsq, - Aserial[0], Aserial[1]); - double dt = get_elapsed_mcycles(); - minTimeSerial = std::min(minTimeSerial, dt); - } + InitData(Nx, Ny, Nz, Aserial, vsq); - printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); + // + // And run the serial implementation 3 times, again reporting the + // minimum time. + // + double minTimeSerial = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aserial[0], Aserial[1]); + double dt = get_elapsed_mcycles(); + minTimeSerial = std::min(minTimeSerial, dt); + } - printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", - minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); + printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); - // Check for agreement - int offset = 0; - int nerr = 0; - for (int z = 0; z < Nz; ++z) - for (int y = 0; y < Ny; ++y) - for (int x = 0; x < Nx; ++x, ++offset) { + printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", + minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); - double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) / - Aserial[1][offset]); - if (error > 1e-3) - { - if (nerr < 100) - printf("Error @ (%d,%d,%d): ispc = %g, serial = %g error= %g\n", - x, y, z, Aispc[1][offset], Aserial[1][offset], error); - nerr++; - } - } + // Check for agreement + int offset = 0; + int nerr = 0; + for (int z = 0; z < Nz; ++z) + for (int y = 0; y < Ny; ++y) + for (int x = 0; x < Nx; ++x, ++offset) { - fprintf(stderr, " nerr= %d frac= %g \n", nerr, 1.0*nerr/(1.0*Nx*Ny*Nz)); - - /*******************/ - destroyContext(); - /*******************/ + double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) / + Aserial[1][offset]); + if (error > 1e-3) + { + if (nerr < 100) + printf("Error @ (%d,%d,%d): ispc = %g, serial = %g error= %g\n", + x, y, z, Aispc[1][offset], Aserial[1][offset], error); + nerr++; + } + } - return 0; + fprintf(stderr, " nerr= %d frac= %g \n", nerr, 1.0*nerr/(1.0*Nx*Ny*Nz)); + + /*******************/ + destroyContext(); + /*******************/ + + return 0; } diff --git a/stdlib.ispc b/stdlib.ispc index 2c6c3de0..b6be0cdb 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -63,47 +63,56 @@ // CUDA Specific primitives // #define CUDABLOCKSIZE 128 +#define WARPSIZE2 5 +#define WARPSIZE (1<> WARPSIZE2)) + (__tid_x() >> WARPSIZE2); } +/***************/ __declspec(safe,cost0) static inline uniform int blockIndex1() { return __ctaid_y(); } +/***************/ __declspec(safe,cost0) static inline uniform int blockIndex2() { return __ctaid_y(); } - +/***************/ __declspec(safe,cost0) static inline uniform int blockCount0() { - return __nctaid_x(); + return __nctaid_x() * (CUDABLOCKSIZE >> WARPSIZE2); } +/***************/ __declspec(safe,cost0) static inline uniform int blockCount1() { return __nctaid_y(); } +/***************/ __declspec(safe,cost0) static inline uniform int blockCount2() { return __nctaid_z(); } -__declspec(safe,cost0) - static inline uniform int warpSize() -{ - return __warpsize(); -} -__declspec(safe,cost0) - static inline uniform int laneIndex() -{ - return __tid_x() & (warpSize()-1); -} /////////////////////////////////////////////////////////////////////////// // Low level primitives