From 426afc7377198bc12a3b39cf82ec3d4ac80aaf16 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Fri, 8 Nov 2013 10:00:49 +0100 Subject: [PATCH] added workable .cu files for stencil & mandelbrot --- examples/mandelbrot_tasks3d/Makefile | 2 +- .../mandelbrot_tasks3d/mandel_cu.cpp | 402 ++++++++++++++++++ .../mandelbrot_tasks3d/mandel_task_cu.cu | 4 +- .../mandelbrot_tasks3d/mandelbrot_task.ispc | 31 +- examples_cuda/stencil/a.out | Bin 27015 -> 0 bytes examples_cuda/stencil/stencil.cu | 4 +- examples_cuda/stencil/stencil.cubin | Bin 3156 -> 3220 bytes examples_cuda/stencil/stencil.ispc | 66 ++- examples_cuda/stencil/stencil_cu.cpp | 261 +++++++----- stdlib.ispc | 35 +- 10 files changed, 645 insertions(+), 160 deletions(-) create mode 100644 examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp delete mode 100755 examples_cuda/stencil/a.out diff --git a/examples/mandelbrot_tasks3d/Makefile b/examples/mandelbrot_tasks3d/Makefile index 3dd44d65..ad1a9b3a 100644 --- a/examples/mandelbrot_tasks3d/Makefile +++ b/examples/mandelbrot_tasks3d/Makefile @@ -2,7 +2,7 @@ EXAMPLE=mandelbrot_tasks3d CPP_SRC=mandelbrot_tasks3d.cpp mandelbrot_tasks_serial.cpp ISPC_SRC=mandelbrot_tasks3d.ispc -ISPC_IA_TARGETS=avx,sse2,sse4 +ISPC_IA_TARGETS=avx ISPC_ARM_TARGETS=neon include ../common.mk diff --git a/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp b/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp new file mode 100644 index 00000000..0f7f0884 --- /dev/null +++ b/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp @@ -0,0 +1,402 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include +#include +#include "../timing.h" + +#include +#include +#include +#include +#include "drvapi_error_string.h" + +#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) +// These are the inline versions for all of the SDK helper functions +void __checkCudaErrors(CUresult err, const char *file, const int line) { + if(CUDA_SUCCESS != err) { + std::cerr << "checkCudeErrors() Driver API error = " << err << "\"" + << getCudaDrvErrorString(err) << "\" from file <" << file + << ", line " << line << "\n"; + exit(-1); + } +} +extern "C" +void mandelbrot_ispc( + float x0, float y0, + float x1, float y1, + int width, int height, + int maxIterations, int output[]) ; + + +/**********************/ +/* Basic CUDriver API */ +CUcontext context; + +void createContext(const int deviceId = 0) +{ + CUdevice device; + int devCount; + checkCudaErrors(cuInit(0)); + checkCudaErrors(cuDeviceGetCount(&devCount)); + assert(devCount > 0); + checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0)); + + char name[128]; + checkCudaErrors(cuDeviceGetName(name, 128, device)); + std::cout << "Using CUDA Device [0]: " << name << "\n"; + + int devMajor, devMinor; + checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device)); + std::cout << "Device Compute Capability: " + << devMajor << "." << devMinor << "\n"; + if (devMajor < 2) { + std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n"; + exit(1); + } + + // Create driver context + checkCudaErrors(cuCtxCreate(&context, 0, device)); +} +void destroyContext() +{ + checkCudaErrors(cuCtxDestroy(context)); +} + +CUmodule loadModule(const char * module) +{ + CUmodule cudaModule; + checkCudaErrors(cuModuleLoadData(&cudaModule, module)); + return cudaModule; +} +void unloadModule(CUmodule &cudaModule) +{ + checkCudaErrors(cuModuleUnload(cudaModule)); +} + +CUfunction getFunction(CUmodule &cudaModule, const char * function) +{ + CUfunction cudaFunction; + checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function)); + return cudaFunction; +} + +CUdeviceptr deviceMalloc(const size_t size) +{ + CUdeviceptr d_buf; + checkCudaErrors(cuMemAlloc(&d_buf, size)); + return d_buf; +} +void deviceFree(CUdeviceptr d_buf) +{ + checkCudaErrors(cuMemFree(d_buf)); +} +void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size) +{ + checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size)); +} +void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size) +{ + checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); +} +#define deviceLaunch(func,nbx,nby,nbz,params) \ + checkCudaErrors( \ + cuLaunchKernel( \ + (func), \ + ((nbx-1)/(128/32)+1), (nby), (nbz), \ + 128, 1, 1, \ + 0, NULL, (params), NULL \ + )); + +typedef CUdeviceptr devicePtr; + + +/**************/ +#include +std::vector readBinary(const char * filename) +{ + std::vector buffer; + FILE *fp = fopen(filename, "rb"); + if (!fp ) + { + fprintf(stderr, "file %s not found\n", filename); + assert(0); + } +#if 0 + char c; + while ((c = fgetc(fp)) != EOF) + buffer.push_back(c); +#else + fseek(fp, 0, SEEK_END); + const unsigned long long size = ftell(fp); /*calc the size needed*/ + fseek(fp, 0, SEEK_SET); + buffer.resize(size); + + if (fp == NULL){ /*ERROR detection if file == empty*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n",filename); + exit(1); + } + else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n", filename); + exit(1); + } +#endif + fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size()); + return buffer; +} + +extern "C" +{ +#if 0 + struct ModuleManager + { + private: + typedef std::pair ModulePair; + typedef std::map ModuleMap; + ModuleMap module_list; + + ModuleMap::iterator findModule(const char * module_name) + { + return module_list.find(std::string(module_name)); + } + + public: + + CUmodule loadModule(const char * module_name, const char * module_data) + { + const ModuleMap::iterator it = findModule(module_name) + if (it != ModuleMap::end) + { + CUmodule cudaModule = loadModule(module); + module_list.insert(std::make_pair(std::string(module_name), cudaModule)); + return cudaModule + } + return it->second; + } + void unloadModule(const char * module_name) + { + ModuleMap::iterator it = findModule(module_name) + if (it != ModuleMap::end) + module_list.erase(it); + } + }; +#endif + + void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment) + { +#if 0 + fprintf(stderr, " ptr= %p\n", *handlePtr); + fprintf(stderr, " size= %d\n", (int)size); + fprintf(stderr, " alignment= %d\n", (int)alignment); + fprintf(stderr, " ------- \n\n"); +#endif + return NULL; + } + void CUDALaunch( + void **handlePtr, + const char * module_name, + const char * module_1, + const char * func_name, + void **func_args, + int countx, int county, int countz) + { + assert(module_name != NULL); + assert(module_1 != NULL); + assert(func_name != NULL); + assert(func_args != NULL); +#if 1 + const char * module = module_1; +#else + const std::vector module_str = readBinary("kernel.cubin"); + const char * module = &module_str[0]; +#endif +#if 1 + CUmodule cudaModule = loadModule(module); + CUfunction cudaFunction = getFunction(cudaModule, func_name); + deviceLaunch(cudaFunction, countx, county, countz, func_args); + unloadModule(cudaModule); +#else + fprintf(stderr, " handle= %p\n", *handlePtr); + fprintf(stderr, " count= %d %d %d\n", countx, county, countz); + + fprintf(stderr, " module_name= %s \n", module_name); + fprintf(stderr, " func_name= %s \n", func_name); +// fprintf(stderr, " ptx= %s \n", module); + fprintf(stderr, " x0= %g \n", *((float*)(func_args[0]))); + fprintf(stderr, " dx= %g \n", *((float*)(func_args[1]))); + fprintf(stderr, " y0= %g \n", *((float*)(func_args[2]))); + fprintf(stderr, " dy= %g \n", *((float*)(func_args[3]))); + fprintf(stderr, " w= %d \n", *((int*)(func_args[4]))); + fprintf(stderr, " h= %d \n", *((int*)(func_args[5]))); + fprintf(stderr, " xs= %d \n", *((int*)(func_args[6]))); + fprintf(stderr, " ys= %d \n", *((int*)(func_args[7]))); + fprintf(stderr, " maxit= %d \n", *((int*)(func_args[8]))); + fprintf(stderr, " ptr= %p \n", *((int**)(func_args[9]))); + fprintf(stderr, " ------- \n\n"); +#endif + } + void CUDASync(void *handle) + { + checkCudaErrors(cuStreamSynchronize(0)); + } + void ISPCSync(void *handle) + { + } + void CUDAFree(void *handle) + { + } +} + +/********************/ + + +extern void mandelbrot_serial(float x0, float y0, float x1, float y1, + int width, int height, int maxIterations, + int output[]); + +/* Write a PPM image file with the image of the Mandelbrot set */ +static void +writePPM(int *buf, int width, int height, const char *fn) { + FILE *fp = fopen(fn, "wb"); + fprintf(fp, "P6\n"); + fprintf(fp, "%d %d\n", width, height); + fprintf(fp, "255\n"); + for (int i = 0; i < width*height; ++i) { + // Map the iteration count to colors by just alternating between + // two greys. + char c = (buf[i] & 0x1) ? 240 : 20; + for (int j = 0; j < 3; ++j) + fputc(c, fp); + } + fclose(fp); + printf("Wrote image file %s\n", fn); +} + + +static void usage() { + fprintf(stderr, "usage: mandelbrot [--scale=]\n"); + exit(1); +} + +int main(int argc, char *argv[]) { + unsigned int width = 1536; + unsigned int height = 1024; + float x0 = -2; + float x1 = 1; + float y0 = -1; + float y1 = 1; + + if (argc == 1) + ; + else if (argc == 2) { + if (strncmp(argv[1], "--scale=", 8) == 0) { + float scale = atof(argv[1] + 8); + if (scale == 0.f) + usage(); + width *= scale; + height *= scale; + // round up to multiples of 16 + width = (width + 0xf) & ~0xf; + height = (height + 0xf) & ~0xf; + } + else + usage(); + } + else + usage(); + + /*******************/ + createContext(); + /*******************/ + + int maxIterations = 512; + int *buf = new int[width*height]; + + for (unsigned int i = 0; i < width*height; i++) + buf[i] = 0; + const size_t bufsize = sizeof(int)*width*height; + devicePtr d_buf = deviceMalloc(bufsize); + memcpyH2D(d_buf, buf, bufsize); + + // + // Compute the image using the ispc implementation; report the minimum + // time of three runs. + // + double minISPC = 1e30; + for (int i = 0; i < 3; ++i) { + // Clear out the buffer + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; + reset_and_start_timer(); + mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, (int*)d_buf); + double dt = get_elapsed_mcycles(); + minISPC = std::min(minISPC, dt); + } + + memcpyD2H(buf, d_buf, bufsize); + deviceFree(d_buf); + + printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC); + writePPM(buf, width, height, "mandelbrot-ispc.ppm"); + + + // + // And run the serial implementation 3 times, again reporting the + // minimum time. + // + double minSerial = 1e30; + for (int i = 0; i < 3; ++i) { + // Clear out the buffer + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; + reset_and_start_timer(); + mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); + double dt = get_elapsed_mcycles(); + minSerial = std::min(minSerial, dt); + } + + printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); + writePPM(buf, width, height, "mandelbrot-serial.ppm"); + + printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); + + return 0; +} diff --git a/examples_cuda/mandelbrot_tasks3d/mandel_task_cu.cu b/examples_cuda/mandelbrot_tasks3d/mandel_task_cu.cu index 33003897..3252fad1 100644 --- a/examples_cuda/mandelbrot_tasks3d/mandel_task_cu.cu +++ b/examples_cuda/mandelbrot_tasks3d/mandel_task_cu.cu @@ -1,8 +1,8 @@ #include -#define blockIndex0 (blockIdx.x) +#define blockIndex0 (blockIdx.x*4 + (threadIdx.x >> 5)) #define blockIndex1 (blockIdx.y) #define vectorWidth (32) -#define vectorIndex (threadIdx.x & (vectorWidth-1)) +#define vectorIndex (threadIdx.x & 31) int __device__ __forceinline__ mandel(float c_re, float c_im, int count) diff --git a/examples_cuda/mandelbrot_tasks3d/mandelbrot_task.ispc b/examples_cuda/mandelbrot_tasks3d/mandelbrot_task.ispc index 7ba09a2e..1b6e1040 100644 --- a/examples_cuda/mandelbrot_tasks3d/mandelbrot_task.ispc +++ b/examples_cuda/mandelbrot_tasks3d/mandelbrot_task.ispc @@ -1,13 +1,10 @@ #ifdef __NVPTX__ -#define blockIndex0 blockIndex0() -#define blockIndex1 blockIndex1() -#define vectorWidth warpSize() -#define vectorIndex laneIndex() -#else -#define blockIndex0 taskIndex0 -#define blockIndex1 taskIndex1 -#define vectorWidth programCount -#define vectorIndex programIndex +#define taskIndex0 blockIndex0() +#define taskIndex1 blockIndex1() +#define taskCount0 blockCount0() +#define taskCount1 blockCount1() +#define programCount warpSize() +#define programIndex laneIndex() #endif #if 0 @@ -46,23 +43,25 @@ mandelbrot_scanline( uniform int xspan, uniform int yspan, uniform int maxIterations, uniform int output[]) { - const uniform int xstart = blockIndex0 * xspan; + if (taskIndex0 >= taskCount0) return; + if (taskIndex1 >= taskCount1) return; + + const uniform int xstart = taskIndex0 * xspan; const uniform int xend = min(xstart + xspan, width); - const uniform int ystart = blockIndex1 * yspan; + const uniform int ystart = taskIndex1 * yspan; const uniform int yend = min(ystart + yspan, height); -// assert(xspan >= vectorWidth); for (uniform int yi = ystart; yi < yend; yi++) - for (uniform int xi = xstart; xi < xend; xi += vectorWidth) + for (uniform int xi = xstart; xi < xend; xi += programCount) { - const float x = x0 + (xi + vectorIndex) * dx; + const float x = x0 + (xi + programIndex) * dx; const float y = y0 + yi * dy; const int res = mandel(x,y,maxIterations); - const int index = yi * width + (xi + vectorIndex); - if (xi + vectorIndex < xend) + const int index = yi * width + (xi + programIndex); + if (xi + programIndex < xend) output[index] = res; } } diff --git a/examples_cuda/stencil/a.out b/examples_cuda/stencil/a.out deleted file mode 100755 index db6400a6db1157dff6d30f8fa7c3d928b6ea0040..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 27015 zcmchA4SZBZmhbI+V569B+@KMIUY%x4l!OEVCJ?kc>Bw#AXvPE$0(L^D1DVO!CLM$s z&^S#T?li6JEIZ50GW#;F@AAXGFs|o1n1V2$kBZ|{8q8U&KNa*)J^^tTp zX=mpBe)|gUJ#|iYTcDZ&w~Fao(O_Feox4D7Pwv4P0(zBz{+vouv#Fw0y;= zSe0v)Ym_OVB;&HO6oNb^VV5XcLZjeI1WclMf*>SVc|^c;jwupK2?w=&mEj~FvuOHZ%@ zBiu?rNHqajUw~eA+*?EOKiG+jpfw8aV3?vvDD!;}`AE*cHb^<=*4M3Bm^Zh+Cab=# zv28=vhWv$D3-hvDo3iII`OPShnz;1N<*aDgA6JNqrUTv};zYX!*LApN;hKs|#dR63 zt8h_B<3O1Kcr~u;aZ%UHaYmE{sxvl-u7vLKF^No<{a_NrO93y(Me@^-UWx1Pa9x6n zWKP8;!!$%@;!44Ftt7^Q^`OhPN3#4Y1fGokB$(7lMiD0&CZoRy`efxi8;8CrPQA5p z=Y>mVJ-*M>PIP?Q?=&m^Q*>U9Q0bRuv8$JL*jU@uFz%^OF-5aOgpU0v9HV*ya zIP~Adq36eu=NED4m&YmRG0>-Drc^nrkW*p0Ptb;Ivi6P&db_aS#7p5AOmZw#T=&&f zm#qhAe4s&bxz;u`HM&~8)h%9^i_+?OS~b!f6{W5kPm7SbqO`casnJtWy{6v7(Z4Dk`b91)V=9aofZ>_S{<87|1f&A2wkiE9Lt{#|sL$wQCvjKh4 z;AwC-Z&VssO$vIz)6$}}wbpw)%}T8(1w0K+>(P_m7Nj<{paiN)^hZN=9r|`-tGlVO zR#{qH?3#~ZEUv7y6**kF+4Gg9PDc??kn?!lH7|Q!_5vsboH%!%N$?qo_(!<>$E|iW zh4UrAGAAO(6!07HF9$kD_FBbN_T7ze8&eeNR}J9#Dd?1*gX@i_e*m+dtQ;13yqQ0P zTS!tq22O#>D~%|xZDt+ZQKIr?L^?&OJPJHLVLZKTSrwy`PZdi8hRb?Q=;XI$2u08%efUTOoz@mI zj7HFD4KN;rJ*B>sYZFQ{M$qM2h3Mu8I;~q|NQq>na(n5p-1&0ksJF^^tVn=f==w(7@28Q0Gub9$0;S#@_IP`)*T}@a$E%o367W zM)*3)IW!nXn0+_lq|!qJ48MbLQstp8h8Gb|Q)Fll!xs@w%dw$OhR-9MRD5VV!*3#- zRC}nM;WrRY!!Xp$@R@`g39n-K6@-%t50x?eQo>1fhpY@wAe>ZoD2L(aZUs)NI;1lE zB;ll@LuQ75MmVYFkizg|gqsK-xd6iK_XsDmFf_>U*9oWP+Ry;QUnZPXb*PKsJ%p2r z4((z1^MsRX4s|m8cZ8El4sB=n|0bMNaj2c)j}uNRIMmGWUlLBLH&n&&pAk+fH&n*( zpAt^0He_Y^j|nGJG?W88_k=?~bibw_(|n(dlvOzTy9RHwDq4T{2m}2&W~&nVXE1)X zeY*yo(EQ1cC90xzba@lP1572hM~1y?p-{`UEV6r=Z!|^Ik3W#Gx_2+?W0}5My^oT@ zgQn}+iME$8$=^Mi2!2iP(YimoO*0H=`hM-ug*N1Wo_Mv6*S#|TF_FK0$svfNw7Hfe zXZZ`w4pjT_@Hzrq+80RL?r#z`->^X|INWNfHC=~iCC%4m(DdYAAvK&DWc>c*$B4J! zaLcjK0*vhbBqwZtA}nz&Y|z(#(Twf zZ^!Gldu+>X70b)@3d?z?Znun>9(@DN(ZAL&1eS(v`j_2bT<6eVwVR#}8!nXOo<4-z zC*T_eI>irhN&+=W24AnC#4u7YZ0h)XG}7kVZ?G@+TF#p~%7Lz2ZChnq zZM)y*>J>_1yLTVzGRiK{9{&6Q@X+yZ!eN%6>02!AM0q2UvXv;^k(BL3c{-A^lPJFe zMSBi0gs$5(J*Y8F!G6R(P2XnO1G4rKOI7yl(*oNpRF1}mPEiJtVTWZmQS@glM0Dt( z=gA$vM7Mo(Bl$X!5}J=WAPcydEtE=gsmt`ZDOc-0rs`eJz!NN=Hnm6dZ?TZDnqF_o z(exR)i&e#`*H|)a&z%Rk-)@1HF_=cswwH{dhVi#jP7+x1nFXJb@;y&%{Ye%AFOhxB z4O6E4&{P|R=at|GXSQo{{j1YQiB4VG_6P;noG)Hp7AbVttsQcNmI|| ze?Z(&YpUq3H7$R+)^rbu)18(LQN^<6Of-)*2^HOq8f2TsTgX}lDr>>Fw2-w-G>SDa zs%3Lw2BSn^EgLFeEqf2Zv^6oZWhU0L)ppAUh>v?z-ZqVKm1FrYsvu7_wTGy-*7b(L_T+|1`?Au^kDMXyYT87Ev`tG?wO;9IGeu|Hj5r zeNr30eXQQNX|l#HL50x5Yyi{NFR?11#=ZD;ZA}r)V3TRr9M+8PzorQp|3gi%lZ<2u?#hhm%c+o2G~vG% zvL-wMVA}c+6ilcc?7+=XyEL0nyOP{6hGziUcL>G-`KMP{szNhPhe4*I_G!e4E+E^{ z4S#`oMph(r^6PN;l*v5%mnaUbvwjL#s$VG4$Kc%@-Jkv)^TSU4RPKr4G&togC_+(u zK{(uAbx7t0+Pa_fN5#ckfQiQAS9)oG_8VQ_d~GM*K1Apy>6;_6EQ?;P1$d( ze@6ept`FJuWBayB5m7oCu*y@f%tRcSKb}#qM<>Z<%jJngZ8{V*wqMIyF zQJ;y z^iw6Kr@yh8o_!dhbYZ?n*2IV%6NzrylVApI@0KtZKLAe}{J?NlsK)UW|jr2N^g z{~6#T{|+Vz5)YwR5(?CATxY32NE~9ub;$n|6M%~GzsLlj!UGkjM$xiDKqFv2;0RzI zpcya+Fa$UUFbyyxka`Nh0x@?M1@b;d1Jyv@D*)-Jj${kuodT8yjD+S`Y9Q|$5Y2(S zVSs5xfoX4`XN`cvfJuO-^l27=Z-AZ$>;)V}uTXme(_R7g6};Xy(JSxTu&%U&_=K(d zV?m zYP&$Wyn!eeKYahuO|^rX$r~Z%MNUHZ4#P$$=iSh-wcwS)|Y&s6b?ng*c!@RaDR> z%V?nR7GNr2^_FQOQQ_#EKs~Z-vYh8kojl7|92*2?_t#h~B>_vG!#%2rX%HMUC*?cb z=Ln}k4~|t5ewJ`j34vcAoGJd8d0M{a{)TXx6Ud(mewYOaDfxpZDi3Pz*J%x0Nwlvk zr-%_hTG&wP{zStZ%fOuOd%Psea%1p#szz=>bs4>&C_?V zG}78(1FCvC_}dkszV6b>0V*I#?XzvpIMmm@c|{M^pQi@LR`o%e)U>{1o3FkhI5r*Y zc`Ezpicf=YRel0lhDo3!5NLVYuz{*sV-Egm#X)KY<+(2nnRc;W2_9eZI#ua}$Y~_g zk4v&rZw!9D;^n>{FVCvD5tTta>WSdy%Aw$Svpr44B-_IK7ib1YRn{My#*-C^n|b;m zrf=V`k)FfxF5uJ;Yz$G1VC$4@tXQlT`mWz>UZto6yhgR}@#R@|Ces#{li@1KxW&>g zvjQh8fJc-aOdHVWo6IUjhQPrCUkQZ$Ao4=EXr5K^Eo-jw0bj%Olh5z zw%IJw+Bs*9S?#-_WbT|B`?^cvkk)ZGZ4Ll@*pMjL+d< zu#!L0fnkfK3k`y_+qmLgwCQO$ zOt9{>2W(>;u1Mf;BUQFDP?rd=^#%MHxoCnP;Mz~lUs z2S#G?w6vl?;W6-}0=^DtMk+Oir@~Jceg-Q1G%FW=8f6q(IQ4vB+RKn_6urKqD3IEN zF8_)KU;1gZ;B@z=v;u;c{uEx?3om^>V-ORw28w&O`AsrV-Q={d{>XsnyM%vo4!47_ zTAs9nhQ5y*(%!^_Eps z)0Wv(sS{%}XF349a3R68z_jU@6lx&-N&tA`X#|r3c~?TqXvP6Ea%_-e>2{!?W?+Fp za1Q)E!8{^khysOILDgv10yA?=;n*~e(H!>|TI>PKRd#fP#TF>M0$Hj!6MQ%o(8V$M zcfvY3h9v=E?Z7ndS4l_ujx}VB`*BiNBBbZTrgRDCO}Zk8Fl(CoAnj_A`V!G%bh!UW z1kygn1+V!U5vl}XjuF1<2y9rN!~H5Hs7P?QM`*uGk)`BN);TDPWvvwCJe0+)BIO%| z;KWvFL}C39@B?y!4)<4b6+jCB>K>>231$KN9!HrfIe3+_0?W&Rb^a7tkFvB3Ml-WQ zbEr^q_*KN)y|H=p9IN7J<#ED7Wlo4feaG?```pXZ-Z7APokA?M^JwLXzGGQgg&E`P z6#uUA1;9#L0PJEz)92oNb)SJ1|C<#fecdH1y2&lGl|&En&xL94XHsSKt)b!R>t0?N zBKMw%GT=E-W$^ba!ogQ7&p@xw^EuV$E=lWSi+~@QgB>f5Q2De7aML2-3BCw8y5cy< z?>q5C#TEgm&60to!TV@EJQkIW4(vPDmSxXZ8P2+ca3ja9tP6m{aTC8Park?}6~c*M z;4~l_Oy;1Zi*g7jI#+>-@(Cw8X$dPw@OP11k5BNmOB@S|F{}cr7_$_LG83~3*S(D` zE&JBAWvwzYl&fl6)+UZt5i?S2ILb5|Xgx=n_5;PDgo^4EVu{ptfzpztXGzv7RzG(< z44+HF!Ra!#C0R9;4kz46!uMp&O`nS)JVB%T1>H^re^q&g)`O=&?!u6uS-9gN3;b*a z(8VHHGbkt#gM1oO07{R(XE^dhWtIcr0zb@6UP4kx-{rUdpBs{Ej zEurm$wv6D}^<5esEi;^OfvOYkE6oYiY=<~TeU62@rQ!5%;|Udh31NMG`77zA1Cz3r zLblvbirC-qeZG!-{VG}tRU%S>ddl=u`j`ffS_P4mf}^GYeaPa(ll)=e^nkyc!6z)8 zfF+rqIs==a7m%E!J3x=ZPK!G3K$9#^J^WG?Sf5vML#Kr;!liX&eHiG8Q2@yMX@u+Vs@h+v@c@U1}) zjGF$9=D%6PyRVM#La5M{(8b(?&+lVk##i$v(7VXg5zX|U^YH%1^nXTJw$G^v zz8+(^+jrKw>D_%+lo0yx810M&`%93hDE%cG2TlT7pu>!+G`)k~xb&BlDZotkcA2J^ zb421LEMYH(G6G|EUFAr44)K&0UUpy34A&B0Cpt-r&F(kUQ;ECVPx z02KN=%p$f=A)#+#LUS&?BZHWeDFKpFmi-<1xC6V{j`2{jDBNU8XLpO;?@@{aNrife zQiAt}8%uIeY+oA4n`75Ux<^xdqejz~Um&aP;ZMW2p#=9~&2K+yM0d8C_R*JPq1y(j zzd*xwZL)?%m+=mL8g$KRG2fv(bBwp^^*Lr>=|#X&ZQ&r|D#g&| z>z~^7b5Mk<;Y7s;WdX%zi3^!|rb<%c2rN2Fj&PF$xdFBKG%mY&S%3Z;O$ zk9scD2R^b#w0%kDNX3iU+ippjxM2p`+;bIbPFVV=`Oof(R(1wM??g{iXxtGM`5`K zL0!hE$xmIc()(`HmJ3ML+_Ux&9wc=|PcNr1>5M;=^^*S@(&hE3bGaIeF+ zAFtV$v>?f8IFoz&`R%z@^v%1WDr6&(=;0_(Y*?JU5-giub@~^az81y!lfwYc%n^;= zqx(j0d*B|Yf62$7aYf?t_LgRzfz(d@v{OG9Dn*fw?r>sh$7!$8sho>@C zM<59sNXd^=*$?+}m7BE-7<#fl`5je-7z!p?pURAHujZe1D^m6T(4SZ#AHK8zT{V0q z=-&lBHyrw-psxZQHW9tG1oiU)J#{o3MoZ4mVkP_roDk=wA5v$(r|I3`oj%Un4_@(p zyPQl##}U((4w40m^P?%4LWv|T6NY3^^S{96D-2_DXfEWSI+x;e6Tp>2*MhLJcb~V&s=PE0x990|*ynzV zSm--UX6`@fbAa+PoBpPz=UE(j*x|p9+R$+|xI;IM;oY6Ib=za?TQ3s{tix#FgjmA9 zg_%6PlQmxp__%rVG4obDVcv>i-il$~iecW0VcvZ0C>l|iw_=#LVwg7{H*dZWOYw2@ z<~zVKZr*&mImXSKZ^FF!Sa08#$@JPx-Zhau`K(2Y_J^qb((vbhP|Ly zNPf*-?9}{umdv;Dm541+VF^3k=bQ#S%O5-Qp40uA^ZJn9V>r3E!e3_E_6TM0Cs}NU zzS`}9I)nZLt)J_JZ_I28>_D=w`*P+2-x#zN^w~_MeNOj?!*}6=*54mSaG3785Sn)c zgYAA5kJ#{YLV2U3eqAQ(9lsZMESpHf9lXL*!@fFm_x($;yah_kCXk*S*d>StTdZG z0>{veN7q?cWDWenc4ible3lW<>a)^mlV7DZ7(B5V9@$77XU^L6L%%q4K~J?{6A15X zPQ!@X^+Vx`S!4x9Y~gJdnpQAZFQltV=!V|y z?Er15oa6h9!~5dUdtlk_^jD2Ku$UpOabV5Q6r!NpwB_HB>R)kQ^L3wh;DJ=o+oo&2 zTIR3E_e`|Lc;^DTn|ZuaGyr3= zOh*#?UJJ&n4#mT$UBSlY!F>etrHv=QhUDlMc$)txjQHF7*A9J*_sJeqOk+VVsifodgDXn|zo&k725Ky+cKsCjOMEDh>hM2H-RoHV zEKSr>KWHrR-~U%lJS+lW0}&*bmeSDo0FPz9K?F<)~@PDHcmJ zZ5iPttVuton6FFKeEq5xK8O+?ItYAZs2vG-)P#Nb2{#Xqja!EwLj)Dwe)Z_uw+W{ZElgU;A6(@G>1VTKYN+`-i$nDjjd+(6Vl{3@fa& z2JiK_CwQ;GJyFx&Y0K9N(!AF)T}EUESL%MP;5rcXcZRQ~W@vi~g1=BtSd9gCZhU^3=aPNHQ z&^b#ngn@BF4JRBPCzNx-;5ea( z6GG#JTuxw$%$A2LCyb7Bm^gu+EP1sT&~~gSX|+UzDJzFQ2W4f%Gjp^(C+&leHnTOO z%jj6_GJEa#?yuvB_cG023{^>V_#ZN2t{ENrQowuR;p_2Go%tEQ2+@3lqgrOa=D%9g zzX+|Ny7kq{&=HakUf)>ipR4JIotYtefUvy?p&(V4(^N8;dqR7T7D5QuIShTKpLM|@ z;3M57_$1f#Pd&ySFnv_o@rf7PID=Yu2y~dBe?S3u1PYG>C?PkiE~sIj$?VMh%%Q*S z=)RDMXN<2YSu=##^QCu+=AWwRa0OpFA>&K0Lx%&KzJ~umtF_Ein*LtsKS0#FhmqGl z-=GCn1(ni{6ZG`*W$0fIVX>yqb?7c5Qjg=)xTTnCQKmoiOJtw$p?|r5Z_@(h=c$L2 z{l(|?L@iJZPlvvZ>eusHX0P3!iwQ(sdkLzr>ut)=yVNw^wKuY^Mdyqr*nU6JwV{Zv zz0F|=vabCMU3+Up*A7tw(X|+u5a{UI$51f3_P5k`AtQAN7LvSckMXWeK-WG?$>Uv{ z~AOSSR1l$XFDdzwRm-; z^(X&;-XJ0D+ddz5f&BUNmADPvi_*jp$j^_Kmt!}~>O>PNu?sW;n2&yY20Md$q3|!# z9?>VJr+Z2Yx@phoD^eEC^k@Z~9Ny|&5y(?)dLMQ$@EBmz-?HJw4nE?TVmoux^fWT+ zUzX^HZD-y;^P%m`;S$3Ml)4+6C0IUuLIn?M?$f38{{&60PJCP7bRWZr#C^=NBBaLTRk%Dz6RRh_k!wL5owZ(YNZR%J^or+3!`sfQKEpd~j%Z{#i#*VJp z!`J&r`NTk7Gm38cN;6bgLIqyvSjR3r3`Bg($Cf`6`vVjDiY1;_a0XqU2ZhWhJt`#C zwA%4DX*pl8t{VGmqQ!F^7Iy>mEyV=lF+I%vU;JpJUSrTTgcVOItYgeSC$9-((E8e5T=T zF}J8reEpQ>gl$!wXdT`t;3YgPc)oOr{AV7Ojw&pT*soD~jW9L7(Xcm_?|1u#jrd)P zpEkh4ZGT2q`tb(;?Mmp6uR<13deMz>feaf|X198?>aA}5LF?*8 zQ>U(+ojt#Hwc1cuUtibMsJb`0>piWPPVnRBmUY$j-@?f&o{Eq$J9}R32DP=>g?J%Dwjef$MdTvrZfhy^ zG!(Vf)_PjhLN#}xlAS%5b>7C-YZ&Bzoz z!#Xqi_rCON-$m+d3>NRaNm%$#f;m%5TcdYv)~)oz;kw4PR1toBTlYgxMy7hJnsZV0 zl9_zkYt)1bPm8&P6o4bmH%RP?1MTV2uA z#^`m8^rPrjk4lBCudZ+Nw2~whw(@1J?=3G`Ug9d&mfyL|RqnX2L`4~@^RA_?$y2>e z>H|$JKTxYNd|>pht7}#3>l*63)g)%Ns+Kpkv~0XyRZH+QWAt3M>~Hq7Y!FoTb8g-% zG`n1+9@(>N#ITIBq5rCz*P$7agEpQj#ul9!naswNI-Z7&Cvb(UrvWT!^g_Wn0iVVj zu1Q=>mU=p_nwP_2dKRvMxuU02@9}V$&PwR}Q#kw#!J%+C6_ddGSvX9qj=s->0hsztIQ$8q z1@JuKsGEY}hj%N64Q9hN(=IV?N4a*;(sAuM7!IE$hN2YnV+r<@v_}(*Qq(|FQA)

^7p$WD+R+3%bctwmn6~=1BtRdc}TP18u=LgLH+ZU zhL<@cD8z-(5FHK3x2IJ3F4;CEko;)UV~LM|kMda1AI4A74}c^;^4L;Tdy3VU^l0K^ zU_zksok$^h^J&!9b6b=_tyHv+y?Xp4GsCH4k=Sj{#hr?5Zyi_lhM|w93`aObvQ6Ejx4$4RU zszML%`Wy3YLB8G4_mlDR#f%B6V*2?a^5sC^e+|g>3X)E)pNWBli}Z<#VfvYb{@scB z@*xqJe#R}wp3-he*q@RQ&7V}=*?3lLnu&${OWWnav_i_Wy(13Ws*V>;x!TP3?SJu1vvXjF??`X zEwmCk9*9O^oFF1HZ!G+2Ogg(s z0T&8*yMXrySS#Rq0Ur_Y=K?+{;C~CaU%#wlLG#?fcphxl|sZ41E)u2s(IOY*|(_ka&qV8%+H;#X55YCkyh>H z^sM=rbmqte?=3hMMagd6*x;>R1L$qxaGi`{HR)+-Rf#jz$r2Isq;KoICnP1w&G+_XxJ(Ts!{IdNGG5Mu^)kOYIQ84AFevZxm z0BF=F(jQ2F5+Y^#{r@W_e^rT<<)|v*8R=OgGRX0hwEsXPvVK~QQ}{|_K&j+l{2p8X3rL~-vKo0# zqC7t_cKl=O?}^DzCvs3&=U`y`+$D}p=%2(7A`)3Yoyb99gUBCSUo8F_cu9Aq{L&xG za~!4q$@P!SFZJjhBcPM~IU-#GnSY5S2uQpX zB&26>SvN}%w_l0+*)wlMg3KpLgvW@;lG2t74q-=T0kz|j{U-HtDrl3{pH|KDH^~ef z9hY06-wG0mJHdJp51o4^{bXb!orGphutt(toX$Rzel0RF5ne(1t7!TZMf#y=`X%G* z-)Oo~k$xkZK6QM(9ZkP<{J9~T4x=yacSJ%WjJdSS5oEOXaz)zPXnKkw?PxUJG=9!$ zG~KL7yBAHL4m)OzB(ZoRPBN5sDl#zs`t769SIYu`tBQbi?&*&Gomkw~EdfTm5#3sTJqLcMAz2zqPH^j(K z148un82Tk>_+;gzk|{yG16Ns*6FU*Y5U&?>`l6r0g!-QzXt5boxe_0^2hJq&-k-7ky1lA%oo&M`5IsY!`pU3b&1NvnB-w!(VLz&pG zVS8Pu|0`C`#q;`7=2WgwoH6=#O&oeQ=!Qw`$t|F(Xl1N@C<6To)&tSE$~bh|^ST1^ z(p{mnS^FBpBAV!er% zPkXc!&c@+S!Gd$Lez+kHy$E!bl^A`yAN1)sv0D05RL4R?9R8nxZk|Nnc7Q&aJbM@& zc39}uhobxs;_#mWo%*>d-^w`Io;PBbVi8XIX`RdIw1-Y174+!|Y2)V_b1K@(_$4+0 z^WxC&5arN!D3Re_M*lmdIcA>HA?YONz#=PSXM5?0{f?DCO`&5FDd_C3C_RFnW3n=e zT(`c>=z=K%d?xt!2)j)mGEzvuX3Av!d==;<4}DQXK|Yrjf^LR?m+N+xh?~aB0e`lY zk$x)@z89zbwQ=Y_66KEwJ1L4$+T-v)3HoH`P(L4s|2(yfX-MmmDza@91s*48(zsjhdi4MSIT+Xe-jfp|CU z_0(i9%vm^Jp#)c5%?832D0~;Prmdl2qm-xQPCFCl_PcGRB`%4PI5Iv_3)`J!wTSN` z*t>k!ea;2!Y;2rh?VRsvtiegq?5l}vdao-H92f0rcDY<_jdk(=yXF_Wit#?#HNOOm ze+d(e(KXO(Q7OPzNW*_{?#!J>(w1**Ev2K)8Bd-A=bO`8YEbbypN=4R=H}J5cyOk;_*kNq(H79_cTY>X zd_B&H3#-{D649)A^vMGy-@;E?XZiCRE7-B?trNY+K3S0?nmbPnA{)0l8OoN`l;^nc zzpMtM+ft3QzGXA-y8ZU@k_vn-P~pi6Qj> 5)) __device__ static void stencil_step( int x0, int x1, diff --git a/examples_cuda/stencil/stencil.cubin b/examples_cuda/stencil/stencil.cubin index db1b1bca7c1d6730958b2611b10f4ee84930169f..5e8d9431465c12762f9fc2617af7ee718373025d 100644 GIT binary patch literal 3220 zcmeHJUrbw782?)S>=c+D>U5o6Sq(3jIar{iMxomMIV+j-v#Lt`?>_a-#mxzv>hqm}=+qwAb7xz((#G=CkXZKMGoax}q z#rQ}f8c%rlF#Ly{_Q>X8W!X0r9gbd*UETlLimEEGR%h34%9Pjc3e@Z=q)xhP&uXNi z+DXulnm3lr>j>CU9IyqT>PP^p4>y=b^|;3H`!hRy9eB%l2k2!y0e2aH05s$qcks2~ zD&uS5P#Iqhw2ZF;%%cQv$K6)S+n~CPH=s(#w^2-O;a8B~T;A56Rd9}lMk zG(`={9JRm+__Js2z@51BraC}7CiqW)<0c49bpki}mf}%7V6rQ2AMPb!>zo>m_;Iu@ zaXS>h{obx?9+daHd>U>BwvPUAm06zpaM6)}9=O0|iBH4L#MU!u3~v~u3A9?!VjF(h z1y?(0&x-gS&2@8>Ff$Mz{sQ-@oZ{O0NC#Rd`$YEC|P z)*}2g>AVH~zk>b`;Z}4`DY-yXgOM+QKaT!?MSES@>&ku$3wC+{@A z^iMfuILOBeH>+#g+h398FV|$Ao=rh*_r6}@k78-xyt9UQ?9aPv7*F(NhBBZA`5-NE z@2`nESpK6`X}6H(HdrGIKdE}Y%W`~4S^iG;DE1c;xOOycq#Qro4Z=DfOA+}_?^r+0R0TWhnb_)zQ^0<6>Rr`gHg-eD9L^ewA0J zUwcHFgegR!MlhV(|Hzu*-a-}2X^>OEyhx#j=FNf{4=&n)2kZ^b!vvONA+ zSk6CHZgbxSa|ZIRIr_^r zwvbsR;eeF)hTOOhCkmdhyGF#-rTuT$7DYcA`iyFX=GOM%gAJs3%}W> zq&O!`{rx{tZ&T+Lo=;DrT%1q0g#1AjAIJ0H?kb{i5I=}MfeUGI8})X-;MwHAQ33id z1J9;+b*IN^*;rDXMXmmcm?o|=aSLr-Hx(_#Rpx3z_M#T< EFDPGE_y7O^ literal 3156 zcmeHJUuc_E6hBGR?)K00OB^o6YmM~fTWn+KL_!HAlo~=ABlf9Ouu0dJZ0oYx#?isB zH&RQ5Oj}yS!lz52%9J>wRt@4L2nB^AKKNE2DxISbd)R{yd#J1DeD{9sm)QE^!(O~3 z-?``9-#O=Z?>*oB?(|D%qE3h7^f8dl3EzW5^+^;OrBbMuvbDAqA`pTad@@1?CR3A{ ziA-_`dzQR<$<#9=QyDalq|con8$>xim`P4vgh2XA;Cy=Y@)1LmspQ1S@DYRLX9hTT zIX#(4rZd4qfd9hj5H^pB<=}X7B6&%!>e0`39I63pcMfo~roHaKpn6ZC^k@h8NUa=j zc^Jk~eYdUOz`%*-h$8|u%@J^&Y%)QOxW{n+*?qkMf)#o_=oLBxUxj`QIO^_w|2pti z=(TX7LU(~yq1OQNu=zW&+U&XmTot-Wn(!?c2`A)w+KdP;&LD+@UbPM;?*GT5SphAd z*YbkZfx#eL9`=Lb!OENM0NzHrQ|EpFk!S}4`HAw2V3_t6FlvFPHgngny9+CM<_ltN zfmrtjITnFf*MhjoC|QtW(TMf1oSioS{5gvIahu@>Z#k)9cE}f5K;>E4y#cUV{>Hz< zu^1h9*V|;?*LH6>Ku0_9Q{>;G8>cY;D+LX`3xZ*MDlyL$HU23WCoul+h}RWfCp_Zr z;|b}(dHTt2AOSHa*cTbpZ?c!$DL>}7hB7#h69Rh<+kGgUP0n8KfYX%SuHoO<%@atL-`q8H^_Xg z8&8nm)$fdV0a*FlHU1>&ox(7szr+JjZdwSG43yqQE$7a;H?l0fB*%j~-`(5WTmNZX z`X_X`A1!N!Iscf>|D^H01jmC)Y|mv2?!1<(@7Lu%b%$hpjPrnS(U8nDT9ENW5pM4D zKtIQ?kJuKU=Vo$ZsUG8+`+Z?d`lrXF-_WqsJL4P=K(XBylX-(F8Ap^g;TAQ$?1qdp5Jx^ZpLxw#ruFOYk+^tF@s@SJ z9qp0$-WY$3=9N%<`a-;c{5LeQvL*yOdxkgD{6fkf*O%U#?a_Iew_B|v&dXfKuJ@(g z5QS_}%hyNM`gMjS4=YbsxHA7#NbViWDn#7`8kd~@^6&v2h2Vl(Wdj}myIomLVkNaw&Uu^0U_s0hY$M;8+i={mM zSc^gR?WOlS{%{&RuV)O3S8mDBIbqr#{DpRhI>+$*@}hmSq?Hf{bbkCo;}2_$X*?J1 zgn`xxyeN0~(C!SAF5aq^1o|%x&!YW1fIUt`DNA-E_6cuqbQQGj5O({oO8Lxg#D29u z9yaD(qY4Id3Ybs>bRVHVzT{@Zn}gXdN-?g~eX*nbEu6)Xio)JbOXVN_7O8s*O#XC( zs`k&IjZzUed%LF$!ZzW<{ON8+km_68m`Y^b)hIvyU#QMcesqno?&R35Xg8H+No)Nm ZC;3@CE1Xa%D?*mE`&DzT_OwLHGaw diff --git a/examples_cuda/stencil/stencil.ispc b/examples_cuda/stencil/stencil.ispc index f96ea8fa..d2e095b3 100644 --- a/examples_cuda/stencil/stencil.ispc +++ b/examples_cuda/stencil/stencil.ispc @@ -34,13 +34,14 @@ #ifdef __NVPTX__ #warning "emitting DEVICE code" #define taskIndex blockIndex0() +#define taskCount blockCount0() #define programIndex laneIndex() #define programCount warpSize() #else #warning "emitting HOST code" #endif -static void +static inline void stencil_step(uniform int x0, uniform int x1, uniform int y0, uniform int y1, uniform int z0, uniform int z1, @@ -50,29 +51,62 @@ stencil_step(uniform int x0, uniform int x1, const uniform int Nxy = Nx * Ny; // foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) +#if 0 +#define VER1 +#endif + +#ifdef VER1 + const uniform long x1o = 1; + const uniform long x2o = 2; + const uniform long x3o = 3; + const uniform long y1o = Nx; + const uniform long y2o = Nx*2; + const uniform long y3o = Nx*3; + const uniform long z1o = Nxy; + const uniform long z2o = Nxy*2; + const uniform long z3o = Nxy*3; +#endif for (uniform int z = z0; z < z1; z++) for (uniform int y = y0; y < y1; y++) - for (uniform int xb = x0; xb < x1; xb += programCount) { - const int x = xb + programIndex; - int index = (z * Nxy) + (y * Nx) + x; + const int index_base = (z * Nxy) + (y * Nx); + for (uniform int xb = x0; xb < x1; xb += programCount) + { + const int x = xb + programIndex; + int index = index_base + x; +#ifndef VER1 #define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] #define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] - double div = coef[0] * A_cur(0, 0, 0) + + double div = coef[0] * A_cur(0, 0, 0) + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + - A_cur(0, +1, 0) + A_cur(0, -1, 0) + - A_cur(0, 0, +1) + A_cur(0, 0, -1)) + + A_cur(0, +1, 0) + A_cur(0, -1, 0) + + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + - A_cur(0, +2, 0) + A_cur(0, -2, 0) + - A_cur(0, 0, +2) + A_cur(0, 0, -2)) + + A_cur(0, +2, 0) + A_cur(0, -2, 0) + + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + - A_cur(0, +3, 0) + A_cur(0, -3, 0) + - A_cur(0, 0, +3) + A_cur(0, 0, -3)); + A_cur(0, +3, 0) + A_cur(0, -3, 0) + + A_cur(0, 0, +3) + A_cur(0, 0, -3)); +#else +#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)] +#define A_next(x, y, z) Aout[index + (x) + (y) + (z)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) + + A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) + + A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) + + coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) + + A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) + + A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) + + coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) + + A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) + + A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o)); +#endif - if (x < x1) - A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + - vsq[index] * div; - } + if (x < x1) + A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + + vsq[index] * div; + } + } } @@ -83,6 +117,8 @@ stencil_step_task(uniform int x0, uniform int x1, uniform int Nx, uniform int Ny, uniform int Nz, uniform const double coef[4], uniform const double vsq[], uniform const double Ain[], uniform double Aout[]) { + if(taskIndex >= taskCount) return; + stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1, Nx, Ny, Nz, coef, vsq, Ain, Aout); } diff --git a/examples_cuda/stencil/stencil_cu.cpp b/examples_cuda/stencil/stencil_cu.cpp index 3f06e841..9cdd1050 100644 --- a/examples_cuda/stencil/stencil_cu.cpp +++ b/examples_cuda/stencil/stencil_cu.cpp @@ -132,11 +132,12 @@ void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size) checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); } #define deviceLaunch(func,nbx,nby,nbz,params) \ + checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \ checkCudaErrors( \ cuLaunchKernel( \ (func), \ - (nbx), (nby), (nbz), \ - 32, 1, 1, \ + ((nbx-1)/(128/32)+1), (nby), (nbz), \ + 128, 1, 1, \ 0, NULL, (params), NULL \ )); @@ -144,6 +145,38 @@ typedef CUdeviceptr devicePtr; /**************/ +#include +std::vector readBinary(const char * filename) +{ + std::vector buffer; + FILE *fp = fopen(filename, "rb"); + if (!fp ) + { + fprintf(stderr, "file %s not found\n", filename); + assert(0); + } +#if 0 + char c; + while ((c = fgetc(fp)) != EOF) + buffer.push_back(c); +#else + fseek(fp, 0, SEEK_END); + const unsigned long long size = ftell(fp); /*calc the size needed*/ + fseek(fp, 0, SEEK_SET); + buffer.resize(size); + + if (fp == NULL){ /*ERROR detection if file == empty*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n",filename); + exit(1); + } + else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n", filename); + exit(1); + } +#endif + fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size()); + return buffer; +} extern "C" { @@ -155,15 +188,21 @@ extern "C" void CUDALaunch( void **handlePtr, const char * module_name, - const char * module, + const char * module_1, const char * func_name, void **func_args, int countx, int county, int countz) { assert(module_name != NULL); - assert(module != NULL); + assert(module_1 != NULL); assert(func_name != NULL); assert(func_args != NULL); +#if 1 + const char * module = module_1; +#else + const std::vector module_str = readBinary("kernel.cubin"); + const char * module = &module_str[0]; +#endif CUmodule cudaModule = loadModule(module); CUfunction cudaFunction = getFunction(cudaModule, func_name); deviceLaunch(cudaFunction, countx, county, countz, func_args); @@ -184,134 +223,134 @@ extern "C" extern void loop_stencil_serial(int t0, int t1, int x0, int x1, - int y0, int y1, int z0, int z1, - int Nx, int Ny, int Nz, - const double coef[5], - const double vsq[], - double Aeven[], double Aodd[]); + int y0, int y1, int z0, int z1, + int Nx, int Ny, int Nz, + const double coef[5], + const double vsq[], + double Aeven[], double Aodd[]); void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) { - int offset = 0; - for (int z = 0; z < Nz; ++z) - for (int y = 0; y < Ny; ++y) - for (int x = 0; x < Nx; ++x, ++offset) { - A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny); - A[1][offset] = 0; - vsq[offset] = x*y*z / double(Nx * Ny * Nz); - } + int offset = 0; + for (int z = 0; z < Nz; ++z) + for (int y = 0; y < Ny; ++y) + for (int x = 0; x < Nx; ++x, ++offset) { + A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny); + A[1][offset] = 0; + vsq[offset] = x*y*z / double(Nx * Ny * Nz); + } } int main() { - int Nx = 256, Ny = 256, Nz = 256; - int width = 4; - double *Aserial[2], *Aispc[2]; - Aserial[0] = new double [Nx * Ny * Nz]; - Aserial[1] = new double [Nx * Ny * Nz]; - Aispc[0] = new double [Nx * Ny * Nz]; - Aispc[1] = new double [Nx * Ny * Nz]; - double *vsq = new double [Nx * Ny * Nz]; + int Nx = 256, Ny = 256, Nz = 256; + int width = 4; + double *Aserial[2], *Aispc[2]; + Aserial[0] = new double [Nx * Ny * Nz]; + Aserial[1] = new double [Nx * Ny * Nz]; + Aispc[0] = new double [Nx * Ny * Nz]; + Aispc[1] = new double [Nx * Ny * Nz]; + double *vsq = new double [Nx * Ny * Nz]; - double coeff[4] = { 0.5, -.25, .125, -.0625 }; - - /*******************/ - createContext(); - /*******************/ + double coeff[4] = { 0.5, -.25, .125, -.0625 }; - const size_t bufsize = sizeof(double)*Nx*Ny*Nz; - devicePtr d_Aispc0 = deviceMalloc(bufsize); - devicePtr d_Aispc1 = deviceMalloc(bufsize); - devicePtr d_vsq = deviceMalloc(bufsize); - devicePtr d_coeff = deviceMalloc(4*sizeof(double)); + /*******************/ + createContext(); + /*******************/ + + const size_t bufsize = sizeof(double)*Nx*Ny*Nz; + devicePtr d_Aispc0 = deviceMalloc(bufsize); + devicePtr d_Aispc1 = deviceMalloc(bufsize); + devicePtr d_vsq = deviceMalloc(bufsize); + devicePtr d_coeff = deviceMalloc(4*sizeof(double)); - InitData(Nx, Ny, Nz, Aispc, vsq); + InitData(Nx, Ny, Nz, Aispc, vsq); - // - // Compute the image using the ispc implementation on one core; report - // the minimum time of three runs. - // - double minTimeISPC = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, coeff, vsq, - Aispc[0], Aispc[1]); - double dt = get_elapsed_mcycles(); - minTimeISPC = std::min(minTimeISPC, dt); - } + // + // Compute the image using the ispc implementation on one core; report + // the minimum time of three runs. + // + double minTimeISPC = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aispc[0], Aispc[1]); + double dt = get_elapsed_mcycles(); + minTimeISPC = std::min(minTimeISPC, dt); + } - printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); - - InitData(Nx, Ny, Nz, Aispc, vsq); + printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); - memcpyH2D(d_Aispc0, Aispc[0], bufsize); - memcpyH2D(d_Aispc1, Aispc[1], bufsize); - memcpyH2D(d_vsq, vsq, bufsize); - memcpyH2D(d_coeff, coeff, 4*sizeof(double)); - // - // Compute the image using the ispc implementation with tasks; report - // the minimum time of three runs. - // - double minTimeISPCTasks = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, (double*)d_coeff, (double*)d_vsq, - (double*)d_Aispc0, (double*)d_Aispc1); - double dt = get_elapsed_mcycles(); - minTimeISPCTasks = std::min(minTimeISPCTasks, dt); - } - memcpyD2H(Aispc[1], d_Aispc1, bufsize); - //memcpyD2H(Aispc[1], d_vsq, bufsize); + InitData(Nx, Ny, Nz, Aispc, vsq); - printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); + memcpyH2D(d_Aispc0, Aispc[0], bufsize); + memcpyH2D(d_Aispc1, Aispc[1], bufsize); + memcpyH2D(d_vsq, vsq, bufsize); + memcpyH2D(d_coeff, coeff, 4*sizeof(double)); + // + // Compute the image using the ispc implementation with tasks; report + // the minimum time of three runs. + // + double minTimeISPCTasks = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, (double*)d_coeff, (double*)d_vsq, + (double*)d_Aispc0, (double*)d_Aispc1); + double dt = get_elapsed_mcycles(); + minTimeISPCTasks = std::min(minTimeISPCTasks, dt); + } + memcpyD2H(Aispc[1], d_Aispc1, bufsize); + //memcpyD2H(Aispc[1], d_vsq, bufsize); - InitData(Nx, Ny, Nz, Aserial, vsq); + printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); - // - // And run the serial implementation 3 times, again reporting the - // minimum time. - // - double minTimeSerial = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, coeff, vsq, - Aserial[0], Aserial[1]); - double dt = get_elapsed_mcycles(); - minTimeSerial = std::min(minTimeSerial, dt); - } + InitData(Nx, Ny, Nz, Aserial, vsq); - printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); + // + // And run the serial implementation 3 times, again reporting the + // minimum time. + // + double minTimeSerial = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aserial[0], Aserial[1]); + double dt = get_elapsed_mcycles(); + minTimeSerial = std::min(minTimeSerial, dt); + } - printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", - minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); + printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); - // Check for agreement - int offset = 0; - int nerr = 0; - for (int z = 0; z < Nz; ++z) - for (int y = 0; y < Ny; ++y) - for (int x = 0; x < Nx; ++x, ++offset) { + printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", + minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); - double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) / - Aserial[1][offset]); - if (error > 1e-3) - { - if (nerr < 100) - printf("Error @ (%d,%d,%d): ispc = %g, serial = %g error= %g\n", - x, y, z, Aispc[1][offset], Aserial[1][offset], error); - nerr++; - } - } + // Check for agreement + int offset = 0; + int nerr = 0; + for (int z = 0; z < Nz; ++z) + for (int y = 0; y < Ny; ++y) + for (int x = 0; x < Nx; ++x, ++offset) { - fprintf(stderr, " nerr= %d frac= %g \n", nerr, 1.0*nerr/(1.0*Nx*Ny*Nz)); - - /*******************/ - destroyContext(); - /*******************/ + double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) / + Aserial[1][offset]); + if (error > 1e-3) + { + if (nerr < 100) + printf("Error @ (%d,%d,%d): ispc = %g, serial = %g error= %g\n", + x, y, z, Aispc[1][offset], Aserial[1][offset], error); + nerr++; + } + } - return 0; + fprintf(stderr, " nerr= %d frac= %g \n", nerr, 1.0*nerr/(1.0*Nx*Ny*Nz)); + + /*******************/ + destroyContext(); + /*******************/ + + return 0; } diff --git a/stdlib.ispc b/stdlib.ispc index 2c6c3de0..b6be0cdb 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -63,47 +63,56 @@ // CUDA Specific primitives // #define CUDABLOCKSIZE 128 +#define WARPSIZE2 5 +#define WARPSIZE (1<> WARPSIZE2)) + (__tid_x() >> WARPSIZE2); } +/***************/ __declspec(safe,cost0) static inline uniform int blockIndex1() { return __ctaid_y(); } +/***************/ __declspec(safe,cost0) static inline uniform int blockIndex2() { return __ctaid_y(); } - +/***************/ __declspec(safe,cost0) static inline uniform int blockCount0() { - return __nctaid_x(); + return __nctaid_x() * (CUDABLOCKSIZE >> WARPSIZE2); } +/***************/ __declspec(safe,cost0) static inline uniform int blockCount1() { return __nctaid_y(); } +/***************/ __declspec(safe,cost0) static inline uniform int blockCount2() { return __nctaid_z(); } -__declspec(safe,cost0) - static inline uniform int warpSize() -{ - return __warpsize(); -} -__declspec(safe,cost0) - static inline uniform int laneIndex() -{ - return __tid_x() & (warpSize()-1); -} /////////////////////////////////////////////////////////////////////////// // Low level primitives