diff --git a/examples_cuda/options/Makefile_gpu b/examples_cuda/options/Makefile_gpu new file mode 100644 index 00000000..03c63d50 --- /dev/null +++ b/examples_cuda/options/Makefile_gpu @@ -0,0 +1,55 @@ +PROG=options_cu +ISPC_SRC=options.ispc +CXX_SRC=options_cu.cpp + +CXX=g++ +CXXFLAGS=-O3 -I$(CUDATK)/include +LD=g++ +LDFLAGS=-lcuda + +ISPC=ispc +ISPCFLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math + +LLVM32 = $(HOME)/usr/local/llvm/bin-3.2 +LLVM = $(HOME)/usr/local/llvm/bin-3.3 +PTXGEN = $(HOME)/ptxgen +PTXGEN += -opt=3 +PTXGEN += -ftz=1 -prec-div=0 -prec-sqrt=0 -fma=1 + +LLVM32DIS=$(LLVM32)/bin/llvm-dis + +.SUFFIXES: .bc .o .ptx .cu _ispc_nvptx64.bc + + +ISPC_OBJ=$(ISPC_SRC:%.ispc=%_ispc.o) +ISPC_BC=$(ISPC_SRC:%.ispc=%_ispc_nvptx64.bc) +PTXSRC=$(ISPC_SRC:%.ispc=%_ispc_nvptx64.ptx) +CXX_OBJ=$(CXX_SRC:%.cpp=%.o) + +all: $(PROG) + + +$(CXX_OBJ) : kernel.ptx +$(PROG): $(CXX_OBJ) kernel.ptx + /bin/cp kernel.ptx __kernels.ptx + $(LD) -o $@ $(CXX_OBJ) $(LDFLAGS) + +%.o: %.cpp + $(CXX) $(CXXFLAGS) -o $@ -c $< + + +%_ispc_nvptx64.bc: %.ispc + $(ISPC) $(ISPCFLAGS) --emit-llvm -o `basename $< .ispc`_ispc_nvptx64.bc -h `basename $< .ispc`_ispc.h $< --emit-llvm + +%.ptx: %.bc + $(LLVM32DIS) $< + $(PTXGEN) `basename $< .bc`.ll > $@ + +kernel.ptx: $(PTXSRC) + cat $^ > kernel.ptx + +clean: + /bin/rm -rf *.ptx *.bc *.ll $(PROG) + + + diff --git a/examples_cuda/options/options.cu b/examples_cuda/options/options.cu new file mode 100644 index 00000000..c14d6df3 --- /dev/null +++ b/examples_cuda/options/options.cu @@ -0,0 +1,150 @@ +// -*- mode: c++ -*- +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "options_defs.h" + +#define programCount 32 +#define programIndex (threadIdx.x & 31) +#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5)) +#define taskCount (gridDim.x*4) +#define warpIdx (threadIdx.x >> 5) + +// Cumulative normal distribution function +// +__device__ +static inline float +CND(float X) { + float L = abs(X); + + float k = 1.0 / (1.0 + 0.2316419 * L); + float k2 = k*k; + float k3 = k2*k; + float k4 = k2*k2; + float k5 = k3*k2; + + const float invSqrt2Pi = 0.39894228040f; + float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 + + -1.821255978f * k4 + 1.330274429f * k5); + w *= invSqrt2Pi * exp(-L * L * .5f); + + if (X > 0.f) + w = 1.0 - w; + return w; +} + +__global__ +void bs_task( float Sa[], float Xa[], float Ta[], + float ra[], float va[], + float result[], int count) { + int first = taskIndex * (count/taskCount); + int last = min(count, (int)((taskIndex+1) * (count/taskCount))); + + for (int i = programIndex + first; i < last; i += programCount) + if (i < last) + { + float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i]; + + float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T)); + float d2 = d1 - v * sqrt(T); + + result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2); + } +} + +extern "C" +__global__ void +black_scholes_ispc_tasks( float Sa[], float Xa[], float Ta[], + float ra[], float va[], + float result[], int count) { + int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384); + bs_task<<>>(Sa, Xa, Ta, ra, va, result, count); +} + +/********/ + + +__device__ +static inline float +binomial_put(float S, float X, float T, float r, float v) { + float V[BINOMIAL_NUM]; + + float dt = T / BINOMIAL_NUM; + float u = exp(v * sqrt(dt)); + float d = 1. / u; + float disc = exp(r * dt); + float Pu = (disc - d) / (u - d); + +#pragma unroll + for ( int j = 0; j < BINOMIAL_NUM; ++j) { + float upow = pow(u, (float)(2*j-BINOMIAL_NUM)); + V[j] = max(0., X - S * upow); + } + +#pragma unroll + for ( int j = BINOMIAL_NUM-1; j >= 0; --j) +#pragma unroll + for ( int k = 0; k < j; ++k) + V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc; + return V[0]; +} + + + +__global__ void +binomial_task( float Sa[], float Xa[], + float Ta[], float ra[], + float va[], float result[], + int count) { + int first = taskIndex * (count/taskCount); + int last = min(count, (int)((taskIndex+1) * (count/taskCount))); + + for (int i = programIndex + first; i < last; i += programCount) + if (i < last) + { + float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i]; + result[i] = binomial_put(S, X, T, r, v); + } +} + + +extern "C" __global__ void +binomial_put_ispc_tasks( float Sa[], float Xa[], + float Ta[], float ra[], + float va[], float result[], + int count) { + int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384); + if (programIndex == 0) + binomial_task<<>>(Sa, Xa, Ta, ra, va, result, count); + cudaDeviceSynchronize(); +} diff --git a/examples_cuda/options/options_cu.cpp b/examples_cuda/options/options_cu.cpp new file mode 100644 index 00000000..0898d9fd --- /dev/null +++ b/examples_cuda/options/options_cu.cpp @@ -0,0 +1,184 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define NOMINMAX + +#include +#include +#include +#include +#include +#include +using std::max; + +#include "options_defs.h" +#include "../timing.h" + +#include "options_ispc.h" +using namespace ispc; +#include +#include "../cuda_ispc.h" + +static void usage() { + printf("usage: options [--count=]\n"); +} + + +int main(int argc, char *argv[]) { + int nOptions = 128*1024; + + for (int i = 1; i < argc; ++i) { + if (strncmp(argv[i], "--count=", 8) == 0) { + nOptions = atoi(argv[i] + 8); + if (nOptions <= 0) { + usage(); + exit(1); + } + } + } + + float *S = new float[nOptions]; + float *X = new float[nOptions]; + float *T = new float[nOptions]; + float *r = new float[nOptions]; + float *v = new float[nOptions]; + float *result = new float[nOptions]; + + for (int i = 0; i < nOptions; ++i) { + S[i] = 100; // stock price + X[i] = 98; // option strike price + T[i] = 2; // time (years) + r[i] = .02; // risk-free interest rate + v[i] = 5; // volatility + } + + /*******************/ + createContext(); + /*******************/ + devicePtr d_S = deviceMalloc(nOptions*sizeof(float)); + devicePtr d_X = deviceMalloc(nOptions*sizeof(float)); + devicePtr d_T = deviceMalloc(nOptions*sizeof(float)); + devicePtr d_r = deviceMalloc(nOptions*sizeof(float)); + devicePtr d_v = deviceMalloc(nOptions*sizeof(float)); + devicePtr d_result = deviceMalloc(nOptions*sizeof(float)); + + memcpyH2D(d_S, S, nOptions*sizeof(float)); + memcpyH2D(d_X, X, nOptions*sizeof(float)); + memcpyH2D(d_T, T, nOptions*sizeof(float)); + memcpyH2D(d_r, r, nOptions*sizeof(float)); + memcpyH2D(d_v, v, nOptions*sizeof(float)); + + double sum; + + // + // Binomial options pricing model, ispc implementation + // + const bool print_log = false; + const int nreg = 32; + double binomial_ispc = 1e30; +#if 0 + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + const double t0 = rtc(); + const char * func_name = "binomial_put_ispc"; + void *func_args[] = {&d_S, &d_X, &d_T, &d_r, &d_v, &d_result, &nOptions}; + double dt = CUDALaunch(NULL, func_name, func_args, print_log, nreg); + dt *= 1e3; + sum = 0.; + for (int i = 0; i < nOptions; ++i) + sum += result[i]; + binomial_ispc = std::min(binomial_ispc, dt); + } + printf("[binomial ispc, 1 thread]:\t[%.3f] million cycles (avg %f)\n", + binomial_ispc, sum / nOptions); +#endif + + // + // Binomial options pricing model, ispc implementation, tasks + // + double binomial_tasks = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + const char * func_name = "binomial_put_ispc_tasks"; + void *func_args[] = {&d_S, &d_X, &d_T, &d_r, &d_v, &d_result, &nOptions}; + double dt = CUDALaunch(NULL, func_name, func_args, print_log, nreg); + dt *= 1e3; + sum = 0.; + for (int i = 0; i < nOptions; ++i) + sum += result[i]; + binomial_tasks = std::min(binomial_tasks, dt); + } + printf("[binomial ispc, tasks]:\t\t[%.3f] million cycles (avg %f)\n", + binomial_tasks, sum / nOptions); + + // + // Black-Scholes options pricing model, ispc implementation, 1 thread + // + double bs_ispc = 1e30; +#if 0 + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + const char * func_name = "black_scholes_ispc"; + void *func_args[] = {&d_S, &d_X, &d_T, &d_r, &d_v, &d_result, &nOptions}; + double dt = CUDALaunch(NULL, func_name, func_args, print_log, nreg); + dt *= 1e3; + sum = 0.; + for (int i = 0; i < nOptions; ++i) + sum += result[i]; + bs_ispc = std::min(bs_ispc, dt); + } + printf("[black-scholes ispc, 1 thread]:\t[%.3f] million cycles (avg %f)\n", + bs_ispc, sum / nOptions); +#endif + + // + // Black-Scholes options pricing model, ispc implementation, tasks + // + double bs_ispc_tasks = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + const char * func_name = "black_scholes_ispc_tasks"; + void *func_args[] = {&d_S, &d_X, &d_T, &d_r, &d_v, &d_result, &nOptions}; + double dt = CUDALaunch(NULL, func_name, func_args, print_log, nreg); + dt *= 1e3; + sum = 0.; + for (int i = 0; i < nOptions; ++i) + sum += result[i]; + bs_ispc_tasks = std::min(bs_ispc_tasks, dt); + } + printf("[black-scholes ispc, tasks]:\t[%.3f] million cycles (avg %f)\n", + bs_ispc_tasks, sum / nOptions); + + + return 0; +}