diff --git a/examples_cuda/cuda_ispc.h b/examples_cuda/cuda_ispc.h index 3060693f..cbb9272a 100644 --- a/examples_cuda/cuda_ispc.h +++ b/examples_cuda/cuda_ispc.h @@ -268,6 +268,7 @@ static double CUDALaunch( const char * module = &module_str[0]; CUmodule cudaModule = loadModule(module, maxrregcount, cudadevrt_lib, log_size, print_log); CUfunction cudaFunction = getFunction(cudaModule, func_name); + checkCudaErrors(cuStreamSynchronize(0)); const double t0 = rtc(); deviceLaunch(cudaFunction, func_args); checkCudaErrors(cuStreamSynchronize(0)); diff --git a/examples_cuda/options/options.cu b/examples_cuda/options/options.cu index f02ab904..e778dcce 100644 --- a/examples_cuda/options/options.cu +++ b/examples_cuda/options/options.cu @@ -39,6 +39,142 @@ #define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5)) #define taskCount (gridDim.x*4) #define warpIdx (threadIdx.x >> 5) +__device__ static inline void __range_reduce_log(float input, float * reduced, + int * exponent) { + int int_version = __float_as_int(input); //intbits(input); + // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM + // exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000 + // 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0 + // non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111 + // = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF + + //const int exponent_mask(0x7F800000) + const int nonexponent_mask = 0x807FFFFF; + + // We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126 + const int exponent_neg1 = (126l << 23); + // NOTE(boulos): We don't need to mask anything out since we know + // the sign bit has to be 0. If it's 1, we need to return infinity/nan + // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN). + int biased_exponent = int_version >> 23; // This number is [0, 255] but it means [-127, 128] + + int offset_exponent = biased_exponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2 + *exponent = offset_exponent - 127; // get the real value + + // Blend the offset_exponent with the original input (do this in + // int for now, until I decide if float can have & and ¬) + int blended = (int_version & nonexponent_mask) | (exponent_neg1); + *reduced = __int_as_float(blended); //floatbits(blended); +} + + +__device__ static inline float __Logf(const float x_full) +{ +#if 1 + return __logf(x_full); +#else + float reduced; + int exponent; + + const int NaN_bits = 0x7fc00000; + const int Neg_Inf_bits = 0xFF800000; + const float NaN = __int_as_float(NaN_bits); //floatbits(NaN_bits); + const float neg_inf = __int_as_float(Neg_Inf_bits); //floatbits(Neg_Inf_bits); + bool use_nan = x_full < 0.f; + bool use_inf = x_full == 0.f; + bool exceptional = use_nan || use_inf; + const float one = 1.0f; + + float patched = exceptional ? one : x_full; + __range_reduce_log(patched, &reduced, &exponent); + + const float ln2 = 0.693147182464599609375f; + + float x1 = one - reduced; + const float c1 = 0.50000095367431640625f; + const float c2 = 0.33326041698455810546875f; + const float c3 = 0.2519190013408660888671875f; + const float c4 = 0.17541764676570892333984375f; + const float c5 = 0.3424419462680816650390625f; + const float c6 = -0.599632322788238525390625f; + const float c7 = +1.98442304134368896484375f; + const float c8 = -2.4899270534515380859375f; + const float c9 = +1.7491014003753662109375f; + + float result = x1 * c9 + c8; + result = x1 * result + c7; + result = x1 * result + c6; + result = x1 * result + c5; + result = x1 * result + c4; + result = x1 * result + c3; + result = x1 * result + c2; + result = x1 * result + c1; + result = x1 * result + one; + + // Equation was for -(ln(red)/(1-red)) + result *= -x1; + result += (float)(exponent) * ln2; + + return exceptional ? (use_nan ? NaN : neg_inf) : result; +#endif +} + +__device__ static inline float __Expf(const float x_full) +{ +#if 1 + return __expf(x_full); +#else + const float ln2_part1 = 0.6931457519f; + const float ln2_part2 = 1.4286067653e-6f; + const float one_over_ln2 = 1.44269502162933349609375f; + + float scaled = x_full * one_over_ln2; + float k_real = floor(scaled); + int k = (int)k_real; + + // Reduced range version of x + float x = x_full - k_real * ln2_part1; + x -= k_real * ln2_part2; + + // These coefficients are for e^x in [0, ln(2)] + const float one = 1.f; + const float c2 = 0.4999999105930328369140625f; + const float c3 = 0.166668415069580078125f; + const float c4 = 4.16539050638675689697265625e-2f; + const float c5 = 8.378830738365650177001953125e-3f; + const float c6 = 1.304379315115511417388916015625e-3f; + const float c7 = 2.7555381529964506626129150390625e-4f; + + float result = x * c7 + c6; + result = x * result + c5; + result = x * result + c4; + result = x * result + c3; + result = x * result + c2; + result = x * result + one; + result = x * result + one; + + // Compute 2^k (should differ for float and double, but I'll avoid + // it for now and just do floats) + const int fpbias = 127; + int biased_n = k + fpbias; + bool overflow = k > fpbias; + // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0) + // we've got underflow. -127 * ln(2) -> -88.02. So the most + // negative float input that doesn't result in zero is like -88. + bool underflow = (biased_n <= 0); + const int InfBits = 0x7f800000; + biased_n <<= 23; + // Reinterpret this thing as float + float two_to_the_n = __int_as_float(biased_n); //floatbits(biased_n); + // Handle both doubles and floats (hopefully eliding the copy for float) + float elemtype_2n = two_to_the_n; + result *= elemtype_2n; +// result = overflow ? floatbits(InfBits) : result; + result = overflow ? __int_as_float(InfBits) : result; + result = underflow ? 0.0f : result; + return result; +#endif +} // Cumulative normal distribution function // @@ -56,7 +192,7 @@ CND(float X) { const float invSqrt2Pi = 0.39894228040f; float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 + -1.821255978f * k4 + 1.330274429f * k5); - w *= invSqrt2Pi * expf(-L * L * .5f); + w *= invSqrt2Pi * __Expf(-L * L * .5f); if (X > 0.f) w = 1.0f - w; @@ -67,6 +203,7 @@ __global__ void bs_task( float Sa[], float Xa[], float Ta[], float ra[], float va[], float result[], int count) { + if (taskIndex >= taskCount) return; int first = taskIndex * (count/taskCount); int last = min(count, (int)((taskIndex+1) * (count/taskCount))); @@ -75,10 +212,10 @@ void bs_task( float Sa[], float Xa[], float Ta[], { float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i]; - float d1 = (logf(S/X) + (r + v * v * .5f) * T) / (v * sqrtf(T)); + float d1 = (__Logf(S/X) + (r + v * v * .5f) * T) / (v * sqrtf(T)); float d2 = d1 - v * sqrtf(T); - result[i] = S * CND(d1) - X * expf(-r * T) * CND(d2); + result[i] = S * CND(d1) - X * __Expf(-r * T) * CND(d2); } } @@ -94,34 +231,59 @@ black_scholes_ispc_tasks( float Sa[], float Xa[], float Ta[], /********/ +template +struct loop +{ + __device__ static void op1(float V[], const float u, const float X, const float S) + { + const int j = NBEG; + float upow = powf(u, (float)(2*j-BINOMIAL_NUM)); + V[j] = max(0.0f, X - S * upow); + loop::op1(V,u,X,S); + } + __device__ static void op2(float V[], const float Pu, const float disc) + { + const int j = NBEG; +#pragma unroll + for ( int k = 0; k < j; ++k) + V[k] = ((1.0f - Pu) * V[k] + Pu * V[k+ 1]) / disc; + loop::op2(V, Pu,disc); + } +}; + +template +struct loop +{ + __device__ static void op1(float V[], const float u, const float X, const float S) {} + __device__ static void op2(float V[], const float Pu, const float disc) {} +}; + __device__ static inline float -binomial_put(float S, float X, float T, float r, float v) { -#if 0 - float V[BINOMIAL_NUM]; -#else - __shared__ float VSH[BINOMIAL_NUM*4]; - float *V = VSH + warpIdx*BINOMIAL_NUM; +binomial_put(float S, float X, float T, float r, float v) +{ + + float V[BINOMIAL_NUM]; + + float dt = T / BINOMIAL_NUM; + float u = exp(v * sqrt(dt)); + float d = 1.f / u; + float disc = exp(r * dt); + float Pu = (disc - d) / (u - d); + +#if 0 /* slow */ + for ( int j = 0; j < BINOMIAL_NUM; ++j) { + float upow = powf(u, (float)(2*j-BINOMIAL_NUM)); + V[j] = max(0.0f, X - S * upow); + } + for ( int j = BINOMIAL_NUM-1; j >= 0; --j) + for ( int k = 0; k < j; ++k) + V[k] = ((1.0f - Pu) * V[k] + Pu * V[k+ 1]) / disc; +#else /* with loop unrolling, stores resutls in registers */ + loop<0,BINOMIAL_NUM,1>::op1(V,u,X,S); + loop::op2(V, Pu, disc); #endif - - float dt = T / BINOMIAL_NUM; - float u = exp(v * sqrt(dt)); - float d = 1.f / u; - float disc = exp(r * dt); - float Pu = (disc - d) / (u - d); - -#pragma unroll - for ( int j = 0; j < BINOMIAL_NUM; ++j) { - float upow = pow(u, (float)(2*j-BINOMIAL_NUM)); - V[j] = max(0., X - S * upow); - } - -#pragma unroll - for ( int j = BINOMIAL_NUM-1; j >= 0; --j) -#pragma unroll - for ( int k = 0; k < j; ++k) - V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc; - return V[0]; + return V[0]; } @@ -130,15 +292,16 @@ __global__ void binomial_task( float Sa[], float Xa[], float Ta[], float ra[], float va[], float result[], - int count) { - int first = taskIndex * (count/taskCount); - int last = min(count, (int)((taskIndex+1) * (count/taskCount))); + int count) +{ + int first = taskIndex * (count/taskCount); + int last = min(count, (int)((taskIndex+1) * (count/taskCount))); - for (int i = programIndex + first; i < last; i += programCount) - if (i < last) - { - float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i]; - result[i] = binomial_put(S, X, T, r, v); + for (int i = programIndex + first; i < last; i += programCount) + if (i < last) + { + float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i]; + result[i] = binomial_put(S, X, T, r, v); } } diff --git a/examples_cuda/options/options_cu.cpp b/examples_cuda/options/options_cu.cpp index 07ba5992..7fd8a6ce 100644 --- a/examples_cuda/options/options_cu.cpp +++ b/examples_cuda/options/options_cu.cpp @@ -104,7 +104,7 @@ int main(int argc, char *argv[]) { // Binomial options pricing model, ispc implementation // const bool print_log = false; - const int nreg = 32; + const int nreg = 64; double binomial_ispc = 1e30; #if 0 for (int i = 0; i < 3; ++i) { @@ -122,6 +122,9 @@ int main(int argc, char *argv[]) { } printf("[binomial ispc, 1 thread]:\t[%.3f] million cycles (avg %f)\n", binomial_ispc, sum / nOptions); + for (int i = 0; i < nOptions; ++i) + result[i] = 0.0; + memcpyH2D(d_result, result, nOptions*sizeof(float)); #endif // @@ -142,6 +145,10 @@ int main(int argc, char *argv[]) { } printf("[binomial ispc, tasks]:\t\t[%.3f] million cycles (avg %f)\n", binomial_tasks, sum / nOptions); + for (int i = 0; i < nOptions; ++i) + result[i] = 0.0; + memcpyH2D(d_result, result, nOptions*sizeof(float)); + // // Black-Scholes options pricing model, ispc implementation, 1 thread @@ -162,6 +169,9 @@ int main(int argc, char *argv[]) { } printf("[black-scholes ispc, 1 thread]:\t[%.3f] million cycles (avg %f)\n", bs_ispc, sum / nOptions); + for (int i = 0; i < nOptions; ++i) + result[i] = 0.0; + memcpyH2D(d_result, result, nOptions*sizeof(float)); #endif // @@ -179,6 +189,9 @@ int main(int argc, char *argv[]) { for (int i = 0; i < nOptions; ++i) sum += result[i]; bs_ispc_tasks = std::min(bs_ispc_tasks, dt); + for (int i = 0; i < nOptions; ++i) + result[i] = 0.0; + memcpyH2D(d_result, result, nOptions*sizeof(float)); } printf("[black-scholes ispc, tasks]:\t[%.3f] million cycles (avg %f)\n", bs_ispc_tasks, sum / nOptions);