some examples tuning

This commit is contained in:
Evghenii
2014-01-13 09:52:34 +01:00
parent 8f17468fa3
commit 66faf8b4e4
6 changed files with 11 additions and 9 deletions

View File

@@ -4,7 +4,7 @@ CU_SRC=ao.cu
CXX_SRC=ao.cpp ao_serial.cpp CXX_SRC=ao.cpp ao_serial.cpp
PTXCC_REGMAX=64 PTXCC_REGMAX=64
# LLVM_GPU=1 LLVM_GPU=1
NVVM_GPU=1 NVVM_GPU=1
include ../common_gpu.mk include ../common_gpu.mk

View File

@@ -133,7 +133,7 @@ struct Uniform
data[chunkIdx] = shdata[programIndex]; data[chunkIdx] = shdata[programIndex];
} }
}; };
#elif 0 #elif 1
template<typename T, int N> template<typename T, int N>
struct Uniform struct Uniform
{ {

View File

@@ -35,10 +35,10 @@
__device__ __device__
static inline int static inline int
mandel(float c_re, float c_im, int count) { mandel(float c_re, float c_im, float count) {
float z_re = c_re, z_im = c_im; float z_re = c_re, z_im = c_im;
int i; float i = 0.0f;
for (i = 0; i < count; ++i) { for (; i < count; ++i) {
if (z_re * z_re + z_im * z_im > 4.0f) if (z_re * z_re + z_im * z_im > 4.0f)
break; break;

View File

@@ -32,10 +32,10 @@
*/ */
static inline int static inline int
mandel(float c_re, float c_im, int count) { mandel(float c_re, float c_im, float count) {
float z_re = c_re, z_im = c_im; float z_re = c_re, z_im = c_im;
int i; float i = 0.0f;
for (i = 0; i < count; ++i) { for (;i < count; ++i) {
if (z_re * z_re + z_im * z_im > 4.0f) if (z_re * z_re + z_im * z_im > 4.0f)
break; break;

View File

@@ -2,7 +2,7 @@ PROG=options
ISPC_SRC=options.ispc ISPC_SRC=options.ispc
CU_SRC=options.cu CU_SRC=options.cu
CXX_SRC=options.cpp options_serial.cpp CXX_SRC=options.cpp options_serial.cpp
PTXCC_REGMAX=32 PTXCC_REGMAX=128
LLVM_GPU=1 LLVM_GPU=1

View File

@@ -327,6 +327,8 @@ __host__ void
binomial_put_ispc_tasks( float Sa[], float Xa[], float Ta[], binomial_put_ispc_tasks( float Sa[], float Xa[], float Ta[],
float ra[], float va[], float ra[], float va[],
float result[], int count) { float result[], int count) {
cudaDeviceSetCacheConfig (cudaFuncCachePreferL1);
binomial_put_ispc_tasks___export<<<1,32>>>(Sa,Xa,Ta,ra,va,result,count); binomial_put_ispc_tasks___export<<<1,32>>>(Sa,Xa,Ta,ra,va,result,count);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
} }