diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll index 063fdc7b..ae8cb604 100644 --- a/builtins/target-nvptx64.ll +++ b/builtins/target-nvptx64.ll @@ -510,18 +510,21 @@ declare double @llvm.nvvm.sqrt.d(double %f) nounwind readonly alwaysinline define float @__rcp_uniform_float(float) nounwind readonly alwaysinline { ; uniform float iv = extract(__rcp_u(v), 0); ; return iv * (2. - v * iv); - %r = fdiv float 1.,%0 - ret float %r +; %ret = fdiv float 1.,%0 + %ret = tail call float asm sideeffect "rcp.approx.ftz.f32 $0, $1;", "=f,f"(float %0) nounwind readnone alwaysinline + ret float %ret } ;; declare float @__sqrt_uniform_float(float) nounwind readnone define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline { - %ret = call float @llvm.nvvm.sqrt.f(float %0) + ;;%ret = call float @llvm.nvvm.sqrt.f(float %0) + %ret = tail call float asm sideeffect "sqrt.approx.ftz.f32 $0, $1;", "=f,f"(float %0) nounwind readnone alwaysinline ret float %ret } ;; declare float @__rsqrt_uniform_float(float) nounwind readnone define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline { - %ret = call float @llvm.nvvm.rsqrt.approx.f(float %0) +;; %ret = call float @llvm.nvvm.rsqrt.approx.f(float %0) + %ret = tail call float asm sideeffect "rsqrt.approx.ftz.f32 $0, $1;", "=f,f"(float %0) nounwind readnone alwaysinline ret float %ret } diff --git a/examples_cuda/aobench/Makefile_gpu b/examples_cuda/aobench/Makefile_gpu index 6a11cbc5..74618d08 100644 --- a/examples_cuda/aobench/Makefile_gpu +++ b/examples_cuda/aobench/Makefile_gpu @@ -8,7 +8,7 @@ LD=g++ LDFLAGS=-lcuda ISPC=ispc -ISPCFLAGS=-O3 --math-lib=default --target=nvptx64 +ISPCFLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math LLVM32 = $(HOME)/usr/local/llvm/bin-3.2 LLVM = $(HOME)/usr/local/llvm/bin-3.3 diff --git a/examples_cuda/aobench/ao_cu.cpp b/examples_cuda/aobench/ao_cu.cpp index 7ec9c1a3..a8ba598b 100755 --- a/examples_cuda/aobench/ao_cu.cpp +++ b/examples_cuda/aobench/ao_cu.cpp @@ -191,7 +191,7 @@ CUmodule loadModule(const char * module) optionVals[5] = (void*) 1; // Max # of registers/pthread options[6] = CU_JIT_MAX_REGISTERS; - int jitRegCount = 48; + int jitRegCount = 64; optionVals[6] = (void *)(size_t)jitRegCount; // Create a pending linker invocation