+added approx rcp/rsqrt/rtz with ftz=true
This commit is contained in:
@@ -510,18 +510,21 @@ declare double @llvm.nvvm.sqrt.d(double %f) nounwind readonly alwaysinline
|
|||||||
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
; uniform float iv = extract(__rcp_u(v), 0);
|
; uniform float iv = extract(__rcp_u(v), 0);
|
||||||
; return iv * (2. - v * iv);
|
; return iv * (2. - v * iv);
|
||||||
%r = fdiv float 1.,%0
|
; %ret = fdiv float 1.,%0
|
||||||
ret float %r
|
%ret = tail call float asm sideeffect "rcp.approx.ftz.f32 $0, $1;", "=f,f"(float %0) nounwind readnone alwaysinline
|
||||||
|
ret float %ret
|
||||||
}
|
}
|
||||||
;; declare float @__sqrt_uniform_float(float) nounwind readnone
|
;; declare float @__sqrt_uniform_float(float) nounwind readnone
|
||||||
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
%ret = call float @llvm.nvvm.sqrt.f(float %0)
|
;;%ret = call float @llvm.nvvm.sqrt.f(float %0)
|
||||||
|
%ret = tail call float asm sideeffect "sqrt.approx.ftz.f32 $0, $1;", "=f,f"(float %0) nounwind readnone alwaysinline
|
||||||
ret float %ret
|
ret float %ret
|
||||||
}
|
}
|
||||||
;; declare float @__rsqrt_uniform_float(float) nounwind readnone
|
;; declare float @__rsqrt_uniform_float(float) nounwind readnone
|
||||||
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline
|
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline
|
||||||
{
|
{
|
||||||
%ret = call float @llvm.nvvm.rsqrt.approx.f(float %0)
|
;; %ret = call float @llvm.nvvm.rsqrt.approx.f(float %0)
|
||||||
|
%ret = tail call float asm sideeffect "rsqrt.approx.ftz.f32 $0, $1;", "=f,f"(float %0) nounwind readnone alwaysinline
|
||||||
ret float %ret
|
ret float %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ LD=g++
|
|||||||
LDFLAGS=-lcuda
|
LDFLAGS=-lcuda
|
||||||
|
|
||||||
ISPC=ispc
|
ISPC=ispc
|
||||||
ISPCFLAGS=-O3 --math-lib=default --target=nvptx64
|
ISPCFLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math
|
||||||
|
|
||||||
LLVM32 = $(HOME)/usr/local/llvm/bin-3.2
|
LLVM32 = $(HOME)/usr/local/llvm/bin-3.2
|
||||||
LLVM = $(HOME)/usr/local/llvm/bin-3.3
|
LLVM = $(HOME)/usr/local/llvm/bin-3.3
|
||||||
|
|||||||
@@ -191,7 +191,7 @@ CUmodule loadModule(const char * module)
|
|||||||
optionVals[5] = (void*) 1;
|
optionVals[5] = (void*) 1;
|
||||||
// Max # of registers/pthread
|
// Max # of registers/pthread
|
||||||
options[6] = CU_JIT_MAX_REGISTERS;
|
options[6] = CU_JIT_MAX_REGISTERS;
|
||||||
int jitRegCount = 48;
|
int jitRegCount = 64;
|
||||||
optionVals[6] = (void *)(size_t)jitRegCount;
|
optionVals[6] = (void *)(size_t)jitRegCount;
|
||||||
|
|
||||||
// Create a pending linker invocation
|
// Create a pending linker invocation
|
||||||
|
|||||||
Reference in New Issue
Block a user