From fcbdd93043b6bf2ec1a0d2575fbff90a4a844a03 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Sat, 25 Jan 2014 16:43:33 +0100 Subject: [PATCH] half/scan for 64 bit/clock/num_cores and other additions --- builtins/builtins.c | 4 + builtins/target-nvptx.ll | 151 +++++++++++++++++--- builtins/util-nvptx.m4 | 7 - module.cpp | 4 +- stdlib.ispc | 3 + tests/array-mixed-unif-vary-indexing-3.ispc | 10 ++ tests/launch-8.ispc | 20 +-- tests/launch-9.ispc | 4 +- tests/test-141.ispc | 2 +- tests/test-142.ispc | 2 +- tests/test-144.ispc | 2 +- 11 files changed, 162 insertions(+), 47 deletions(-) diff --git a/builtins/builtins.c b/builtins/builtins.c index ee34ff54..e7becf90 100644 --- a/builtins/builtins.c +++ b/builtins/builtins.c @@ -189,6 +189,7 @@ void __do_print(const char *format, const char *types, int width, uint64_t mask, int __puts_nvptx(const char *); void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask, void **args) { +#if 0 char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL char *bufp = &printString[0]; char tmpBuf[256]; @@ -254,6 +255,9 @@ void __do_print_nvptx(const char *format, const char *types, int width, uint64_t *bufp = '\n'; bufp++; *bufp = '\0'; __puts_nvptx(printString); +#else + __puts_nvptx("---nvptx printing is not support---\n"); +#endif } diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll index dd3cbb5c..4d485a04 100644 --- a/builtins/target-nvptx.ll +++ b/builtins/target-nvptx.ll @@ -387,16 +387,21 @@ define float @__half_to_float_uniform(i16 %v) nounwind readnone alwaysinline { ;; %res = call float @llvm.convert.from.fp16(i16 %v) %res = tail call float asm sideeffect - "{ .reg .b16 %tmp; - mov.b16 %tmp, $1; - cvt.f32.f16 $0, %tmp; - }", "=f,h"(i16 %v) nounwind readnone alwaysinline + "{ .reg .f16 tmp; + mov.b16 tmp, $1; + cvt.f32.f16 $0, tmp; + }", "=f,h"(i16 %v) nounwind readnone alwaysinline ret float %res } define i16 @__float_to_half_uniform(float %v) nounwind readnone alwaysinline { ;; this will break the compiler, use inline asm similarly to above case - %half = call i16 @llvm.convert.to.fp16(float %v) + ;; %half = call i16 @llvm.convert.to.fp16(float %v) + %half = tail call i16 asm sideeffect + "{ .reg .f16 tmp; + cvt.rn.f16.f32 tmp, $1; + mov.b16 $0, tmp; + }", "=h,f"(float %v) nounwind readnone alwaysinline ret i16 %half } define @__half_to_float_varying( %v) nounwind readnone alwaysinline @@ -421,7 +426,7 @@ declare void @__fastmath() nounwind ;; round/floor/ceil -define float @__round_uniform_float(float) nounwind readnone alwaysinline +define float @__round_uniform_float_ptx(float) nounwind readnone alwaysinline { %2 = tail call float asm sideeffect "{ .reg .pred p<3>; .reg .s32 r<4>; .reg .f32 f<10>; @@ -436,11 +441,25 @@ define float @__round_uniform_float(float) nounwind readnone alwaysinline setp.gt.f32 p1, f5, 0f4B000000; selp.f32 f9, f4, f8, p1; setp.geu.f32 p2, f5, 0f3F000000; - @!p2 cvt.rzi.f32.f32 f9, f4; + @p2 bra BB2_2; + cvt.rzi.f32.f32 f9, f4; +BB2_2: mov.f32 $0, f9; }", "=f,f"(float %0) nounwind readnone alwaysinline ret float %2 } +define float @__round_uniform_float(float) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast float %0 to <1 x i32> + %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, + %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float> + %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, + %binop21.i = fadd <1 x float> %binop.i, + %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32> + %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to float + ret float %int_to_float_bitcast.i.i.i +} define float @__floor_uniform_float(float) nounwind readnone alwaysinline { %2 = tail call float asm sideeffect "cvt.rmi.f32.f32 $0, $1;", "=f,f"(float %0) nounwind alwaysinline readnone @@ -501,18 +520,6 @@ define double @__ceil_uniform_double(double) nounwind readnone alwaysinline ret double %2 } -define <1 x float> @__round_varying_floatX(<1 x float>) nounwind readonly alwaysinline { - %float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32> - %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, - %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i - %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float> - %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, - %binop21.i = fadd <1 x float> %binop.i, - %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32> - %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i - %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float> - ret <1 x float> %int_to_float_bitcast.i.i.i -} define <1 x float> @__floor_varying_floatX(<1 x float>) nounwind readonly alwaysinline { %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind %bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0 @@ -1363,10 +1370,92 @@ define <1 x float> @__exclusive_scan_add_float(<1 x float>, <1 x i1>) nounwind r %retv = insertelement <1 x float> undef, float %rets, i32 0 ret <1 x float> %retv } -declare <1 x double> @__exclusive_scan_add_double(<1 x double>, <1 x i1>) nounwind readnone alwaysinline -declare <1 x i64> @__exclusive_scan_add_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline -declare <1 x i64> @__exclusive_scan_and_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline -declare <1 x i64> @__exclusive_scan_or_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline +define double @__shfl_scan_add_step_double(double %partial, i32 %up_offset) nounwind readnone alwaysinline +{ + %result = tail call double asm sideeffect + "{.reg .s32 r<10>; + .reg .f64 fd0; + .reg .pred p; + .reg .b32 temp; + mov.b64 {r1,temp}, $1; + mov.b64 {temp,r2}, $1; + shfl.up.b32 r3, r1, $2, 0; + shfl.up.b32 r4|p, r2, $2, 0; + mov.b64 fd0, {r3,r4}; + @p add.f64 fd0, fd0, $3; + mov.f64 $0, fd0; + }", "=d,d,r,d"(double %partial, i32 %up_offset, double %partial) nounwind readnone alwaysinline + ret double %result; +} +define <1 x double> @__exclusive_scan_add_double(<1 x double>, <1 x i1>) nounwind readnone alwaysinline +{ + %v0 = extractelement <1 x double> %0, i32 0 + %mask = extractelement <1 x i1 > %1, i32 0 + %v = select i1 %mask, double %v0, double zeroinitializer + + %s1 = tail call double @__shfl_scan_add_step_double(double %v, i32 1); + %s2 = tail call double @__shfl_scan_add_step_double(double %s1, i32 2); + %s3 = tail call double @__shfl_scan_add_step_double(double %s2, i32 4); + %s4 = tail call double @__shfl_scan_add_step_double(double %s3, i32 8); + %s5 = tail call double @__shfl_scan_add_step_double(double %s4, i32 16); + %rets = fsub double %s5, %v + %retv = bitcast double %rets to <1 x double> + ret <1 x double> %retv +} + +define i64 @__shfl_scan_add_step_i64(i64 %partial, i32 %up_offset) nounwind readnone alwaysinline +{ + %result = tail call i64 asm sideeffect + "{.reg .s32 r<10>; + .reg .s64 rl0; + .reg .pred p; + .reg .b32 temp; + mov.b64 {r1,temp}, $1; + mov.b64 {temp,r2}, $1; + shfl.up.b32 r3, r1, $2, 0; + shfl.up.b32 r4|p, r2, $2, 0; + mov.b64 rl0, {r3,r4}; + @p add.s64 rl0, rl0, $3; + mov.s64 $0, rl0; + }", "=l,l,r,l"(i64 %partial, i32 %up_offset, i64 %partial) nounwind readnone alwaysinline + ret i64 %result; +} +define <1 x i64> @__exclusive_scan_add_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline +{ + %v0 = extractelement <1 x i64> %0, i32 0 + %mask = extractelement <1 x i1 > %1, i32 0 + %v = select i1 %mask, i64 %v0, i64 zeroinitializer + + %s1 = tail call i64 @__shfl_scan_add_step_i64(i64 %v, i32 1); + %s2 = tail call i64 @__shfl_scan_add_step_i64(i64 %s1, i32 2); + %s3 = tail call i64 @__shfl_scan_add_step_i64(i64 %s2, i32 4); + %s4 = tail call i64 @__shfl_scan_add_step_i64(i64 %s3, i32 8); + %s5 = tail call i64 @__shfl_scan_add_step_i64(i64 %s4, i32 16); + %rets = sub i64 %s5, %v + %retv = bitcast i64 %rets to <1 x i64> + ret <1 x i64> %retv +} + +define(`exclusive_scan_i64',` +define <1 x i64> @__exclusive_scan_$1_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline +{ + %v = bitcast <1 x i64> %0 to <2 x i32> + %v0 = extractelement <2 x i32> %v, i32 0 + %v1 = extractelement <2 x i32> %v, i32 1 + %inp0 = bitcast i32 %v0 to <1 x i32> + %inp1 = bitcast i32 %v1 to <1 x i32> + %res0 = call <1 x i32> @__exclusive_scan_$1_i32(<1 x i32> %inp0, <1 x i1> %1); + %res1 = call <1 x i32> @__exclusive_scan_$1_i32(<1 x i32> %inp1, <1 x i1> %1); + %r0 = bitcast <1 x i32> %res0 to i32 + %r1 = bitcast <1 x i32> %res1 to i32 + %ret0 = insertelement <2 x i32> undef, i32 %r0, i32 0 + %ret1 = insertelement <2 x i32> %ret0, i32 %r1, i32 1 + %ret = bitcast <2 x i32> %ret1 to <1 x i64> + ret <1 x i64> %ret +} +') +exclusive_scan_i64(or) +exclusive_scan_i64(and) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts @@ -1530,9 +1619,19 @@ define void @__do_assert_uniform(i8 *%str, i1 %test, %mask) { br i1 %test, label %ok, label %fail fail: + %lane = call i32 @__laneidx() + %cmp = icmp eq i32 %lane, 0 + br i1 %cmp, label %fail_print, label %fail_void; + + + +fail_print: call void @__abort_nvptx(i8* %str) noreturn unreachable +fail_void: + unreachable + ok: ret void } @@ -1554,3 +1653,9 @@ fail: ok: ret void } + +define i64 @__clock() nounwind alwaysinline { + %r = call i64 asm sideeffect "mov.b64 $0, %clock64;", "=l"(); + ret i64 %r +} + diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4 index 65577454..7bb1014b 100644 --- a/builtins/util-nvptx.m4 +++ b/builtins/util-nvptx.m4 @@ -2372,13 +2372,6 @@ m4exit(`1') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; read hw clock -declare i64 @llvm.readcyclecounter() - -define i64 @__clock() nounwind { - %r = call i64 @llvm.readcyclecounter() - ret i64 %r -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; stdlib transcendentals ;; diff --git a/module.cpp b/module.cpp index 3536c10f..8a3f3507 100644 --- a/module.cpp +++ b/module.cpp @@ -2134,14 +2134,14 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre opts.addMacroDef("cwhile=while"); opts.addMacroDef("ccontinue=continue"); opts.addMacroDef("cdo=do"); - opts.addMacroDef("taskIndex=blockIndex0()"); - opts.addMacroDef("taskCount=blockCount0()"); opts.addMacroDef("taskIndex0=blockIndex0()"); opts.addMacroDef("taskCount0=blockCount0()"); opts.addMacroDef("taskIndex1=blockIndex1()"); opts.addMacroDef("taskCount1=blockCount1()"); opts.addMacroDef("taskIndex2=blockIndex2()"); opts.addMacroDef("taskCount2=blockCount2()"); + opts.addMacroDef("taskIndex=(taskIndex0 + taskCount0*(taskIndex1 + taskCount1*taskIndex2))"); + opts.addMacroDef("taskCount=(taskCount0*taskCount1*taskCount2)"); } #if defined(LLVM_3_1) diff --git a/stdlib.ispc b/stdlib.ispc index 6ce656fe..2d79bf33 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1301,6 +1301,9 @@ packed_store_active2(uniform int a[], int vals) { // System information static inline uniform int num_cores() { + if (__is_nvptx_target) + return 15*32; // K20/K20X/K40 - 15SMX x 32 warps/smx (max is 64 warps/smx) + else return __num_cores(); } diff --git a/tests/array-mixed-unif-vary-indexing-3.ispc b/tests/array-mixed-unif-vary-indexing-3.ispc index ab3a7a7c..c6623cf6 100644 --- a/tests/array-mixed-unif-vary-indexing-3.ispc +++ b/tests/array-mixed-unif-vary-indexing-3.ispc @@ -5,7 +5,13 @@ export uniform int width() { return programCount; } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; assert(programCount <= 64); +#ifdef __NVPTX__ + uniform float * uniform xarr = uniform new uniform float[70*70]; + uniform float (* uniform x)[70] = (uniform float (* uniform)[70])xarr; +#define _SHMALLOC +#else uniform float x[70][70]; +#endif for (uniform int i = 0; i < 70; ++i) for (uniform int j = 0; j < 70; ++j) x[i][j] = 2+b-5; @@ -16,6 +22,10 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { else x[b-1][a-1] = 1; RET[programIndex] = x[4][a]; + +#ifdef _SHMALLOC + delete xarr; +#endif } export void result(uniform float RET[]) { diff --git a/tests/launch-8.ispc b/tests/launch-8.ispc index eacba673..dcc2b6b9 100644 --- a/tests/launch-8.ispc +++ b/tests/launch-8.ispc @@ -2,7 +2,7 @@ export uniform int width() { return programCount; } -#define N0 10 +#define N0 12 #define N1 20 #define N2 50 static uniform float array[N2][N1][N0]; @@ -10,14 +10,14 @@ static uniform float array[N2][N1][N0]; task void x(const float f) { uniform int j; - assert(taskCount == (int32)N0*N1*N2); - assert(taskCount0 == (int32)N0); - assert(taskCount1 == (int32)N1); - assert(taskCount2 == (int32)N2); - assert(taskIndex == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2)); - assert(taskIndex0 < (int32)N0); - assert(taskIndex1 < (int32)N1); - assert(taskIndex2 < (int32)N2); + assert(taskCount == (uniform int32)N0*N1*N2); + assert(taskCount0 == (uniform int32)N0); + assert(taskCount1 == (uniform int32)N1); + assert(taskCount2 == (uniform int32)N2); + assert(taskIndex == (uniform int32)taskIndex0 + (uniform int32)N0*(taskIndex1 +(uniform int32) N1*taskIndex2)); + assert(taskIndex0 < (uniform int32)N0); + assert(taskIndex1 < (uniform int32)N1); + assert(taskIndex2 < (uniform int32)N2); const uniform int i0 = taskIndex0; const uniform int i1 = taskIndex1; @@ -38,5 +38,5 @@ export void f_f(uniform float RET[], uniform float fFOO[]) { export void result(uniform float RET[]) { - RET[programIndex] = 9999.000000; + RET[programIndex] = 11999.000000; } diff --git a/tests/launch-9.ispc b/tests/launch-9.ispc index 1952e8e7..be9e4881 100644 --- a/tests/launch-9.ispc +++ b/tests/launch-9.ispc @@ -2,7 +2,7 @@ export uniform int width() { return programCount; } -#define N0 10 +#define N0 12 #define N1 20 #define N2 50 static uniform float array[N2][N1][N0]; @@ -38,5 +38,5 @@ export void f_f(uniform float RET[], uniform float fFOO[]) { export void result(uniform float RET[]) { - RET[programIndex] = 9999.000000; + RET[programIndex] = 11999.000000; } diff --git a/tests/test-141.ispc b/tests/test-141.ispc index b69be1fa..9045c081 100644 --- a/tests/test-141.ispc +++ b/tests/test-141.ispc @@ -5,7 +5,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; // calculation error 1e-6 is the same as in icc - RET[programIndex] = (exp(-log(1/a)) - a) < 1e-6 ? 1 : 0; + RET[programIndex] = (exp(-log(1/a)) - a)/a < 1e-6 ? 1 : 0; } export void result(uniform float RET[4]) { diff --git a/tests/test-142.ispc b/tests/test-142.ispc index 18053402..9ab8ff9f 100644 --- a/tests/test-142.ispc +++ b/tests/test-142.ispc @@ -4,7 +4,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; - RET[programIndex] = round(a+.499999); + RET[programIndex] = round(a+.49999); } export void result(uniform float RET[]) { diff --git a/tests/test-144.ispc b/tests/test-144.ispc index 568bdc10..64e1817a 100644 --- a/tests/test-144.ispc +++ b/tests/test-144.ispc @@ -4,7 +4,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; - RET[programIndex] = floor(a+.999999); + RET[programIndex] = floor(a+.99999); } export void result(uniform float RET[]) {