From fcbdd93043b6bf2ec1a0d2575fbff90a4a844a03 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Sat, 25 Jan 2014 16:43:33 +0100
Subject: [PATCH] half/scan for 64 bit/clock/num_cores and other additions

---
 builtins/builtins.c                         |   4 +
 builtins/target-nvptx.ll                    | 151 +++++++++++++++++---
 builtins/util-nvptx.m4                      |   7 -
 module.cpp                                  |   4 +-
 stdlib.ispc                                 |   3 +
 tests/array-mixed-unif-vary-indexing-3.ispc |  10 ++
 tests/launch-8.ispc                         |  20 +--
 tests/launch-9.ispc                         |   4 +-
 tests/test-141.ispc                         |   2 +-
 tests/test-142.ispc                         |   2 +-
 tests/test-144.ispc                         |   2 +-
 11 files changed, 162 insertions(+), 47 deletions(-)
diff --git a/builtins/builtins.c b/builtins/builtins.c
index ee34ff54..e7becf90 100644
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -189,6 +189,7 @@ void __do_print(const char *format, const char *types, int width, uint64_t mask,
 int __puts_nvptx(const char *);
 void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
                 void **args) {
+#if 0
     char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
     char *bufp = &printString[0];
     char tmpBuf[256];
@@ -254,6 +255,9 @@ void __do_print_nvptx(const char *format, const char *types, int width, uint64_t
     *bufp = '\n'; bufp++;
     *bufp = '\0';
     __puts_nvptx(printString);
+#else
+    __puts_nvptx("---nvptx printing is not support---\n");
+#endif
 }
 
 
diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll
index dd3cbb5c..4d485a04 100644
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
@@ -387,16 +387,21 @@ define float @__half_to_float_uniform(i16 %v) nounwind readnone alwaysinline
 {
   ;; %res = call float @llvm.convert.from.fp16(i16 %v)
   %res = tail call float asm sideeffect 
-      "{ .reg .b16 %tmp; 
-         mov.b16 %tmp, $1;
-         cvt.f32.f16 $0, %tmp;
-      }", "=f,h"(i16 %v) nounwind readnone alwaysinline
+      "{ .reg .f16 tmp; 
+        mov.b16 tmp, $1;
+        cvt.f32.f16 $0, tmp;
+     }", "=f,h"(i16 %v) nounwind readnone alwaysinline
   ret float %res
 }
 define i16 @__float_to_half_uniform(float %v) nounwind readnone alwaysinline
 {
  ;; this will break the compiler, use inline asm similarly to above case
-  %half = call i16 @llvm.convert.to.fp16(float %v)
+ ;; %half = call i16 @llvm.convert.to.fp16(float %v)
+  %half = tail call i16 asm sideeffect 
+      "{ .reg .f16 tmp; 
+        cvt.rn.f16.f32 tmp, $1;
+        mov.b16 $0, tmp;
+     }", "=h,f"(float %v) nounwind readnone alwaysinline
   ret i16 %half
 }
 define <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone alwaysinline
@@ -421,7 +426,7 @@ declare void @__fastmath() nounwind
 
 ;; round/floor/ceil
 
-define float @__round_uniform_float(float) nounwind readnone alwaysinline
+define float @__round_uniform_float_ptx(float) nounwind readnone alwaysinline
 {
   %2 = tail call float asm sideeffect
         "{ .reg .pred p<3>; .reg .s32 r<4>; .reg .f32 f<10>;
@@ -436,11 +441,25 @@ define float @__round_uniform_float(float) nounwind readnone alwaysinline
            setp.gt.f32	p1, f5, 0f4B000000;
            selp.f32	f9, f4, f8, p1;
            setp.geu.f32	p2, f5, 0f3F000000;
-           @!p2 cvt.rzi.f32.f32	f9, f4;
+           @p2 bra BB2_2;
+           cvt.rzi.f32.f32	f9, f4;
+BB2_2:
            mov.f32 $0, f9;
         }", "=f,f"(float %0) nounwind readnone alwaysinline
   ret float %2
 }
+define  float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to <1 x i32>
+  %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648>
+  %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06>
+  %binop21.i = fadd <1 x float> %binop.i, <float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32>
+  %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to float
+  ret float %int_to_float_bitcast.i.i.i
+}
 define float @__floor_uniform_float(float) nounwind readnone alwaysinline
 {
   %2 = tail call float asm sideeffect "cvt.rmi.f32.f32 $0, $1;", "=f,f"(float %0) nounwind alwaysinline readnone
@@ -501,18 +520,6 @@ define double @__ceil_uniform_double(double) nounwind readnone alwaysinline
   ret double %2
 }
 
-define  <1 x float> @__round_varying_floatX(<1 x float>) nounwind readonly alwaysinline {
-  %float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32>
-  %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648>
-  %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
-  %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float>
-  %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06>
-  %binop21.i = fadd <1 x float> %binop.i, <float -8.388608e+06>
-  %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32>
-  %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
-  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float>
-  ret <1 x float> %int_to_float_bitcast.i.i.i
-}
 define  <1 x float> @__floor_varying_floatX(<1 x float>) nounwind readonly alwaysinline {
   %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
   %bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0
@@ -1363,10 +1370,92 @@ define <1 x float> @__exclusive_scan_add_float(<1 x float>, <1 x i1>) nounwind r
   %retv = insertelement <1 x float> undef, float %rets, i32 0
   ret <1 x float> %retv
 }
-declare <1 x double> @__exclusive_scan_add_double(<1 x double>, <1 x i1>) nounwind readnone alwaysinline
-declare <1 x i64> @__exclusive_scan_add_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline
-declare <1 x i64> @__exclusive_scan_and_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline
-declare <1 x i64> @__exclusive_scan_or_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline
+define double @__shfl_scan_add_step_double(double %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call double asm sideeffect  
+      "{.reg .s32 r<10>;
+        .reg .f64 fd0;
+       .reg .pred p;
+       .reg .b32 temp;
+       mov.b64 {r1,temp}, $1;
+       mov.b64 {temp,r2}, $1;
+       shfl.up.b32 r3,   r1, $2, 0;
+       shfl.up.b32 r4|p, r2, $2, 0;
+       mov.b64 fd0, {r3,r4};
+       @p add.f64 fd0, fd0, $3;
+       mov.f64 $0, fd0;
+       }", "=d,d,r,d"(double %partial, i32 %up_offset, double %partial) nounwind readnone alwaysinline
+  ret double %result;
+}
+define <1 x double> @__exclusive_scan_add_double(<1 x double>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x double> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v    = select i1 %mask, double %v0, double zeroinitializer
+
+  %s1 = tail call double @__shfl_scan_add_step_double(double %v,  i32  1);
+  %s2 = tail call double @__shfl_scan_add_step_double(double %s1, i32  2);
+  %s3 = tail call double @__shfl_scan_add_step_double(double %s2, i32  4);
+  %s4 = tail call double @__shfl_scan_add_step_double(double %s3, i32  8);
+  %s5 = tail call double @__shfl_scan_add_step_double(double %s4, i32 16);
+  %rets = fsub double %s5, %v
+  %retv = bitcast double %rets to <1 x double>
+  ret <1 x double> %retv
+}
+
+define i64 @__shfl_scan_add_step_i64(i64 %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call i64 asm sideeffect  
+      "{.reg .s32 r<10>;
+        .reg .s64 rl0;
+       .reg .pred p;
+       .reg .b32 temp;
+       mov.b64 {r1,temp}, $1;
+       mov.b64 {temp,r2}, $1;
+       shfl.up.b32 r3,   r1, $2, 0;
+       shfl.up.b32 r4|p, r2, $2, 0;
+       mov.b64 rl0, {r3,r4};
+       @p add.s64 rl0, rl0, $3;
+       mov.s64 $0, rl0;
+       }", "=l,l,r,l"(i64 %partial, i32 %up_offset, i64 %partial) nounwind readnone alwaysinline
+  ret i64 %result;
+}
+define <1 x i64> @__exclusive_scan_add_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x i64> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v    = select i1 %mask, i64 %v0, i64 zeroinitializer
+
+  %s1 = tail call i64 @__shfl_scan_add_step_i64(i64 %v,  i32  1);
+  %s2 = tail call i64 @__shfl_scan_add_step_i64(i64 %s1, i32  2);
+  %s3 = tail call i64 @__shfl_scan_add_step_i64(i64 %s2, i32  4);
+  %s4 = tail call i64 @__shfl_scan_add_step_i64(i64 %s3, i32  8);
+  %s5 = tail call i64 @__shfl_scan_add_step_i64(i64 %s4, i32 16);
+  %rets = sub i64 %s5, %v
+  %retv = bitcast i64 %rets to <1 x i64>
+  ret <1 x i64> %retv
+}
+
+define(`exclusive_scan_i64',`
+define <1 x i64> @__exclusive_scan_$1_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v = bitcast <1 x i64> %0 to <2 x i32>
+  %v0 = extractelement <2 x i32> %v, i32 0
+  %v1 = extractelement <2 x i32> %v, i32 1
+  %inp0 = bitcast i32 %v0 to <1 x i32>
+  %inp1 = bitcast i32 %v1 to <1 x i32>
+  %res0 = call <1 x i32> @__exclusive_scan_$1_i32(<1 x i32> %inp0, <1 x i1> %1);
+  %res1 = call <1 x i32> @__exclusive_scan_$1_i32(<1 x i32> %inp1, <1 x i1> %1);
+  %r0   = bitcast <1 x i32> %res0 to i32
+  %r1   = bitcast <1 x i32> %res1 to i32
+  %ret0 = insertelement <2 x i32> undef, i32 %r0, i32 0
+  %ret1 = insertelement <2 x i32> %ret0, i32 %r1, i32 1
+  %ret  = bitcast <2 x i32> %ret1 to <1 x i64>
+  ret <1 x i64> %ret
+}
+')
+exclusive_scan_i64(or)
+exclusive_scan_i64(and)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
@@ -1530,9 +1619,19 @@ define void @__do_assert_uniform(i8 *%str, i1 %test, <WIDTH x MASK> %mask) {
   br i1 %test, label %ok, label %fail
 
 fail:
+  %lane = call i32 @__laneidx()
+  %cmp  = icmp eq i32 %lane, 0
+  br i1 %cmp, label %fail_print, label %fail_void;
+  
+
+
+fail_print:
   call void @__abort_nvptx(i8* %str) noreturn
   unreachable
 
+fail_void:
+  unreachable
+
 ok:
   ret void
 }
@@ -1554,3 +1653,9 @@ fail:
 ok:
   ret void
 }
+
+define i64 @__clock() nounwind alwaysinline {
+  %r = call i64 asm sideeffect "mov.b64 $0, %clock64;", "=l"();
+  ret i64 %r
+}
+
diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4
index 65577454..7bb1014b 100644
--- a/builtins/util-nvptx.m4
+++ b/builtins/util-nvptx.m4
@@ -2372,13 +2372,6 @@ m4exit(`1')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; read hw clock
 
-declare i64 @llvm.readcyclecounter()
-
-define i64 @__clock() nounwind {
-  %r = call i64 @llvm.readcyclecounter()
-  ret i64 %r
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; stdlib transcendentals
 ;;
diff --git a/module.cpp b/module.cpp
index 3536c10f..8a3f3507 100644
--- a/module.cpp
+++ b/module.cpp
@@ -2134,14 +2134,14 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
       opts.addMacroDef("cwhile=while");
       opts.addMacroDef("ccontinue=continue");
       opts.addMacroDef("cdo=do");
-      opts.addMacroDef("taskIndex=blockIndex0()");
-      opts.addMacroDef("taskCount=blockCount0()");
       opts.addMacroDef("taskIndex0=blockIndex0()");
       opts.addMacroDef("taskCount0=blockCount0()");
       opts.addMacroDef("taskIndex1=blockIndex1()");
       opts.addMacroDef("taskCount1=blockCount1()");
       opts.addMacroDef("taskIndex2=blockIndex2()");
       opts.addMacroDef("taskCount2=blockCount2()");
+      opts.addMacroDef("taskIndex=(taskIndex0 + taskCount0*(taskIndex1 + taskCount1*taskIndex2))");
+      opts.addMacroDef("taskCount=(taskCount0*taskCount1*taskCount2)");
     }
 
 #if defined(LLVM_3_1)
diff --git a/stdlib.ispc b/stdlib.ispc
index 6ce656fe..2d79bf33 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1301,6 +1301,9 @@ packed_store_active2(uniform int a[], int vals) {
 // System information
 
 static inline uniform int num_cores() {
+  if (__is_nvptx_target)
+    return 15*32; // K20/K20X/K40 - 15SMX x 32 warps/smx (max is 64 warps/smx)
+  else
     return __num_cores();
 }
 
diff --git a/tests/array-mixed-unif-vary-indexing-3.ispc b/tests/array-mixed-unif-vary-indexing-3.ispc
index ab3a7a7c..c6623cf6 100644
--- a/tests/array-mixed-unif-vary-indexing-3.ispc
+++ b/tests/array-mixed-unif-vary-indexing-3.ispc
@@ -5,7 +5,13 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     assert(programCount <= 64);
+#ifdef __NVPTX__
+    uniform float  * uniform xarr   =  uniform new uniform float[70*70];
+    uniform float (* uniform x)[70] = (uniform float (* uniform)[70])xarr;
+#define _SHMALLOC
+#else
     uniform float x[70][70];
+#endif
     for (uniform int i = 0; i < 70; ++i)
         for (uniform int j = 0; j < 70; ++j)
             x[i][j] = 2+b-5;
@@ -16,6 +22,10 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     else
         x[b-1][a-1] = 1;
     RET[programIndex] = x[4][a];
+
+#ifdef _SHMALLOC
+    delete xarr;
+#endif
 }
 
 export void result(uniform float RET[]) { 
diff --git a/tests/launch-8.ispc b/tests/launch-8.ispc
index eacba673..dcc2b6b9 100644
--- a/tests/launch-8.ispc
+++ b/tests/launch-8.ispc
@@ -2,7 +2,7 @@
 export uniform int width() { return programCount; }
 
 
-#define N0 10
+#define N0 12
 #define N1 20
 #define N2 50
 static uniform float array[N2][N1][N0];
@@ -10,14 +10,14 @@ static uniform float array[N2][N1][N0];
 task void x(const float f) {
     uniform int j;
 
-    assert(taskCount  == (int32)N0*N1*N2);
-    assert(taskCount0 == (int32)N0);
-    assert(taskCount1 == (int32)N1);
-    assert(taskCount2 == (int32)N2);
-    assert(taskIndex  == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2));
-    assert(taskIndex0 < (int32)N0);
-    assert(taskIndex1 < (int32)N1);
-    assert(taskIndex2 < (int32)N2);
+    assert(taskCount  == (uniform int32)N0*N1*N2);
+    assert(taskCount0 == (uniform int32)N0);
+    assert(taskCount1 == (uniform int32)N1);
+    assert(taskCount2 == (uniform int32)N2);
+    assert(taskIndex  == (uniform int32)taskIndex0 + (uniform int32)N0*(taskIndex1 +(uniform int32) N1*taskIndex2));
+    assert(taskIndex0 < (uniform int32)N0);
+    assert(taskIndex1 < (uniform int32)N1);
+    assert(taskIndex2 < (uniform int32)N2);
 
     const uniform int i0 = taskIndex0;
     const uniform int i1 = taskIndex1;
@@ -38,5 +38,5 @@ export void f_f(uniform float RET[], uniform float fFOO[]) {
 
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 9999.000000;
+    RET[programIndex] = 11999.000000;
 }
diff --git a/tests/launch-9.ispc b/tests/launch-9.ispc
index 1952e8e7..be9e4881 100644
--- a/tests/launch-9.ispc
+++ b/tests/launch-9.ispc
@@ -2,7 +2,7 @@
 export uniform int width() { return programCount; }
 
 
-#define N0 10
+#define N0 12
 #define N1 20
 #define N2 50
 static uniform float array[N2][N1][N0];
@@ -38,5 +38,5 @@ export void f_f(uniform float RET[], uniform float fFOO[]) {
 
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 9999.000000;
+    RET[programIndex] = 11999.000000;
 }
diff --git a/tests/test-141.ispc b/tests/test-141.ispc
index b69be1fa..9045c081 100644
--- a/tests/test-141.ispc
+++ b/tests/test-141.ispc
@@ -5,7 +5,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex];
     // calculation error 1e-6 is the same as in icc
-    RET[programIndex] = (exp(-log(1/a)) - a) < 1e-6 ? 1 : 0;
+    RET[programIndex] = (exp(-log(1/a)) - a)/a < 1e-6 ? 1 : 0;
 }
 
 export void result(uniform float RET[4]) {
diff --git a/tests/test-142.ispc b/tests/test-142.ispc
index 18053402..9ab8ff9f 100644
--- a/tests/test-142.ispc
+++ b/tests/test-142.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = round(a+.499999); 
+    RET[programIndex] = round(a+.49999); 
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/test-144.ispc b/tests/test-144.ispc
index 568bdc10..64e1817a 100644
--- a/tests/test-144.ispc
+++ b/tests/test-144.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = floor(a+.999999); 
+    RET[programIndex] = floor(a+.99999); 
 }
 
 export void result(uniform float RET[]) {