+fixed some example, found some bugs, and bugs in ptxas/cuda

2014-01-21 14:51:27 +01:00
parent 5a773ed62a
commit bc99897fbb
22 changed files with 98 additions and 58 deletions
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
@@ -653,9 +653,19 @@ define  i1 @__none(<1 x i1>) nounwind readnone alwaysinline {
 declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
 declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone

-define  float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
-  %r = extractelement <1 x float> %v, i32 0
-  ret float %r
+define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
+  %value = extractelement <1 x float> %v, i32 0
+  %call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
+  %call1 = fadd float %call, %value 
+  %call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8)
+  %call1.1 = fadd float %call1, %call.1 
+  %call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4)
+  %call1.2 = fadd float %call1.1, %call.2
+  %call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2)
+  %call1.3 = fadd float %call1.2, %call.3 
+  %call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1)
+  %call1.4 = fadd float %call1.3, %call.4 
+  ret float %call1.4
 }

 define  float @__reduce_min_float(<1 x float>) nounwind readnone {