diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll
index a2fdacd6..d631f35d 100644
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
@@ -1141,6 +1141,120 @@ broadcast(i64)
 broadcast(float)
 broadcast(double)
 
+define i32 @__shfl_scan_add_step_i32(i32 %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call i32 asm sideeffect  
+      "{.reg .u32 r0;
+       .reg .pred p;
+       shfl.up.b32 r0|p, $1, $2, 0;
+       @p add.u32 r0, r0, $3;
+       mov.u32 $0, r0;
+       }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) nounwind readnone alwaysinline
+  ret i32 %result;
+}
+define <1 x i32> @__exclusive_scan_add_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x i32> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v    = select i1 %mask, i32 %v0, i32 0
+  
+  %s1 = tail call i32 @__shfl_scan_add_step_i32(i32 %v,  i32  1);
+  %s2 = tail call i32 @__shfl_scan_add_step_i32(i32 %s1, i32  2);
+  %s3 = tail call i32 @__shfl_scan_add_step_i32(i32 %s2, i32  4);
+  %s4 = tail call i32 @__shfl_scan_add_step_i32(i32 %s3, i32  8);
+  %s5 = tail call i32 @__shfl_scan_add_step_i32(i32 %s4, i32 16);
+  %rets = sub i32 %s5, %v
+  %retv = insertelement <1 x i32> undef, i32 %rets, i32 0
+  ret <1 x i32> %retv
+}
+;;
+define i32 @__shfl_scan_or_step_i32(i32 %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call i32 asm sideeffect  
+      "{.reg .u32 r0;
+       .reg .pred p;
+       shfl.up.b32 r0|p, $1, $2, 0;
+       @p or.b32 r0, r0, $3;
+       mov.u32 $0, r0;
+       }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) nounwind readnone alwaysinline
+  ret i32 %result;
+}
+define <1 x i32> @__exclusive_scan_or_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %shft = tail call <1 x i32> @__shift_i32(<1 x i32> %0, i32 -1)
+  %v0   = extractelement <1 x i32> %shft, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v    = select i1 %mask, i32 %v0, i32 0
+  
+  %s1 = tail call i32 @__shfl_scan_or_step_i32(i32 %v,  i32  1);
+  %s2 = tail call i32 @__shfl_scan_or_step_i32(i32 %s1, i32  2);
+  %s3 = tail call i32 @__shfl_scan_or_step_i32(i32 %s2, i32  4);
+  %s4 = tail call i32 @__shfl_scan_or_step_i32(i32 %s3, i32  8);
+  %s5 = tail call i32 @__shfl_scan_or_step_i32(i32 %s4, i32 16);
+  %retv = insertelement <1 x i32> undef, i32 %s5, i32 0
+  ret <1 x i32> %retv
+}
+;;
+define i32 @__shfl_scan_and_step_i32(i32 %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call i32 asm sideeffect  
+      "{.reg .u32 r0;
+       .reg .pred p;
+       shfl.up.b32 r0|p, $1, $2, 0;
+       @p and.b32 r0, r0, $3;
+       mov.u32 $0, r0;
+       }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) nounwind readnone alwaysinline
+  ret i32 %result;
+}
+define <1 x i32> @__exclusive_scan_and_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %shft = tail call <1 x i32> @__shift_i32(<1 x i32> %0, i32 -1)
+  %v0   = extractelement <1 x i32> %shft, i32 0
+  %m0   = extractelement <1 x i1 > %1,    i32 0
+
+  %tid  = tail call i32 @__tid_x()
+  %lane = and i32 %tid, 31
+  %m1   = icmp eq i32 %lane, 0
+
+  %mask = and i1 %m0, %m1
+  %v    = select i1 %mask, i32 %v0, i32 -1
+  
+  %s1 = tail call i32 @__shfl_scan_and_step_i32(i32 %v,  i32  1);
+  %s2 = tail call i32 @__shfl_scan_and_step_i32(i32 %s1, i32  2);
+  %s3 = tail call i32 @__shfl_scan_and_step_i32(i32 %s2, i32  4);
+  %s4 = tail call i32 @__shfl_scan_and_step_i32(i32 %s3, i32  8);
+  %s5 = tail call i32 @__shfl_scan_and_step_i32(i32 %s4, i32 16);
+  %retv = insertelement <1 x i32> undef, i32 %s5, i32 0
+  ret <1 x i32> %retv
+}
+
+define float @__shfl_scan_add_step_float(float %partial, i32 %up_offset) nounwind readnone alwaysinline
+{
+  %result = tail call float asm sideeffect  
+      "{.reg .f32 f0;
+       .reg .pred p;
+       shfl.up.b32 f0|p, $1, $2, 0;
+       @p add.f32 f0, f0, $3;
+       mov.f32 $0, f0;
+       }", "=f,f,r,f"(float %partial, i32 %up_offset, float %partial) nounwind readnone alwaysinline
+  ret float %result;
+}
+define <1 x float> @__exclusive_scan_add_float(<1 x float>, <1 x i1>) nounwind readnone alwaysinline
+{
+  %v0   = extractelement <1 x float> %0, i32 0
+  %mask = extractelement <1 x i1 > %1, i32 0
+  %v    = select i1 %mask, float %v0, float zeroinitializer
+
+  %s1 = tail call float @__shfl_scan_add_step_float(float %v,  i32  1);
+  %s2 = tail call float @__shfl_scan_add_step_float(float %s1, i32  2);
+  %s3 = tail call float @__shfl_scan_add_step_float(float %s2, i32  4);
+  %s4 = tail call float @__shfl_scan_add_step_float(float %s3, i32  8);
+  %s5 = tail call float @__shfl_scan_add_step_float(float %s4, i32 16);
+  %rets = fsub float %s5, %v
+  %retv = insertelement <1 x float> undef, float %rets, i32 0
+  ret <1 x float> %retv
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
 
diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4
index ef7ce3f8..11bd0997 100644
--- a/builtins/util-nvptx.m4
+++ b/builtins/util-nvptx.m4
@@ -3825,15 +3825,11 @@ define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v,
 ')
 
 define(`scans', `
-exclusive_scan(WIDTH, i32, 32, add, 0, add_i32)
-exclusive_scan(WIDTH, float, 32, fadd, zeroinitializer, add_float)
 exclusive_scan(WIDTH, i64, 64, add, 0, add_i64)
 exclusive_scan(WIDTH, double, 64, fadd, zeroinitializer, add_double)
 
-exclusive_scan(WIDTH, i32, 32, and, -1, and_i32)
 exclusive_scan(WIDTH, i64, 64, and, -1, and_i64)
 
-exclusive_scan(WIDTH, i32, 32, or, 0, or_i32)
 exclusive_scan(WIDTH, i64, 64, or, 0, or_i64)
 ')