diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll
index 4dab86c4..e93c65fe 100644
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
@@ -1062,6 +1062,69 @@ shuffle1(i64)
 shuffle1(float)
 shuffle1(double)
 
+define(`shuffle2',`
+define <1 x $1> @__shuffle2_$1(<1 x $1>, <1 x $1>, <1 x i32>) nounwind readnone alwaysinline
+{
+  %val1 = extractelement <1 x $1> %0, i32 0
+  %val2 = extractelement <1 x $1> %1, i32 0
+  %lane = extractelement <1 x i32> %2, i32 0
+  %c    = icmp slt i32 %lane, 32              
+  %val  = select i1 %c, $1 %val1, $1 %val2
+  %lane_mask = and i32 %lane, 31
+  %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane_mask);
+  %retv = insertelement <1 x $1> undef, $1 %rets, i32 0
+  ret <1 x $1> %retv
+}
+')
+shuffle2(i8)
+shuffle2(i16)
+shuffle2(i32)
+shuffle2(i64)
+shuffle2(float)
+shuffle2(double)
+
+define(`shift',`
+define <1 x $1> @__shift_$1(<1 x $1>, i32) nounwind readnone alwaysinline
+{
+  %val  = extractelement <1 x $1> %0, i32 0
+  %tid  = tail call i32 @__tid_x()
+  %lane = and i32 %tid,  31
+  %src  = add i32 %lane, %1
+  %ret  = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %src)
+  %c1   = icmp sge i32 %src, 0
+  %c2   = icmp slt i32 %src, 32
+  %c    = and i1 %c1, %c2
+  %rets = select i1 %c, $1 %ret, $1 zeroinitializer
+  %retv = insertelement <1 x $1> undef, $1 %rets, i32 0
+  ret <1 x $1> %retv
+}
+')
+shift(i8)
+shift(i16)
+shift(i32)
+shift(i64)
+shift(float)
+shift(double)
+
+define(`rotate', `
+define <1 x $1> @__rotate_$1(<1 x $1>, i32) nounwind readnone alwaysinline 
+{
+  %val  = extractelement <1 x $1> %0, i32 0
+  %tid  = tail call i32 @__tid_x()
+  %src  = add i32 %tid, %1
+  %lane = and i32 %src, 31
+  %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane)
+  %retv = insertelement <1 x $1> undef, $1 %rets, i32 0
+  ret <1 x $1> %retv
+}
+')
+rotate(i8)
+rotate(i16)
+rotate(i32)
+rotate(i64)
+rotate(float)
+rotate(double)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
 
diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4
index 76fc7f2b..aa3d4e82 100644
--- a/builtins/util-nvptx.m4
+++ b/builtins/util-nvptx.m4
@@ -764,97 +764,6 @@ define <WIDTH x $1> @__broadcast_$1(<WIDTH x $1>, i32) nounwind readnone alwaysi
   ret <WIDTH x $1> %broadcast
 }
 
-define <WIDTH x $1> @__rotate_$1(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
-  %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1)
-  br i1 %isc, label %is_const, label %not_const
-
-is_const:
-  ; though verbose, this turms into tight code if %1 is a constant
-forloop(i, 0, eval(WIDTH-1), `  
-  %delta_`'i = add i32 %1, i
-  %delta_clamped_`'i = and i32 %delta_`'i, eval(WIDTH-1)
-  %v_`'i = extractelement <WIDTH x $1> %0, i32 %delta_clamped_`'i')
-
-  %ret_0 = insertelement <WIDTH x $1> undef, $1 %v_0, i32 0
-forloop(i, 1, eval(WIDTH-1), `  %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
-')
-  ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
-
-not_const:
-  ; store two instances of the vector into memory
-  %ptr = alloca <WIDTH x $1>, i32 2
-  %ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
-  store <WIDTH x $1> %0, <WIDTH x $1> * %ptr0
-  %ptr1 = getelementptr <WIDTH x $1> * %ptr, i32 1
-  store <WIDTH x $1> %0, <WIDTH x $1> * %ptr1
-
-  ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector
-  %offset = and i32 %1, eval(WIDTH-1)
-  %ptr_as_elt_array = bitcast <WIDTH x $1> * %ptr to [eval(2*WIDTH) x $1] *
-  %load_ptr = getelementptr [eval(2*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset
-  %load_ptr_vec = bitcast $1 * %load_ptr to <WIDTH x $1> *
-  %result = load <WIDTH x $1> * %load_ptr_vec, align $2
-  ret <WIDTH x $1> %result
-}
-
-define <WIDTH x $1> @__shift_$1(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
-  %ptr = alloca <WIDTH x $1>, i32 3
-  %ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
-  store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr0
-  %ptr1 = getelementptr <WIDTH x $1> * %ptr, i32 1
-  store <WIDTH x $1> %0, <WIDTH x $1> * %ptr1
-  %ptr2 = getelementptr <WIDTH x $1> * %ptr, i32 2
-  store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr2
-
-  %offset = add i32 %1, WIDTH
-  %ptr_as_elt_array = bitcast <WIDTH x $1> * %ptr to [eval(3*WIDTH) x $1] *
-  %load_ptr = getelementptr [eval(3*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset
-  %load_ptr_vec = bitcast $1 * %load_ptr to <WIDTH x $1> *
-  %result = load <WIDTH x $1> * %load_ptr_vec, align $2
-  ret <WIDTH x $1> %result
-}
-
-
-define <WIDTH x $1> @__shuffle2_$1(<WIDTH x $1>, <WIDTH x $1>, <WIDTH x i32>) nounwind readnone alwaysinline {
-  %v2 = shufflevector <WIDTH x $1> %0, <WIDTH x $1> %1, <eval(2*WIDTH) x i32> <
-      forloop(i, 0, eval(2*WIDTH-2), `i32 i, ') i32 eval(2*WIDTH-1)
-  >
-forloop(i, 0, eval(WIDTH-1), `  
-  %index_`'i = extractelement <WIDTH x i32> %2, i32 i')
-
-  %isc = call i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32> %2)
-  br i1 %isc, label %is_const, label %not_const
-
-is_const:
-  ; extract from the requested lanes and insert into the result; LLVM turns
-  ; this into good code in the end
-forloop(i, 0, eval(WIDTH-1), `  
-  %v_`'i = extractelement <eval(2*WIDTH) x $1> %v2, i32 %index_`'i')
-
-  %ret_0 = insertelement <WIDTH x $1> undef, $1 %v_0, i32 0
-forloop(i, 1, eval(WIDTH-1), `  %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
-')
-  ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
-
-not_const:
-  ; otherwise store the two vectors onto the stack and then use the given
-  ; permutation vector to get indices into that array...
-  %ptr = alloca <eval(2*WIDTH) x $1>
-  store <eval(2*WIDTH) x $1> %v2, <eval(2*WIDTH) x $1> * %ptr
-  %baseptr = bitcast <eval(2*WIDTH) x $1> * %ptr to $1 *
-
-  %ptr_0 = getelementptr $1 * %baseptr, i32 %index_0
-  %val_0 = load $1 * %ptr_0
-  %result_0 = insertelement <WIDTH x $1> undef, $1 %val_0, i32 0
-
-forloop(i, 1, eval(WIDTH-1), `  
-  %ptr_`'i = getelementptr $1 * %baseptr, i32 %index_`'i
-  %val_`'i = load $1 * %ptr_`'i
-  %result_`'i = insertelement <WIDTH x $1> %result_`'eval(i-1), $1 %val_`'i, i32 i
-')
-
-  ret <WIDTH x $1> %result_`'eval(WIDTH-1)
-}
 ')
 
 define(`define_shuffles',`
diff --git a/opt.cpp b/opt.cpp
index 8702ca38..b85e171f 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -586,7 +586,10 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createGlobalOptimizerPass());
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createIPConstantPropagationPass());
-        optPM.add(CreateReplaceStdlibShiftPass(),229);
+
+        if (g->target->getISA() != Target::NVPTX)
+          optPM.add(CreateReplaceStdlibShiftPass(),229);
+
         optPM.add(llvm::createDeadArgEliminationPass(),230);
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());