diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll index 5334f882..4dab86c4 100644 --- a/builtins/target-nvptx.ll +++ b/builtins/target-nvptx.ll @@ -1044,6 +1044,24 @@ define i1 @__reduce_equal_double(<1 x double> %vv, double * %samevalue, } +;;;;;;;;;;; shuffle +define(`shuffle1', ` +define <1 x $1> @__shuffle_$1(<1 x $1>, <1 x i32>) nounwind readnone alwaysinline +{ + %val = extractelement <1 x $1> %0, i32 0 + %lane = extractelement <1 x i32> %1, i32 0 + %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane) + %retv = insertelement <1 x $1> undef, $1 %rets, i32 0 + ret <1 x $1> %retv +} +') +shuffle1(i8) +shuffle1(i16) +shuffle1(i32) +shuffle1(i64) +shuffle1(float) +shuffle1(double) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4 index c948cfc3..76fc7f2b 100644 --- a/builtins/util-nvptx.m4 +++ b/builtins/util-nvptx.m4 @@ -815,18 +815,6 @@ define @__shift_$1(, i32) nounwind readnone alwaysinlin } -define @__shuffle_$1(, ) nounwind readnone alwaysinline { -forloop(i, 0, eval(WIDTH-1), ` - %index_`'i = extractelement %1, i32 i') -forloop(i, 0, eval(WIDTH-1), ` - %v_`'i = extractelement %0, i32 %index_`'i') - - %ret_0 = insertelement undef, $1 %v_0, i32 0 -forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement %ret_`'eval(i-1), $1 %v_`'i, i32 i -') - ret %ret_`'eval(WIDTH-1) -} - define @__shuffle2_$1(, , ) nounwind readnone alwaysinline { %v2 = shufflevector %0, %1, < forloop(i, 0, eval(2*WIDTH-2), `i32 i, ') i32 eval(2*WIDTH-1)