diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll index 4dab86c4..e93c65fe 100644 --- a/builtins/target-nvptx.ll +++ b/builtins/target-nvptx.ll @@ -1062,6 +1062,69 @@ shuffle1(i64) shuffle1(float) shuffle1(double) +define(`shuffle2',` +define <1 x $1> @__shuffle2_$1(<1 x $1>, <1 x $1>, <1 x i32>) nounwind readnone alwaysinline +{ + %val1 = extractelement <1 x $1> %0, i32 0 + %val2 = extractelement <1 x $1> %1, i32 0 + %lane = extractelement <1 x i32> %2, i32 0 + %c = icmp slt i32 %lane, 32 + %val = select i1 %c, $1 %val1, $1 %val2 + %lane_mask = and i32 %lane, 31 + %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane_mask); + %retv = insertelement <1 x $1> undef, $1 %rets, i32 0 + ret <1 x $1> %retv +} +') +shuffle2(i8) +shuffle2(i16) +shuffle2(i32) +shuffle2(i64) +shuffle2(float) +shuffle2(double) + +define(`shift',` +define <1 x $1> @__shift_$1(<1 x $1>, i32) nounwind readnone alwaysinline +{ + %val = extractelement <1 x $1> %0, i32 0 + %tid = tail call i32 @__tid_x() + %lane = and i32 %tid, 31 + %src = add i32 %lane, %1 + %ret = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %src) + %c1 = icmp sge i32 %src, 0 + %c2 = icmp slt i32 %src, 32 + %c = and i1 %c1, %c2 + %rets = select i1 %c, $1 %ret, $1 zeroinitializer + %retv = insertelement <1 x $1> undef, $1 %rets, i32 0 + ret <1 x $1> %retv +} +') +shift(i8) +shift(i16) +shift(i32) +shift(i64) +shift(float) +shift(double) + +define(`rotate', ` +define <1 x $1> @__rotate_$1(<1 x $1>, i32) nounwind readnone alwaysinline +{ + %val = extractelement <1 x $1> %0, i32 0 + %tid = tail call i32 @__tid_x() + %src = add i32 %tid, %1 + %lane = and i32 %src, 31 + %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane) + %retv = insertelement <1 x $1> undef, $1 %rets, i32 0 + ret <1 x $1> %retv +} +') +rotate(i8) +rotate(i16) +rotate(i32) +rotate(i64) +rotate(float) +rotate(double) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4 index 76fc7f2b..aa3d4e82 100644 --- a/builtins/util-nvptx.m4 +++ b/builtins/util-nvptx.m4 @@ -764,97 +764,6 @@ define @__broadcast_$1(, i32) nounwind readnone alwaysi ret %broadcast } -define @__rotate_$1(, i32) nounwind readnone alwaysinline { - %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1) - br i1 %isc, label %is_const, label %not_const - -is_const: - ; though verbose, this turms into tight code if %1 is a constant -forloop(i, 0, eval(WIDTH-1), ` - %delta_`'i = add i32 %1, i - %delta_clamped_`'i = and i32 %delta_`'i, eval(WIDTH-1) - %v_`'i = extractelement %0, i32 %delta_clamped_`'i') - - %ret_0 = insertelement undef, $1 %v_0, i32 0 -forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement %ret_`'eval(i-1), $1 %v_`'i, i32 i -') - ret %ret_`'eval(WIDTH-1) - -not_const: - ; store two instances of the vector into memory - %ptr = alloca , i32 2 - %ptr0 = getelementptr * %ptr, i32 0 - store %0, * %ptr0 - %ptr1 = getelementptr * %ptr, i32 1 - store %0, * %ptr1 - - ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector - %offset = and i32 %1, eval(WIDTH-1) - %ptr_as_elt_array = bitcast * %ptr to [eval(2*WIDTH) x $1] * - %load_ptr = getelementptr [eval(2*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset - %load_ptr_vec = bitcast $1 * %load_ptr to * - %result = load * %load_ptr_vec, align $2 - ret %result -} - -define @__shift_$1(, i32) nounwind readnone alwaysinline { - %ptr = alloca , i32 3 - %ptr0 = getelementptr * %ptr, i32 0 - store zeroinitializer, * %ptr0 - %ptr1 = getelementptr * %ptr, i32 1 - store %0, * %ptr1 - %ptr2 = getelementptr * %ptr, i32 2 - store zeroinitializer, * %ptr2 - - %offset = add i32 %1, WIDTH - %ptr_as_elt_array = bitcast * %ptr to [eval(3*WIDTH) x $1] * - %load_ptr = getelementptr [eval(3*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset - %load_ptr_vec = bitcast $1 * %load_ptr to * - %result = load * %load_ptr_vec, align $2 - ret %result -} - - -define @__shuffle2_$1(, , ) nounwind readnone alwaysinline { - %v2 = shufflevector %0, %1, < - forloop(i, 0, eval(2*WIDTH-2), `i32 i, ') i32 eval(2*WIDTH-1) - > -forloop(i, 0, eval(WIDTH-1), ` - %index_`'i = extractelement %2, i32 i') - - %isc = call i1 @__is_compile_time_constant_varying_int32( %2) - br i1 %isc, label %is_const, label %not_const - -is_const: - ; extract from the requested lanes and insert into the result; LLVM turns - ; this into good code in the end -forloop(i, 0, eval(WIDTH-1), ` - %v_`'i = extractelement %v2, i32 %index_`'i') - - %ret_0 = insertelement undef, $1 %v_0, i32 0 -forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement %ret_`'eval(i-1), $1 %v_`'i, i32 i -') - ret %ret_`'eval(WIDTH-1) - -not_const: - ; otherwise store the two vectors onto the stack and then use the given - ; permutation vector to get indices into that array... - %ptr = alloca - store %v2, * %ptr - %baseptr = bitcast * %ptr to $1 * - - %ptr_0 = getelementptr $1 * %baseptr, i32 %index_0 - %val_0 = load $1 * %ptr_0 - %result_0 = insertelement undef, $1 %val_0, i32 0 - -forloop(i, 1, eval(WIDTH-1), ` - %ptr_`'i = getelementptr $1 * %baseptr, i32 %index_`'i - %val_`'i = load $1 * %ptr_`'i - %result_`'i = insertelement %result_`'eval(i-1), $1 %val_`'i, i32 i -') - - ret %result_`'eval(WIDTH-1) -} ') define(`define_shuffles',` diff --git a/opt.cpp b/opt.cpp index 8702ca38..b85e171f 100644 --- a/opt.cpp +++ b/opt.cpp @@ -586,7 +586,10 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createGlobalOptimizerPass()); optPM.add(llvm::createReassociatePass()); optPM.add(llvm::createIPConstantPropagationPass()); - optPM.add(CreateReplaceStdlibShiftPass(),229); + + if (g->target->getISA() != Target::NVPTX) + optPM.add(CreateReplaceStdlibShiftPass(),229); + optPM.add(llvm::createDeadArgEliminationPass(),230); optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createCFGSimplificationPass());