added shift
This commit is contained in:
@@ -1062,6 +1062,69 @@ shuffle1(i64)
|
||||
shuffle1(float)
|
||||
shuffle1(double)
|
||||
|
||||
define(`shuffle2',`
|
||||
define <1 x $1> @__shuffle2_$1(<1 x $1>, <1 x $1>, <1 x i32>) nounwind readnone alwaysinline
|
||||
{
|
||||
%val1 = extractelement <1 x $1> %0, i32 0
|
||||
%val2 = extractelement <1 x $1> %1, i32 0
|
||||
%lane = extractelement <1 x i32> %2, i32 0
|
||||
%c = icmp slt i32 %lane, 32
|
||||
%val = select i1 %c, $1 %val1, $1 %val2
|
||||
%lane_mask = and i32 %lane, 31
|
||||
%rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane_mask);
|
||||
%retv = insertelement <1 x $1> undef, $1 %rets, i32 0
|
||||
ret <1 x $1> %retv
|
||||
}
|
||||
')
|
||||
shuffle2(i8)
|
||||
shuffle2(i16)
|
||||
shuffle2(i32)
|
||||
shuffle2(i64)
|
||||
shuffle2(float)
|
||||
shuffle2(double)
|
||||
|
||||
define(`shift',`
|
||||
define <1 x $1> @__shift_$1(<1 x $1>, i32) nounwind readnone alwaysinline
|
||||
{
|
||||
%val = extractelement <1 x $1> %0, i32 0
|
||||
%tid = tail call i32 @__tid_x()
|
||||
%lane = and i32 %tid, 31
|
||||
%src = add i32 %lane, %1
|
||||
%ret = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %src)
|
||||
%c1 = icmp sge i32 %src, 0
|
||||
%c2 = icmp slt i32 %src, 32
|
||||
%c = and i1 %c1, %c2
|
||||
%rets = select i1 %c, $1 %ret, $1 zeroinitializer
|
||||
%retv = insertelement <1 x $1> undef, $1 %rets, i32 0
|
||||
ret <1 x $1> %retv
|
||||
}
|
||||
')
|
||||
shift(i8)
|
||||
shift(i16)
|
||||
shift(i32)
|
||||
shift(i64)
|
||||
shift(float)
|
||||
shift(double)
|
||||
|
||||
define(`rotate', `
|
||||
define <1 x $1> @__rotate_$1(<1 x $1>, i32) nounwind readnone alwaysinline
|
||||
{
|
||||
%val = extractelement <1 x $1> %0, i32 0
|
||||
%tid = tail call i32 @__tid_x()
|
||||
%src = add i32 %tid, %1
|
||||
%lane = and i32 %src, 31
|
||||
%rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane)
|
||||
%retv = insertelement <1 x $1> undef, $1 %rets, i32 0
|
||||
ret <1 x $1> %retv
|
||||
}
|
||||
')
|
||||
rotate(i8)
|
||||
rotate(i16)
|
||||
rotate(i32)
|
||||
rotate(i64)
|
||||
rotate(float)
|
||||
rotate(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
|
||||
@@ -764,97 +764,6 @@ define <WIDTH x $1> @__broadcast_$1(<WIDTH x $1>, i32) nounwind readnone alwaysi
|
||||
ret <WIDTH x $1> %broadcast
|
||||
}
|
||||
|
||||
define <WIDTH x $1> @__rotate_$1(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
|
||||
%isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1)
|
||||
br i1 %isc, label %is_const, label %not_const
|
||||
|
||||
is_const:
|
||||
; though verbose, this turms into tight code if %1 is a constant
|
||||
forloop(i, 0, eval(WIDTH-1), `
|
||||
%delta_`'i = add i32 %1, i
|
||||
%delta_clamped_`'i = and i32 %delta_`'i, eval(WIDTH-1)
|
||||
%v_`'i = extractelement <WIDTH x $1> %0, i32 %delta_clamped_`'i')
|
||||
|
||||
%ret_0 = insertelement <WIDTH x $1> undef, $1 %v_0, i32 0
|
||||
forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
|
||||
')
|
||||
ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
|
||||
|
||||
not_const:
|
||||
; store two instances of the vector into memory
|
||||
%ptr = alloca <WIDTH x $1>, i32 2
|
||||
%ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
|
||||
store <WIDTH x $1> %0, <WIDTH x $1> * %ptr0
|
||||
%ptr1 = getelementptr <WIDTH x $1> * %ptr, i32 1
|
||||
store <WIDTH x $1> %0, <WIDTH x $1> * %ptr1
|
||||
|
||||
; compute offset in [0,vectorwidth-1], then index into the doubled-up vector
|
||||
%offset = and i32 %1, eval(WIDTH-1)
|
||||
%ptr_as_elt_array = bitcast <WIDTH x $1> * %ptr to [eval(2*WIDTH) x $1] *
|
||||
%load_ptr = getelementptr [eval(2*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset
|
||||
%load_ptr_vec = bitcast $1 * %load_ptr to <WIDTH x $1> *
|
||||
%result = load <WIDTH x $1> * %load_ptr_vec, align $2
|
||||
ret <WIDTH x $1> %result
|
||||
}
|
||||
|
||||
define <WIDTH x $1> @__shift_$1(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
|
||||
%ptr = alloca <WIDTH x $1>, i32 3
|
||||
%ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
|
||||
store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr0
|
||||
%ptr1 = getelementptr <WIDTH x $1> * %ptr, i32 1
|
||||
store <WIDTH x $1> %0, <WIDTH x $1> * %ptr1
|
||||
%ptr2 = getelementptr <WIDTH x $1> * %ptr, i32 2
|
||||
store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr2
|
||||
|
||||
%offset = add i32 %1, WIDTH
|
||||
%ptr_as_elt_array = bitcast <WIDTH x $1> * %ptr to [eval(3*WIDTH) x $1] *
|
||||
%load_ptr = getelementptr [eval(3*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset
|
||||
%load_ptr_vec = bitcast $1 * %load_ptr to <WIDTH x $1> *
|
||||
%result = load <WIDTH x $1> * %load_ptr_vec, align $2
|
||||
ret <WIDTH x $1> %result
|
||||
}
|
||||
|
||||
|
||||
define <WIDTH x $1> @__shuffle2_$1(<WIDTH x $1>, <WIDTH x $1>, <WIDTH x i32>) nounwind readnone alwaysinline {
|
||||
%v2 = shufflevector <WIDTH x $1> %0, <WIDTH x $1> %1, <eval(2*WIDTH) x i32> <
|
||||
forloop(i, 0, eval(2*WIDTH-2), `i32 i, ') i32 eval(2*WIDTH-1)
|
||||
>
|
||||
forloop(i, 0, eval(WIDTH-1), `
|
||||
%index_`'i = extractelement <WIDTH x i32> %2, i32 i')
|
||||
|
||||
%isc = call i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32> %2)
|
||||
br i1 %isc, label %is_const, label %not_const
|
||||
|
||||
is_const:
|
||||
; extract from the requested lanes and insert into the result; LLVM turns
|
||||
; this into good code in the end
|
||||
forloop(i, 0, eval(WIDTH-1), `
|
||||
%v_`'i = extractelement <eval(2*WIDTH) x $1> %v2, i32 %index_`'i')
|
||||
|
||||
%ret_0 = insertelement <WIDTH x $1> undef, $1 %v_0, i32 0
|
||||
forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
|
||||
')
|
||||
ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
|
||||
|
||||
not_const:
|
||||
; otherwise store the two vectors onto the stack and then use the given
|
||||
; permutation vector to get indices into that array...
|
||||
%ptr = alloca <eval(2*WIDTH) x $1>
|
||||
store <eval(2*WIDTH) x $1> %v2, <eval(2*WIDTH) x $1> * %ptr
|
||||
%baseptr = bitcast <eval(2*WIDTH) x $1> * %ptr to $1 *
|
||||
|
||||
%ptr_0 = getelementptr $1 * %baseptr, i32 %index_0
|
||||
%val_0 = load $1 * %ptr_0
|
||||
%result_0 = insertelement <WIDTH x $1> undef, $1 %val_0, i32 0
|
||||
|
||||
forloop(i, 1, eval(WIDTH-1), `
|
||||
%ptr_`'i = getelementptr $1 * %baseptr, i32 %index_`'i
|
||||
%val_`'i = load $1 * %ptr_`'i
|
||||
%result_`'i = insertelement <WIDTH x $1> %result_`'eval(i-1), $1 %val_`'i, i32 i
|
||||
')
|
||||
|
||||
ret <WIDTH x $1> %result_`'eval(WIDTH-1)
|
||||
}
|
||||
')
|
||||
|
||||
define(`define_shuffles',`
|
||||
|
||||
5
opt.cpp
5
opt.cpp
@@ -586,7 +586,10 @@ Optimize(llvm::Module *module, int optLevel) {
|
||||
optPM.add(llvm::createGlobalOptimizerPass());
|
||||
optPM.add(llvm::createReassociatePass());
|
||||
optPM.add(llvm::createIPConstantPropagationPass());
|
||||
optPM.add(CreateReplaceStdlibShiftPass(),229);
|
||||
|
||||
if (g->target->getISA() != Target::NVPTX)
|
||||
optPM.add(CreateReplaceStdlibShiftPass(),229);
|
||||
|
||||
optPM.add(llvm::createDeadArgEliminationPass(),230);
|
||||
optPM.add(llvm::createInstructionCombiningPass());
|
||||
optPM.add(llvm::createCFGSimplificationPass());
|
||||
|
||||
Reference in New Issue
Block a user