added shift

This commit is contained in:
Evghenii
2014-01-22 20:43:53 +01:00
parent 39962623cc
commit 7d0aa7a336
3 changed files with 67 additions and 92 deletions

View File

@@ -1062,6 +1062,69 @@ shuffle1(i64)
shuffle1(float)
shuffle1(double)
define(`shuffle2',`
define <1 x $1> @__shuffle2_$1(<1 x $1>, <1 x $1>, <1 x i32>) nounwind readnone alwaysinline
{
%val1 = extractelement <1 x $1> %0, i32 0
%val2 = extractelement <1 x $1> %1, i32 0
%lane = extractelement <1 x i32> %2, i32 0
%c = icmp slt i32 %lane, 32
%val = select i1 %c, $1 %val1, $1 %val2
%lane_mask = and i32 %lane, 31
%rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane_mask);
%retv = insertelement <1 x $1> undef, $1 %rets, i32 0
ret <1 x $1> %retv
}
')
shuffle2(i8)
shuffle2(i16)
shuffle2(i32)
shuffle2(i64)
shuffle2(float)
shuffle2(double)
define(`shift',`
define <1 x $1> @__shift_$1(<1 x $1>, i32) nounwind readnone alwaysinline
{
%val = extractelement <1 x $1> %0, i32 0
%tid = tail call i32 @__tid_x()
%lane = and i32 %tid, 31
%src = add i32 %lane, %1
%ret = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %src)
%c1 = icmp sge i32 %src, 0
%c2 = icmp slt i32 %src, 32
%c = and i1 %c1, %c2
%rets = select i1 %c, $1 %ret, $1 zeroinitializer
%retv = insertelement <1 x $1> undef, $1 %rets, i32 0
ret <1 x $1> %retv
}
')
shift(i8)
shift(i16)
shift(i32)
shift(i64)
shift(float)
shift(double)
define(`rotate', `
define <1 x $1> @__rotate_$1(<1 x $1>, i32) nounwind readnone alwaysinline
{
%val = extractelement <1 x $1> %0, i32 0
%tid = tail call i32 @__tid_x()
%src = add i32 %tid, %1
%lane = and i32 %src, 31
%rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane)
%retv = insertelement <1 x $1> undef, $1 %rets, i32 0
ret <1 x $1> %retv
}
')
rotate(i8)
rotate(i16)
rotate(i32)
rotate(i64)
rotate(float)
rotate(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts

View File

@@ -764,97 +764,6 @@ define <WIDTH x $1> @__broadcast_$1(<WIDTH x $1>, i32) nounwind readnone alwaysi
ret <WIDTH x $1> %broadcast
}
define <WIDTH x $1> @__rotate_$1(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
%isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1)
br i1 %isc, label %is_const, label %not_const
is_const:
; though verbose, this turms into tight code if %1 is a constant
forloop(i, 0, eval(WIDTH-1), `
%delta_`'i = add i32 %1, i
%delta_clamped_`'i = and i32 %delta_`'i, eval(WIDTH-1)
%v_`'i = extractelement <WIDTH x $1> %0, i32 %delta_clamped_`'i')
%ret_0 = insertelement <WIDTH x $1> undef, $1 %v_0, i32 0
forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
')
ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
not_const:
; store two instances of the vector into memory
%ptr = alloca <WIDTH x $1>, i32 2
%ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
store <WIDTH x $1> %0, <WIDTH x $1> * %ptr0
%ptr1 = getelementptr <WIDTH x $1> * %ptr, i32 1
store <WIDTH x $1> %0, <WIDTH x $1> * %ptr1
; compute offset in [0,vectorwidth-1], then index into the doubled-up vector
%offset = and i32 %1, eval(WIDTH-1)
%ptr_as_elt_array = bitcast <WIDTH x $1> * %ptr to [eval(2*WIDTH) x $1] *
%load_ptr = getelementptr [eval(2*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset
%load_ptr_vec = bitcast $1 * %load_ptr to <WIDTH x $1> *
%result = load <WIDTH x $1> * %load_ptr_vec, align $2
ret <WIDTH x $1> %result
}
define <WIDTH x $1> @__shift_$1(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
%ptr = alloca <WIDTH x $1>, i32 3
%ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr0
%ptr1 = getelementptr <WIDTH x $1> * %ptr, i32 1
store <WIDTH x $1> %0, <WIDTH x $1> * %ptr1
%ptr2 = getelementptr <WIDTH x $1> * %ptr, i32 2
store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr2
%offset = add i32 %1, WIDTH
%ptr_as_elt_array = bitcast <WIDTH x $1> * %ptr to [eval(3*WIDTH) x $1] *
%load_ptr = getelementptr [eval(3*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset
%load_ptr_vec = bitcast $1 * %load_ptr to <WIDTH x $1> *
%result = load <WIDTH x $1> * %load_ptr_vec, align $2
ret <WIDTH x $1> %result
}
define <WIDTH x $1> @__shuffle2_$1(<WIDTH x $1>, <WIDTH x $1>, <WIDTH x i32>) nounwind readnone alwaysinline {
%v2 = shufflevector <WIDTH x $1> %0, <WIDTH x $1> %1, <eval(2*WIDTH) x i32> <
forloop(i, 0, eval(2*WIDTH-2), `i32 i, ') i32 eval(2*WIDTH-1)
>
forloop(i, 0, eval(WIDTH-1), `
%index_`'i = extractelement <WIDTH x i32> %2, i32 i')
%isc = call i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32> %2)
br i1 %isc, label %is_const, label %not_const
is_const:
; extract from the requested lanes and insert into the result; LLVM turns
; this into good code in the end
forloop(i, 0, eval(WIDTH-1), `
%v_`'i = extractelement <eval(2*WIDTH) x $1> %v2, i32 %index_`'i')
%ret_0 = insertelement <WIDTH x $1> undef, $1 %v_0, i32 0
forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
')
ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
not_const:
; otherwise store the two vectors onto the stack and then use the given
; permutation vector to get indices into that array...
%ptr = alloca <eval(2*WIDTH) x $1>
store <eval(2*WIDTH) x $1> %v2, <eval(2*WIDTH) x $1> * %ptr
%baseptr = bitcast <eval(2*WIDTH) x $1> * %ptr to $1 *
%ptr_0 = getelementptr $1 * %baseptr, i32 %index_0
%val_0 = load $1 * %ptr_0
%result_0 = insertelement <WIDTH x $1> undef, $1 %val_0, i32 0
forloop(i, 1, eval(WIDTH-1), `
%ptr_`'i = getelementptr $1 * %baseptr, i32 %index_`'i
%val_`'i = load $1 * %ptr_`'i
%result_`'i = insertelement <WIDTH x $1> %result_`'eval(i-1), $1 %val_`'i, i32 i
')
ret <WIDTH x $1> %result_`'eval(WIDTH-1)
}
')
define(`define_shuffles',`

View File

@@ -586,7 +586,10 @@ Optimize(llvm::Module *module, int optLevel) {
optPM.add(llvm::createGlobalOptimizerPass());
optPM.add(llvm::createReassociatePass());
optPM.add(llvm::createIPConstantPropagationPass());
optPM.add(CreateReplaceStdlibShiftPass(),229);
if (g->target->getISA() != Target::NVPTX)
optPM.add(CreateReplaceStdlibShiftPass(),229);
optPM.add(llvm::createDeadArgEliminationPass(),230);
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createCFGSimplificationPass());