packed_load and packed_store2 added

This commit is contained in:
Evghenii
2014-01-23 14:34:00 +01:00
parent e734bbc0cc
commit 0091973bca

View File

@@ -3516,58 +3516,24 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
define(`packed_load_and_store', `
define i32 @__packed_load_active(i32 * %startptr, <WIDTH x i32> * %val_ptr,
<WIDTH x MASK> %full_mask) nounwind alwaysinline {
define i32 @__packed_load_active(i32 * %startptr, <1 x i32> * %val_ptr,
<1 x i1> %full_mask) nounwind alwaysinline {
entry:
%mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
br i1 %mask_known, label %known_mask, label %unknown_mask
%active = extractelement <1 x i1> %full_mask, i32 0
%call = tail call i64 @__warpBinExclusiveScan(i1 zeroext %active)
%res.sroa.0.0.extract.trunc = trunc i64 %call to i32
br i1 %active, label %if.then, label %if.end
known_mask:
%allon = icmp eq i64 %mask, ALL_ON_MASK
br i1 %allon, label %all_on, label %unknown_mask
if.then: ; preds = %entry
%idxprom = ashr i64 %call, 32
%arrayidx = getelementptr inbounds i32* %startptr, i64 %idxprom
%val = load i32* %arrayidx, align 4
%valvec = insertelement <1 x i32> undef, i32 %val, i32 0
store <1 x i32> %valvec, <1 x i32>* %val_ptr, align 4
br label %if.end
all_on:
;; everyone wants to load, so just load an entire vector width in a single
;; vector load
%vecptr = bitcast i32 *%startptr to <WIDTH x i32> *
%vec_load = load <WIDTH x i32> *%vecptr, align 4
store <WIDTH x i32> %vec_load, <WIDTH x i32> * %val_ptr, align 4
ret i32 WIDTH
unknown_mask:
br label %loop
loop:
%lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
%lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
%offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
; is the current lane on?
%and = and i64 %mask, %lanemask
%do_load = icmp eq i64 %and, %lanemask
br i1 %do_load, label %load, label %loopend
load:
%loadptr = getelementptr i32 *%startptr, i32 %offset
%loadval = load i32 *%loadptr
%val_ptr_i32 = bitcast <WIDTH x i32> * %val_ptr to i32 *
%storeptr = getelementptr i32 *%val_ptr_i32, i32 %lane
store i32 %loadval, i32 *%storeptr
%offset1 = add i32 %offset, 1
br label %loopend
loopend:
%nextoffset = phi i32 [ %offset1, %load ], [ %offset, %loop ]
%nextlane = add i32 %lane, 1
%nextlanemask = mul i64 %lanemask, 2
; are we done yet?
%test = icmp ne i32 %nextlane, WIDTH
br i1 %test, label %loop, label %done
done:
ret i32 %nextoffset
if.end: ; preds = %if.then, %entry
ret i32 %res.sroa.0.0.extract.trunc
}
define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
@@ -3589,10 +3555,16 @@ if.then: ; preds = %entry
if.end: ; preds = %if.then, %entry
ret i32 %res.sroa.0.0.extract.trunc
}
define i32 @__packed_store_active2(i32 * %startptr, <1 x i32> %vals,
<1 x i1> %full_mask) nounwind alwaysinline
{
%ret = call i32 @__packed_store_active(i32* %startptr,
<1 x i32> %vals, <1 x i1> %full_mask);
ret i32 %ret
}
')
declare i32 @__packed_store_active2(i32 * %startptr, <WIDTH x i32> %vals,
<WIDTH x i1> %full_mask) nounwind alwaysinline ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reduce_equal