added reduce_min/max_float, packed_store_active for CUDA, and now kerenls1.ispc just work :)

This commit is contained in:
Evghenii
2013-11-11 12:33:39 +01:00
parent 9c7a842163
commit a91c8e15e2
6 changed files with 193 additions and 106 deletions

View File

@@ -3768,52 +3768,23 @@ done:
}
define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
<WIDTH x MASK> %full_mask) nounwind alwaysinline {
<WIDTH x MASK> %full_mask) nounwind alwaysinline
{
entry:
%mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
br i1 %mask_known, label %known_mask, label %unknown_mask
%active = extractelement <1 x i1> %full_mask, i32 0
%call = tail call i64 @__warpBinExclusiveScan(i1 zeroext %active)
%res.sroa.0.0.extract.trunc = trunc i64 %call to i32
br i1 %active, label %if.then, label %if.end
known_mask:
%allon = icmp eq i64 %mask, ALL_ON_MASK
br i1 %allon, label %all_on, label %unknown_mask
if.then: ; preds = %entry
%idxprom = ashr i64 %call, 32
%arrayidx = getelementptr inbounds i32* %startptr, i64 %idxprom
%val = extractelement <1 x i32> %vals, i32 0
store i32 %val, i32* %arrayidx, align 4
br label %if.end
all_on:
%vecptr = bitcast i32 *%startptr to <WIDTH x i32> *
store <WIDTH x i32> %vals, <WIDTH x i32> * %vecptr, align 4
ret i32 WIDTH
unknown_mask:
br label %loop
loop:
%lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
%lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
%offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
; is the current lane on?
%and = and i64 %mask, %lanemask
%do_store = icmp eq i64 %and, %lanemask
br i1 %do_store, label %store, label %loopend
store:
%storeval = extractelement <WIDTH x i32> %vals, i32 %lane
%storeptr = getelementptr i32 *%startptr, i32 %offset
store i32 %storeval, i32 *%storeptr
%offset1 = add i32 %offset, 1
br label %loopend
loopend:
%nextoffset = phi i32 [ %offset1, %store ], [ %offset, %loop ]
%nextlane = add i32 %lane, 1
%nextlanemask = mul i64 %lanemask, 2
; are we done yet?
%test = icmp ne i32 %nextlane, WIDTH
br i1 %test, label %loop, label %done
done:
ret i32 %nextoffset
if.end: ; preds = %if.then, %entry
ret i32 %res.sroa.0.0.extract.trunc
}
')