added reduce_min/max_float, packed_store_active for CUDA, and now kerenls1.ispc just work :)
This commit is contained in:
@@ -3768,52 +3768,23 @@ done:
|
||||
}
|
||||
|
||||
define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
|
||||
<WIDTH x MASK> %full_mask) nounwind alwaysinline {
|
||||
<WIDTH x MASK> %full_mask) nounwind alwaysinline
|
||||
{
|
||||
entry:
|
||||
%mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
|
||||
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
|
||||
br i1 %mask_known, label %known_mask, label %unknown_mask
|
||||
%active = extractelement <1 x i1> %full_mask, i32 0
|
||||
%call = tail call i64 @__warpBinExclusiveScan(i1 zeroext %active)
|
||||
%res.sroa.0.0.extract.trunc = trunc i64 %call to i32
|
||||
br i1 %active, label %if.then, label %if.end
|
||||
|
||||
known_mask:
|
||||
%allon = icmp eq i64 %mask, ALL_ON_MASK
|
||||
br i1 %allon, label %all_on, label %unknown_mask
|
||||
if.then: ; preds = %entry
|
||||
%idxprom = ashr i64 %call, 32
|
||||
%arrayidx = getelementptr inbounds i32* %startptr, i64 %idxprom
|
||||
%val = extractelement <1 x i32> %vals, i32 0
|
||||
store i32 %val, i32* %arrayidx, align 4
|
||||
br label %if.end
|
||||
|
||||
all_on:
|
||||
%vecptr = bitcast i32 *%startptr to <WIDTH x i32> *
|
||||
store <WIDTH x i32> %vals, <WIDTH x i32> * %vecptr, align 4
|
||||
ret i32 WIDTH
|
||||
|
||||
unknown_mask:
|
||||
br label %loop
|
||||
|
||||
loop:
|
||||
%lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
|
||||
%lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
|
||||
%offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
|
||||
|
||||
; is the current lane on?
|
||||
%and = and i64 %mask, %lanemask
|
||||
%do_store = icmp eq i64 %and, %lanemask
|
||||
br i1 %do_store, label %store, label %loopend
|
||||
|
||||
store:
|
||||
%storeval = extractelement <WIDTH x i32> %vals, i32 %lane
|
||||
%storeptr = getelementptr i32 *%startptr, i32 %offset
|
||||
store i32 %storeval, i32 *%storeptr
|
||||
%offset1 = add i32 %offset, 1
|
||||
br label %loopend
|
||||
|
||||
loopend:
|
||||
%nextoffset = phi i32 [ %offset1, %store ], [ %offset, %loop ]
|
||||
%nextlane = add i32 %lane, 1
|
||||
%nextlanemask = mul i64 %lanemask, 2
|
||||
|
||||
; are we done yet?
|
||||
%test = icmp ne i32 %nextlane, WIDTH
|
||||
br i1 %test, label %loop, label %done
|
||||
|
||||
done:
|
||||
ret i32 %nextoffset
|
||||
if.end: ; preds = %if.then, %entry
|
||||
ret i32 %res.sroa.0.0.extract.trunc
|
||||
}
|
||||
')
|
||||
|
||||
|
||||
Reference in New Issue
Block a user