Make all "if" statements "coherent" ifs. Workaround for issue #74.

Using blend to do masked stores is unsafe if all of the lanes are off:
it may read from or write to invalid memory.  For now, this workaround
transforms all 'if' statements into coherent 'if's, ensuring that an
instruction only runs if at least on program instance wants to be running
it.

One nice thing about this change is that a number of implementations of
various builtins can be simplified, since they no longer need to confirm
that at least one program instance is running.

It might be nice to re-enable regular if statements in a future checkin,
but we'd want to make sure they don't have any masked loads or blended
masked stores in their statement lists.  There isn't a performance
impact for any of the examples with this change, so it's unclear if
this is important.

Note that this only impacts 'if' statements with a varying condition.
This commit is contained in:
Matt Pharr
2011-09-12 16:07:08 -07:00
parent 9d4ff1bc06
commit 0c20483853
2 changed files with 13 additions and 63 deletions

View File

@@ -1346,12 +1346,6 @@ i64minmax($1,max,uint64,ugt)
define(`load_and_broadcast', `
define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
; must not load if the mask is all off; the address may be invalid
%mm = call i32 @__movmsk(<$1 x i32> %mask)
%any_on = icmp ne i32 %mm, 0
br i1 %any_on, label %load, label %skip
load:
%ptr = bitcast i8 * %0 to $2 *
%val = load $2 * %ptr
@@ -1359,9 +1353,6 @@ load:
forloop(i, 1, eval($1-1), `
%ret`'i = insertelement <$1 x $2> %ret`'eval(i-1), $2 %val, i32 i')
ret <$1 x $2> %ret`'eval($1-1)
skip:
ret <$1 x $2> undef
}
')
@@ -1383,11 +1374,9 @@ entry:
%mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
%can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
%mm_not_zero = icmp ne i32 %mm, 0
%fast32 = call i32 @__fast_masked_vload()
%fast_i1 = trunc i32 %fast32 to i1
%vload_fast = and i1 %mm_not_zero, %fast_i1
%can_vload_maybe_fast = or i1 %vload_fast, %can_vload
%can_vload_maybe_fast = or i1 %fast_i1, %can_vload
; if we are not able to do a singe vload, we will accumulate lanes in this memory..
%retptr = alloca <$1 x $2>
@@ -1594,7 +1583,7 @@ entry:
known_mask:
%allon = icmp eq i32 %mask, eval((1 << $1) -1)
br i1 %allon, label %all_on, label %not_all_on
br i1 %allon, label %all_on, label %unknown_mask
all_on:
;; everyone wants to load, so just load an entire vector width in a single
@@ -1604,14 +1593,6 @@ all_on:
store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
ret i32 $1
not_all_on:
%alloff = icmp eq i32 %mask, 0
br i1 %alloff, label %all_off, label %unknown_mask
all_off:
;; no one wants to load
ret i32 0
unknown_mask:
br label %loop
@@ -1658,20 +1639,13 @@ entry:
known_mask:
%allon = icmp eq i32 %mask, eval((1 << $1) -1)
br i1 %allon, label %all_on, label %not_all_on
br i1 %allon, label %all_on, label %unknown_mask
all_on:
%vecptr = bitcast i32 *%startptr to <$1 x i32> *
store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4
ret i32 $1
not_all_on:
%alloff = icmp eq i32 %mask, 0
br i1 %alloff, label %all_off, label %unknown_mask
all_off:
ret i32 0
unknown_mask:
br label %loop
@@ -1721,14 +1695,6 @@ entry:
br i1 %allon, label %check_neighbors, label %domixed
domixed:
; the mask is mixed on/off. First see if the lanes are all off
%alloff = icmp eq i32 %mm, 0
br i1 %alloff, label %doalloff, label %actuallymixed
doalloff:
ret i1 false ;; this seems safest
actuallymixed:
; First, figure out which lane is the first active one
%first = call i32 @llvm.cttz.i32(i32 %mm)
%baseval = extractelement <$1 x $2> %v, i32 %first
@@ -1751,7 +1717,7 @@ actuallymixed:
br label %check_neighbors
check_neighbors:
%vec = phi <$1 x $2> [ %blendvec, %actuallymixed ], [ %v, %entry ]
%vec = phi <$1 x $2> [ %blendvec, %domixed ], [ %v, %entry ]
ifelse($6, `32', `
; For 32-bit elements, we rotate once and compare with the vector, which ends
; up comparing each element to its neighbor on the right. Then see if
@@ -1883,7 +1849,7 @@ pl_known_mask:
;; the mask is known at compile time; see if it is something we can
;; handle more efficiently
%pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
br i1 %pl_is_allon, label %pl_all_on, label %pl_not_all_on
br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask
pl_all_on:
;; the mask is all on--just expand the code for each lane sequentially
@@ -1891,19 +1857,14 @@ pl_all_on:
`patsubst(`$3', `ID\|LANE', i)')
br label %pl_done
pl_not_all_on:
;; not all on--see if it is all off or mixed
;; for the mixed case, we just run the general case, though we could
pl_unknown_mask:
;; we just run the general case, though we could
;; try to be smart and just emit the code based on what it actually is,
;; for example by emitting the code straight-line without a loop and doing
;; the lane tests explicitly, leaving later optimization passes to eliminate
;; the stuff that is definitely not needed. Not clear if we will frequently
;; encounter a mask that is known at compile-time but is not either all on or
;; all off...
%pl_alloff = icmp eq i32 %pl_mask, 0
br i1 %pl_alloff, label %pl_done, label %pl_unknown_mask
pl_unknown_mask:
br label %pl_loop
pl_loop:
@@ -1959,20 +1920,6 @@ define internal <$1 x $2> @__gather_elt_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x
define <$1 x $2> @__gather_base_offsets_$2(i8 * %ptr, <$1 x i32> %offsets,
<$1 x i32> %vecmask) nounwind readonly alwaysinline {
entry:
%mask = call i32 @__movmsk(<$1 x i32> %vecmask)
%maskKnown = call i1 @__is_compile_time_constant_mask(<$1 x i32> %vecmask)
br i1 %maskKnown, label %known_mask, label %unknown_mask
known_mask:
%alloff = icmp eq i32 %mask, 0
br i1 %alloff, label %gather_all_off, label %unknown_mask
gather_all_off:
ret <$1 x $2> undef
unknown_mask:
; We can be clever and avoid the per-lane stuff for gathers if we are willing
; to require that the 0th element of the array being gathered from is always
; legal to read from (and we do indeed require that, given the benefits!)

View File

@@ -402,9 +402,12 @@ DeclStmt::Print(int indent) const {
///////////////////////////////////////////////////////////////////////////
// IfStmt
IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool doUnif, SourcePos p)
IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool checkCoherence, SourcePos p)
: Stmt(p), test(t), trueStmts(ts), falseStmts(fs),
doCoherentCheck(doUnif && !g->opt.disableCoherentControlFlow) {
doCoherentCheck(checkCoherence &&
(test->GetType() != NULL) &&
test->GetType()->IsVaryingType() &&
!g->opt.disableCoherentControlFlow) {
}
@@ -439,7 +442,7 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {
if (isUniform) {
ctx->StartUniformIf(ctx->GetMask());
if (doCoherentCheck)
Warning(test->pos, "Uniform condition supplied to cif statement.");
Warning(test->pos, "Uniform condition supplied to \"cif\" statement.");
// 'If' statements with uniform conditions are relatively
// straightforward. We evaluate the condition and then jump to