diff --git a/builtins.m4 b/builtins.m4 index 13295b9d..3c59a1f7 100644 --- a/builtins.m4 +++ b/builtins.m4 @@ -1346,12 +1346,6 @@ i64minmax($1,max,uint64,ugt) define(`load_and_broadcast', ` define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline { - ; must not load if the mask is all off; the address may be invalid - %mm = call i32 @__movmsk(<$1 x i32> %mask) - %any_on = icmp ne i32 %mm, 0 - br i1 %any_on, label %load, label %skip - -load: %ptr = bitcast i8 * %0 to $2 * %val = load $2 * %ptr @@ -1359,9 +1353,6 @@ load: forloop(i, 1, eval($1-1), ` %ret`'i = insertelement <$1 x $2> %ret`'eval(i-1), $2 %val, i32 i') ret <$1 x $2> %ret`'eval($1-1) - -skip: - ret <$1 x $2> undef } ') @@ -1383,11 +1374,9 @@ entry: %mm_and = and i32 %mm, eval(1 | (1<<($1-1))) %can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1))) - %mm_not_zero = icmp ne i32 %mm, 0 %fast32 = call i32 @__fast_masked_vload() %fast_i1 = trunc i32 %fast32 to i1 - %vload_fast = and i1 %mm_not_zero, %fast_i1 - %can_vload_maybe_fast = or i1 %vload_fast, %can_vload + %can_vload_maybe_fast = or i1 %fast_i1, %can_vload ; if we are not able to do a singe vload, we will accumulate lanes in this memory.. %retptr = alloca <$1 x $2> @@ -1594,7 +1583,7 @@ entry: known_mask: %allon = icmp eq i32 %mask, eval((1 << $1) -1) - br i1 %allon, label %all_on, label %not_all_on + br i1 %allon, label %all_on, label %unknown_mask all_on: ;; everyone wants to load, so just load an entire vector width in a single @@ -1604,14 +1593,6 @@ all_on: store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4 ret i32 $1 -not_all_on: - %alloff = icmp eq i32 %mask, 0 - br i1 %alloff, label %all_off, label %unknown_mask - -all_off: - ;; no one wants to load - ret i32 0 - unknown_mask: br label %loop @@ -1658,20 +1639,13 @@ entry: known_mask: %allon = icmp eq i32 %mask, eval((1 << $1) -1) - br i1 %allon, label %all_on, label %not_all_on + br i1 %allon, label %all_on, label %unknown_mask all_on: %vecptr = bitcast i32 *%startptr to <$1 x i32> * store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4 ret i32 $1 -not_all_on: - %alloff = icmp eq i32 %mask, 0 - br i1 %alloff, label %all_off, label %unknown_mask - -all_off: - ret i32 0 - unknown_mask: br label %loop @@ -1721,14 +1695,6 @@ entry: br i1 %allon, label %check_neighbors, label %domixed domixed: - ; the mask is mixed on/off. First see if the lanes are all off - %alloff = icmp eq i32 %mm, 0 - br i1 %alloff, label %doalloff, label %actuallymixed - -doalloff: - ret i1 false ;; this seems safest - -actuallymixed: ; First, figure out which lane is the first active one %first = call i32 @llvm.cttz.i32(i32 %mm) %baseval = extractelement <$1 x $2> %v, i32 %first @@ -1751,7 +1717,7 @@ actuallymixed: br label %check_neighbors check_neighbors: - %vec = phi <$1 x $2> [ %blendvec, %actuallymixed ], [ %v, %entry ] + %vec = phi <$1 x $2> [ %blendvec, %domixed ], [ %v, %entry ] ifelse($6, `32', ` ; For 32-bit elements, we rotate once and compare with the vector, which ends ; up comparing each element to its neighbor on the right. Then see if @@ -1883,7 +1849,7 @@ pl_known_mask: ;; the mask is known at compile time; see if it is something we can ;; handle more efficiently %pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1) - br i1 %pl_is_allon, label %pl_all_on, label %pl_not_all_on + br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask pl_all_on: ;; the mask is all on--just expand the code for each lane sequentially @@ -1891,19 +1857,14 @@ pl_all_on: `patsubst(`$3', `ID\|LANE', i)') br label %pl_done -pl_not_all_on: - ;; not all on--see if it is all off or mixed - ;; for the mixed case, we just run the general case, though we could +pl_unknown_mask: + ;; we just run the general case, though we could ;; try to be smart and just emit the code based on what it actually is, ;; for example by emitting the code straight-line without a loop and doing ;; the lane tests explicitly, leaving later optimization passes to eliminate ;; the stuff that is definitely not needed. Not clear if we will frequently ;; encounter a mask that is known at compile-time but is not either all on or ;; all off... - %pl_alloff = icmp eq i32 %pl_mask, 0 - br i1 %pl_alloff, label %pl_done, label %pl_unknown_mask - -pl_unknown_mask: br label %pl_loop pl_loop: @@ -1959,20 +1920,6 @@ define internal <$1 x $2> @__gather_elt_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x define <$1 x $2> @__gather_base_offsets_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x i32> %vecmask) nounwind readonly alwaysinline { -entry: - %mask = call i32 @__movmsk(<$1 x i32> %vecmask) - - %maskKnown = call i1 @__is_compile_time_constant_mask(<$1 x i32> %vecmask) - br i1 %maskKnown, label %known_mask, label %unknown_mask - -known_mask: - %alloff = icmp eq i32 %mask, 0 - br i1 %alloff, label %gather_all_off, label %unknown_mask - -gather_all_off: - ret <$1 x $2> undef - -unknown_mask: ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always ; legal to read from (and we do indeed require that, given the benefits!) diff --git a/stmt.cpp b/stmt.cpp index 20637553..c532b75c 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -402,9 +402,12 @@ DeclStmt::Print(int indent) const { /////////////////////////////////////////////////////////////////////////// // IfStmt -IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool doUnif, SourcePos p) +IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool checkCoherence, SourcePos p) : Stmt(p), test(t), trueStmts(ts), falseStmts(fs), - doCoherentCheck(doUnif && !g->opt.disableCoherentControlFlow) { + doCoherentCheck(checkCoherence && + (test->GetType() != NULL) && + test->GetType()->IsVaryingType() && + !g->opt.disableCoherentControlFlow) { } @@ -439,7 +442,7 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const { if (isUniform) { ctx->StartUniformIf(ctx->GetMask()); if (doCoherentCheck) - Warning(test->pos, "Uniform condition supplied to cif statement."); + Warning(test->pos, "Uniform condition supplied to \"cif\" statement."); // 'If' statements with uniform conditions are relatively // straightforward. We evaluate the condition and then jump to