Represent MOVMSK'ed masks with int64s rather than int32s.
This allows us to scale up to 64-wide execution.
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -70,7 +70,7 @@ typedef int Bool;
|
||||
putchar('['); \
|
||||
for (int i = 0; i < width; ++i) { \
|
||||
/* only print the value if the current lane is executing */ \
|
||||
if (mask & (1<<i)) \
|
||||
if (mask & (1ull<<i)) \
|
||||
printf(fmt, ((type *)ptr)[i]); \
|
||||
else \
|
||||
printf("((" fmt "))", ((type *)ptr)[i]); \
|
||||
@@ -89,7 +89,7 @@ typedef int Bool;
|
||||
@param mask Current lane mask when the print statemnt is called
|
||||
@param args Array of pointers to the values to be printed
|
||||
*/
|
||||
void __do_print(const char *format, const char *types, int width, int mask,
|
||||
void __do_print(const char *format, const char *types, int width, uint64_t mask,
|
||||
void **args) {
|
||||
if (mask == 0)
|
||||
return;
|
||||
@@ -113,7 +113,7 @@ void __do_print(const char *format, const char *types, int width, int mask,
|
||||
case 'B': {
|
||||
putchar('[');
|
||||
for (int i = 0; i < width; ++i) {
|
||||
if (mask & (1<<i))
|
||||
if (mask & (1ull << i))
|
||||
printf("%s", ((Bool *)ptr)[i] ? "true" : "false");
|
||||
else
|
||||
printf("_________");
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -175,7 +175,7 @@ define <16 x float> @__min_varying_float(<16 x float>,
|
||||
|
||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %0 to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
@@ -186,7 +186,8 @@ define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
|
||||
%v1shift = shl i32 %v1, 8
|
||||
%v = or i32 %v1shift, %v0
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -175,10 +175,11 @@ define <8 x float> @__min_varying_float(<8 x float>,
|
||||
|
||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
@@ -186,14 +186,14 @@ define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>,
|
||||
ret void
|
||||
}
|
||||
|
||||
define i32 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
|
||||
%item = extractelement <1 x i32> %0, i32 0
|
||||
%v = lshr i32 %item, 31
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
;;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -201,7 +201,7 @@ declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reductions
|
||||
|
||||
declare i32 @__movmsk(<WIDTH x i1>) nounwind readnone
|
||||
declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone
|
||||
|
||||
declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
|
||||
declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -295,7 +295,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
@@ -309,7 +309,8 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define <4 x float> @__vec4_add_float(<4 x float> %v0,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -239,10 +239,11 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -237,7 +237,7 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>,
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
@@ -251,7 +251,8 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -271,10 +271,11 @@ define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alway
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
@@ -38,6 +38,18 @@ declare i1 @__is_compile_time_constant_uniform_int32(i32)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; It is a bit of a pain to compute this in m4 for 32 and 64-wide targets...
|
||||
define(`ALL_ON_MASK',
|
||||
`ifelse(WIDTH, `64', `-1',
|
||||
WIDTH, `32', `4294967295',
|
||||
`eval((1<<WIDTH)-1)')')
|
||||
|
||||
define(`MASK_HIGH_BIT_ON',
|
||||
`ifelse(WIDTH, `64', `-9223372036854775808',
|
||||
WIDTH, `32', `2147483648',
|
||||
`eval(1<<(WIDTH-1))')')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; Helper macro for calling various SSE instructions for scalar values
|
||||
;; but where the instruction takes a vector parameter.
|
||||
@@ -1529,7 +1541,7 @@ declare i32 @__fast_masked_vload()
|
||||
declare i8* @ISPCAlloc(i8**, i64, i32) nounwind
|
||||
declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind
|
||||
declare void @ISPCSync(i8*) nounwind
|
||||
declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind
|
||||
declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind
|
||||
|
||||
declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
|
||||
declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
|
||||
@@ -2096,12 +2108,12 @@ ok:
|
||||
|
||||
|
||||
define void @__do_assert_varying(i8 *%str, <WIDTH x MASK> %test,
|
||||
<WIDTH x MASK> %mask) {
|
||||
<WIDTH x MASK> %mask) {
|
||||
%nottest = xor <WIDTH x MASK> %test,
|
||||
< forloop(i, 1, eval(WIDTH-1), `MASK -1, ') MASK -1 >
|
||||
%nottest_and_mask = and <WIDTH x MASK> %nottest, %mask
|
||||
%mm = call i32 @__movmsk(<WIDTH x MASK> %nottest_and_mask)
|
||||
%all_ok = icmp eq i32 %mm, 0
|
||||
%mm = call i64 @__movmsk(<WIDTH x MASK> %nottest_and_mask)
|
||||
%all_ok = icmp eq i64 %mm, 0
|
||||
br i1 %all_ok, label %ok, label %fail
|
||||
|
||||
fail:
|
||||
@@ -2505,12 +2517,16 @@ define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x MASK> %mask) nounwind alwa
|
||||
define(`masked_load', `
|
||||
define <$1 x $2> @__masked_load_$3(i8 *, <$1 x MASK> %mask) nounwind alwaysinline {
|
||||
entry:
|
||||
%mm = call i32 @__movmsk(<$1 x MASK> %mask)
|
||||
%mm = call i64 @__movmsk(<$1 x MASK> %mask)
|
||||
|
||||
; if the first lane and the last lane are on, then it is safe to do a vector load
|
||||
; of the whole thing--what the lanes in the middle want turns out to not matter...
|
||||
%mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
|
||||
%can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
|
||||
%mm_and_low = and i64 %mm, 1
|
||||
%mm_and_high = and i64 %mm, MASK_HIGH_BIT_ON
|
||||
%mm_and_high_shift = lshr i64 %mm_and_high, eval(WIDTH-1)
|
||||
%mm_and_low_i1 = trunc i64 %mm_and_low to i1
|
||||
%mm_and_high_shift_i1 = trunc i64 %mm_and_high_shift to i1
|
||||
%can_vload = and i1 %mm_and_low_i1, %mm_and_high_shift_i1
|
||||
|
||||
%fast32 = call i32 @__fast_masked_vload()
|
||||
%fast_i1 = trunc i32 %fast32 to i1
|
||||
@@ -2529,9 +2545,10 @@ load:
|
||||
loop:
|
||||
; loop over the lanes and see if each one is on...
|
||||
%lane = phi i32 [ 0, %entry ], [ %next_lane, %lane_done ]
|
||||
%lanemask = shl i32 1, %lane
|
||||
%mask_and = and i32 %mm, %lanemask
|
||||
%do_lane = icmp ne i32 %mask_and, 0
|
||||
%lane64 = zext i32 %lane to i64
|
||||
%lanemask = shl i64 1, %lane64
|
||||
%mask_and = and i64 %mm, %lanemask
|
||||
%do_lane = icmp ne i64 %mask_and, 0
|
||||
br i1 %do_lane, label %load_lane, label %lane_done
|
||||
|
||||
load_lane:
|
||||
@@ -2743,12 +2760,12 @@ define(`packed_load_and_store', `
|
||||
define i32 @__packed_load_active(i32 * %startptr, <WIDTH x i32> * %val_ptr,
|
||||
<WIDTH x i32> %full_mask) nounwind alwaysinline {
|
||||
entry:
|
||||
%mask = call i32 @__movmsk(<WIDTH x i32> %full_mask)
|
||||
%mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
|
||||
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
|
||||
br i1 %mask_known, label %known_mask, label %unknown_mask
|
||||
|
||||
known_mask:
|
||||
%allon = icmp eq i32 %mask, eval((1 << WIDTH) -1)
|
||||
%allon = icmp eq i64 %mask, ALL_ON_MASK
|
||||
br i1 %allon, label %all_on, label %unknown_mask
|
||||
|
||||
all_on:
|
||||
@@ -2764,12 +2781,12 @@ unknown_mask:
|
||||
|
||||
loop:
|
||||
%lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
|
||||
%lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
|
||||
%lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
|
||||
%offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
|
||||
|
||||
; is the current lane on?
|
||||
%and = and i32 %mask, %lanemask
|
||||
%do_load = icmp eq i32 %and, %lanemask
|
||||
%and = and i64 %mask, %lanemask
|
||||
%do_load = icmp eq i64 %and, %lanemask
|
||||
br i1 %do_load, label %load, label %loopend
|
||||
|
||||
load:
|
||||
@@ -2784,7 +2801,7 @@ load:
|
||||
loopend:
|
||||
%nextoffset = phi i32 [ %offset1, %load ], [ %offset, %loop ]
|
||||
%nextlane = add i32 %lane, 1
|
||||
%nextlanemask = mul i32 %lanemask, 2
|
||||
%nextlanemask = mul i64 %lanemask, 2
|
||||
|
||||
; are we done yet?
|
||||
%test = icmp ne i32 %nextlane, WIDTH
|
||||
@@ -2795,14 +2812,14 @@ done:
|
||||
}
|
||||
|
||||
define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
|
||||
<WIDTH x i32> %full_mask) nounwind alwaysinline {
|
||||
<WIDTH x i32> %full_mask) nounwind alwaysinline {
|
||||
entry:
|
||||
%mask = call i32 @__movmsk(<WIDTH x i32> %full_mask)
|
||||
%mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
|
||||
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
|
||||
br i1 %mask_known, label %known_mask, label %unknown_mask
|
||||
|
||||
known_mask:
|
||||
%allon = icmp eq i32 %mask, eval((1 << WIDTH) -1)
|
||||
%allon = icmp eq i64 %mask, ALL_ON_MASK
|
||||
br i1 %allon, label %all_on, label %unknown_mask
|
||||
|
||||
all_on:
|
||||
@@ -2815,12 +2832,12 @@ unknown_mask:
|
||||
|
||||
loop:
|
||||
%lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
|
||||
%lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
|
||||
%lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
|
||||
%offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
|
||||
|
||||
; is the current lane on?
|
||||
%and = and i32 %mask, %lanemask
|
||||
%do_store = icmp eq i32 %and, %lanemask
|
||||
%and = and i64 %mask, %lanemask
|
||||
%do_store = icmp eq i64 %and, %lanemask
|
||||
br i1 %do_store, label %store, label %loopend
|
||||
|
||||
store:
|
||||
@@ -2833,7 +2850,7 @@ store:
|
||||
loopend:
|
||||
%nextoffset = phi i32 [ %offset1, %store ], [ %offset, %loop ]
|
||||
%nextlane = add i32 %lane, 1
|
||||
%nextlanemask = mul i32 %lanemask, 2
|
||||
%nextlanemask = mul i64 %lanemask, 2
|
||||
|
||||
; are we done yet?
|
||||
%test = icmp ne i32 %nextlane, WIDTH
|
||||
@@ -2857,14 +2874,15 @@ define(`reduce_equal_aux', `
|
||||
define i1 @__reduce_equal_$3(<$1 x $2> %v, $2 * %samevalue,
|
||||
<$1 x MASK> %mask) nounwind alwaysinline {
|
||||
entry:
|
||||
%mm = call i32 @__movmsk(<$1 x MASK> %mask)
|
||||
%allon = icmp eq i32 %mm, eval((1<<$1)-1)
|
||||
%mm = call i64 @__movmsk(<$1 x MASK> %mask)
|
||||
%allon = icmp eq i64 %mm, ALL_ON_MASK
|
||||
br i1 %allon, label %check_neighbors, label %domixed
|
||||
|
||||
domixed:
|
||||
; First, figure out which lane is the first active one
|
||||
%first = call i32 @llvm.cttz.i32(i32 %mm)
|
||||
%baseval = extractelement <$1 x $2> %v, i32 %first
|
||||
%first = call i64 @llvm.cttz.i64(i64 %mm)
|
||||
%first32 = trunc i64 %first to i32
|
||||
%baseval = extractelement <$1 x $2> %v, i32 %first32
|
||||
%basev1 = bitcast $2 %baseval to <1 x $2>
|
||||
; get a vector that is that value smeared across all elements
|
||||
%basesmear = shufflevector <1 x $2> %basev1, <1 x $2> undef,
|
||||
@@ -2895,9 +2913,9 @@ check_neighbors:
|
||||
%eq = $5 eq <$1 x $2> %vec, %vr
|
||||
ifelse(MASK,i32, `
|
||||
%eq32 = sext <$1 x i1> %eq to <$1 x i32>
|
||||
%eqmm = call i32 @__movmsk(<$1 x i32> %eq32)', `
|
||||
%eqmm = call i32 @__movmsk(<$1 x MASK> %eq)')
|
||||
%alleq = icmp eq i32 %eqmm, eval((1<<$1)-1)
|
||||
%eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', `
|
||||
%eqmm = call i64 @__movmsk(<$1 x MASK> %eq)')
|
||||
%alleq = icmp eq i64 %eqmm, ALL_ON_MASK
|
||||
br i1 %alleq, label %all_equal, label %not_all_equal
|
||||
', `
|
||||
; But for 64-bit elements, it turns out to be more efficient to just
|
||||
@@ -3010,14 +3028,14 @@ define(`per_lane', `
|
||||
br label %pl_entry
|
||||
|
||||
pl_entry:
|
||||
%pl_mask = call i32 @__movmsk($2)
|
||||
%pl_mask = call i64 @__movmsk($2)
|
||||
%pl_mask_known = call i1 @__is_compile_time_constant_mask($2)
|
||||
br i1 %pl_mask_known, label %pl_known_mask, label %pl_unknown_mask
|
||||
|
||||
pl_known_mask:
|
||||
;; the mask is known at compile time; see if it is something we can
|
||||
;; handle more efficiently
|
||||
%pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
|
||||
%pl_is_allon = icmp eq i64 %pl_mask, ALL_ON_MASK
|
||||
br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask
|
||||
|
||||
pl_all_on:
|
||||
@@ -3039,11 +3057,11 @@ pl_unknown_mask:
|
||||
pl_loop:
|
||||
;; Loop over each lane and see if we want to do the work for this lane
|
||||
%pl_lane = phi i32 [ 0, %pl_unknown_mask ], [ %pl_nextlane, %pl_loopend ]
|
||||
%pl_lanemask = phi i32 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ]
|
||||
%pl_lanemask = phi i64 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ]
|
||||
|
||||
; is the current lane on? if so, goto do work, otherwise to end of loop
|
||||
%pl_and = and i32 %pl_mask, %pl_lanemask
|
||||
%pl_doit = icmp eq i32 %pl_and, %pl_lanemask
|
||||
%pl_and = and i64 %pl_mask, %pl_lanemask
|
||||
%pl_doit = icmp eq i64 %pl_and, %pl_lanemask
|
||||
br i1 %pl_doit, label %pl_dolane, label %pl_loopend
|
||||
|
||||
pl_dolane:
|
||||
@@ -3054,7 +3072,7 @@ pl_dolane:
|
||||
|
||||
pl_loopend:
|
||||
%pl_nextlane = add i32 %pl_lane, 1
|
||||
%pl_nextlanemask = mul i32 %pl_lanemask, 2
|
||||
%pl_nextlanemask = mul i64 %pl_lanemask, 2
|
||||
|
||||
; are we done yet?
|
||||
%pl_test = icmp ne i32 %pl_nextlane, $1
|
||||
|
||||
Reference in New Issue
Block a user