Represent MOVMSK'ed masks with int64s rather than int32s.

This allows us to scale up to 64-wide execution.
This commit is contained in:
Matt Pharr
2012-05-25 11:48:08 -07:00
parent 38cea6dc71
commit 90db01d038
20 changed files with 137 additions and 107 deletions

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -70,7 +70,7 @@ typedef int Bool;
putchar('['); \
for (int i = 0; i < width; ++i) { \
/* only print the value if the current lane is executing */ \
if (mask & (1<<i)) \
if (mask & (1ull<<i)) \
printf(fmt, ((type *)ptr)[i]); \
else \
printf("((" fmt "))", ((type *)ptr)[i]); \
@@ -89,7 +89,7 @@ typedef int Bool;
@param mask Current lane mask when the print statemnt is called
@param args Array of pointers to the values to be printed
*/
void __do_print(const char *format, const char *types, int width, int mask,
void __do_print(const char *format, const char *types, int width, uint64_t mask,
void **args) {
if (mask == 0)
return;
@@ -113,7 +113,7 @@ void __do_print(const char *format, const char *types, int width, int mask,
case 'B': {
putchar('[');
for (int i = 0; i < width; ++i) {
if (mask & (1<<i))
if (mask & (1ull << i))
printf("%s", ((Bool *)ptr)[i] ? "true" : "false");
else
printf("_________");

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -175,7 +175,7 @@ define <16 x float> @__min_varying_float(<16 x float>,
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <16 x i32> %0 to <16 x float>
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -186,7 +186,8 @@ define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
%v1shift = shl i32 %v1, 8
%v = or i32 %v1shift, %v0
ret i32 %v
%v64 = zext i32 %v to i64
ret i64 %v64
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -175,10 +175,11 @@ define <8 x float> @__min_varying_float(<8 x float>,
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
ret i32 %v
%v64 = zext i32 %v to i64
ret i64 %v64
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

View File

@@ -186,14 +186,14 @@ define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>,
ret void
}
define i32 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
define i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
%item = extractelement <1 x i32> %0, i32 0
%v = lshr i32 %item, 31
ret i32 %v
%v64 = zext i32 %v to i64
ret i64 %v64
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding
;;

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -201,7 +201,7 @@ declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reductions
declare i32 @__movmsk(<WIDTH x i1>) nounwind readnone
declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone
declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -295,7 +295,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -309,7 +309,8 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
ret i32 %v
%v64 = zext i32 %v to i64
ret i64 %v64
}
define <4 x float> @__vec4_add_float(<4 x float> %v0,

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -239,10 +239,11 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
ret i32 %v
%v64 = zext i32 %v to i64
ret i64 %v64
}
define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -237,7 +237,7 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>,
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -251,7 +251,8 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
ret i32 %v
%v64 = zext i32 %v to i64
ret i64 %v64
}
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -271,10 +271,11 @@ define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alway
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
ret i32 %v
%v64 = zext i32 %v to i64
ret i64 %v64
}
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone

View File

@@ -38,6 +38,18 @@ declare i1 @__is_compile_time_constant_uniform_int32(i32)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; It is a bit of a pain to compute this in m4 for 32 and 64-wide targets...
define(`ALL_ON_MASK',
`ifelse(WIDTH, `64', `-1',
WIDTH, `32', `4294967295',
`eval((1<<WIDTH)-1)')')
define(`MASK_HIGH_BIT_ON',
`ifelse(WIDTH, `64', `-9223372036854775808',
WIDTH, `32', `2147483648',
`eval(1<<(WIDTH-1))')')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Helper macro for calling various SSE instructions for scalar values
;; but where the instruction takes a vector parameter.
@@ -1529,7 +1541,7 @@ declare i32 @__fast_masked_vload()
declare i8* @ISPCAlloc(i8**, i64, i32) nounwind
declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind
declare void @ISPCSync(i8*) nounwind
declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind
declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind
declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
@@ -2096,12 +2108,12 @@ ok:
define void @__do_assert_varying(i8 *%str, <WIDTH x MASK> %test,
<WIDTH x MASK> %mask) {
<WIDTH x MASK> %mask) {
%nottest = xor <WIDTH x MASK> %test,
< forloop(i, 1, eval(WIDTH-1), `MASK -1, ') MASK -1 >
%nottest_and_mask = and <WIDTH x MASK> %nottest, %mask
%mm = call i32 @__movmsk(<WIDTH x MASK> %nottest_and_mask)
%all_ok = icmp eq i32 %mm, 0
%mm = call i64 @__movmsk(<WIDTH x MASK> %nottest_and_mask)
%all_ok = icmp eq i64 %mm, 0
br i1 %all_ok, label %ok, label %fail
fail:
@@ -2505,12 +2517,16 @@ define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x MASK> %mask) nounwind alwa
define(`masked_load', `
define <$1 x $2> @__masked_load_$3(i8 *, <$1 x MASK> %mask) nounwind alwaysinline {
entry:
%mm = call i32 @__movmsk(<$1 x MASK> %mask)
%mm = call i64 @__movmsk(<$1 x MASK> %mask)
; if the first lane and the last lane are on, then it is safe to do a vector load
; of the whole thing--what the lanes in the middle want turns out to not matter...
%mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
%can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
%mm_and_low = and i64 %mm, 1
%mm_and_high = and i64 %mm, MASK_HIGH_BIT_ON
%mm_and_high_shift = lshr i64 %mm_and_high, eval(WIDTH-1)
%mm_and_low_i1 = trunc i64 %mm_and_low to i1
%mm_and_high_shift_i1 = trunc i64 %mm_and_high_shift to i1
%can_vload = and i1 %mm_and_low_i1, %mm_and_high_shift_i1
%fast32 = call i32 @__fast_masked_vload()
%fast_i1 = trunc i32 %fast32 to i1
@@ -2529,9 +2545,10 @@ load:
loop:
; loop over the lanes and see if each one is on...
%lane = phi i32 [ 0, %entry ], [ %next_lane, %lane_done ]
%lanemask = shl i32 1, %lane
%mask_and = and i32 %mm, %lanemask
%do_lane = icmp ne i32 %mask_and, 0
%lane64 = zext i32 %lane to i64
%lanemask = shl i64 1, %lane64
%mask_and = and i64 %mm, %lanemask
%do_lane = icmp ne i64 %mask_and, 0
br i1 %do_lane, label %load_lane, label %lane_done
load_lane:
@@ -2743,12 +2760,12 @@ define(`packed_load_and_store', `
define i32 @__packed_load_active(i32 * %startptr, <WIDTH x i32> * %val_ptr,
<WIDTH x i32> %full_mask) nounwind alwaysinline {
entry:
%mask = call i32 @__movmsk(<WIDTH x i32> %full_mask)
%mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
br i1 %mask_known, label %known_mask, label %unknown_mask
known_mask:
%allon = icmp eq i32 %mask, eval((1 << WIDTH) -1)
%allon = icmp eq i64 %mask, ALL_ON_MASK
br i1 %allon, label %all_on, label %unknown_mask
all_on:
@@ -2764,12 +2781,12 @@ unknown_mask:
loop:
%lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
%lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
%lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
%offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
; is the current lane on?
%and = and i32 %mask, %lanemask
%do_load = icmp eq i32 %and, %lanemask
%and = and i64 %mask, %lanemask
%do_load = icmp eq i64 %and, %lanemask
br i1 %do_load, label %load, label %loopend
load:
@@ -2784,7 +2801,7 @@ load:
loopend:
%nextoffset = phi i32 [ %offset1, %load ], [ %offset, %loop ]
%nextlane = add i32 %lane, 1
%nextlanemask = mul i32 %lanemask, 2
%nextlanemask = mul i64 %lanemask, 2
; are we done yet?
%test = icmp ne i32 %nextlane, WIDTH
@@ -2795,14 +2812,14 @@ done:
}
define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
<WIDTH x i32> %full_mask) nounwind alwaysinline {
<WIDTH x i32> %full_mask) nounwind alwaysinline {
entry:
%mask = call i32 @__movmsk(<WIDTH x i32> %full_mask)
%mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
br i1 %mask_known, label %known_mask, label %unknown_mask
known_mask:
%allon = icmp eq i32 %mask, eval((1 << WIDTH) -1)
%allon = icmp eq i64 %mask, ALL_ON_MASK
br i1 %allon, label %all_on, label %unknown_mask
all_on:
@@ -2815,12 +2832,12 @@ unknown_mask:
loop:
%lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
%lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
%lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
%offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
; is the current lane on?
%and = and i32 %mask, %lanemask
%do_store = icmp eq i32 %and, %lanemask
%and = and i64 %mask, %lanemask
%do_store = icmp eq i64 %and, %lanemask
br i1 %do_store, label %store, label %loopend
store:
@@ -2833,7 +2850,7 @@ store:
loopend:
%nextoffset = phi i32 [ %offset1, %store ], [ %offset, %loop ]
%nextlane = add i32 %lane, 1
%nextlanemask = mul i32 %lanemask, 2
%nextlanemask = mul i64 %lanemask, 2
; are we done yet?
%test = icmp ne i32 %nextlane, WIDTH
@@ -2857,14 +2874,15 @@ define(`reduce_equal_aux', `
define i1 @__reduce_equal_$3(<$1 x $2> %v, $2 * %samevalue,
<$1 x MASK> %mask) nounwind alwaysinline {
entry:
%mm = call i32 @__movmsk(<$1 x MASK> %mask)
%allon = icmp eq i32 %mm, eval((1<<$1)-1)
%mm = call i64 @__movmsk(<$1 x MASK> %mask)
%allon = icmp eq i64 %mm, ALL_ON_MASK
br i1 %allon, label %check_neighbors, label %domixed
domixed:
; First, figure out which lane is the first active one
%first = call i32 @llvm.cttz.i32(i32 %mm)
%baseval = extractelement <$1 x $2> %v, i32 %first
%first = call i64 @llvm.cttz.i64(i64 %mm)
%first32 = trunc i64 %first to i32
%baseval = extractelement <$1 x $2> %v, i32 %first32
%basev1 = bitcast $2 %baseval to <1 x $2>
; get a vector that is that value smeared across all elements
%basesmear = shufflevector <1 x $2> %basev1, <1 x $2> undef,
@@ -2895,9 +2913,9 @@ check_neighbors:
%eq = $5 eq <$1 x $2> %vec, %vr
ifelse(MASK,i32, `
%eq32 = sext <$1 x i1> %eq to <$1 x i32>
%eqmm = call i32 @__movmsk(<$1 x i32> %eq32)', `
%eqmm = call i32 @__movmsk(<$1 x MASK> %eq)')
%alleq = icmp eq i32 %eqmm, eval((1<<$1)-1)
%eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', `
%eqmm = call i64 @__movmsk(<$1 x MASK> %eq)')
%alleq = icmp eq i64 %eqmm, ALL_ON_MASK
br i1 %alleq, label %all_equal, label %not_all_equal
', `
; But for 64-bit elements, it turns out to be more efficient to just
@@ -3010,14 +3028,14 @@ define(`per_lane', `
br label %pl_entry
pl_entry:
%pl_mask = call i32 @__movmsk($2)
%pl_mask = call i64 @__movmsk($2)
%pl_mask_known = call i1 @__is_compile_time_constant_mask($2)
br i1 %pl_mask_known, label %pl_known_mask, label %pl_unknown_mask
pl_known_mask:
;; the mask is known at compile time; see if it is something we can
;; handle more efficiently
%pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
%pl_is_allon = icmp eq i64 %pl_mask, ALL_ON_MASK
br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask
pl_all_on:
@@ -3039,11 +3057,11 @@ pl_unknown_mask:
pl_loop:
;; Loop over each lane and see if we want to do the work for this lane
%pl_lane = phi i32 [ 0, %pl_unknown_mask ], [ %pl_nextlane, %pl_loopend ]
%pl_lanemask = phi i32 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ]
%pl_lanemask = phi i64 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ]
; is the current lane on? if so, goto do work, otherwise to end of loop
%pl_and = and i32 %pl_mask, %pl_lanemask
%pl_doit = icmp eq i32 %pl_and, %pl_lanemask
%pl_and = and i64 %pl_mask, %pl_lanemask
%pl_doit = icmp eq i64 %pl_and, %pl_lanemask
br i1 %pl_doit, label %pl_dolane, label %pl_loopend
pl_dolane:
@@ -3054,7 +3072,7 @@ pl_dolane:
pl_loopend:
%pl_nextlane = add i32 %pl_lane, 1
%pl_nextlanemask = mul i32 %pl_lanemask, 2
%pl_nextlanemask = mul i64 %pl_lanemask, 2
; are we done yet?
%pl_test = icmp ne i32 %pl_nextlane, $1

23
ctx.cpp
View File

@@ -1254,16 +1254,19 @@ llvm::Value *
FunctionEmitContext::Any(llvm::Value *mask) {
llvm::Value *mmval = LaneMask(mask);
return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, mmval,
LLVMInt32(0), LLVMGetName(mask, "_any"));
LLVMInt64(0), LLVMGetName(mask, "_any"));
}
llvm::Value *
FunctionEmitContext::All(llvm::Value *mask) {
llvm::Value *mmval = LaneMask(mask);
llvm::Value *allOnMaskValue = (g->target.vectorWidth == 64) ?
LLVMInt64(~0ull) :
LLVMInt64((1ull << g->target.vectorWidth) - 1);
return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
LLVMInt32((1<<g->target.vectorWidth)-1),
LLVMGetName(mask, "_all"));
allOnMaskValue, LLVMGetName(mask, "_all"));
}
@@ -1271,14 +1274,14 @@ llvm::Value *
FunctionEmitContext::None(llvm::Value *mask) {
llvm::Value *mmval = LaneMask(mask);
return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
LLVMInt32(0), LLVMGetName(mask, "_none"));
LLVMInt64(0), LLVMGetName(mask, "_none"));
}
llvm::Value *
FunctionEmitContext::LaneMask(llvm::Value *v) {
// Call the target-dependent movmsk function to turn the vector mask
// into an i32 value
// into an i64 value
std::vector<Symbol *> mm;
m->symbolTable->LookupFunction("__movmsk", &mm);
if (g->target.maskBitCount == 1)
@@ -1396,7 +1399,7 @@ FunctionEmitContext::AddInstrumentationPoint(const char *note) {
args.push_back(lGetStringAsValue(bblock, note));
// arg 3: line number
args.push_back(LLVMInt32(currentPos.first_line));
// arg 4: current mask, movmsk'ed down to an int32
// arg 4: current mask, movmsk'ed down to an int64
args.push_back(LaneMask(GetFullMask()));
llvm::Function *finst = m->module->getFunction("ISPCInstrument");
@@ -3196,10 +3199,12 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
// pointer to be called.
llvm::Value *currentMask = LoadInst(maskPtr);
llvm::Function *cttz =
m->module->getFunction("__count_trailing_zeros_i32");
m->module->getFunction("__count_trailing_zeros_i64");
AssertPos(currentPos, cttz != NULL);
llvm::Value *firstLane = CallInst(cttz, NULL, LaneMask(currentMask),
"first_lane");
llvm::Value *firstLane64 = CallInst(cttz, NULL, LaneMask(currentMask),
"first_lane64");
llvm::Value *firstLane =
TruncInst(firstLane64, LLVMTypes::Int32Type, "first_lane32");
// Get the pointer to the function we're going to call this
// time through: ftpr = func[firstLane]

2
ctx.h
View File

@@ -276,7 +276,7 @@ public:
llvm::Value *None(llvm::Value *mask);
/** Given a boolean mask value of type LLVMTypes::MaskType, return an
i32 value wherein the i'th bit is on if and only if the i'th lane
i64 value wherein the i'th bit is on if and only if the i'th lane
of the mask is on. */
llvm::Value *LaneMask(llvm::Value *mask);

View File

@@ -624,7 +624,7 @@ gathers happen.)
extern "C" {
void ISPCInstrument(const char *fn, const char *note,
int line, int mask);
int line, uint64_t mask);
}
This function is passed the file name of the ``ispc`` file running, a short
@@ -637,7 +637,7 @@ as follows:
::
ISPCInstrument("foo.ispc", "function entry", 55, 0xf);
ISPCInstrument("foo.ispc", "function entry", 55, 0xfull);
This call indicates that at the currently executing program has just
entered the function defined at line 55 of the file ``foo.ispc``, with a

View File

@@ -311,8 +311,8 @@ INSERT_EXTRACT(__vec1_d, double)
///////////////////////////////////////////////////////////////////////////
// mask ops
static FORCEINLINE uint32_t __movmsk(__vec16_i1 mask) {
return mask.v;
static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
return (uint64_t)mask.v;
}
static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) {

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -224,8 +224,8 @@ CAST_BITS_SCALAR(double, int64_t)
///////////////////////////////////////////////////////////////////////////
// mask ops
static FORCEINLINE uint32_t __movmsk(__vec4_i1 mask) {
return _mm_movemask_ps(mask.v);
static FORCEINLINE uint64_t __movmsk(__vec4_i1 mask) {
return (uint64_t)_mm_movemask_ps(mask.v);
}
static FORCEINLINE __vec4_i1 __equal(__vec4_i1 a, __vec4_i1 b) {

2
ispc.h
View File

@@ -61,7 +61,7 @@
/** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
targets.
*/
#define ISPC_MAX_NVEC 32
#define ISPC_MAX_NVEC 64
// Forward declarations of a number of widely-used LLVM types
namespace llvm {

View File

@@ -1228,7 +1228,7 @@ Module::writeHeader(const char *fn) {
if (g->emitInstrumentation) {
fprintf(f, "#define ISPC_INSTRUMENTATION 1\n");
fprintf(f, "extern \"C\" {\n");
fprintf(f, " void ISPCInstrument(const char *fn, const char *note, int line, int mask);\n");
fprintf(f, " void ISPCInstrument(const char *fn, const char *note, int line, uint64_t mask);\n");
fprintf(f, "}\n");
}

33
opt.cpp
View File

@@ -269,12 +269,12 @@ lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name,
execution mask, convert it to a bitvector where the 0th bit corresponds
to the first vector value and so forth.
*/
static uint32_t
static uint64_t
lConstElementsToMask(const llvm::SmallVector<llvm::Constant *,
ISPC_MAX_NVEC> &elements) {
Assert(elements.size() <= 32);
Assert(elements.size() <= 64);
uint32_t mask = 0;
uint64_t mask = 0;
for (unsigned int i = 0; i < elements.size(); ++i) {
llvm::APInt intMaskValue;
// SSE has the "interesting" approach of encoding blending
@@ -293,7 +293,7 @@ lConstElementsToMask(const llvm::SmallVector<llvm::Constant *,
// Is the high-bit set? If so, OR in the appropriate bit in
// the result mask
if (intMaskValue.countLeadingOnes() > 0)
mask |= (1 << i);
mask |= (1ull << i);
}
return mask;
}
@@ -306,7 +306,7 @@ lConstElementsToMask(const llvm::SmallVector<llvm::Constant *,
4-wide mask: < 0xffffffff, 0, 0, 0xffffffff >, we have 0b1001 = 9.
*/
static bool
lGetMask(llvm::Value *factor, uint32_t *mask) {
lGetMask(llvm::Value *factor, uint64_t *mask) {
#ifndef LLVM_3_0
llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(factor);
if (cdv != NULL) {
@@ -364,7 +364,7 @@ enum MaskStatus { ALL_ON, ALL_OFF, MIXED, UNKNOWN };
*/
static MaskStatus
lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
uint32_t bits;
uint64_t bits;
if (lGetMask(mask, &bits) == false)
return UNKNOWN;
@@ -373,7 +373,7 @@ lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
if (vecWidth == -1)
vecWidth = g->target.vectorWidth;
Assert(vecWidth <= 32);
Assert(vecWidth <= 64);
for (int i = 0; i < vecWidth; ++i) {
if ((bits & (1ull << i)) == 0)
@@ -601,12 +601,12 @@ private:
instruction for this optimization pass.
*/
struct BlendInstruction {
BlendInstruction(llvm::Function *f, uint32_t ao, int o0, int o1, int of)
BlendInstruction(llvm::Function *f, uint64_t ao, int o0, int o1, int of)
: function(f), allOnMask(ao), op0(o0), op1(o1), opFactor(of) { }
/** Function pointer for the blend instruction */
llvm::Function *function;
/** Mask value for an "all on" mask for this instruction */
uint32_t allOnMask;
uint64_t allOnMask;
/** The operand number in the llvm CallInst corresponds to the
first operand to blend with. */
int op0;
@@ -728,7 +728,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
goto restart;
}
uint32_t mask;
uint64_t mask;
if (lGetMask(factor, &mask) == true) {
llvm::Value *value = NULL;
if (mask == 0)
@@ -748,12 +748,13 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
}
else if (matchesMaskInstruction(callInst->getCalledFunction())) {
llvm::Value *factor = callInst->getArgOperand(0);
uint32_t mask;
uint64_t mask;
if (lGetMask(factor, &mask) == true) {
// If the vector-valued mask has a known value, replace it
// with the corresponding integer mask from its elements
// high bits.
llvm::Value *value = LLVMInt32(mask);
llvm::Value *value = (callInst->getType() == LLVMTypes::Int32Type) ?
LLVMInt32(mask) : LLVMInt64(mask);
llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
iter, value);
modifiedAny = true;
@@ -763,7 +764,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
else if (callInst->getCalledFunction() == avxMaskedLoad32 ||
callInst->getCalledFunction() == avxMaskedLoad64) {
llvm::Value *factor = callInst->getArgOperand(1);
uint32_t mask;
uint64_t mask;
if (lGetMask(factor, &mask) == true) {
if (mask == 0) {
// nothing being loaded, replace with undef value
@@ -802,7 +803,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
callInst->getCalledFunction() == avxMaskedStore64) {
// NOTE: mask is the 2nd parameter, not the 3rd one!!
llvm::Value *factor = callInst->getArgOperand(1);
uint32_t mask;
uint64_t mask;
if (lGetMask(factor, &mask) == true) {
if (mask == 0) {
// nothing actually being stored, just remove the inst
@@ -931,7 +932,7 @@ VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
continue;
uint32_t mask;
uint64_t mask;
if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
#if 0
fprintf(stderr, "mask %d\n", mask);
@@ -939,7 +940,7 @@ VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
fprintf(stderr, "-----------\n");
#endif
llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
iter, LLVMInt32(mask));
iter, LLVMInt64(mask));
modifiedAny = true;
goto restart;
}

View File

@@ -355,7 +355,8 @@ static inline uniform bool all(bool v) {
#else
int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask);
#endif
return __movmsk(match) == (1 << programCount) - 1;
return __movmsk(match) == ((programCount == 64) ? ~0ull :
((1ull << programCount) - 1));
}
__declspec(safe)
@@ -388,14 +389,14 @@ __declspec(safe)
static inline uniform int popcnt(bool v) {
// As with any() and all(), only count across the active lanes
#ifdef ISPC_TARGET_GENERIC
return __popcnt_int32(__movmsk(v & __mask));
return __popcnt_int64(__movmsk(v & __mask));
#else
return __popcnt_int32(__movmsk(__sext_varying_bool(v) & __mask));
return __popcnt_int64(__movmsk(__sext_varying_bool(v) & __mask));
#endif
}
__declspec(safe)
static inline uniform int lanemask() {
static inline uniform unsigned int64 lanemask() {
return __movmsk(__mask);
}
@@ -1615,12 +1616,12 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
TA ret[programCount]; \
TA memVal; \
uniform int lastSwap; \
uniform int mask = lanemask(); \
uniform unsigned int64 mask = lanemask(); \
/* First, have the first running program instance (if any) perform \
the swap with memory with its value of "value"; record the \
value returned. */ \
for (; i < programCount; ++i) { \
if ((mask & (1 << i)) == 0) \
if ((mask & (1ull << i)) == 0) \
continue; \
memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \
lastSwap = i; \
@@ -1632,7 +1633,7 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
current instance had executed a hardware atomic swap right before \
the last one that did a swap. */ \
for (; i < programCount; ++i) { \
if ((mask & (1 << i)) == 0) \
if ((mask & (1ull << i)) == 0) \
continue; \
ret[lastSwap] = extract(value, i); \
lastSwap = i; \

View File

@@ -2843,7 +2843,7 @@ CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) {
pos);
// Compute the per lane mask to test the mask bits against: (1 << iter)
ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, 1,
ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt64, 1ll,
iterSym->pos);
Expr *shiftLaneExpr = new BinaryExpr(BinaryExpr::Shl, oneExpr, symExpr,
pos);
@@ -2863,4 +2863,3 @@ CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) {
// And return a for loop that wires it all together.
return new ForStmt(initStmt, testExpr, stepStmt, laneCheckIf, false, pos);
}