Fix various small things that were broken with single-bit-per-lane masks.
Also small cleanups to declarations, "no captures" added, etc.
This commit is contained in:
@@ -33,7 +33,6 @@ define(`MASK',`i1')
|
|||||||
include(`util.m4')
|
include(`util.m4')
|
||||||
|
|
||||||
stdlib_core()
|
stdlib_core()
|
||||||
|
|
||||||
scans()
|
scans()
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
@@ -96,7 +95,7 @@ declare float @__rsqrt_uniform_float(float) nounwind readnone
|
|||||||
declare float @__rcp_uniform_float(float) nounwind readnone
|
declare float @__rcp_uniform_float(float) nounwind readnone
|
||||||
declare float @__sqrt_uniform_float(float) nounwind readnone
|
declare float @__sqrt_uniform_float(float) nounwind readnone
|
||||||
declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone
|
declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone
|
||||||
declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %v) nounwind readnone
|
declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone
|
||||||
declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
|
declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
|
||||||
|
|
||||||
declare double @__sqrt_uniform_double(double) nounwind readnone
|
declare double @__sqrt_uniform_double(double) nounwind readnone
|
||||||
@@ -142,7 +141,7 @@ declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
|
|||||||
declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone
|
declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone
|
||||||
declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone
|
declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone
|
||||||
|
|
||||||
declare i32 @__reduce_add_uint32(<WIDTH x i32> %v) nounwind readnone
|
declare i32 @__reduce_add_uint32(<WIDTH x i32>) nounwind readnone
|
||||||
declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone
|
declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone
|
||||||
declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone
|
declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone
|
||||||
|
|
||||||
@@ -154,7 +153,7 @@ declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone
|
|||||||
declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone
|
declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone
|
||||||
declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone
|
declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone
|
||||||
|
|
||||||
declare i64 @__reduce_add_uint64(<WIDTH x i64> %v) nounwind readnone
|
declare i64 @__reduce_add_uint64(<WIDTH x i64>) nounwind readnone
|
||||||
declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone
|
declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone
|
||||||
declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone
|
declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone
|
||||||
|
|
||||||
@@ -189,7 +188,6 @@ declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
|||||||
declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
||||||
<WIDTH x i1> %mask) nounwind
|
<WIDTH x i1> %mask) nounwind
|
||||||
|
|
||||||
ifelse(LLVM_VERSION,LLVM_3_1svn,`
|
|
||||||
define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||||
<WIDTH x i1>) nounwind {
|
<WIDTH x i1>) nounwind {
|
||||||
%v = load <WIDTH x i8> * %0
|
%v = load <WIDTH x i8> * %0
|
||||||
@@ -221,39 +219,28 @@ define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
|
|||||||
store <WIDTH x i64> %v1, <WIDTH x i64> * %0
|
store <WIDTH x i64> %v1, <WIDTH x i64> * %0
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
',`
|
|
||||||
declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
|
||||||
<WIDTH x i1>) nounwind
|
|
||||||
declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
|
||||||
<WIDTH x i1>) nounwind
|
|
||||||
declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
|
||||||
<WIDTH x i1>) nounwind
|
|
||||||
declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture %ptr,
|
|
||||||
<WIDTH x i64> %new,
|
|
||||||
<WIDTH x i1> %mask) nounwind
|
|
||||||
')
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; gather/scatter
|
;; gather/scatter
|
||||||
|
|
||||||
define(`gather_scatter', `
|
define(`gather_scatter', `
|
||||||
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture %ptr, <WIDTH x i32> %offsets,
|
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
|
||||||
i32 %offset_scale, <WIDTH x i1> %vecmask) nounwind readonly
|
i32, <WIDTH x i1>) nounwind readonly
|
||||||
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture %ptr, <WIDTH x i64> %offsets,
|
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
|
||||||
i32 %offset_scale, <WIDTH x i1> %vecmask) nounwind readonly
|
i32, <WIDTH x i1>) nounwind readonly
|
||||||
declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs,
|
declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>,
|
||||||
<WIDTH x i1> %vecmask) nounwind readonly
|
<WIDTH x i1>) nounwind readonly
|
||||||
declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs,
|
declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>,
|
||||||
<WIDTH x i1> %vecmask) nounwind readonly
|
<WIDTH x i1>) nounwind readonly
|
||||||
|
|
||||||
declare void @__scatter_base_offsets32_$1(i8* nocapture %base, <WIDTH x i32> %offsets,
|
declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
|
||||||
i32 %offset_scale, <WIDTH x $1> %values, <WIDTH x i1> %mask) nounwind
|
i32, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||||
declare void @__scatter_base_offsets64_$1(i8* nocapture %base, <WIDTH x i64> %offsets,
|
declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
|
||||||
i32 %offset_scale, <WIDTH x $1> %values, <WIDTH x i1> %mask) nounwind
|
i32, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||||
declare void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
|
declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
|
||||||
<WIDTH x i1> %mask) nounwind
|
<WIDTH x i1>) nounwind
|
||||||
declare void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,
|
declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
|
||||||
<WIDTH x i1> %mask) nounwind
|
<WIDTH x i1>) nounwind
|
||||||
')
|
')
|
||||||
|
|
||||||
gather_scatter(i8)
|
gather_scatter(i8)
|
||||||
@@ -261,17 +248,17 @@ gather_scatter(i16)
|
|||||||
gather_scatter(i32)
|
gather_scatter(i32)
|
||||||
gather_scatter(i64)
|
gather_scatter(i64)
|
||||||
|
|
||||||
declare i32 @__packed_load_active(i32 * nocapture %startptr, <WIDTH x i32> * nocapture %val_ptr,
|
declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
|
||||||
<WIDTH x i1> %full_mask) nounwind
|
<WIDTH x i1>) nounwind
|
||||||
declare i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
|
declare i32 @__packed_store_active(i32 * nocapture, <WIDTH x i32> %vals,
|
||||||
<WIDTH x i1> %full_mask) nounwind
|
<WIDTH x i1>) nounwind
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; prefetch
|
;; prefetch
|
||||||
|
|
||||||
declare void @__prefetch_read_uniform_1(i8 *) nounwind readnone
|
declare void @__prefetch_read_uniform_1(i8 * nocapture) nounwind
|
||||||
declare void @__prefetch_read_uniform_2(i8 *) nounwind readnone
|
declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
|
||||||
declare void @__prefetch_read_uniform_3(i8 *) nounwind readnone
|
declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind
|
||||||
declare void @__prefetch_read_uniform_nt(i8 *) nounwind readnone
|
declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
|
||||||
|
|
||||||
|
|||||||
@@ -2192,9 +2192,8 @@ i64minmax(WIDTH,max,uint64,ugt)
|
|||||||
;; $2: element type for which to emit the function (i32, i64, ...)
|
;; $2: element type for which to emit the function (i32, i64, ...)
|
||||||
;; $3: suffix for function name (32, 64, ...)
|
;; $3: suffix for function name (32, 64, ...)
|
||||||
|
|
||||||
|
|
||||||
define(`load_and_broadcast', `
|
define(`load_and_broadcast', `
|
||||||
define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
|
define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x MASK> %mask) nounwind alwaysinline {
|
||||||
%ptr = bitcast i8 * %0 to $2 *
|
%ptr = bitcast i8 * %0 to $2 *
|
||||||
%val = load $2 * %ptr
|
%val = load $2 * %ptr
|
||||||
|
|
||||||
@@ -2536,9 +2535,9 @@ declare i64 @llvm.cttz.i64(i64)
|
|||||||
|
|
||||||
define(`reduce_equal_aux', `
|
define(`reduce_equal_aux', `
|
||||||
define i1 @__reduce_equal_$3(<$1 x $2> %v, $2 * %samevalue,
|
define i1 @__reduce_equal_$3(<$1 x $2> %v, $2 * %samevalue,
|
||||||
<$1 x i32> %mask) nounwind alwaysinline {
|
<$1 x MASK> %mask) nounwind alwaysinline {
|
||||||
entry:
|
entry:
|
||||||
%mm = call i32 @__movmsk(<$1 x i32> %mask)
|
%mm = call i32 @__movmsk(<$1 x MASK> %mask)
|
||||||
%allon = icmp eq i32 %mm, eval((1<<$1)-1)
|
%allon = icmp eq i32 %mm, eval((1<<$1)-1)
|
||||||
br i1 %allon, label %check_neighbors, label %domixed
|
br i1 %allon, label %check_neighbors, label %domixed
|
||||||
|
|
||||||
@@ -2560,7 +2559,7 @@ domixed:
|
|||||||
store <$1 x $2> %basesmear, <$1 x $2> * %ptr
|
store <$1 x $2> %basesmear, <$1 x $2> * %ptr
|
||||||
%castptr = bitcast <$1 x $2> * %ptr to <$1 x $4> *
|
%castptr = bitcast <$1 x $2> * %ptr to <$1 x $4> *
|
||||||
%castv = bitcast <$1 x $2> %v to <$1 x $4>
|
%castv = bitcast <$1 x $2> %v to <$1 x $4>
|
||||||
call void @__masked_store_blend_$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x i32> %mask)
|
call void @__masked_store_blend_$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x MASK> %mask)
|
||||||
%blendvec = load <$1 x $2> * %ptr
|
%blendvec = load <$1 x $2> * %ptr
|
||||||
br label %check_neighbors
|
br label %check_neighbors
|
||||||
|
|
||||||
@@ -2574,8 +2573,10 @@ check_neighbors:
|
|||||||
%castvr = call <$1 x $4> @__rotate_int$6(<$1 x $4> %castvec, i32 1)
|
%castvr = call <$1 x $4> @__rotate_int$6(<$1 x $4> %castvec, i32 1)
|
||||||
%vr = bitcast <$1 x $4> %castvr to <$1 x $2>
|
%vr = bitcast <$1 x $4> %castvr to <$1 x $2>
|
||||||
%eq = $5 eq <$1 x $2> %vec, %vr
|
%eq = $5 eq <$1 x $2> %vec, %vr
|
||||||
%eq32 = sext <$1 x i1> %eq to <$1 x i32>
|
ifelse(MASK,i32, `
|
||||||
%eqmm = call i32 @__movmsk(<$1 x i32> %eq32)
|
%eq32 = sext <$1 x i1> %eq to <$1 x i32>
|
||||||
|
%eqmm = call i32 @__movmsk(<$1 x i32> %eq32)', `
|
||||||
|
%eqmm = call i32 @__movmsk(<$1 x MASK> %eq)')
|
||||||
%alleq = icmp eq i32 %eqmm, eval((1<<$1)-1)
|
%alleq = icmp eq i32 %eqmm, eval((1<<$1)-1)
|
||||||
br i1 %alleq, label %all_equal, label %not_all_equal
|
br i1 %alleq, label %all_equal, label %not_all_equal
|
||||||
', `
|
', `
|
||||||
|
|||||||
14
ctx.cpp
14
ctx.cpp
@@ -1945,6 +1945,20 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
|
|||||||
else
|
else
|
||||||
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_64");
|
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_64");
|
||||||
}
|
}
|
||||||
|
else if (valueType == AtomicType::VaryingBool &&
|
||||||
|
g->target.maskBitCount == 1) {
|
||||||
|
llvm::Value *notMask = BinaryOperator(llvm::Instruction::Xor, mask,
|
||||||
|
LLVMMaskAllOn, "~mask");
|
||||||
|
llvm::Value *old = LoadInst(ptr);
|
||||||
|
llvm::Value *maskedOld = BinaryOperator(llvm::Instruction::And, old,
|
||||||
|
notMask, "old&~mask");
|
||||||
|
llvm::Value *maskedNew = BinaryOperator(llvm::Instruction::And, value,
|
||||||
|
mask, "new&mask");
|
||||||
|
llvm::Value *final = BinaryOperator(llvm::Instruction::Or, maskedOld,
|
||||||
|
maskedNew, "old_new_result");
|
||||||
|
StoreInst(final, ptr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
else if (valueType == AtomicType::VaryingDouble ||
|
else if (valueType == AtomicType::VaryingDouble ||
|
||||||
valueType == AtomicType::VaryingInt64 ||
|
valueType == AtomicType::VaryingInt64 ||
|
||||||
valueType == AtomicType::VaryingUInt64) {
|
valueType == AtomicType::VaryingUInt64) {
|
||||||
|
|||||||
12
stdlib.ispc
12
stdlib.ispc
@@ -312,14 +312,14 @@ static inline int popcnt(int v) {
|
|||||||
int r;
|
int r;
|
||||||
for (uniform int i = 0; i < programCount; ++i)
|
for (uniform int i = 0; i < programCount; ++i)
|
||||||
r = insert(r, i, popcnt(extract(v, i)));
|
r = insert(r, i, popcnt(extract(v, i)));
|
||||||
return (r & __mask);
|
return __mask ? r : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int popcnt(int64 v) {
|
static inline int popcnt(int64 v) {
|
||||||
int r;
|
int r;
|
||||||
for (uniform int i = 0; i < programCount; ++i)
|
for (uniform int i = 0; i < programCount; ++i)
|
||||||
r = insert(r, i, popcnt(extract(v, i)));
|
r = insert(r, i, popcnt(extract(v, i)));
|
||||||
return (r & __mask);
|
return __mask ? r : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uniform int popcnt(bool v) {
|
static inline uniform int popcnt(bool v) {
|
||||||
@@ -589,7 +589,7 @@ static inline uniform float reduce_max(float v) {
|
|||||||
|
|
||||||
static inline uniform int reduce_add(int x) {
|
static inline uniform int reduce_add(int x) {
|
||||||
// Zero out the values for lanes that aren't running
|
// Zero out the values for lanes that aren't running
|
||||||
return __reduce_add_int32(x & __mask);
|
return __reduce_add_int32(__mask ? x : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uniform int reduce_min(int v) {
|
static inline uniform int reduce_min(int v) {
|
||||||
@@ -609,7 +609,7 @@ static inline uniform int reduce_max(int v) {
|
|||||||
static inline uniform unsigned int reduce_add(unsigned int x) {
|
static inline uniform unsigned int reduce_add(unsigned int x) {
|
||||||
// Set values for non-running lanes to zero so they don't affect the
|
// Set values for non-running lanes to zero so they don't affect the
|
||||||
// result.
|
// result.
|
||||||
return __reduce_add_uint32(x & __mask);
|
return __reduce_add_uint32(__mask ? x : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uniform unsigned int reduce_min(unsigned int v) {
|
static inline uniform unsigned int reduce_min(unsigned int v) {
|
||||||
@@ -647,7 +647,7 @@ static inline uniform double reduce_max(double v) {
|
|||||||
|
|
||||||
static inline uniform int64 reduce_add(int64 x) {
|
static inline uniform int64 reduce_add(int64 x) {
|
||||||
// Zero out the values for lanes that aren't running
|
// Zero out the values for lanes that aren't running
|
||||||
return __reduce_add_int64(x & (int64)(__mask));
|
return __reduce_add_int64(__mask ? x : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uniform int64 reduce_min(int64 v) {
|
static inline uniform int64 reduce_min(int64 v) {
|
||||||
@@ -667,7 +667,7 @@ static inline uniform int64 reduce_max(int64 v) {
|
|||||||
static inline uniform unsigned int64 reduce_add(unsigned int64 x) {
|
static inline uniform unsigned int64 reduce_add(unsigned int64 x) {
|
||||||
// Set values for non-running lanes to zero so they don't affect the
|
// Set values for non-running lanes to zero so they don't affect the
|
||||||
// result.
|
// result.
|
||||||
return __reduce_add_int64(x & (int64)(__mask));
|
return __reduce_add_int64(__mask ? x : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uniform unsigned int64 reduce_min(unsigned int64 v) {
|
static inline uniform unsigned int64 reduce_min(unsigned int64 v) {
|
||||||
|
|||||||
Reference in New Issue
Block a user