From 848a4326406a69b629b819152bfd91be1f66d309 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 4 Jan 2012 12:26:22 -0800 Subject: [PATCH] Fix various small things that were broken with single-bit-per-lane masks. Also small cleanups to declarations, "no captures" added, etc. --- builtins/target-generic-common.ll | 67 +++++++++++++------------------ builtins/util.m4 | 15 +++---- ctx.cpp | 14 +++++++ stdlib.ispc | 12 +++--- 4 files changed, 55 insertions(+), 53 deletions(-) diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 4c815de1..3d123fcf 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -33,7 +33,6 @@ define(`MASK',`i1') include(`util.m4') stdlib_core() - scans() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -96,7 +95,7 @@ declare float @__rsqrt_uniform_float(float) nounwind readnone declare float @__rcp_uniform_float(float) nounwind readnone declare float @__sqrt_uniform_float(float) nounwind readnone declare @__rcp_varying_float() nounwind readnone -declare @__rsqrt_varying_float( %v) nounwind readnone +declare @__rsqrt_varying_float() nounwind readnone declare @__sqrt_varying_float() nounwind readnone declare double @__sqrt_uniform_double(double) nounwind readnone @@ -142,7 +141,7 @@ declare i32 @__reduce_add_int32() nounwind readnone declare i32 @__reduce_min_int32() nounwind readnone declare i32 @__reduce_max_int32() nounwind readnone -declare i32 @__reduce_add_uint32( %v) nounwind readnone +declare i32 @__reduce_add_uint32() nounwind readnone declare i32 @__reduce_min_uint32() nounwind readnone declare i32 @__reduce_max_uint32() nounwind readnone @@ -154,7 +153,7 @@ declare i64 @__reduce_add_int64() nounwind readnone declare i64 @__reduce_min_int64() nounwind readnone declare i64 @__reduce_max_int64() nounwind readnone -declare i64 @__reduce_add_uint64( %v) nounwind readnone +declare i64 @__reduce_add_uint64() nounwind readnone declare i64 @__reduce_min_uint64() nounwind readnone declare i64 @__reduce_max_uint64() nounwind readnone @@ -189,7 +188,6 @@ declare void @__masked_store_32(* nocapture, , declare void @__masked_store_64(* nocapture, , %mask) nounwind -ifelse(LLVM_VERSION,LLVM_3_1svn,` define void @__masked_store_blend_8(* nocapture, , ) nounwind { %v = load * %0 @@ -221,39 +219,28 @@ define void @__masked_store_blend_64(* nocapture, store %v1, * %0 ret void } -',` -declare void @__masked_store_blend_8(* nocapture, , - ) nounwind -declare void @__masked_store_blend_16(* nocapture, , - ) nounwind -declare void @__masked_store_blend_32(* nocapture, , - ) nounwind -declare void @__masked_store_blend_64(* nocapture %ptr, - %new, - %mask) nounwind -') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter define(`gather_scatter', ` -declare @__gather_base_offsets32_$1(i8 * nocapture %ptr, %offsets, - i32 %offset_scale, %vecmask) nounwind readonly -declare @__gather_base_offsets64_$1(i8 * nocapture %ptr, %offsets, - i32 %offset_scale, %vecmask) nounwind readonly -declare @__gather32_$1( %ptrs, - %vecmask) nounwind readonly -declare @__gather64_$1( %ptrs, - %vecmask) nounwind readonly +declare @__gather_base_offsets32_$1(i8 * nocapture, , + i32, ) nounwind readonly +declare @__gather_base_offsets64_$1(i8 * nocapture, , + i32, ) nounwind readonly +declare @__gather32_$1(, + ) nounwind readonly +declare @__gather64_$1(, + ) nounwind readonly -declare void @__scatter_base_offsets32_$1(i8* nocapture %base, %offsets, - i32 %offset_scale, %values, %mask) nounwind -declare void @__scatter_base_offsets64_$1(i8* nocapture %base, %offsets, - i32 %offset_scale, %values, %mask) nounwind -declare void @__scatter32_$1( %ptrs, %values, - %mask) nounwind -declare void @__scatter64_$1( %ptrs, %values, - %mask) nounwind +declare void @__scatter_base_offsets32_$1(i8* nocapture, , + i32, , ) nounwind +declare void @__scatter_base_offsets64_$1(i8* nocapture, , + i32, , ) nounwind +declare void @__scatter32_$1(, , + ) nounwind +declare void @__scatter64_$1(, , + ) nounwind ') gather_scatter(i8) @@ -261,17 +248,17 @@ gather_scatter(i16) gather_scatter(i32) gather_scatter(i64) -declare i32 @__packed_load_active(i32 * nocapture %startptr, * nocapture %val_ptr, - %full_mask) nounwind -declare i32 @__packed_store_active(i32 * %startptr, %vals, - %full_mask) nounwind +declare i32 @__packed_load_active(i32 * nocapture, * nocapture, + ) nounwind +declare i32 @__packed_store_active(i32 * nocapture, %vals, + ) nounwind ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; prefetch -declare void @__prefetch_read_uniform_1(i8 *) nounwind readnone -declare void @__prefetch_read_uniform_2(i8 *) nounwind readnone -declare void @__prefetch_read_uniform_3(i8 *) nounwind readnone -declare void @__prefetch_read_uniform_nt(i8 *) nounwind readnone +declare void @__prefetch_read_uniform_1(i8 * nocapture) nounwind +declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind +declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind +declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind diff --git a/builtins/util.m4 b/builtins/util.m4 index 5eaac67e..15f3df11 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -2192,9 +2192,8 @@ i64minmax(WIDTH,max,uint64,ugt) ;; $2: element type for which to emit the function (i32, i64, ...) ;; $3: suffix for function name (32, 64, ...) - define(`load_and_broadcast', ` -define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline { +define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x MASK> %mask) nounwind alwaysinline { %ptr = bitcast i8 * %0 to $2 * %val = load $2 * %ptr @@ -2536,9 +2535,9 @@ declare i64 @llvm.cttz.i64(i64) define(`reduce_equal_aux', ` define i1 @__reduce_equal_$3(<$1 x $2> %v, $2 * %samevalue, - <$1 x i32> %mask) nounwind alwaysinline { + <$1 x MASK> %mask) nounwind alwaysinline { entry: - %mm = call i32 @__movmsk(<$1 x i32> %mask) + %mm = call i32 @__movmsk(<$1 x MASK> %mask) %allon = icmp eq i32 %mm, eval((1<<$1)-1) br i1 %allon, label %check_neighbors, label %domixed @@ -2560,7 +2559,7 @@ domixed: store <$1 x $2> %basesmear, <$1 x $2> * %ptr %castptr = bitcast <$1 x $2> * %ptr to <$1 x $4> * %castv = bitcast <$1 x $2> %v to <$1 x $4> - call void @__masked_store_blend_$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x i32> %mask) + call void @__masked_store_blend_$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x MASK> %mask) %blendvec = load <$1 x $2> * %ptr br label %check_neighbors @@ -2574,8 +2573,10 @@ check_neighbors: %castvr = call <$1 x $4> @__rotate_int$6(<$1 x $4> %castvec, i32 1) %vr = bitcast <$1 x $4> %castvr to <$1 x $2> %eq = $5 eq <$1 x $2> %vec, %vr - %eq32 = sext <$1 x i1> %eq to <$1 x i32> - %eqmm = call i32 @__movmsk(<$1 x i32> %eq32) + ifelse(MASK,i32, ` + %eq32 = sext <$1 x i1> %eq to <$1 x i32> + %eqmm = call i32 @__movmsk(<$1 x i32> %eq32)', ` + %eqmm = call i32 @__movmsk(<$1 x MASK> %eq)') %alleq = icmp eq i32 %eqmm, eval((1<<$1)-1) br i1 %alleq, label %all_equal, label %not_all_equal ', ` diff --git a/ctx.cpp b/ctx.cpp index 694a3b1d..fb7ec7f9 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1945,6 +1945,20 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, else maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_64"); } + else if (valueType == AtomicType::VaryingBool && + g->target.maskBitCount == 1) { + llvm::Value *notMask = BinaryOperator(llvm::Instruction::Xor, mask, + LLVMMaskAllOn, "~mask"); + llvm::Value *old = LoadInst(ptr); + llvm::Value *maskedOld = BinaryOperator(llvm::Instruction::And, old, + notMask, "old&~mask"); + llvm::Value *maskedNew = BinaryOperator(llvm::Instruction::And, value, + mask, "new&mask"); + llvm::Value *final = BinaryOperator(llvm::Instruction::Or, maskedOld, + maskedNew, "old_new_result"); + StoreInst(final, ptr); + return; + } else if (valueType == AtomicType::VaryingDouble || valueType == AtomicType::VaryingInt64 || valueType == AtomicType::VaryingUInt64) { diff --git a/stdlib.ispc b/stdlib.ispc index c3b02fa7..667c2e0e 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -312,14 +312,14 @@ static inline int popcnt(int v) { int r; for (uniform int i = 0; i < programCount; ++i) r = insert(r, i, popcnt(extract(v, i))); - return (r & __mask); + return __mask ? r : 0; } static inline int popcnt(int64 v) { int r; for (uniform int i = 0; i < programCount; ++i) r = insert(r, i, popcnt(extract(v, i))); - return (r & __mask); + return __mask ? r : 0; } static inline uniform int popcnt(bool v) { @@ -589,7 +589,7 @@ static inline uniform float reduce_max(float v) { static inline uniform int reduce_add(int x) { // Zero out the values for lanes that aren't running - return __reduce_add_int32(x & __mask); + return __reduce_add_int32(__mask ? x : 0); } static inline uniform int reduce_min(int v) { @@ -609,7 +609,7 @@ static inline uniform int reduce_max(int v) { static inline uniform unsigned int reduce_add(unsigned int x) { // Set values for non-running lanes to zero so they don't affect the // result. - return __reduce_add_uint32(x & __mask); + return __reduce_add_uint32(__mask ? x : 0); } static inline uniform unsigned int reduce_min(unsigned int v) { @@ -647,7 +647,7 @@ static inline uniform double reduce_max(double v) { static inline uniform int64 reduce_add(int64 x) { // Zero out the values for lanes that aren't running - return __reduce_add_int64(x & (int64)(__mask)); + return __reduce_add_int64(__mask ? x : 0); } static inline uniform int64 reduce_min(int64 v) { @@ -667,7 +667,7 @@ static inline uniform int64 reduce_max(int64 v) { static inline uniform unsigned int64 reduce_add(unsigned int64 x) { // Set values for non-running lanes to zero so they don't affect the // result. - return __reduce_add_int64(x & (int64)(__mask)); + return __reduce_add_int64(__mask ? x : 0); } static inline uniform unsigned int64 reduce_min(unsigned int64 v) {