From b86d40091a63a95660d1a210e057bbb1fa0b9df4 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 7 Jun 2012 13:51:08 -0700 Subject: [PATCH] Improve naming of masked load/store instructions in builtins. Now, use _i32 suffixes, rather than _32, etc. Also cleaned up the m4 macro to generate these functions, using WIDTH to get the target width, etc. --- builtins/target-avx-x2.ll | 28 +++--- builtins/target-avx.ll | 29 +++--- builtins/target-generic-1.ll | 30 +++--- builtins/target-generic-common.ll | 48 ++++----- builtins/target-sse2-x2.ll | 24 ++--- builtins/target-sse2.ll | 24 ++--- builtins/target-sse4-x2.ll | 25 +++-- builtins/target-sse4.ll | 24 ++--- builtins/util.m4 | 160 ++++++++++++++---------------- ctx.cpp | 16 ++- examples/intrinsics/generic-16.h | 56 ++++++----- examples/intrinsics/sse4.h | 51 +++++----- opt.cpp | 92 ++++++++--------- 13 files changed, 299 insertions(+), 308 deletions(-) diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index ddb9f095..9bb73d88 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -359,13 +359,13 @@ load_and_broadcast(i32) load_and_broadcast(i64) ; no masked load instruction for i8 and i16 types?? -masked_load(16, i8, 8, 1) -masked_load(16, i16, 16, 2) +masked_load(i8, 1) +masked_load(i16, 2) declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask) declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask) -define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinline { +define <16 x i32> @__masked_load_i32(i8 *, <16 x i32> %mask) nounwind alwaysinline { %floatmask = bitcast <16 x i32> %mask to <16 x float> %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef, <8 x i32> @@ -383,7 +383,7 @@ define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinlin } -define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinline { +define <16 x i64> @__masked_load_i64(i8 *, <16 x i32> %mask) nounwind alwaysinline { ; double up masks, bitcast to doubles %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef, <8 x i32> @@ -424,15 +424,15 @@ define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin ; FIXME: there is no AVX instruction for these, but we could be clever ; by packing the bits down and setting the last 3/4 or half, respectively, ; of the mask to zero... Not sure if this would be a win in the end -gen_masked_store(16, i8, 8) -gen_masked_store(16, i16, 16) +gen_masked_store(i8) +gen_masked_store(i16) ; note that mask is the 2nd parameter, not the 3rd one!! declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>) declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>) -define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>, - <16 x i32>) nounwind alwaysinline { +define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32>, + <16 x i32>) nounwind alwaysinline { %ptr = bitcast <16 x i32> * %0 to i8 * %val = bitcast <16 x i32> %1 to <16 x float> %mask = bitcast <16 x i32> %2 to <16 x float> @@ -454,8 +454,8 @@ define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>, ret void } -define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>, - <16 x i32> %mask) nounwind alwaysinline { +define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>, + <16 x i32> %mask) nounwind alwaysinline { %ptr = bitcast <16 x i64> * %0 to i8 * %val = bitcast <16 x i64> %1 to <16 x double> @@ -499,8 +499,8 @@ masked_store_blend_8_16_by_16() declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone -define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, - <16 x i32>) nounwind alwaysinline { +define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, + <16 x i32>) nounwind alwaysinline { %maskAsFloat = bitcast <16 x i32> %2 to <16 x float> %oldValue = load <16 x i32>* %0, align 4 %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float> @@ -537,8 +537,8 @@ define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone -define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, - <16 x i32> %mask) nounwind alwaysinline { +define void @__masked_store_blend_i64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, + <16 x i32> %mask) nounwind alwaysinline { %oldValue = load <16 x i64>* %ptr, align 8 %old = bitcast <16 x i64> %oldValue to <16 x double> %old0d = shufflevector <16 x double> %old, <16 x double> undef, diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index 856e4453..e5ded22a 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -340,13 +340,13 @@ load_and_broadcast(i32) load_and_broadcast(i64) ; no masked load instruction for i8 and i16 types?? -masked_load(8, i8, 8, 1) -masked_load(8, i16, 16, 2) +masked_load(i8, 1) +masked_load(i16, 2) declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask) declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask) -define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline { +define <8 x i32> @__masked_load_i32(i8 *, <8 x i32> %mask) nounwind alwaysinline { %floatmask = bitcast <8 x i32> %mask to <8 x float> %floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask) %retval = bitcast <8 x float> %floatval to <8 x i32> @@ -354,7 +354,7 @@ define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline } -define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline { +define <8 x i64> @__masked_load_i64(i8 *, <8 x i32> %mask) nounwind alwaysinline { ; double up masks, bitcast to doubles %mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef, <8 x i32> @@ -377,15 +377,15 @@ define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; masked store -gen_masked_store(8, i8, 8) -gen_masked_store(8, i16, 16) +gen_masked_store(i8) +gen_masked_store(i16) ; note that mask is the 2nd parameter, not the 3rd one!! declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>) declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>) -define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>, - <8 x i32>) nounwind alwaysinline { +define void @__masked_store_i32(<8 x i32>* nocapture, <8 x i32>, + <8 x i32>) nounwind alwaysinline { %ptr = bitcast <8 x i32> * %0 to i8 * %val = bitcast <8 x i32> %1 to <8 x float> %mask = bitcast <8 x i32> %2 to <8 x float> @@ -393,8 +393,8 @@ define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>, ret void } -define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>, - <8 x i32> %mask) nounwind alwaysinline { +define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>, + <8 x i32> %mask) nounwind alwaysinline { %ptr = bitcast <8 x i64> * %0 to i8 * %val = bitcast <8 x i64> %1 to <8 x double> @@ -418,14 +418,13 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>, } - masked_store_blend_8_16_by_8() declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone -define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, - <8 x i32>) nounwind alwaysinline { +define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, + <8 x i32>) nounwind alwaysinline { %mask_as_float = bitcast <8 x i32> %2 to <8 x float> %oldValue = load <8 x i32>* %0, align 4 %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float> @@ -439,8 +438,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, } -define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, - <8 x i32> %i32mask) nounwind alwaysinline { +define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new, + <8 x i32> %i32mask) nounwind alwaysinline { %oldValue = load <8 x i64>* %ptr, align 8 %mask = bitcast <8 x i32> %i32mask to <8 x float> diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 13343495..e87c3c0a 100755 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -13,10 +13,10 @@ aossoa() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; masked store -gen_masked_store(1, i8, 8) -gen_masked_store(1, i16, 16) -gen_masked_store(1, i32, 32) -gen_masked_store(1, i64, 64) +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts @@ -26,10 +26,10 @@ load_and_broadcast(i16) load_and_broadcast(i32) load_and_broadcast(i64) -masked_load(1, i8, 8, 1) -masked_load(1, i16, 16, 2) -masked_load(1, i32, 32, 4) -masked_load(1, i64, 64, 8) +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(i64, 8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter @@ -155,23 +155,23 @@ define <1 x float> @__vselect_float(<1 x float>, <1 x float>, ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; masked store -define void @__masked_store_blend_8(<1 x i8>* nocapture, <1 x i8>, +define void @__masked_store_blend_i8(<1 x i8>* nocapture, <1 x i8>, <1 x i32> %mask) nounwind alwaysinline { %val = load <1 x i8> * %0, align 4 %newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask) store <1 x i8> %newval, <1 x i8> * %0, align 4 ret void } -define void @__masked_store_blend_16(<1 x i16>* nocapture, <1 x i16>, - <1 x i32> %mask) nounwind alwaysinline { + +define void @__masked_store_blend_i16(<1 x i16>* nocapture, <1 x i16>, + <1 x i32> %mask) nounwind alwaysinline { %val = load <1 x i16> * %0, align 4 %newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask) store <1 x i16> %newval, <1 x i16> * %0, align 4 ret void } - -define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>, +define void @__masked_store_blend_i32(<1 x i32>* nocapture, <1 x i32>, <1 x i32> %mask) nounwind alwaysinline { %val = load <1 x i32> * %0, align 4 %newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask) @@ -179,8 +179,8 @@ define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>, ret void } -define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>, - <1 x i32> %mask) nounwind alwaysinline { +define void @__masked_store_blend_i64(<1 x i64>* nocapture, <1 x i64>, + <1 x i32> %mask) nounwind alwaysinline { %val = load <1 x i64> * %0, align 4 %newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask) store <1 x i64> %newval, <1 x i64> * %0, align 4 diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 4a7b5c2a..e61057f6 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -231,36 +231,36 @@ declare i64 @__reduce_max_uint64() nounwind readnone ;; unaligned loads/loads+broadcasts -declare @__masked_load_8(i8 * nocapture, %mask) nounwind readonly -declare @__masked_load_16(i8 * nocapture, %mask) nounwind readonly -declare @__masked_load_32(i8 * nocapture, %mask) nounwind readonly -declare @__masked_load_64(i8 * nocapture, %mask) nounwind readonly load_and_broadcast(i8) load_and_broadcast(i16) load_and_broadcast(i32) load_and_broadcast(i64) -declare void @__masked_store_8(* nocapture, , - ) nounwind -declare void @__masked_store_16(* nocapture, , +declare @__masked_load_i8(i8 * nocapture, %mask) nounwind readonly +declare @__masked_load_i16(i8 * nocapture, %mask) nounwind readonly +declare @__masked_load_i32(i8 * nocapture, %mask) nounwind readonly +declare @__masked_load_i64(i8 * nocapture, %mask) nounwind readonly +declare void @__masked_store_i8(* nocapture, , ) nounwind -declare void @__masked_store_32(* nocapture, , - ) nounwind -declare void @__masked_store_64(* nocapture, , - %mask) nounwind +declare void @__masked_store_i16(* nocapture, , + ) nounwind +declare void @__masked_store_i32(* nocapture, , + ) nounwind +declare void @__masked_store_i64(* nocapture, , + %mask) nounwind ifelse(LLVM_VERSION, `LLVM_3_0', ` -declare void @__masked_store_blend_8(* nocapture, , - ) nounwind -declare void @__masked_store_blend_16(* nocapture, , +declare void @__masked_store_blend_i8(* nocapture, , ) nounwind -declare void @__masked_store_blend_32(* nocapture, , - ) nounwind -declare void @__masked_store_blend_64(* nocapture, , - %mask) nounwind +declare void @__masked_store_blend_i16(* nocapture, , + ) nounwind +declare void @__masked_store_blend_i32(* nocapture, , + ) nounwind +declare void @__masked_store_blend_i64(* nocapture, , + %mask) nounwind ', ` -define void @__masked_store_blend_8(* nocapture, , +define void @__masked_store_blend_i8(* nocapture, , ) nounwind alwaysinline { %v = load * %0 %v1 = select %2, %1, %v @@ -268,23 +268,23 @@ define void @__masked_store_blend_8(* nocapture, , ret void } -define void @__masked_store_blend_16(* nocapture, , - ) nounwind alwaysinline { +define void @__masked_store_blend_i16(* nocapture, , + ) nounwind alwaysinline { %v = load * %0 %v1 = select %2, %1, %v store %v1, * %0 ret void } -define void @__masked_store_blend_32(* nocapture, , - ) nounwind alwaysinline { +define void @__masked_store_blend_i32(* nocapture, , + ) nounwind alwaysinline { %v = load * %0 %v1 = select %2, %1, %v store %v1, * %0 ret void } -define void @__masked_store_blend_64(* nocapture, +define void @__masked_store_blend_i64(* nocapture, , ) nounwind alwaysinline { %v = load * %0 %v1 = select %2, %1, %v diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index fb8568fd..2574b81f 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -434,14 +434,14 @@ reduce_equal(8) ;; unaligned loads/loads+broadcasts -masked_load(8, i8, 8, 1) -masked_load(8, i16, 16, 2) -masked_load(8, i32, 32, 4) -masked_load(8, i64, 64, 8) load_and_broadcast(i8) load_and_broadcast(i16) load_and_broadcast(i32) load_and_broadcast(i64) +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(i64, 8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter @@ -558,23 +558,23 @@ define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alway ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; masked store -gen_masked_store(8, i8, 8) -gen_masked_store(8, i16, 16) -gen_masked_store(8, i32, 32) -gen_masked_store(8, i64, 64) +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) masked_store_blend_8_16_by_8() -define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, - <8 x i32> %mask) nounwind alwaysinline { +define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, + <8 x i32> %mask) nounwind alwaysinline { %val = load <8 x i32> * %0, align 4 %newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask) store <8 x i32> %newval, <8 x i32> * %0, align 4 ret void } -define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, - <8 x i32> %mask) nounwind alwaysinline { +define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new, + <8 x i32> %mask) nounwind alwaysinline { %oldValue = load <8 x i64>* %ptr, align 8 ; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index c1182767..2275cf1b 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -350,16 +350,16 @@ reduce_equal(4) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; masked store -define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, - <4 x i32> %mask) nounwind alwaysinline { +define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, + <4 x i32> %mask) nounwind alwaysinline { %val = load <4 x i32> * %0, align 4 %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) store <4 x i32> %newval, <4 x i32> * %0, align 4 ret void } -define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new, - <4 x i32> %mask) nounwind alwaysinline { +define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new, + <4 x i32> %mask) nounwind alwaysinline { %oldValue = load <4 x i64>* %ptr, align 8 ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values @@ -552,10 +552,10 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r masked_store_blend_8_16_by_4() -gen_masked_store(4, i8, 8) -gen_masked_store(4, i16, 16) -gen_masked_store(4, i32, 32) -gen_masked_store(4, i64, 64) +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts @@ -565,10 +565,10 @@ load_and_broadcast(i16) load_and_broadcast(i32) load_and_broadcast(i64) -masked_load(4, i8, 8, 1) -masked_load(4, i16, 16, 2) -masked_load(4, i32, 32, 4) -masked_load(4, i64, 64, 8) +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(i64, 8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 48a14b70..a4416409 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -360,15 +360,14 @@ reduce_equal(8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts - -masked_load(8, i8, 8, 1) -masked_load(8, i16, 16, 2) -masked_load(8, i32, 32, 4) -masked_load(8, i64, 64, 8) load_and_broadcast(i8) load_and_broadcast(i16) load_and_broadcast(i32) load_and_broadcast(i64) +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(i64, 8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter @@ -444,18 +443,18 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; masked store -gen_masked_store(8, i8, 8) -gen_masked_store(8, i16, 16) -gen_masked_store(8, i32, 32) -gen_masked_store(8, i64, 64) +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) masked_store_blend_8_16_by_8() declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone -define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, - <8 x i32> %mask) nounwind alwaysinline { +define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, + <8 x i32> %mask) nounwind alwaysinline { ; do two 4-wide blends with blendvps %mask_as_float = bitcast <8 x i32> %mask to <8 x float> %mask_a = shufflevector <8 x float> %mask_as_float, <8 x float> undef, @@ -484,8 +483,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, ret void } -define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, - <8 x i32> %mask) nounwind alwaysinline { +define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new, + <8 x i32> %mask) nounwind alwaysinline { ; implement this as 4 blends of <4 x i32>s, which are actually bitcast ; <2 x i64>s... diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 2bf3104d..c31a23b2 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -384,8 +384,8 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone -define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, - <4 x i32> %mask) nounwind alwaysinline { +define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, + <4 x i32> %mask) nounwind alwaysinline { %mask_as_float = bitcast <4 x i32> %mask to <4 x float> %oldValue = load <4 x i32>* %0, align 4 %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float> @@ -399,8 +399,8 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, } -define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new, - <4 x i32> %i32mask) nounwind alwaysinline { +define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new, + <4 x i32> %i32mask) nounwind alwaysinline { %oldValue = load <4 x i64>* %ptr, align 8 %mask = bitcast <4 x i32> %i32mask to <4 x float> @@ -451,10 +451,10 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new, masked_store_blend_8_16_by_4() -gen_masked_store(4, i8, 8) -gen_masked_store(4, i16, 16) -gen_masked_store(4, i32, 32) -gen_masked_store(4, i64, 64) +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts @@ -464,10 +464,10 @@ load_and_broadcast(i16) load_and_broadcast(i32) load_and_broadcast(i64) -masked_load(4, i8, 8, 1) -masked_load(4, i16, 16, 2) -masked_load(4, i32, 32, 4) -masked_load(4, i64, 64, 8) +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(i64, 8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter diff --git a/builtins/util.m4 b/builtins/util.m4 index deb2fac8..212f6076 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1549,19 +1549,19 @@ declare i1 @__is_compile_time_constant_varying_int32() ; This function declares placeholder masked store functions for the ; front-end to use. ; -; void __pseudo_masked_store_8 (uniform int8 *ptr, varying int8 values, mask) -; void __pseudo_masked_store_16(uniform int16 *ptr, varying int16 values, mask) -; void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask) -; void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask) +; void __pseudo_masked_store_i8 (uniform int8 *ptr, varying int8 values, mask) +; void __pseudo_masked_store_i16(uniform int16 *ptr, varying int16 values, mask) +; void __pseudo_masked_store_i32(uniform int32 *ptr, varying int32 values, mask) +; void __pseudo_masked_store_i64(uniform int64 *ptr, varying int64 values, mask) ; ; These in turn are converted to native masked stores or to regular ; stores (if the mask is all on) by the MaskedStoreOptPass optimization ; pass. -declare void @__pseudo_masked_store_8( * nocapture, , ) -declare void @__pseudo_masked_store_16( * nocapture, , ) -declare void @__pseudo_masked_store_32( * nocapture, , ) -declare void @__pseudo_masked_store_64( * nocapture, , ) +declare void @__pseudo_masked_store_i8( * nocapture, , ) +declare void @__pseudo_masked_store_i16( * nocapture, , ) +declare void @__pseudo_masked_store_i32( * nocapture, , ) +declare void @__pseudo_masked_store_i64( * nocapture, , ) ; Declare the pseudo-gather functions. When the ispc front-end needs ; to perform a gather, it generates a call to one of these functions, @@ -1692,13 +1692,13 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, %mask) { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; loads - %ml8 = call @__masked_load_8(i8 * %ptr, %mask) + %ml8 = call @__masked_load_i8(i8 * %ptr, %mask) call void @__use8( %ml8) - %ml16 = call @__masked_load_16(i8 * %ptr, %mask) + %ml16 = call @__masked_load_i16(i8 * %ptr, %mask) call void @__use16( %ml16) - %ml32 = call @__masked_load_32(i8 * %ptr, %mask) + %ml32 = call @__masked_load_i32(i8 * %ptr, %mask) call void @__use32( %ml32) - %ml64 = call @__masked_load_64(i8 * %ptr, %mask) + %ml64 = call @__masked_load_i64(i8 * %ptr, %mask) call void @__use64( %ml64) %lb8 = call @__load_and_broadcast_i8(i8 * %ptr, %mask) @@ -1713,31 +1713,29 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; stores %pv8 = bitcast i8 * %ptr to * - call void @__pseudo_masked_store_8( * %pv8, %v8, - %mask) + call void @__pseudo_masked_store_i8( * %pv8, %v8, + %mask) %pv16 = bitcast i8 * %ptr to * - call void @__pseudo_masked_store_16( * %pv16, %v16, - %mask) + call void @__pseudo_masked_store_i16( * %pv16, %v16, + %mask) %pv32 = bitcast i8 * %ptr to * - call void @__pseudo_masked_store_32( * %pv32, %v32, - %mask) + call void @__pseudo_masked_store_i32( * %pv32, %v32, + %mask) %pv64 = bitcast i8 * %ptr to * - call void @__pseudo_masked_store_64( * %pv64, %v64, + call void @__pseudo_masked_store_i64( * %pv64, %v64, + %mask) + call void @__masked_store_i8( * %pv8, %v8, %mask) + call void @__masked_store_i16( * %pv16, %v16, %mask) + call void @__masked_store_i32( * %pv32, %v32, %mask) + call void @__masked_store_i64( * %pv64, %v64, %mask) + call void @__masked_store_blend_i8( * %pv8, %v8, + %mask) + call void @__masked_store_blend_i16( * %pv16, %v16, + %mask) + call void @__masked_store_blend_i32( * %pv32, %v32, + %mask) + call void @__masked_store_blend_i64( * %pv64, %v64, %mask) - - call void @__masked_store_8( * %pv8, %v8, %mask) - call void @__masked_store_16( * %pv16, %v16, %mask) - call void @__masked_store_32( * %pv32, %v32, %mask) - call void @__masked_store_64( * %pv64, %v64, %mask) - - call void @__masked_store_blend_8( * %pv8, %v8, - %mask) - call void @__masked_store_blend_16( * %pv16, %v16, - %mask) - call void @__masked_store_blend_32( * %pv32, %v32, - %mask) - call void @__masked_store_blend_64( * %pv64, %v64, - %mask) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gathers @@ -2507,15 +2505,13 @@ define @__load_and_broadcast_$1(i8 *, %mask) nounwin ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Emit general-purpose code to do a masked load for targets that dont have ;; an instruction to do that. Parameters: -;; $1: target vector width -;; $2: element type for which to emit the function (i32, i64, ...) -;; $3: suffix for function name (32, 64, ...) -;; $4: alignment for elements of type $2 (4, 8, ...) +;; $1: element type for which to emit the function (i32, i64, ...) (and suffix for function name) +;; $2: alignment for elements of type $1 (4, 8, ...) define(`masked_load', ` -define <$1 x $2> @__masked_load_$3(i8 *, <$1 x MASK> %mask) nounwind alwaysinline { +define @__masked_load_$1(i8 *, %mask) nounwind alwaysinline { entry: - %mm = call i64 @__movmsk(<$1 x MASK> %mask) + %mm = call i64 @__movmsk( %mask) ; if the first lane and the last lane are on, then it is safe to do a vector load ; of the whole thing--what the lanes in the middle want turns out to not matter... @@ -2531,14 +2527,14 @@ entry: %can_vload_maybe_fast = or i1 %fast_i1, %can_vload ; if we are not able to do a singe vload, we will accumulate lanes in this memory.. - %retptr = alloca <$1 x $2> - %retptr32 = bitcast <$1 x $2> * %retptr to $2 * + %retptr = alloca + %retptr32 = bitcast * %retptr to $1 * br i1 %can_vload_maybe_fast, label %load, label %loop load: - %ptr = bitcast i8 * %0 to <$1 x $2> * - %valall = load <$1 x $2> * %ptr, align $4 - ret <$1 x $2> %valall + %ptr = bitcast i8 * %0 to * + %valall = load * %ptr, align $2 + ret %valall loop: ; loop over the lanes and see if each one is on... @@ -2552,21 +2548,21 @@ loop: load_lane: ; yes! do the load and store the result into the appropriate place in the ; allocaed memory above - %ptr32 = bitcast i8 * %0 to $2 * - %lane_ptr = getelementptr $2 * %ptr32, i32 %lane - %val = load $2 * %lane_ptr - %store_ptr = getelementptr $2 * %retptr32, i32 %lane - store $2 %val, $2 * %store_ptr + %ptr32 = bitcast i8 * %0 to $1 * + %lane_ptr = getelementptr $1 * %ptr32, i32 %lane + %val = load $1 * %lane_ptr + %store_ptr = getelementptr $1 * %retptr32, i32 %lane + store $1 %val, $1 * %store_ptr br label %lane_done lane_done: %next_lane = add i32 %lane, 1 - %done = icmp eq i32 %lane, eval($1-1) + %done = icmp eq i32 %lane, eval(WIDTH-1) br i1 %done, label %return, label %loop return: - %r = load <$1 x $2> * %retptr - ret <$1 x $2> %r + %r = load * %retptr + ret %r } ') @@ -2574,23 +2570,21 @@ return: ;; masked store ;; emit code to do masked store as a set of per-lane scalar stores ;; parameters: -;; $1: target vector width -;; $2: llvm type of elements -;; $3: suffix for function name +;; $1: llvm type of elements (and suffix for function name) define(`gen_masked_store', ` -define void @__masked_store_$3(<$1 x $2>* nocapture, <$1 x $2>, <$1 x i32>) nounwind alwaysinline { - per_lane($1, <$1 x i32> %2, ` - %ptr_LANE_ID = getelementptr <$1 x $2> * %0, i32 0, i32 LANE - %storeval_LANE_ID = extractelement <$1 x $2> %1, i32 LANE - store $2 %storeval_LANE_ID, $2 * %ptr_LANE_ID') +define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { + per_lane(WIDTH, %2, ` + %ptr_LANE_ID = getelementptr * %0, i32 0, i32 LANE + %storeval_LANE_ID = extractelement %1, i32 LANE + store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID') ret void } ') define(`masked_store_blend_8_16_by_4', ` -define void @__masked_store_blend_8(<4 x i8>* nocapture, <4 x i8>, - <4 x i32>) nounwind alwaysinline { +define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>, + <4 x i32>) nounwind alwaysinline { %old = load <4 x i8> * %0, align 1 ifelse(LLVM_VERSION,LLVM_3_1svn,` %m = trunc <4 x i32> %2 to <4 x i1> @@ -2613,8 +2607,8 @@ define void @__masked_store_blend_8(<4 x i8>* nocapture, <4 x i8>, ret void } -define void @__masked_store_blend_16(<4 x i16>* nocapture, <4 x i16>, - <4 x i32>) nounwind alwaysinline { +define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>, + <4 x i32>) nounwind alwaysinline { %old = load <4 x i16> * %0, align 2 ifelse(LLVM_VERSION,LLVM_3_1svn,` %m = trunc <4 x i32> %2 to <4 x i1> @@ -2639,8 +2633,8 @@ define void @__masked_store_blend_16(<4 x i16>* nocapture, <4 x i16>, ') define(`masked_store_blend_8_16_by_8', ` -define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>, - <8 x i32>) nounwind alwaysinline { +define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>, + <8 x i32>) nounwind alwaysinline { %old = load <8 x i8> * %0, align 1 ifelse(LLVM_VERSION,LLVM_3_1svn,` %m = trunc <8 x i32> %2 to <8 x i1> @@ -2663,8 +2657,8 @@ define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>, ret void } -define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>, - <8 x i32>) nounwind alwaysinline { +define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>, + <8 x i32>) nounwind alwaysinline { %old = load <8 x i16> * %0, align 2 ifelse(LLVM_VERSION,LLVM_3_1svn,` %m = trunc <8 x i32> %2 to <8 x i1> @@ -2690,8 +2684,8 @@ define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>, define(`masked_store_blend_8_16_by_16', ` -define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>, - <16 x i32>) nounwind alwaysinline { +define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>, + <16 x i32>) nounwind alwaysinline { %old = load <16 x i8> * %0, align 1 ifelse(LLVM_VERSION,LLVM_3_1svn,` %m = trunc <16 x i32> %2 to <16 x i1> @@ -2714,8 +2708,8 @@ define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>, ret void } -define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>, - <16 x i32>) nounwind alwaysinline { +define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>, + <16 x i32>) nounwind alwaysinline { %old = load <16 x i16> * %0, align 2 ifelse(LLVM_VERSION,LLVM_3_1svn,` %m = trunc <16 x i32> %2 to <16 x i1> @@ -2895,7 +2889,7 @@ domixed: store <$1 x $2> %basesmear, <$1 x $2> * %ptr %castptr = bitcast <$1 x $2> * %ptr to <$1 x $4> * %castv = bitcast <$1 x $2> %v to <$1 x $4> - call void @__masked_store_blend_$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x MASK> %mask) + call void @__masked_store_blend_i$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x MASK> %mask) %blendvec = load <$1 x $2> * %ptr br label %check_neighbors @@ -2970,8 +2964,8 @@ define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v, store <$1 x $2> %idvec, <$1 x $2> * %ptr %ptr`'$3 = bitcast <$1 x $2> * %ptr to <$1 x i`'$3> * %vi = bitcast <$1 x $2> %v to <$1 x i`'$3> - call void @__masked_store_blend_$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi, - <$1 x MASK> %mask) + call void @__masked_store_blend_i$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi, + <$1 x MASK> %mask) %v_id = load <$1 x $2> * %ptr ; extract elements of the vector to use in computing the scan @@ -3144,14 +3138,14 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 ; Set the offset to zero for lanes that are off %offsetsPtr = alloca <$1 x i32> store <$1 x i32> zeroinitializer, <$1 x i32> * %offsetsPtr - call void @__masked_store_blend_32(<$1 x i32> * %offsetsPtr, <$1 x i32> %offsets, - <$1 x i32> %vecmask) + call void @__masked_store_blend_i32(<$1 x i32> * %offsetsPtr, <$1 x i32> %offsets, + <$1 x i32> %vecmask) %newOffsets = load <$1 x i32> * %offsetsPtr %deltaPtr = alloca <$1 x i32> store <$1 x i32> zeroinitializer, <$1 x i32> * %deltaPtr - call void @__masked_store_blend_32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta, - <$1 x i32> %vecmask) + call void @__masked_store_blend_i32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta, + <$1 x i32> %vecmask) %newDelta = load <$1 x i32> * %deltaPtr %ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets, @@ -3175,14 +3169,14 @@ define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 ; Set the offset to zero for lanes that are off %offsetsPtr = alloca <$1 x i64> store <$1 x i64> zeroinitializer, <$1 x i64> * %offsetsPtr - call void @__masked_store_blend_64(<$1 x i64> * %offsetsPtr, <$1 x i64> %offsets, - <$1 x i32> %vecmask) + call void @__masked_store_blend_i64(<$1 x i64> * %offsetsPtr, <$1 x i64> %offsets, + <$1 x i32> %vecmask) %newOffsets = load <$1 x i64> * %offsetsPtr %deltaPtr = alloca <$1 x i64> store <$1 x i64> zeroinitializer, <$1 x i64> * %deltaPtr - call void @__masked_store_blend_64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta, - <$1 x i32> %vecmask) + call void @__masked_store_blend_i64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta, + <$1 x i32> %vecmask) %newDelta = load <$1 x i64> * %deltaPtr %ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets, diff --git a/ctx.cpp b/ctx.cpp index 9ad62b3b..9468a10d 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -2691,9 +2691,9 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, } if (g->target.is32Bit) - maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_32"); + maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32"); else - maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_64"); + maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64"); } else if (Type::Equal(valueType, AtomicType::VaryingBool) && g->target.maskBitCount == 1) { @@ -2712,35 +2712,31 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, else if (Type::Equal(valueType, AtomicType::VaryingDouble) || Type::Equal(valueType, AtomicType::VaryingInt64) || Type::Equal(valueType, AtomicType::VaryingUInt64)) { - maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_64"); ptr = BitCastInst(ptr, LLVMTypes::Int64VectorPointerType, LLVMGetName(ptr, "_to_int64vecptr")); value = BitCastInst(value, LLVMTypes::Int64VectorType, LLVMGetName(value, "_to_int64")); + maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64"); } else if (Type::Equal(valueType, AtomicType::VaryingFloat) || Type::Equal(valueType, AtomicType::VaryingBool) || Type::Equal(valueType, AtomicType::VaryingInt32) || Type::Equal(valueType, AtomicType::VaryingUInt32) || CastType(valueType) != NULL) { - maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_32"); ptr = BitCastInst(ptr, LLVMTypes::Int32VectorPointerType, LLVMGetName(ptr, "_to_int32vecptr")); if (Type::Equal(valueType, AtomicType::VaryingFloat)) value = BitCastInst(value, LLVMTypes::Int32VectorType, LLVMGetName(value, "_to_int32")); + maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32"); } else if (Type::Equal(valueType, AtomicType::VaryingInt16) || Type::Equal(valueType, AtomicType::VaryingUInt16)) { - maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_16"); - ptr = BitCastInst(ptr, LLVMTypes::Int16VectorPointerType, - LLVMGetName(ptr, "_to_int16vecptr")); + maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i16"); } else if (Type::Equal(valueType, AtomicType::VaryingInt8) || Type::Equal(valueType, AtomicType::VaryingUInt8)) { - maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_8"); - ptr = BitCastInst(ptr, LLVMTypes::Int8VectorPointerType, - LLVMGetName(ptr, "_to_int8vecptr")); + maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i8"); } AssertPos(currentPos, maskedStoreFunc != NULL); diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 80c2635c..a043fb33 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1101,8 +1101,8 @@ REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >) /////////////////////////////////////////////////////////////////////////// // masked load/store -static FORCEINLINE __vec16_i8 __masked_load_8(void *p, - __vec16_i1 mask) { +static FORCEINLINE __vec16_i8 __masked_load_i8(void *p, + __vec16_i1 mask) { __vec16_i8 ret; int8_t *ptr = (int8_t *)p; for (int i = 0; i < 16; ++i) @@ -1111,8 +1111,8 @@ static FORCEINLINE __vec16_i8 __masked_load_8(void *p, return ret; } -static FORCEINLINE __vec16_i16 __masked_load_16(void *p, - __vec16_i1 mask) { +static FORCEINLINE __vec16_i16 __masked_load_i16(void *p, + __vec16_i1 mask) { __vec16_i16 ret; int16_t *ptr = (int16_t *)p; for (int i = 0; i < 16; ++i) @@ -1121,8 +1121,8 @@ static FORCEINLINE __vec16_i16 __masked_load_16(void *p, return ret; } -static FORCEINLINE __vec16_i32 __masked_load_32(void *p, - __vec16_i1 mask) { +static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, + __vec16_i1 mask) { __vec16_i32 ret; int32_t *ptr = (int32_t *)p; for (int i = 0; i < 16; ++i) @@ -1131,8 +1131,8 @@ static FORCEINLINE __vec16_i32 __masked_load_32(void *p, return ret; } -static FORCEINLINE __vec16_i64 __masked_load_64(void *p, - __vec16_i1 mask) { +static FORCEINLINE __vec16_i64 __masked_load_i64(void *p, + __vec16_i1 mask) { __vec16_i64 ret; int64_t *ptr = (int64_t *)p; for (int i = 0; i < 16; ++i) @@ -1141,31 +1141,31 @@ static FORCEINLINE __vec16_i64 __masked_load_64(void *p, return ret; } -static FORCEINLINE void __masked_store_8(void *p, __vec16_i8 val, - __vec16_i1 mask) { +static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val, + __vec16_i1 mask) { int8_t *ptr = (int8_t *)p; for (int i = 0; i < 16; ++i) if ((mask.v & (1 << i)) != 0) ptr[i] = val.v[i]; } -static FORCEINLINE void __masked_store_16(void *p, __vec16_i16 val, - __vec16_i1 mask) { +static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val, + __vec16_i1 mask) { int16_t *ptr = (int16_t *)p; for (int i = 0; i < 16; ++i) if ((mask.v & (1 << i)) != 0) ptr[i] = val.v[i]; } -static FORCEINLINE void __masked_store_32(void *p, __vec16_i32 val, - __vec16_i1 mask) { +static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, + __vec16_i1 mask) { int32_t *ptr = (int32_t *)p; for (int i = 0; i < 16; ++i) if ((mask.v & (1 << i)) != 0) ptr[i] = val.v[i]; } -static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val, +static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val, __vec16_i1 mask) { int64_t *ptr = (int64_t *)p; for (int i = 0; i < 16; ++i) @@ -1173,24 +1173,28 @@ static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val, ptr[i] = val.v[i]; } -static FORCEINLINE void __masked_store_blend_8(void *p, __vec16_i8 val, - __vec16_i1 mask) { - __masked_store_8(p, val, mask); } -static FORCEINLINE void __masked_store_blend_16(void *p, __vec16_i16 val, +static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val, __vec16_i1 mask) { - __masked_store_16(p, val, mask); + __masked_store_i8(p, val, mask); } -static FORCEINLINE void __masked_store_blend_32(void *p, __vec16_i32 val, - __vec16_i1 mask) { - __masked_store_32(p, val, mask); +static FORCEINLINE void __masked_store_blend_i16(void *p, __vec16_i16 val, + __vec16_i1 mask) { + __masked_store_i16(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val, + __vec16_i1 mask) { + __masked_store_i32(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val, + __vec16_i1 mask) { + __masked_store_i64(p, val, mask); } -static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val, - __vec16_i1 mask) { - __masked_store_64(p, val, mask); } /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 9f301bb7..d1a93351 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -2415,8 +2415,7 @@ static FORCEINLINE uint64_t __reduce_max_uint64(__vec4_i64 v) { /////////////////////////////////////////////////////////////////////////// // masked load/store -static FORCEINLINE __vec4_i8 __masked_load_8(void *p, - __vec4_i1 mask) { +static FORCEINLINE __vec4_i8 __masked_load_i8(void *p, __vec4_i1 mask) { int8_t r[4]; int8_t *ptr = (int8_t *)p; uint32_t m = _mm_extract_ps(mask.v, 0); @@ -2435,8 +2434,7 @@ static FORCEINLINE __vec4_i8 __masked_load_8(void *p, return __vec4_i8(r[0], r[1], r[2], r[3]); } -static FORCEINLINE __vec4_i16 __masked_load_16(void *p, - __vec4_i1 mask) { +static FORCEINLINE __vec4_i16 __masked_load_i16(void *p, __vec4_i1 mask) { int16_t r[4]; int16_t *ptr = (int16_t *)p; @@ -2459,8 +2457,7 @@ static FORCEINLINE __vec4_i16 __masked_load_16(void *p, return __vec4_i16(r[0], r[1], r[2], r[3]); } -static FORCEINLINE __vec4_i32 __masked_load_32(void *p, - __vec4_i1 mask) { +static FORCEINLINE __vec4_i32 __masked_load_i32(void *p, __vec4_i1 mask) { __m128i r = _mm_set_epi32(0, 0, 0, 0); int32_t *ptr = (int32_t *)p; uint32_t m = _mm_extract_ps(mask.v, 0); @@ -2482,8 +2479,7 @@ static FORCEINLINE __vec4_i32 __masked_load_32(void *p, return r; } -static FORCEINLINE __vec4_i64 __masked_load_64(void *p, - __vec4_i1 mask) { +static FORCEINLINE __vec4_i64 __masked_load_i64(void *p, __vec4_i1 mask) { uint64_t r[4]; uint64_t *ptr = (uint64_t *)p; uint32_t m = _mm_extract_ps(mask.v, 0); @@ -2505,8 +2501,8 @@ static FORCEINLINE __vec4_i64 __masked_load_64(void *p, return __vec4_i64(r[0], r[1], r[2], r[3]); } -static FORCEINLINE void __masked_store_8(void *p, __vec4_i8 val, - __vec4_i1 mask) { +static FORCEINLINE void __masked_store_i8(void *p, __vec4_i8 val, + __vec4_i1 mask) { int8_t *ptr = (int8_t *)p; uint32_t m = _mm_extract_ps(mask.v, 0); @@ -2526,8 +2522,8 @@ static FORCEINLINE void __masked_store_8(void *p, __vec4_i8 val, ptr[3] = _mm_extract_epi8(val.v, 3); } -static FORCEINLINE void __masked_store_16(void *p, __vec4_i16 val, - __vec4_i1 mask) { +static FORCEINLINE void __masked_store_i16(void *p, __vec4_i16 val, + __vec4_i1 mask) { int16_t *ptr = (int16_t *)p; uint32_t m = _mm_extract_ps(mask.v, 0); @@ -2547,8 +2543,8 @@ static FORCEINLINE void __masked_store_16(void *p, __vec4_i16 val, ptr[3] = _mm_extract_epi16(val.v, 3); } -static FORCEINLINE void __masked_store_32(void *p, __vec4_i32 val, - __vec4_i1 mask) { +static FORCEINLINE void __masked_store_i32(void *p, __vec4_i32 val, + __vec4_i1 mask) { int32_t *ptr = (int32_t *)p; uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) @@ -2567,8 +2563,8 @@ static FORCEINLINE void __masked_store_32(void *p, __vec4_i32 val, ptr[3] = _mm_extract_epi32(val.v, 3); } -static FORCEINLINE void __masked_store_64(void *p, __vec4_i64 val, - __vec4_i1 mask) { +static FORCEINLINE void __masked_store_i64(void *p, __vec4_i64 val, + __vec4_i1 mask) { int64_t *ptr = (int64_t *)p; uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) @@ -2587,26 +2583,29 @@ static FORCEINLINE void __masked_store_64(void *p, __vec4_i64 val, ptr[3] = _mm_extract_epi64(val.v[1], 1); } -static FORCEINLINE void __masked_store_blend_8(void *p, __vec4_i8 val, - __vec4_i1 mask) { - __masked_store_8(p, val, mask); } -static FORCEINLINE void __masked_store_blend_16(void *p, __vec4_i16 val, +static FORCEINLINE void __masked_store_blend_i8(void *p, __vec4_i8 val, __vec4_i1 mask) { - __masked_store_16(p, val, mask); + __masked_store_i8(p, val, mask); } -static FORCEINLINE void __masked_store_blend_32(void *p, __vec4_i32 val, - __vec4_i1 mask) { +static FORCEINLINE void __masked_store_blend_i16(void *p, __vec4_i16 val, + __vec4_i1 mask) { + __masked_store_i16(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i32(void *p, __vec4_i32 val, + __vec4_i1 mask) { // FIXME: do a load, blendvps, store here... - __masked_store_32(p, val, mask); + __masked_store_i32(p, val, mask); +} } -static FORCEINLINE void __masked_store_blend_64(void *p, __vec4_i64 val, +static FORCEINLINE void __masked_store_blend_i64(void *p, __vec4_i64 val, __vec4_i1 mask) { // FIXME: do a 2x (load, blendvps, store) here... - __masked_store_64(p, val, mask); + __masked_store_i64(p, val, mask); } /////////////////////////////////////////////////////////////////////////// diff --git a/opt.cpp b/opt.cpp index b8e1d3f9..df7e082e 100644 --- a/opt.cpp +++ b/opt.cpp @@ -1877,18 +1877,18 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) { DEBUG_START_PASS("MaskedStoreOpt"); MSInfo msInfo[] = { - MSInfo("__pseudo_masked_store_8", 1), - MSInfo("__pseudo_masked_store_16", 2), - MSInfo("__pseudo_masked_store_32", 4), - MSInfo("__pseudo_masked_store_64", 8), - MSInfo("__masked_store_blend_8", 1), - MSInfo("__masked_store_blend_16", 2), - MSInfo("__masked_store_blend_32", 4), - MSInfo("__masked_store_blend_64", 8), - MSInfo("__masked_store_8", 1), - MSInfo("__masked_store_16", 2), - MSInfo("__masked_store_32", 4), - MSInfo("__masked_store_64", 8) + MSInfo("__pseudo_masked_store_i8", 1), + MSInfo("__pseudo_masked_store_i16", 2), + MSInfo("__pseudo_masked_store_i32", 4), + MSInfo("__pseudo_masked_store_i64", 8), + MSInfo("__masked_store_blend_i8", 1), + MSInfo("__masked_store_blend_i16", 2), + MSInfo("__masked_store_blend_i32", 4), + MSInfo("__masked_store_blend_i64", 8), + MSInfo("__masked_store_i8", 1), + MSInfo("__masked_store_i16", 2), + MSInfo("__masked_store_i32", 4), + MSInfo("__masked_store_i64", 8), }; bool modifiedAny = false; @@ -1992,10 +1992,10 @@ MaskedLoadOptPass::runOnBasicBlock(llvm::BasicBlock &bb) { DEBUG_START_PASS("MaskedLoadOpt"); MLInfo mlInfo[] = { - MLInfo("__masked_load_8", 1), - MLInfo("__masked_load_16", 2), - MLInfo("__masked_load_32", 4), - MLInfo("__masked_load_64", 8) + MLInfo("__masked_load_i8", 1), + MLInfo("__masked_load_i16", 2), + MLInfo("__masked_load_i32", 4), + MLInfo("__masked_load_i64", 8), }; bool modifiedAny = false; @@ -2141,14 +2141,14 @@ PseudoMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) { DEBUG_START_PASS("PseudoMaskedStorePass"); LMSInfo msInfo[] = { - LMSInfo("__pseudo_masked_store_8", "__masked_store_blend_8", - "__masked_store_8"), - LMSInfo("__pseudo_masked_store_16", "__masked_store_blend_16", - "__masked_store_16"), - LMSInfo("__pseudo_masked_store_32", "__masked_store_blend_32", - "__masked_store_32"), - LMSInfo("__pseudo_masked_store_64", "__masked_store_blend_64", - "__masked_store_64") + LMSInfo("__pseudo_masked_store_i8", "__masked_store_blend_i8", + "__masked_store_i8"), + LMSInfo("__pseudo_masked_store_i16", "__masked_store_blend_i16", + "__masked_store_i16"), + LMSInfo("__pseudo_masked_store_i32", "__masked_store_blend_i32", + "__masked_store_i32"), + LMSInfo("__pseudo_masked_store_i64", "__masked_store_blend_i64", + "__masked_store_i64"), }; bool modifiedAny = false; @@ -2282,38 +2282,38 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) { GatherImpInfo gInfo[] = { GatherImpInfo("__pseudo_gather_base_offsets32_8", "__load_and_broadcast_i8", - "__masked_load_8", 1), + "__masked_load_i8", 1), GatherImpInfo("__pseudo_gather_base_offsets32_16", "__load_and_broadcast_i16", - "__masked_load_16", 2), + "__masked_load_i16", 2), GatherImpInfo("__pseudo_gather_base_offsets32_32", "__load_and_broadcast_i32", - "__masked_load_32", 4), + "__masked_load_i32", 4), GatherImpInfo("__pseudo_gather_base_offsets32_64", "__load_and_broadcast_i64", - "__masked_load_64", 8), + "__masked_load_i64", 8), GatherImpInfo("__pseudo_gather_base_offsets64_8", "__load_and_broadcast_i8", - "__masked_load_8", 1), + "__masked_load_i8", 1), GatherImpInfo("__pseudo_gather_base_offsets64_16", "__load_and_broadcast_i16", - "__masked_load_16", 2), + "__masked_load_i16", 2), GatherImpInfo("__pseudo_gather_base_offsets64_32", "__load_and_broadcast_i32", - "__masked_load_32", 4), + "__masked_load_i32", 4), GatherImpInfo("__pseudo_gather_base_offsets64_64", "__load_and_broadcast_i64", - "__masked_load_64", 8) + "__masked_load_i64", 8) }; ScatterImpInfo sInfo[] = { - ScatterImpInfo("__pseudo_scatter_base_offsets32_8", "__pseudo_masked_store_8", + ScatterImpInfo("__pseudo_scatter_base_offsets32_8", "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1), - ScatterImpInfo("__pseudo_scatter_base_offsets32_16", "__pseudo_masked_store_16", + ScatterImpInfo("__pseudo_scatter_base_offsets32_16", "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2), - ScatterImpInfo("__pseudo_scatter_base_offsets32_32", "__pseudo_masked_store_32", + ScatterImpInfo("__pseudo_scatter_base_offsets32_32", "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets32_64", "__pseudo_masked_store_64", + ScatterImpInfo("__pseudo_scatter_base_offsets32_64", "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_base_offsets64_8", "__pseudo_masked_store_8", + ScatterImpInfo("__pseudo_scatter_base_offsets64_8", "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1), - ScatterImpInfo("__pseudo_scatter_base_offsets64_16", "__pseudo_masked_store_16", + ScatterImpInfo("__pseudo_scatter_base_offsets64_16", "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2), - ScatterImpInfo("__pseudo_scatter_base_offsets64_32", "__pseudo_masked_store_32", + ScatterImpInfo("__pseudo_scatter_base_offsets64_32", "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets64_64", "__pseudo_masked_store_64", + ScatterImpInfo("__pseudo_scatter_base_offsets64_64", "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8) }; @@ -3815,14 +3815,14 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { "__gather_elt32_i32", "__gather_elt32_i64", "__gather_elt64_i8", "__gather_elt64_i16", "__gather_elt64_i32", "__gather_elt64_i64", - "__masked_load_8", "__masked_load_16", - "__masked_load_32", "__masked_load_64", - "__masked_store_8", "__masked_store_16", - "__masked_store_32", "__masked_store_64", - "__masked_store_blend_8", "__masked_store_blend_16", - "__masked_store_blend_32", "__masked_store_blend_64", "__load_and_broadcast_i8", "__load_and_broadcast_i16", "__load_and_broadcast_i32", "__load_and_broadcast_i64", + "__masked_load_i8", "__masked_load_i16", + "__masked_load_i32", "__masked_load_i64", + "__masked_store_i8", "__masked_store_i16", + "__masked_store_i32", "__masked_store_i64", + "__masked_store_blend_i8", "__masked_store_blend_i16", + "__masked_store_blend_i32", "__masked_store_blend_i64", "__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16", "__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64", "__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16",