Added masked load optimization pass.

This pass handles the "all on" and "all off" mask cases appropriately. Also renamed load_masked stuff in built-ins to masked_load for consistency with masked_store.
2012-01-04 11:51:26 -08:00
parent 75f18c7c66
commit 562d61caff
9 changed files with 152 additions and 46 deletions
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -385,13 +385,13 @@ load_and_broadcast(16, i32, 32)
 load_and_broadcast(16, i64, 64)

 ; no masked load instruction for i8 and i16 types??
-load_masked(16, i8,  8,  1)
-load_masked(16, i16, 16, 2)
+masked_load(16, i8,  8,  1)
+masked_load(16, i16, 16, 2)

 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
 
-define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  %floatmask = bitcast <16 x i32> %mask to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -409,7 +409,7 @@ define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinlin
 }


-define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -366,13 +366,13 @@ load_and_broadcast(8, i32, 32)
 load_and_broadcast(8, i64, 64)

 ; no masked load instruction for i8 and i16 types??
-load_masked(8, i8,  8,  1)
-load_masked(8, i16, 16, 2)
+masked_load(8, i8,  8,  1)
+masked_load(8, i16, 16, 2)

 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
 
-define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  %floatmask = bitcast <8 x i32> %mask to <8 x float>
  %floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
  %retval = bitcast <8 x float> %floatval to <8 x i32>
@@ -380,7 +380,7 @@ define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline
 }


-define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -175,10 +175,10 @@ load_and_broadcast(WIDTH, i16, 16)
 load_and_broadcast(WIDTH, i32, 32)
 load_and_broadcast(WIDTH, i64, 64)

-declare <WIDTH x i8> @__load_masked_8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
-declare <WIDTH x i16> @__load_masked_16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
-declare <WIDTH x i32> @__load_masked_32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
-declare <WIDTH x i64> @__load_masked_64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i8> @__masked_load_8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i16> @__masked_load_16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i32> @__masked_load_32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i64> @__masked_load_64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly

 declare void @__masked_store_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
                                <WIDTH x i1>) nounwind 
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -429,10 +429,10 @@ load_and_broadcast(8, i16, 16)
 load_and_broadcast(8, i32, 32)
 load_and_broadcast(8, i64, 64)

-load_masked(8, i8,  8,  1)
-load_masked(8, i16, 16, 2)
-load_masked(8, i32, 32, 4)
-load_masked(8, i64, 64, 8)
+masked_load(8, i8,  8,  1)
+masked_load(8, i16, 16, 2)
+masked_load(8, i32, 32, 4)
+masked_load(8, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -556,10 +556,10 @@ load_and_broadcast(4, i16, 16)
 load_and_broadcast(4, i32, 32)
 load_and_broadcast(4, i64, 64)

-load_masked(4, i8,  8,  1)
-load_masked(4, i16, 16, 2)
-load_masked(4, i32, 32, 4)
-load_masked(4, i64, 64, 8)
+masked_load(4, i8,  8,  1)
+masked_load(4, i16, 16, 2)
+masked_load(4, i32, 32, 4)
+masked_load(4, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -356,10 +356,10 @@ load_and_broadcast(8, i16, 16)
 load_and_broadcast(8, i32, 32)
 load_and_broadcast(8, i64, 64)

-load_masked(8, i8,  8,  1)
-load_masked(8, i16, 16, 2)
-load_masked(8, i32, 32, 4)
-load_masked(8, i64, 64, 8)
+masked_load(8, i8,  8,  1)
+masked_load(8, i16, 16, 2)
+masked_load(8, i32, 32, 4)
+masked_load(8, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -455,10 +455,10 @@ load_and_broadcast(4, i16, 16)
 load_and_broadcast(4, i32, 32)
 load_and_broadcast(4, i64, 64)

-load_masked(4, i8,  8,  1)
-load_masked(4, i16, 16, 2)
-load_masked(4, i32, 32, 4)
-load_masked(4, i64, 64, 8)
+masked_load(4, i8,  8,  1)
+masked_load(4, i16, 16, 2)
+masked_load(4, i32, 32, 4)
+masked_load(4, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -2213,8 +2213,8 @@ define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x i32> %mask) nounwind alway
 ;; $3: suffix for function name (32, 64, ...)
 ;; $4: alignment for elements of type $2 (4, 8, ...)

-define(`load_masked', `
-define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
+define(`masked_load', `
+define <$1 x $2> @__masked_load_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
 entry:
  %mm = call i32 @__movmsk(<$1 x i32> %mask)