Improve naming of masked load/store instructions in builtins.

Now, use _i32 suffixes, rather than _32, etc. Also cleaned up the m4 macro to generate these functions, using WIDTH to get the target width, etc.
2012-06-07 13:51:08 -07:00
parent 91d22d150f
commit b86d40091a
13 changed files with 299 additions and 308 deletions
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -359,13 +359,13 @@ load_and_broadcast(i32)
 load_and_broadcast(i64)

 ; no masked load instruction for i8 and i16 types??
-masked_load(16, i8,  8,  1)
-masked_load(16, i16, 16, 2)
+masked_load(i8,  1)
+masked_load(i16, 2)

 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
 
-define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+define <16 x i32> @__masked_load_i32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  %floatmask = bitcast <16 x i32> %mask to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -383,7 +383,7 @@ define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinlin
 }


-define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+define <16 x i64> @__masked_load_i64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -424,15 +424,15 @@ define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
 ; FIXME: there is no AVX instruction for these, but we could be clever
 ; by packing the bits down and setting the last 3/4 or half, respectively,
 ; of the mask to zero...  Not sure if this would be a win in the end
-gen_masked_store(16, i8, 8)
-gen_masked_store(16, i16, 16)
+gen_masked_store(i8)
+gen_masked_store(i16)

 ; note that mask is the 2nd parameter, not the 3rd one!!
 declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
 declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)

-define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>, 
-                               <16 x i32>) nounwind alwaysinline {
+define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                <16 x i32>) nounwind alwaysinline {
  %ptr = bitcast <16 x i32> * %0 to i8 *
  %val = bitcast <16 x i32> %1 to <16 x float>
  %mask = bitcast <16 x i32> %2 to <16 x float>
@@ -454,8 +454,8 @@ define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>,
  ret void
 }

-define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
-                               <16 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>,
+                                <16 x i32> %mask) nounwind alwaysinline {
  %ptr = bitcast <16 x i64> * %0 to i8 *
  %val = bitcast <16 x i64> %1 to <16 x double>

@@ -499,8 +499,8 @@ masked_store_blend_8_16_by_16()
 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone

-define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, 
-                                     <16 x i32>) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                      <16 x i32>) nounwind alwaysinline {
  %maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
  %oldValue = load <16 x i32>* %0, align 4
  %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
@@ -537,8 +537,8 @@ define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
                                                 <4 x double>) nounwind readnone

-define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
-                                     <16 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
+                                      <16 x i32> %mask) nounwind alwaysinline {
  %oldValue = load <16 x i64>* %ptr, align 8
  %old = bitcast <16 x i64> %oldValue to <16 x double>
  %old0d = shufflevector <16 x double> %old, <16 x double> undef,
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -340,13 +340,13 @@ load_and_broadcast(i32)
 load_and_broadcast(i64)

 ; no masked load instruction for i8 and i16 types??
-masked_load(8, i8,  8,  1)
-masked_load(8, i16, 16, 2)
+masked_load(i8,  1)
+masked_load(i16, 2)

 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
 
-define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+define <8 x i32> @__masked_load_i32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  %floatmask = bitcast <8 x i32> %mask to <8 x float>
  %floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
  %retval = bitcast <8 x float> %floatval to <8 x i32>
@@ -354,7 +354,7 @@ define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline
 }


-define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+define <8 x i64> @__masked_load_i64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -377,15 +377,15 @@ define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-gen_masked_store(8, i8, 8)
-gen_masked_store(8, i16, 16)
+gen_masked_store(i8)
+gen_masked_store(i16)

 ; note that mask is the 2nd parameter, not the 3rd one!!
 declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
 declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)

-define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>, 
-                               <8 x i32>) nounwind alwaysinline {
+define void @__masked_store_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                <8 x i32>) nounwind alwaysinline {
  %ptr = bitcast <8 x i32> * %0 to i8 *
  %val = bitcast <8 x i32> %1 to <8 x float>
  %mask = bitcast <8 x i32> %2 to <8 x float>
@@ -393,8 +393,8 @@ define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
  ret void
 }

-define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
-                               <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>,
+                                <8 x i32> %mask) nounwind alwaysinline {
  %ptr = bitcast <8 x i64> * %0 to i8 *
  %val = bitcast <8 x i64> %1 to <8 x double>

@@ -418,14 +418,13 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
 }


-
 masked_store_blend_8_16_by_8()

 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone

-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
-                                     <8 x i32>) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x i32>) nounwind alwaysinline {
  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
  %oldValue = load <8 x i32>* %0, align 4
  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
@@ -439,8 +438,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
 }


-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
-                                     <8 x i32> %i32mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
+                                      <8 x i32> %i32mask) nounwind alwaysinline {
  %oldValue = load <8 x i64>* %ptr, align 8
  %mask = bitcast <8 x i32> %i32mask to <8 x float>

--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -13,10 +13,10 @@ aossoa()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-gen_masked_store(1, i8, 8)
-gen_masked_store(1, i16, 16)
-gen_masked_store(1, i32, 32)
-gen_masked_store(1, i64, 64)
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
@@ -26,10 +26,10 @@ load_and_broadcast(i16)
 load_and_broadcast(i32)
 load_and_broadcast(i64)

-masked_load(1, i8,  8,  1)
-masked_load(1, i16, 16, 2)
-masked_load(1, i32, 32, 4)
-masked_load(1, i64, 64, 8)
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(i64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
@@ -155,23 +155,23 @@ define  <1 x float> @__vselect_float(<1 x float>, <1 x float>,
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-define void @__masked_store_blend_8(<1 x i8>* nocapture, <1 x i8>, 
+define void @__masked_store_blend_i8(<1 x i8>* nocapture, <1 x i8>,
                                     <1 x i32> %mask) nounwind alwaysinline {
  %val = load <1 x i8> * %0, align 4
  %newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask) 
  store <1 x i8> %newval, <1 x i8> * %0, align 4
  ret void
 }
-define void @__masked_store_blend_16(<1 x i16>* nocapture, <1 x i16>, 
-                                     <1 x i32> %mask) nounwind alwaysinline {
+
+define void @__masked_store_blend_i16(<1 x i16>* nocapture, <1 x i16>, 
+                                      <1 x i32> %mask) nounwind alwaysinline {
  %val = load <1 x i16> * %0, align 4
  %newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask) 
  store <1 x i16> %newval, <1 x i16> * %0, align 4
  ret void
 }

-
-define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>, 
+define void @__masked_store_blend_i32(<1 x i32>* nocapture, <1 x i32>, 
                                     <1 x i32> %mask) nounwind alwaysinline {
  %val = load <1 x i32> * %0, align 4
  %newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask) 
@@ -179,8 +179,8 @@ define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>,
  ret void
 }

-define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>,
-                                     <1 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<1 x i64>* nocapture, <1 x i64>,
+                                      <1 x i32> %mask) nounwind alwaysinline {
  %val = load <1 x i64> * %0, align 4
  %newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask) 
  store <1 x i64> %newval, <1 x i64> * %0, align 4
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -231,36 +231,36 @@ declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone
 ;; unaligned loads/loads+broadcasts


-declare <WIDTH x i8> @__masked_load_8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
-declare <WIDTH x i16> @__masked_load_16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
-declare <WIDTH x i32> @__masked_load_32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
-declare <WIDTH x i64> @__masked_load_64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly

 load_and_broadcast(i8)
 load_and_broadcast(i16)
 load_and_broadcast(i32)
 load_and_broadcast(i64)

-declare void @__masked_store_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
-                               <WIDTH x i1>) nounwind 
-declare void @__masked_store_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+declare <WIDTH x i8> @__masked_load_i8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i16> @__masked_load_i16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i32> @__masked_load_i32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i64> @__masked_load_i64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare void @__masked_store_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
                                <WIDTH x i1>) nounwind 
-declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
-                                <WIDTH x i1>) nounwind 
-declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
-                                <WIDTH x i1> %mask) nounwind 
+declare void @__masked_store_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                 <WIDTH x i1>) nounwind 
+declare void @__masked_store_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                 <WIDTH x i1>) nounwind 
+declare void @__masked_store_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                 <WIDTH x i1> %mask) nounwind 

 ifelse(LLVM_VERSION, `LLVM_3_0', `
-declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
-                                     <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+declare void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
                                      <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
-                                      <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
-                                      <WIDTH x i1> %mask) nounwind 
+declare void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                       <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                       <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                       <WIDTH x i1> %mask) nounwind 
 ', `
-define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
                                     <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i8> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
@@ -268,23 +268,23 @@ define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
  ret void
 }

-define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
-                                     <WIDTH x i1>) nounwind alwaysinline {
+define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                      <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i16> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
  store <WIDTH x i16> %v1, <WIDTH x i16> * %0
  ret void
 }

-define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
-                                     <WIDTH x i1>) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                      <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i32> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
  store <WIDTH x i32> %v1, <WIDTH x i32> * %0
  ret void
 }

-define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
+define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
                            <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i64> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -434,14 +434,14 @@ reduce_equal(8)
 ;; unaligned loads/loads+broadcasts


-masked_load(8, i8,  8,  1)
-masked_load(8, i16, 16, 2)
-masked_load(8, i32, 32, 4)
-masked_load(8, i64, 64, 8)
 load_and_broadcast(i8)
 load_and_broadcast(i16)
 load_and_broadcast(i32)
 load_and_broadcast(i64)
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(i64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
@@ -558,23 +558,23 @@ define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alway
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-gen_masked_store(8, i8, 8)
-gen_masked_store(8, i16, 16)
-gen_masked_store(8, i32, 32)
-gen_masked_store(8, i64, 64)
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)

 masked_store_blend_8_16_by_8()

-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
-                                     <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x i32> %mask) nounwind alwaysinline {
  %val = load <8 x i32> * %0, align 4
  %newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask) 
  store <8 x i32> %newval, <8 x i32> * %0, align 4
  ret void
 }

-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
-                                     <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+                                      <8 x i32> %mask) nounwind alwaysinline {
  %oldValue = load <8 x i64>* %ptr, align 8

  ; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -350,16 +350,16 @@ reduce_equal(4)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
-                                     <4 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i32> %mask) nounwind alwaysinline {
  %val = load <4 x i32> * %0, align 4
  %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) 
  store <4 x i32> %newval, <4 x i32> * %0, align 4
  ret void
 }

-define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
-                                     <4 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
+                                      <4 x i32> %mask) nounwind alwaysinline {
  %oldValue = load <4 x i64>* %ptr, align 8

  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
@@ -552,10 +552,10 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r

 masked_store_blend_8_16_by_4()

-gen_masked_store(4, i8, 8)
-gen_masked_store(4, i16, 16)
-gen_masked_store(4, i32, 32)
-gen_masked_store(4, i64, 64)
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
@@ -565,10 +565,10 @@ load_and_broadcast(i16)
 load_and_broadcast(i32)
 load_and_broadcast(i64)

-masked_load(4, i8,  8,  1)
-masked_load(4, i16, 16, 2)
-masked_load(4, i32, 32, 4)
-masked_load(4, i64, 64, 8)
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(i64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -360,15 +360,14 @@ reduce_equal(8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-
-masked_load(8, i8,  8,  1)
-masked_load(8, i16, 16, 2)
-masked_load(8, i32, 32, 4)
-masked_load(8, i64, 64, 8)
 load_and_broadcast(i8)
 load_and_broadcast(i16)
 load_and_broadcast(i32)
 load_and_broadcast(i64)
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(i64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
@@ -444,18 +443,18 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-gen_masked_store(8, i8, 8)
-gen_masked_store(8, i16, 16)
-gen_masked_store(8, i32, 32)
-gen_masked_store(8, i64, 64)
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)

 masked_store_blend_8_16_by_8()

 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                             <4 x float>) nounwind readnone

-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
-                                     <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x i32> %mask) nounwind alwaysinline {
  ; do two 4-wide blends with blendvps
  %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
  %mask_a = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
@@ -484,8 +483,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
  ret void
 }

-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
-                                     <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+                                      <8 x i32> %mask) nounwind alwaysinline {
  ; implement this as 4 blends of <4 x i32>s, which are actually bitcast
  ; <2 x i64>s...

--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -384,8 +384,8 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                             <4 x float>) nounwind readnone


-define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
-                                     <4 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i32> %mask) nounwind alwaysinline {
  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
  %oldValue = load <4 x i32>* %0, align 4
  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
@@ -399,8 +399,8 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
 }


-define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
-                                     <4 x i32> %i32mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
+                                      <4 x i32> %i32mask) nounwind alwaysinline {
  %oldValue = load <4 x i64>* %ptr, align 8
  %mask = bitcast <4 x i32> %i32mask to <4 x float>

@@ -451,10 +451,10 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,

 masked_store_blend_8_16_by_4()

-gen_masked_store(4, i8, 8)
-gen_masked_store(4, i16, 16)
-gen_masked_store(4, i32, 32)
-gen_masked_store(4, i64, 64)
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
@@ -464,10 +464,10 @@ load_and_broadcast(i16)
 load_and_broadcast(i32)
 load_and_broadcast(i64)

-masked_load(4, i8,  8,  1)
-masked_load(4, i16, 16, 2)
-masked_load(4, i32, 32, 4)
-masked_load(4, i64, 64, 8)
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(i64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1549,19 +1549,19 @@ declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
 ; This function declares placeholder masked store functions for the
 ;  front-end to use.
 ;
-;  void __pseudo_masked_store_8 (uniform int8 *ptr, varying int8 values, mask)
-;  void __pseudo_masked_store_16(uniform int16 *ptr, varying int16 values, mask)
-;  void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask)
-;  void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask)
+;  void __pseudo_masked_store_i8 (uniform int8 *ptr, varying int8 values, mask)
+;  void __pseudo_masked_store_i16(uniform int16 *ptr, varying int16 values, mask)
+;  void __pseudo_masked_store_i32(uniform int32 *ptr, varying int32 values, mask)
+;  void __pseudo_masked_store_i64(uniform int64 *ptr, varying int64 values, mask)
 ;
 ;  These in turn are converted to native masked stores or to regular
 ;  stores (if the mask is all on) by the MaskedStoreOptPass optimization
 ;  pass.

-declare void @__pseudo_masked_store_8(<WIDTH x i8> * nocapture, <WIDTH x i8>, <WIDTH x MASK>)
-declare void @__pseudo_masked_store_16(<WIDTH x i16> * nocapture, <WIDTH x i16>, <WIDTH x MASK>)
-declare void @__pseudo_masked_store_32(<WIDTH x i32> * nocapture, <WIDTH x i32>, <WIDTH x MASK>)
-declare void @__pseudo_masked_store_64(<WIDTH x i64> * nocapture, <WIDTH x i64>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_i8(<WIDTH x i8> * nocapture, <WIDTH x i8>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_i16(<WIDTH x i16> * nocapture, <WIDTH x i16>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_i32(<WIDTH x i32> * nocapture, <WIDTH x i32>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_i64(<WIDTH x i64> * nocapture, <WIDTH x i64>, <WIDTH x MASK>)

 ; Declare the pseudo-gather functions.  When the ispc front-end needs
 ; to perform a gather, it generates a call to one of these functions,
@@ -1692,13 +1692,13 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
                               <WIDTH x MASK> %mask) {
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;; loads
-  %ml8  = call <WIDTH x i8>  @__masked_load_8(i8 * %ptr, <WIDTH x MASK> %mask)
+  %ml8  = call <WIDTH x i8>  @__masked_load_i8(i8 * %ptr, <WIDTH x MASK> %mask)
  call void @__use8(<WIDTH x i8> %ml8)
-  %ml16 = call <WIDTH x i16> @__masked_load_16(i8 * %ptr, <WIDTH x MASK> %mask)
+  %ml16 = call <WIDTH x i16> @__masked_load_i16(i8 * %ptr, <WIDTH x MASK> %mask)
  call void @__use16(<WIDTH x i16> %ml16)
-  %ml32 = call <WIDTH x i32> @__masked_load_32(i8 * %ptr, <WIDTH x MASK> %mask)
+  %ml32 = call <WIDTH x i32> @__masked_load_i32(i8 * %ptr, <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %ml32)
-  %ml64 = call <WIDTH x i64> @__masked_load_64(i8 * %ptr, <WIDTH x MASK> %mask)
+  %ml64 = call <WIDTH x i64> @__masked_load_i64(i8 * %ptr, <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %ml64)

  %lb8   = call <WIDTH x i8>  @__load_and_broadcast_i8(i8 * %ptr, <WIDTH x MASK> %mask)
@@ -1713,31 +1713,29 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;; stores
  %pv8 = bitcast i8 * %ptr to <WIDTH x i8> *
-  call void @__pseudo_masked_store_8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
-                                     <WIDTH x MASK> %mask)
+  call void @__pseudo_masked_store_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
+                                      <WIDTH x MASK> %mask)
  %pv16 = bitcast i8 * %ptr to <WIDTH x i16> *
-  call void @__pseudo_masked_store_16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
-                                      <WIDTH x MASK> %mask)
+  call void @__pseudo_masked_store_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
+                                       <WIDTH x MASK> %mask)
  %pv32 = bitcast i8 * %ptr to <WIDTH x i32> *
-  call void @__pseudo_masked_store_32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
-                                      <WIDTH x MASK> %mask)
+  call void @__pseudo_masked_store_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
+                                       <WIDTH x MASK> %mask)
  %pv64 = bitcast i8 * %ptr to <WIDTH x i64> *
-  call void @__pseudo_masked_store_64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
+  call void @__pseudo_masked_store_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
+                                       <WIDTH x MASK> %mask)
+  call void @__masked_store_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__masked_store_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__masked_store_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__masked_store_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
+                                     <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
+                                      <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
+                                      <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
                                      <WIDTH x MASK> %mask)
-
-  call void @__masked_store_8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
-  call void @__masked_store_16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
-  call void @__masked_store_32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__masked_store_64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
-
-  call void @__masked_store_blend_8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
-                                    <WIDTH x MASK> %mask)
-  call void @__masked_store_blend_16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
-                                     <WIDTH x MASK> %mask)
-  call void @__masked_store_blend_32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
-                                     <WIDTH x MASK> %mask)
-  call void @__masked_store_blend_64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
-                                     <WIDTH x MASK> %mask)

  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;; gathers
@@ -2507,15 +2505,13 @@ define <WIDTH x $1> @__load_and_broadcast_$1(i8 *, <WIDTH x MASK> %mask) nounwin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Emit general-purpose code to do a masked load for targets that dont have
 ;; an instruction to do that.  Parameters:
-;; $1: target vector width
-;; $2: element type for which to emit the function (i32, i64, ...)
-;; $3: suffix for function name (32, 64, ...)
-;; $4: alignment for elements of type $2 (4, 8, ...)
+;; $1: element type for which to emit the function (i32, i64, ...) (and suffix for function name)
+;; $2: alignment for elements of type $1 (4, 8, ...)

 define(`masked_load', `
-define <$1 x $2> @__masked_load_$3(i8 *, <$1 x MASK> %mask) nounwind alwaysinline {
+define <WIDTH x $1> @__masked_load_$1(i8 *, <WIDTH x MASK> %mask) nounwind alwaysinline {
 entry:
-  %mm = call i64 @__movmsk(<$1 x MASK> %mask)
+  %mm = call i64 @__movmsk(<WIDTH x MASK> %mask)
  
  ; if the first lane and the last lane are on, then it is safe to do a vector load
  ; of the whole thing--what the lanes in the middle want turns out to not matter...
@@ -2531,14 +2527,14 @@ entry:
  %can_vload_maybe_fast = or i1 %fast_i1, %can_vload

  ; if we are not able to do a singe vload, we will accumulate lanes in this memory..
-  %retptr = alloca <$1 x $2>
-  %retptr32 = bitcast <$1 x $2> * %retptr to $2 *
+  %retptr = alloca <WIDTH x $1>
+  %retptr32 = bitcast <WIDTH x $1> * %retptr to $1 *
  br i1 %can_vload_maybe_fast, label %load, label %loop

 load: 
-  %ptr = bitcast i8 * %0 to <$1 x $2> *
-  %valall = load <$1 x $2> * %ptr, align $4
-  ret <$1 x $2> %valall
+  %ptr = bitcast i8 * %0 to <WIDTH x $1> *
+  %valall = load <WIDTH x $1> * %ptr, align $2
+  ret <WIDTH x $1> %valall

 loop:
  ; loop over the lanes and see if each one is on...
@@ -2552,21 +2548,21 @@ loop:
 load_lane:
  ; yes!  do the load and store the result into the appropriate place in the
  ; allocaed memory above
-  %ptr32 = bitcast i8 * %0 to $2 *
-  %lane_ptr = getelementptr $2 * %ptr32, i32 %lane
-  %val = load $2 * %lane_ptr
-  %store_ptr = getelementptr $2 * %retptr32, i32 %lane
-  store $2 %val, $2 * %store_ptr
+  %ptr32 = bitcast i8 * %0 to $1 *
+  %lane_ptr = getelementptr $1 * %ptr32, i32 %lane
+  %val = load $1 * %lane_ptr
+  %store_ptr = getelementptr $1 * %retptr32, i32 %lane
+  store $1 %val, $1 * %store_ptr
  br label %lane_done

 lane_done:
  %next_lane = add i32 %lane, 1
-  %done = icmp eq i32 %lane, eval($1-1)
+  %done = icmp eq i32 %lane, eval(WIDTH-1)
  br i1 %done, label %return, label %loop

 return:
-  %r = load <$1 x $2> * %retptr
-  ret <$1 x $2> %r
+  %r = load <WIDTH x $1> * %retptr
+  ret <WIDTH x $1> %r
 }
 ')

@@ -2574,23 +2570,21 @@ return:
 ;; masked store
 ;; emit code to do masked store as a set of per-lane scalar stores
 ;; parameters:
-;; $1: target vector width
-;; $2: llvm type of elements
-;; $3: suffix for function name
+;; $1: llvm type of elements (and suffix for function name)

 define(`gen_masked_store', `
-define void @__masked_store_$3(<$1 x $2>* nocapture, <$1 x $2>, <$1 x i32>) nounwind alwaysinline {
-  per_lane($1, <$1 x i32> %2, `
-      %ptr_LANE_ID = getelementptr <$1 x $2> * %0, i32 0, i32 LANE
-      %storeval_LANE_ID = extractelement <$1 x $2> %1, i32 LANE
-      store $2 %storeval_LANE_ID, $2 * %ptr_LANE_ID')
+define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x i32>) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x i32> %2, `
+      %ptr_LANE_ID = getelementptr <WIDTH x $1> * %0, i32 0, i32 LANE
+      %storeval_LANE_ID = extractelement <WIDTH x $1> %1, i32 LANE
+      store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID')
  ret void
 }
 ')

 define(`masked_store_blend_8_16_by_4', `
-define void @__masked_store_blend_8(<4 x i8>* nocapture, <4 x i8>,
-                                    <4 x i32>) nounwind alwaysinline {
+define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
+                                     <4 x i32>) nounwind alwaysinline {
  %old = load <4 x i8> * %0, align 1
  ifelse(LLVM_VERSION,LLVM_3_1svn,`
    %m = trunc <4 x i32> %2 to <4 x i1>
@@ -2613,8 +2607,8 @@ define void @__masked_store_blend_8(<4 x i8>* nocapture, <4 x i8>,
  ret void
 }

-define void @__masked_store_blend_16(<4 x i16>* nocapture, <4 x i16>,
-                                     <4 x i32>) nounwind alwaysinline {
+define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
+                                      <4 x i32>) nounwind alwaysinline {
  %old = load <4 x i16> * %0, align 2
  ifelse(LLVM_VERSION,LLVM_3_1svn,`
    %m = trunc <4 x i32> %2 to <4 x i1>
@@ -2639,8 +2633,8 @@ define void @__masked_store_blend_16(<4 x i16>* nocapture, <4 x i16>,
 ')

 define(`masked_store_blend_8_16_by_8', `
-define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>,
-                                    <8 x i32>) nounwind alwaysinline {
+define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
+                                     <8 x i32>) nounwind alwaysinline {
  %old = load <8 x i8> * %0, align 1
  ifelse(LLVM_VERSION,LLVM_3_1svn,`
    %m = trunc <8 x i32> %2 to <8 x i1>
@@ -2663,8 +2657,8 @@ define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>,
  ret void
 }

-define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
-                                     <8 x i32>) nounwind alwaysinline {
+define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>,
+                                      <8 x i32>) nounwind alwaysinline {
  %old = load <8 x i16> * %0, align 2
  ifelse(LLVM_VERSION,LLVM_3_1svn,`
    %m = trunc <8 x i32> %2 to <8 x i1>
@@ -2690,8 +2684,8 @@ define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,


 define(`masked_store_blend_8_16_by_16', `
-define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>,
-                                    <16 x i32>) nounwind alwaysinline {
+define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
+                                     <16 x i32>) nounwind alwaysinline {
  %old = load <16 x i8> * %0, align 1
  ifelse(LLVM_VERSION,LLVM_3_1svn,`
    %m = trunc <16 x i32> %2 to <16 x i1>
@@ -2714,8 +2708,8 @@ define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>,
  ret void
 }

-define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
-                                     <16 x i32>) nounwind alwaysinline {
+define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
+                                      <16 x i32>) nounwind alwaysinline {
  %old = load <16 x i16> * %0, align 2
  ifelse(LLVM_VERSION,LLVM_3_1svn,`
    %m = trunc <16 x i32> %2 to <16 x i1>
@@ -2895,7 +2889,7 @@ domixed:
  store <$1 x $2> %basesmear, <$1 x $2> * %ptr
  %castptr = bitcast <$1 x $2> * %ptr to <$1 x $4> *
  %castv = bitcast <$1 x $2> %v to <$1 x $4>
-  call void @__masked_store_blend_$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x MASK> %mask)
+  call void @__masked_store_blend_i$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x MASK> %mask)
  %blendvec = load <$1 x $2> * %ptr
  br label %check_neighbors

@@ -2970,8 +2964,8 @@ define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v,
  store <$1 x $2> %idvec, <$1 x $2> * %ptr
  %ptr`'$3 = bitcast <$1 x $2> * %ptr to <$1 x i`'$3> *
  %vi = bitcast <$1 x $2> %v to <$1 x i`'$3>
-  call void @__masked_store_blend_$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi,
-                                     <$1 x MASK> %mask)
+  call void @__masked_store_blend_i$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi,
+                                      <$1 x MASK> %mask)
  %v_id = load <$1 x $2> * %ptr

  ; extract elements of the vector to use in computing the scan
@@ -3144,14 +3138,14 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32
  ; Set the offset to zero for lanes that are off
  %offsetsPtr = alloca <$1 x i32>
  store <$1 x i32> zeroinitializer, <$1 x i32> * %offsetsPtr
-  call void @__masked_store_blend_32(<$1 x i32> * %offsetsPtr, <$1 x i32> %offsets, 
-                                     <$1 x i32> %vecmask)
+  call void @__masked_store_blend_i32(<$1 x i32> * %offsetsPtr, <$1 x i32> %offsets, 
+                                      <$1 x i32> %vecmask)
  %newOffsets = load <$1 x i32> * %offsetsPtr

  %deltaPtr = alloca <$1 x i32>
  store <$1 x i32> zeroinitializer, <$1 x i32> * %deltaPtr
-  call void @__masked_store_blend_32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta, 
-                                     <$1 x i32> %vecmask)
+  call void @__masked_store_blend_i32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta, 
+                                      <$1 x i32> %vecmask)
  %newDelta = load <$1 x i32> * %deltaPtr

  %ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
@@ -3175,14 +3169,14 @@ define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32
  ; Set the offset to zero for lanes that are off
  %offsetsPtr = alloca <$1 x i64>
  store <$1 x i64> zeroinitializer, <$1 x i64> * %offsetsPtr
-  call void @__masked_store_blend_64(<$1 x i64> * %offsetsPtr, <$1 x i64> %offsets, 
-                                     <$1 x i32> %vecmask)
+  call void @__masked_store_blend_i64(<$1 x i64> * %offsetsPtr, <$1 x i64> %offsets, 
+                                      <$1 x i32> %vecmask)
  %newOffsets = load <$1 x i64> * %offsetsPtr

  %deltaPtr = alloca <$1 x i64>
  store <$1 x i64> zeroinitializer, <$1 x i64> * %deltaPtr
-  call void @__masked_store_blend_64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta, 
-                                     <$1 x i32> %vecmask)
+  call void @__masked_store_blend_i64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta, 
+                                      <$1 x i32> %vecmask)
  %newDelta = load <$1 x i64> * %deltaPtr

  %ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets,