Add separate variants of memory built-ins for floats and doubles.

Previously, we'd bitcast e.g. a vector of floats to a vector of i32s and then use the i32 variant of masked_load/masked_store/gather/scatter. Now, we have separate float/double variants of each of those.
2012-06-07 14:29:17 -07:00
parent 1ac3e03171
commit 89a2566e01
17 changed files with 593 additions and 41 deletions
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -356,7 +356,9 @@ define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
 load_and_broadcast(i8)
 load_and_broadcast(i16)
 load_and_broadcast(i32)
+load_and_broadcast(float)
 load_and_broadcast(i64)
+load_and_broadcast(double)

 ; no masked load instruction for i8 and i16 types??
 masked_load(i8,  1)
@@ -417,6 +419,7 @@ define <16 x i64> @__masked_load_i64(i8 *, <16 x i32> %mask) nounwind alwaysinli
  ret <16 x i64> %val
 }

+masked_load_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
@@ -493,6 +496,7 @@ define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>,
  ret void
 }

+masked_store_float_double()

 masked_store_blend_8_16_by_16()

@@ -601,7 +605,9 @@ define void @__masked_store_blend_i64(<16 x i64>* nocapture %ptr, <16 x i64> %ne
 gen_scatter(i8)
 gen_scatter(i16)
 gen_scatter(i32)
+gen_scatter(float)
 gen_scatter(i64)
+gen_scatter(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -337,7 +337,9 @@ define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
 load_and_broadcast(i8)
 load_and_broadcast(i16)
 load_and_broadcast(i32)
+load_and_broadcast(float)
 load_and_broadcast(i64)
+load_and_broadcast(double)

 ; no masked load instruction for i8 and i16 types??
 masked_load(i8,  1)
@@ -373,6 +375,7 @@ define <8 x i64> @__masked_load_i64(i8 *, <8 x i32> %mask) nounwind alwaysinline
  ret <8 x i64> %val
 }

+masked_load_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
@@ -488,6 +491,7 @@ define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
  ret void
 }

+masked_store_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; scatter
@@ -495,7 +499,9 @@ define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
 gen_scatter(i8)
 gen_scatter(i16)
 gen_scatter(i32)
+gen_scatter(float)
 gen_scatter(i64)
+gen_scatter(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -72,4 +72,6 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
 gen_gather(i8)
 gen_gather(i16)
 gen_gather(i32)
+gen_gather(float)
 gen_gather(i64)
+gen_gather(double)
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -72,4 +72,6 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
 gen_gather(i8)
 gen_gather(i16)
 gen_gather(i32)
+gen_gather(float)
 gen_gather(i64)
+gen_gather(double)
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -124,4 +124,6 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
 gen_gather(i8)
 gen_gather(i16)
 gen_gather(i32)
+gen_gather(float)
 gen_gather(i64)
+gen_gather(double)
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -107,4 +107,6 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
 gen_gather(i8)
 gen_gather(i16)
 gen_gather(i32)
+gen_gather(float)
 gen_gather(i64)
+gen_gather(double)
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -24,12 +24,16 @@ gen_masked_store(i64)
 load_and_broadcast(i8)
 load_and_broadcast(i16)
 load_and_broadcast(i32)
+load_and_broadcast(float)
 load_and_broadcast(i64)
+load_and_broadcast(double)

 masked_load(i8,  1)
 masked_load(i16, 2)
 masked_load(i32, 4)
+masked_load(float, 4)
 masked_load(i64, 8)
+masked_load(double, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
@@ -39,12 +43,16 @@ masked_load(i64, 8)
 gen_gather(i8)
 gen_gather(i16)
 gen_gather(i32)
+gen_gather(float)
 gen_gather(i64)
+gen_gather(double)

 gen_scatter(i8)
 gen_scatter(i16)
 gen_scatter(i32)
+gen_scatter(float)
 gen_scatter(i64)
+gen_scatter(double)


 define  <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
@@ -187,6 +195,8 @@ define void @__masked_store_blend_i64(<1 x i64>* nocapture, <1 x i64>,
  ret void
 }

+masked_store_float_double()
+
 define  i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
  %item = extractelement <1 x i32> %0, i32 0
  %v = lshr i32 %item, 31
@@ -933,4 +943,3 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
-
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -230,25 +230,32 @@ declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-
-
 load_and_broadcast(i8)
 load_and_broadcast(i16)
 load_and_broadcast(i32)
+load_and_broadcast(float)
 load_and_broadcast(i64)
+load_and_broadcast(double)

 declare <WIDTH x i8> @__masked_load_i8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
 declare <WIDTH x i16> @__masked_load_i16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
 declare <WIDTH x i32> @__masked_load_i32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x float> @__masked_load_float(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
 declare <WIDTH x i64> @__masked_load_i64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x double> @__masked_load_double(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+
 declare void @__masked_store_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
                                <WIDTH x i1>) nounwind 
 declare void @__masked_store_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
                                 <WIDTH x i1>) nounwind 
 declare void @__masked_store_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
                                 <WIDTH x i1>) nounwind 
+declare void @__masked_store_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                   <WIDTH x i1>) nounwind 
 declare void @__masked_store_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
                                 <WIDTH x i1> %mask) nounwind 
+declare void @__masked_store_double(<WIDTH x double>* nocapture, <WIDTH x double>,
+                                    <WIDTH x i1> %mask) nounwind 

 ifelse(LLVM_VERSION, `LLVM_3_0', `
 declare void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
@@ -257,8 +264,12 @@ declare void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
                                       <WIDTH x i1>) nounwind 
 declare void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
                                       <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                       <WIDTH x i1>) nounwind 
 declare void @__masked_store_blend_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
                                       <WIDTH x i1> %mask) nounwind 
+declare void @__masked_store_blend_double(<WIDTH x double>* nocapture, <WIDTH x double>,
+                                       <WIDTH x i1> %mask) nounwind 
 ', `
 define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
                                     <WIDTH x i1>) nounwind alwaysinline {
@@ -284,6 +295,14 @@ define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
  ret void
 }

+define void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                        <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x float> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x float> %1, <WIDTH x float> %v
+  store <WIDTH x float> %v1, <WIDTH x float> * %0
+  ret void
+}
+
 define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
                            <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i64> * %0
@@ -291,6 +310,14 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
  store <WIDTH x i64> %v1, <WIDTH x i64> * %0
  ret void
 }
+
+define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
+                            <WIDTH x double>, <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x double> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x double> %1, <WIDTH x double> %v
+  store <WIDTH x double> %v1, <WIDTH x double> * %0
+  ret void
+}
 ')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -319,7 +346,9 @@ declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
 gather_scatter(i8)
 gather_scatter(i16)
 gather_scatter(i32)
+gather_scatter(float)
 gather_scatter(i64)
+gather_scatter(double)

 declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
                                  <WIDTH x i1>) nounwind
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -433,15 +433,19 @@ reduce_equal(8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-
 load_and_broadcast(i8)
 load_and_broadcast(i16)
 load_and_broadcast(i32)
+load_and_broadcast(float)
 load_and_broadcast(i64)
+load_and_broadcast(double)
+
 masked_load(i8,  1)
 masked_load(i16, 2)
 masked_load(i32, 4)
+masked_load(float, 4)
 masked_load(i64, 8)
+masked_load(double, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
@@ -449,12 +453,16 @@ masked_load(i64, 8)
 gen_gather(i8)
 gen_gather(i16)
 gen_gather(i32)
+gen_gather(float)
 gen_gather(i64)
+gen_gather(double)

 gen_scatter(i8)
 gen_scatter(i16)
 gen_scatter(i32)
+gen_scatter(float)
 gen_scatter(i64)
+gen_scatter(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float rounding
@@ -617,6 +625,8 @@ define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
  ret void
 }

+masked_store_float_double()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt

--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -401,6 +401,8 @@ define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
 }


+masked_store_float_double()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

@@ -563,12 +565,16 @@ gen_masked_store(i64)
 load_and_broadcast(i8)
 load_and_broadcast(i16)
 load_and_broadcast(i32)
+load_and_broadcast(float)
 load_and_broadcast(i64)
+load_and_broadcast(double)

 masked_load(i8,  1)
 masked_load(i16, 2)
 masked_load(i32, 4)
+masked_load(float, 4)
 masked_load(i64, 8)
+masked_load(double, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
@@ -578,9 +584,13 @@ masked_load(i64, 8)
 gen_gather(i8)
 gen_gather(i16)
 gen_gather(i32)
+gen_gather(float)
 gen_gather(i64)
+gen_gather(double)

 gen_scatter(i8)
 gen_scatter(i16)
 gen_scatter(i32)
+gen_scatter(float)
 gen_scatter(i64)
+gen_scatter(double)
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -363,11 +363,16 @@ reduce_equal(8)
 load_and_broadcast(i8)
 load_and_broadcast(i16)
 load_and_broadcast(i32)
+load_and_broadcast(float)
 load_and_broadcast(i64)
+load_and_broadcast(double)
+
 masked_load(i8,  1)
 masked_load(i16, 2)
 masked_load(i32, 4)
+masked_load(float, 4)
 masked_load(i64, 8)
+masked_load(double, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
@@ -375,12 +380,16 @@ masked_load(i64, 8)
 gen_gather(i8)
 gen_gather(i16)
 gen_gather(i32)
+gen_gather(float)
 gen_gather(i64)
+gen_gather(double)

 gen_scatter(i8)
 gen_scatter(i16)
 gen_scatter(i32)
+gen_scatter(float)
 gen_scatter(i64)
+gen_scatter(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float rounding
@@ -550,6 +559,7 @@ define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
  ret void
 }

+masked_store_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -456,18 +456,24 @@ gen_masked_store(i16)
 gen_masked_store(i32)
 gen_masked_store(i64)

+masked_store_float_double()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

 load_and_broadcast(i8)
 load_and_broadcast(i16)
 load_and_broadcast(i32)
+load_and_broadcast(float)
 load_and_broadcast(i64)
+load_and_broadcast(double)

 masked_load(i8,  1)
 masked_load(i16, 2)
 masked_load(i32, 4)
+masked_load(float, 4)
 masked_load(i64, 8)
+masked_load(double, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
@@ -477,9 +483,13 @@ masked_load(i64, 8)
 gen_gather(i8)
 gen_gather(i16)
 gen_gather(i32)
+gen_gather(float)
 gen_gather(i64)
+gen_gather(double)

 gen_scatter(i8)
 gen_scatter(i16)
 gen_scatter(i32)
+gen_scatter(float)
 gen_scatter(i64)
+gen_scatter(double)
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1533,6 +1533,63 @@ define void

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+define(`masked_load_float_double', `
+define <WIDTH x float> @__masked_load_float(i8 * %ptr,
+                                             <WIDTH x MASK> %mask) readonly alwaysinline {
+  %v32 = call <WIDTH x i32> @__masked_load_i32(i8 * %ptr, <WIDTH x MASK> %mask)
+  %vf = bitcast <WIDTH x i32> %v32 to <WIDTH x float>
+  ret <WIDTH x float> %vf
+}
+
+define <WIDTH x double> @__masked_load_double(i8 * %ptr,
+                                             <WIDTH x MASK> %mask) readonly alwaysinline {
+  %v64 = call <WIDTH x i64> @__masked_load_i64(i8 * %ptr, <WIDTH x MASK> %mask)
+  %vd = bitcast <WIDTH x i64> %v64 to <WIDTH x double>
+  ret <WIDTH x double> %vd
+}
+
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define(`masked_store_float_double', `
+define void @__masked_store_float(<WIDTH x float> * nocapture, <WIDTH x float>,
+                                  <WIDTH x MASK>) nounwind alwaysinline {
+  %ptr = bitcast <WIDTH x float> * %0 to <WIDTH x i32> *
+  %val = bitcast <WIDTH x float> %1 to <WIDTH x i32>
+  call void @__masked_store_i32(<WIDTH x i32> * %ptr, <WIDTH x i32> %val, <WIDTH x MASK> %2)
+  ret void
+}
+
+
+define void @__masked_store_double(<WIDTH x double> * nocapture, <WIDTH x double>,
+                                   <WIDTH x MASK>) nounwind alwaysinline {
+  %ptr = bitcast <WIDTH x double> * %0 to <WIDTH x i64> *
+  %val = bitcast <WIDTH x double> %1 to <WIDTH x i64>
+  call void @__masked_store_i64(<WIDTH x i64> * %ptr, <WIDTH x i64> %val, <WIDTH x MASK> %2)
+  ret void
+}
+
+define void @__masked_store_blend_float(<WIDTH x float> * nocapture, <WIDTH x float>,
+                                        <WIDTH x MASK>) nounwind alwaysinline {
+  %ptr = bitcast <WIDTH x float> * %0 to <WIDTH x i32> *
+  %val = bitcast <WIDTH x float> %1 to <WIDTH x i32>
+  call void @__masked_store_blend_i32(<WIDTH x i32> * %ptr, <WIDTH x i32> %val, <WIDTH x MASK> %2)
+  ret void
+}
+
+
+define void @__masked_store_blend_double(<WIDTH x double> * nocapture, <WIDTH x double>,
+                                         <WIDTH x MASK>) nounwind alwaysinline {
+  %ptr = bitcast <WIDTH x double> * %0 to <WIDTH x i64> *
+  %val = bitcast <WIDTH x double> %1 to <WIDTH x i64>
+  call void @__masked_store_blend_i64(<WIDTH x i64> * %ptr, <WIDTH x i64> %val, <WIDTH x MASK> %2)
+  ret void
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+

 define(`stdlib_core', `

@@ -1552,7 +1609,9 @@ declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
 ;  void __pseudo_masked_store_i8 (uniform int8 *ptr, varying int8 values, mask)
 ;  void __pseudo_masked_store_i16(uniform int16 *ptr, varying int16 values, mask)
 ;  void __pseudo_masked_store_i32(uniform int32 *ptr, varying int32 values, mask)
+;  void __pseudo_masked_store_float(uniform float *ptr, varying float values, mask)
 ;  void __pseudo_masked_store_i64(uniform int64 *ptr, varying int64 values, mask)
+;  void __pseudo_masked_store_double(uniform double *ptr, varying double values, mask)
 ;
 ;  These in turn are converted to native masked stores or to regular
 ;  stores (if the mask is all on) by the MaskedStoreOptPass optimization
@@ -1561,7 +1620,9 @@ declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
 declare void @__pseudo_masked_store_i8(<WIDTH x i8> * nocapture, <WIDTH x i8>, <WIDTH x MASK>)
 declare void @__pseudo_masked_store_i16(<WIDTH x i16> * nocapture, <WIDTH x i16>, <WIDTH x MASK>)
 declare void @__pseudo_masked_store_i32(<WIDTH x i32> * nocapture, <WIDTH x i32>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_float(<WIDTH x float> * nocapture, <WIDTH x float>, <WIDTH x MASK>)
 declare void @__pseudo_masked_store_i64(<WIDTH x i64> * nocapture, <WIDTH x i64>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_double(<WIDTH x double> * nocapture, <WIDTH x double>, <WIDTH x MASK>)

 ; Declare the pseudo-gather functions.  When the ispc front-end needs
 ; to perform a gather, it generates a call to one of these functions,
@@ -1570,7 +1631,9 @@ declare void @__pseudo_masked_store_i64(<WIDTH x i64> * nocapture, <WIDTH x i64>
 ; varying int8  __pseudo_gather_i8(varying int8 *, mask)
 ; varying int16 __pseudo_gather_i16(varying int16 *, mask)
 ; varying int32 __pseudo_gather_i32(varying int32 *, mask)
+; varying float __pseudo_gather_float(varying float *, mask)
 ; varying int64 __pseudo_gather_i64(varying int64 *, mask)
+; varying double __pseudo_gather_double(varying double *, mask)
 ;
 ; The GatherScatterFlattenOpt optimization pass finds these calls and then 
 ; converts them to make calls to the following functions (when appropriate); 
@@ -1582,8 +1645,8 @@ declare void @__pseudo_masked_store_i64(<WIDTH x i64> * nocapture, <WIDTH x i64>
 ; that use the free 2/4/8 scaling available in x86 addressing calculations, and
 ; offset_delta feeds into the free offset calculation. 
 ;
-; varying int{8,16,32,64}
-; __pseudo_gather_base_offsets{32,64}_{8,16,32,64}(uniform int8 *base,
+; varying int{8,16,32,float,64,double}
+; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
 ;                                    int{32,64} offsets, uniform int32 offset_scale, 
 ;                                    int{32,64} offset_delta, mask)
 ;
@@ -1594,12 +1657,16 @@ declare void @__pseudo_masked_store_i64(<WIDTH x i64> * nocapture, <WIDTH x i64>
 declare <WIDTH x i8>  @__pseudo_gather32_i8(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i16> @__pseudo_gather32_i16(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i32> @__pseudo_gather32_i32(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float> @__pseudo_gather32_float(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i64> @__pseudo_gather32_i64(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double> @__pseudo_gather32_double(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly

 declare <WIDTH x i8>  @__pseudo_gather64_i8(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i16> @__pseudo_gather64_i16(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i32> @__pseudo_gather64_i32(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float> @__pseudo_gather64_float(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i64> @__pseudo_gather64_i64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double> @__pseudo_gather64_double(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly

 declare <WIDTH x i8>  @__pseudo_gather_base_offsets32_i8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                        <WIDTH x MASK>) nounwind readonly
@@ -1607,8 +1674,12 @@ declare <WIDTH x i16> @__pseudo_gather_base_offsets32_i16(i8 *, <WIDTH x i32>, i
                                                         <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i32> @__pseudo_gather_base_offsets32_i32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                          <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float> @__pseudo_gather_base_offsets32_float(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                          <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i64> @__pseudo_gather_base_offsets32_i64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                          <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double> @__pseudo_gather_base_offsets32_double(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                          <WIDTH x MASK>) nounwind readonly

 declare <WIDTH x i8>  @__pseudo_gather_base_offsets64_i8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                         <WIDTH x MASK>) nounwind readonly
@@ -1616,8 +1687,12 @@ declare <WIDTH x i16> @__pseudo_gather_base_offsets64_i16(i8 *, <WIDTH x i64>, i
                                                          <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i32> @__pseudo_gather_base_offsets64_i32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                          <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float> @__pseudo_gather_base_offsets64_float(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                          <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i64> @__pseudo_gather_base_offsets64_i64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                          <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double> @__pseudo_gather_base_offsets64_double(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                                <WIDTH x MASK>) nounwind readonly

 ; Similarly to the pseudo-gathers defined above, we also declare undefined
 ; pseudo-scatter instructions with signatures:
@@ -1625,7 +1700,9 @@ declare <WIDTH x i64> @__pseudo_gather_base_offsets64_i64(i8 *, <WIDTH x i64>, i
 ; void __pseudo_scatter_i8 (varying int8 *, varying int8 values, mask)
 ; void __pseudo_scatter_i16(varying int16 *, varying int16 values, mask)
 ; void __pseudo_scatter_i32(varying int32 *, varying int32 values, mask)
+; void __pseudo_scatter_float(varying float *, varying float values, mask)
 ; void __pseudo_scatter_i64(varying int64 *, varying int64 values, mask)
+; void __pseudo_scatter_double(varying double *, varying double values, mask)
 ;
 ; The GatherScatterFlattenOpt optimization pass also finds these and
 ; transforms them to scatters like:
@@ -1641,12 +1718,16 @@ declare <WIDTH x i64> @__pseudo_gather_base_offsets64_i64(i8 *, <WIDTH x i64>, i
 declare void @__pseudo_scatter32_i8(<WIDTH x i32>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter32_i16(<WIDTH x i32>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter32_i32(<WIDTH x i32>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_float(<WIDTH x i32>, <WIDTH x float>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter32_i64(<WIDTH x i32>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_double(<WIDTH x i32>, <WIDTH x double>, <WIDTH x MASK>) nounwind

 declare void @__pseudo_scatter64_i8(<WIDTH x i64>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter64_i16(<WIDTH x i64>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter64_i32(<WIDTH x i64>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_float(<WIDTH x i64>, <WIDTH x float>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter64_i64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_double(<WIDTH x i64>, <WIDTH x double>, <WIDTH x MASK>) nounwind

 declare void @__pseudo_scatter_base_offsets32_i8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                 <WIDTH x i8>, <WIDTH x MASK>) nounwind
@@ -1654,8 +1735,12 @@ declare void @__pseudo_scatter_base_offsets32_i16(i8 * nocapture, <WIDTH x i32>,
                                                  <WIDTH x i16>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter_base_offsets32_i32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                  <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets32_float(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                  <WIDTH x float>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter_base_offsets32_i64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                  <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets32_double(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                   <WIDTH x double>, <WIDTH x MASK>) nounwind

 declare void @__pseudo_scatter_base_offsets64_i8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                 <WIDTH x i8>, <WIDTH x MASK>) nounwind
@@ -1663,8 +1748,12 @@ declare void @__pseudo_scatter_base_offsets64_i16(i8 * nocapture, <WIDTH x i64>,
                                                  <WIDTH x i16>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter_base_offsets64_i32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                  <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets64_float(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                  <WIDTH x float>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter_base_offsets64_i64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                  <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets64_double(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                     <WIDTH x double>, <WIDTH x MASK>) nounwind

 declare float @__log_uniform_float(float) nounwind readnone
 declare <WIDTH x float> @__log_varying_float(<WIDTH x float>) nounwind readnone
@@ -1678,7 +1767,9 @@ declare <WIDTH x float> @__pow_varying_float(<WIDTH x float>, <WIDTH x float>) n
 declare void @__use8(<WIDTH x i8>)
 declare void @__use16(<WIDTH x i16>)
 declare void @__use32(<WIDTH x i32>)
+declare void @__usefloat(<WIDTH x float>)
 declare void @__use64(<WIDTH x i64>)
+declare void @__usedouble(<WIDTH x double>)

 ;; This is a temporary function that will be removed at the end of
 ;; compilation--the idea is that it calls out to all of the various
@@ -1698,8 +1789,12 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
  call void @__use16(<WIDTH x i16> %ml16)
  %ml32 = call <WIDTH x i32> @__masked_load_i32(i8 * %ptr, <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %ml32)
+  %mlf = call <WIDTH x float> @__masked_load_float(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %mlf)
  %ml64 = call <WIDTH x i64> @__masked_load_i64(i8 * %ptr, <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %ml64)
+  %mld = call <WIDTH x double> @__masked_load_double(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %mld)

  %lb8   = call <WIDTH x i8>  @__load_and_broadcast_i8(i8 * %ptr, <WIDTH x MASK> %mask)
  call void @__use8(<WIDTH x i8> %lb8)
@@ -1707,8 +1802,12 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
  call void @__use16(<WIDTH x i16> %lb16)
  %lb32  = call <WIDTH x i32> @__load_and_broadcast_i32(i8 * %ptr, <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %lb32)
+  %lbf  = call <WIDTH x float> @__load_and_broadcast_float(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %lbf)
  %lb64  = call <WIDTH x i64> @__load_and_broadcast_i64(i8 * %ptr, <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %lb64)
+  %lbd  = call <WIDTH x double> @__load_and_broadcast_double(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %lbd)

  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;; stores
@@ -1721,21 +1820,37 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
  %pv32 = bitcast i8 * %ptr to <WIDTH x i32> *
  call void @__pseudo_masked_store_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
                                       <WIDTH x MASK> %mask)
+  %vf = bitcast <WIDTH x i32> %v32 to <WIDTH x float>
+  %pvf = bitcast i8 * %ptr to <WIDTH x float> *
+  call void @__pseudo_masked_store_float(<WIDTH x float> * %pvf, <WIDTH x float> %vf,
+                                         <WIDTH x MASK> %mask)
  %pv64 = bitcast i8 * %ptr to <WIDTH x i64> *
  call void @__pseudo_masked_store_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
                                       <WIDTH x MASK> %mask)
+  %vd = bitcast <WIDTH x i64> %v64 to <WIDTH x double>
+  %pvd = bitcast i8 * %ptr to <WIDTH x double> *
+  call void @__pseudo_masked_store_double(<WIDTH x double> * %pvd, <WIDTH x double> %vd,
+                                         <WIDTH x MASK> %mask)
+
  call void @__masked_store_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
  call void @__masked_store_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
  call void @__masked_store_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__masked_store_float(<WIDTH x float> * %pvf, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
  call void @__masked_store_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__masked_store_double(<WIDTH x double> * %pvd, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
  call void @__masked_store_blend_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
                                     <WIDTH x MASK> %mask)
  call void @__masked_store_blend_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
                                      <WIDTH x MASK> %mask)
  call void @__masked_store_blend_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
                                      <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_float(<WIDTH x float> * %pvf, <WIDTH x float> %vf,
+                                        <WIDTH x MASK> %mask)
  call void @__masked_store_blend_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
                                      <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_double(<WIDTH x double> * %pvd, <WIDTH x double> %vd,
+                                         <WIDTH x MASK> %mask)

  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;; gathers
@@ -1749,9 +1864,15 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
  %pg32_32 = call <WIDTH x i32>  @__pseudo_gather32_i32(<WIDTH x i32> %v32,
                                                        <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %pg32_32)
+  %pg32_f = call <WIDTH x float>  @__pseudo_gather32_float(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %pg32_f)
  %pg32_64 = call <WIDTH x i64>  @__pseudo_gather32_i64(<WIDTH x i32> %v32,
                                                        <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %pg32_64)
+  %pg32_d = call <WIDTH x double>  @__pseudo_gather32_double(<WIDTH x i32> %v32,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %pg32_d)

  %pg64_8 = call <WIDTH x i8>  @__pseudo_gather64_i8(<WIDTH x i64> %v64,
                                                     <WIDTH x MASK> %mask)
@@ -1762,9 +1883,15 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
  %pg64_32 = call <WIDTH x i32>  @__pseudo_gather64_i32(<WIDTH x i64> %v64,
                                                        <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %pg64_32)
+  %pg64_f = call <WIDTH x float>  @__pseudo_gather64_float(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %pg64_f)
  %pg64_64 = call <WIDTH x i64>  @__pseudo_gather64_i64(<WIDTH x i64> %v64,
                                                        <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %pg64_64)
+  %pg64_d = call <WIDTH x double>  @__pseudo_gather64_double(<WIDTH x i64> %v64,
+                                                        <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %pg64_d)

  %g32_8 = call <WIDTH x i8>  @__gather32_i8(<WIDTH x i32> %v32,
                                             <WIDTH x MASK> %mask)
@@ -1775,9 +1902,15 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
  %g32_32 = call <WIDTH x i32>  @__gather32_i32(<WIDTH x i32> %v32,
                                                <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %g32_32)
+  %g32_f = call <WIDTH x float>  @__gather32_float(<WIDTH x i32> %v32,
+                                                <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %g32_f)
  %g32_64 = call <WIDTH x i64>  @__gather32_i64(<WIDTH x i32> %v32,
                                                <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %g32_64)
+  %g32_d = call <WIDTH x double>  @__gather32_double(<WIDTH x i32> %v32,
+                                                <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %g32_d)

  %g64_8 = call <WIDTH x i8>  @__gather64_i8(<WIDTH x i64> %v64,
                                             <WIDTH x MASK> %mask)
@@ -1788,9 +1921,15 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
  %g64_32 = call <WIDTH x i32>  @__gather64_i32(<WIDTH x i64> %v64,
                                                <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %g64_32)
+  %g64_f = call <WIDTH x float>  @__gather64_float(<WIDTH x i64> %v64,
+                                                <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %g64_f)
  %g64_64 = call <WIDTH x i64>  @__gather64_i64(<WIDTH x i64> %v64,
                                                <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %g64_64)
+  %g64_d = call <WIDTH x double>  @__gather64_double(<WIDTH x i64> %v64,
+                                                    <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %g64_d)

  %pgbo32_8 = call <WIDTH x i8>
       @__pseudo_gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
@@ -1804,10 +1943,18 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
       @__pseudo_gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %pgbo32_32)
+  %pgbo32_f = call <WIDTH x float>
+       @__pseudo_gather_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %pgbo32_f)
  %pgbo32_64 = call <WIDTH x i64>
       @__pseudo_gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %pgbo32_64)
+  %pgbo32_d = call <WIDTH x double>
+       @__pseudo_gather_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %pgbo32_d)

  %gbo32_8 = call <WIDTH x i8>
       @__gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
@@ -1821,10 +1968,18 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
       @__gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %gbo32_32)
+  %gbo32_f = call <WIDTH x float>
+       @__gather_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %gbo32_f)
  %gbo32_64 = call <WIDTH x i64>
       @__gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %gbo32_64)
+  %gbo32_d = call <WIDTH x double>
+       @__gather_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %gbo32_d)


  %pgbo64_8 = call <WIDTH x i8>
@@ -1839,10 +1994,18 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
       @__pseudo_gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %pgbo64_32)
+  %pgbo64_f = call <WIDTH x float>
+       @__pseudo_gather_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %pgbo64_f)
  %pgbo64_64 = call <WIDTH x i64>
       @__pseudo_gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %pgbo64_64)
+  %pgbo64_d = call <WIDTH x double>
+       @__pseudo_gather_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %pgbo64_d)

  %gbo64_8 = call <WIDTH x i8>
       @__gather_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
@@ -1856,10 +2019,18 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
       @__gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %gbo64_32)
+  %gbo64_f = call <WIDTH x float>
+       @__gather_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %gbo64_f)
  %gbo64_64 = call <WIDTH x i64>
       @__gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %gbo64_64)
+  %gbo64_d = call <WIDTH x double>
+       @__gather_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %gbo64_d)

  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;; scatters
@@ -1867,22 +2038,30 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
  call void @__pseudo_scatter32_i8(<WIDTH x i32> %v32, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
  call void @__pseudo_scatter32_i16(<WIDTH x i32> %v32, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
  call void @__pseudo_scatter32_i32(<WIDTH x i32> %v32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_float(<WIDTH x i32> %v32, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
  call void @__pseudo_scatter32_i64(<WIDTH x i32> %v32, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_double(<WIDTH x i32> %v32, <WIDTH x double> %vd, <WIDTH x MASK> %mask)

  call void @__pseudo_scatter64_i8(<WIDTH x i64> %v64, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
  call void @__pseudo_scatter64_i16(<WIDTH x i64> %v64, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
  call void @__pseudo_scatter64_i32(<WIDTH x i64> %v64, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_float(<WIDTH x i64> %v64, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
  call void @__pseudo_scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_double(<WIDTH x i64> %v64, <WIDTH x double> %vd, <WIDTH x MASK> %mask)

  call void @__scatter32_i8(<WIDTH x i32> %v32, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
  call void @__scatter32_i16(<WIDTH x i32> %v32, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
  call void @__scatter32_i32(<WIDTH x i32> %v32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter32_float(<WIDTH x i32> %v32, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
  call void @__scatter32_i64(<WIDTH x i32> %v32, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter32_double(<WIDTH x i32> %v32, <WIDTH x double> %vd, <WIDTH x MASK> %mask)

  call void @__scatter64_i8(<WIDTH x i64> %v64, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
  call void @__scatter64_i16(<WIDTH x i64> %v64, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
  call void @__scatter64_i32(<WIDTH x i64> %v64, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter64_float(<WIDTH x i64> %v64, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
  call void @__scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter64_double(<WIDTH x i64> %v64, <WIDTH x double> %vd, <WIDTH x MASK> %mask)

  call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
@@ -1890,8 +2069,12 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
  call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
  call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)

  call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
@@ -1899,8 +2082,12 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
  call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
  call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)

  call void @__scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                         <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
@@ -1908,8 +2095,12 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
                                          <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
  call void @__scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                            <WIDTH x float> %vf, <WIDTH x MASK> %mask)
  call void @__scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                             <WIDTH x double> %vd, <WIDTH x MASK> %mask)

  call void @__scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                         <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
@@ -1917,8 +2108,12 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
                                          <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
  call void @__scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                            <WIDTH x float> %vf, <WIDTH x MASK> %mask)
  call void @__scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                             <WIDTH x double> %vd, <WIDTH x MASK> %mask)

  ret void
 }