Add separate variants of memory built-ins for floats and doubles.

Previously, we'd bitcast e.g. a vector of floats to a vector of i32s and then
use the i32 variant of masked_load/masked_store/gather/scatter.  Now, we have
separate float/double variants of each of those.
This commit is contained in:
Matt Pharr
2012-06-07 14:29:17 -07:00
parent 1ac3e03171
commit 89a2566e01
17 changed files with 593 additions and 41 deletions

View File

@@ -356,7 +356,9 @@ define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
load_and_broadcast(i8) load_and_broadcast(i8)
load_and_broadcast(i16) load_and_broadcast(i16)
load_and_broadcast(i32) load_and_broadcast(i32)
load_and_broadcast(float)
load_and_broadcast(i64) load_and_broadcast(i64)
load_and_broadcast(double)
; no masked load instruction for i8 and i16 types?? ; no masked load instruction for i8 and i16 types??
masked_load(i8, 1) masked_load(i8, 1)
@@ -417,6 +419,7 @@ define <16 x i64> @__masked_load_i64(i8 *, <16 x i32> %mask) nounwind alwaysinli
ret <16 x i64> %val ret <16 x i64> %val
} }
masked_load_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store ;; masked store
@@ -493,6 +496,7 @@ define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>,
ret void ret void
} }
masked_store_float_double()
masked_store_blend_8_16_by_16() masked_store_blend_8_16_by_16()
@@ -601,7 +605,9 @@ define void @__masked_store_blend_i64(<16 x i64>* nocapture %ptr, <16 x i64> %ne
gen_scatter(i8) gen_scatter(i8)
gen_scatter(i16) gen_scatter(i16)
gen_scatter(i32) gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64) gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt ;; double precision sqrt

View File

@@ -337,7 +337,9 @@ define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
load_and_broadcast(i8) load_and_broadcast(i8)
load_and_broadcast(i16) load_and_broadcast(i16)
load_and_broadcast(i32) load_and_broadcast(i32)
load_and_broadcast(float)
load_and_broadcast(i64) load_and_broadcast(i64)
load_and_broadcast(double)
; no masked load instruction for i8 and i16 types?? ; no masked load instruction for i8 and i16 types??
masked_load(i8, 1) masked_load(i8, 1)
@@ -373,6 +375,7 @@ define <8 x i64> @__masked_load_i64(i8 *, <8 x i32> %mask) nounwind alwaysinline
ret <8 x i64> %val ret <8 x i64> %val
} }
masked_load_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store ;; masked store
@@ -488,6 +491,7 @@ define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
ret void ret void
} }
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; scatter ;; scatter
@@ -495,7 +499,9 @@ define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
gen_scatter(i8) gen_scatter(i8)
gen_scatter(i16) gen_scatter(i16)
gen_scatter(i32) gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64) gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt ;; double precision sqrt

View File

@@ -72,4 +72,6 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
gen_gather(i8) gen_gather(i8)
gen_gather(i16) gen_gather(i16)
gen_gather(i32) gen_gather(i32)
gen_gather(float)
gen_gather(i64) gen_gather(i64)
gen_gather(double)

View File

@@ -72,4 +72,6 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
gen_gather(i8) gen_gather(i8)
gen_gather(i16) gen_gather(i16)
gen_gather(i32) gen_gather(i32)
gen_gather(float)
gen_gather(i64) gen_gather(i64)
gen_gather(double)

View File

@@ -124,4 +124,6 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
gen_gather(i8) gen_gather(i8)
gen_gather(i16) gen_gather(i16)
gen_gather(i32) gen_gather(i32)
gen_gather(float)
gen_gather(i64) gen_gather(i64)
gen_gather(double)

View File

@@ -107,4 +107,6 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
gen_gather(i8) gen_gather(i8)
gen_gather(i16) gen_gather(i16)
gen_gather(i32) gen_gather(i32)
gen_gather(float)
gen_gather(i64) gen_gather(i64)
gen_gather(double)

View File

@@ -24,12 +24,16 @@ gen_masked_store(i64)
load_and_broadcast(i8) load_and_broadcast(i8)
load_and_broadcast(i16) load_and_broadcast(i16)
load_and_broadcast(i32) load_and_broadcast(i32)
load_and_broadcast(float)
load_and_broadcast(i64) load_and_broadcast(i64)
load_and_broadcast(double)
masked_load(i8, 1) masked_load(i8, 1)
masked_load(i16, 2) masked_load(i16, 2)
masked_load(i32, 4) masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8) masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter ;; gather/scatter
@@ -39,12 +43,16 @@ masked_load(i64, 8)
gen_gather(i8) gen_gather(i8)
gen_gather(i16) gen_gather(i16)
gen_gather(i32) gen_gather(i32)
gen_gather(float)
gen_gather(i64) gen_gather(i64)
gen_gather(double)
gen_scatter(i8) gen_scatter(i8)
gen_scatter(i16) gen_scatter(i16)
gen_scatter(i32) gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64) gen_scatter(i64)
gen_scatter(double)
define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> , define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
@@ -187,6 +195,8 @@ define void @__masked_store_blend_i64(<1 x i64>* nocapture, <1 x i64>,
ret void ret void
} }
masked_store_float_double()
define i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline { define i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
%item = extractelement <1 x i32> %0, i32 0 %item = extractelement <1 x i32> %0, i32 0
%v = lshr i32 %item, 31 %v = lshr i32 %item, 31
@@ -933,4 +943,3 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone

View File

@@ -230,25 +230,32 @@ declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts ;; unaligned loads/loads+broadcasts
load_and_broadcast(i8) load_and_broadcast(i8)
load_and_broadcast(i16) load_and_broadcast(i16)
load_and_broadcast(i32) load_and_broadcast(i32)
load_and_broadcast(float)
load_and_broadcast(i64) load_and_broadcast(i64)
load_and_broadcast(double)
declare <WIDTH x i8> @__masked_load_i8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly declare <WIDTH x i8> @__masked_load_i8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x i16> @__masked_load_i16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly declare <WIDTH x i16> @__masked_load_i16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x i32> @__masked_load_i32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly declare <WIDTH x i32> @__masked_load_i32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x float> @__masked_load_float(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x i64> @__masked_load_i64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly declare <WIDTH x i64> @__masked_load_i64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x double> @__masked_load_double(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare void @__masked_store_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, declare void @__masked_store_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
<WIDTH x i1>) nounwind <WIDTH x i1>) nounwind
declare void @__masked_store_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, declare void @__masked_store_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
<WIDTH x i1>) nounwind <WIDTH x i1>) nounwind
declare void @__masked_store_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, declare void @__masked_store_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
<WIDTH x i1>) nounwind <WIDTH x i1>) nounwind
declare void @__masked_store_float(<WIDTH x float>* nocapture, <WIDTH x float>,
<WIDTH x i1>) nounwind
declare void @__masked_store_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>, declare void @__masked_store_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
<WIDTH x i1> %mask) nounwind <WIDTH x i1> %mask) nounwind
declare void @__masked_store_double(<WIDTH x double>* nocapture, <WIDTH x double>,
<WIDTH x i1> %mask) nounwind
ifelse(LLVM_VERSION, `LLVM_3_0', ` ifelse(LLVM_VERSION, `LLVM_3_0', `
declare void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, declare void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
@@ -257,8 +264,12 @@ declare void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
<WIDTH x i1>) nounwind <WIDTH x i1>) nounwind
declare void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, declare void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
<WIDTH x i1>) nounwind <WIDTH x i1>) nounwind
declare void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>,
<WIDTH x i1>) nounwind
declare void @__masked_store_blend_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>, declare void @__masked_store_blend_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
<WIDTH x i1> %mask) nounwind <WIDTH x i1> %mask) nounwind
declare void @__masked_store_blend_double(<WIDTH x double>* nocapture, <WIDTH x double>,
<WIDTH x i1> %mask) nounwind
', ` ', `
define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
<WIDTH x i1>) nounwind alwaysinline { <WIDTH x i1>) nounwind alwaysinline {
@@ -284,6 +295,14 @@ define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
ret void ret void
} }
define void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>,
<WIDTH x i1>) nounwind alwaysinline {
%v = load <WIDTH x float> * %0
%v1 = select <WIDTH x i1> %2, <WIDTH x float> %1, <WIDTH x float> %v
store <WIDTH x float> %v1, <WIDTH x float> * %0
ret void
}
define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture, define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
<WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline { <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
%v = load <WIDTH x i64> * %0 %v = load <WIDTH x i64> * %0
@@ -291,6 +310,14 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
store <WIDTH x i64> %v1, <WIDTH x i64> * %0 store <WIDTH x i64> %v1, <WIDTH x i64> * %0
ret void ret void
} }
define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
<WIDTH x double>, <WIDTH x i1>) nounwind alwaysinline {
%v = load <WIDTH x double> * %0
%v1 = select <WIDTH x i1> %2, <WIDTH x double> %1, <WIDTH x double> %v
store <WIDTH x double> %v1, <WIDTH x double> * %0
ret void
}
') ')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -319,7 +346,9 @@ declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
gather_scatter(i8) gather_scatter(i8)
gather_scatter(i16) gather_scatter(i16)
gather_scatter(i32) gather_scatter(i32)
gather_scatter(float)
gather_scatter(i64) gather_scatter(i64)
gather_scatter(double)
declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture, declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
<WIDTH x i1>) nounwind <WIDTH x i1>) nounwind

View File

@@ -433,15 +433,19 @@ reduce_equal(8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts ;; unaligned loads/loads+broadcasts
load_and_broadcast(i8) load_and_broadcast(i8)
load_and_broadcast(i16) load_and_broadcast(i16)
load_and_broadcast(i32) load_and_broadcast(i32)
load_and_broadcast(float)
load_and_broadcast(i64) load_and_broadcast(i64)
load_and_broadcast(double)
masked_load(i8, 1) masked_load(i8, 1)
masked_load(i16, 2) masked_load(i16, 2)
masked_load(i32, 4) masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8) masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter ;; gather/scatter
@@ -449,12 +453,16 @@ masked_load(i64, 8)
gen_gather(i8) gen_gather(i8)
gen_gather(i16) gen_gather(i16)
gen_gather(i32) gen_gather(i32)
gen_gather(float)
gen_gather(i64) gen_gather(i64)
gen_gather(double)
gen_scatter(i8) gen_scatter(i8)
gen_scatter(i16) gen_scatter(i16)
gen_scatter(i32) gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64) gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float rounding ;; float rounding
@@ -617,6 +625,8 @@ define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
ret void ret void
} }
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt ;; double precision sqrt

View File

@@ -401,6 +401,8 @@ define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
} }
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp ;; rcp
@@ -563,12 +565,16 @@ gen_masked_store(i64)
load_and_broadcast(i8) load_and_broadcast(i8)
load_and_broadcast(i16) load_and_broadcast(i16)
load_and_broadcast(i32) load_and_broadcast(i32)
load_and_broadcast(float)
load_and_broadcast(i64) load_and_broadcast(i64)
load_and_broadcast(double)
masked_load(i8, 1) masked_load(i8, 1)
masked_load(i16, 2) masked_load(i16, 2)
masked_load(i32, 4) masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8) masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter ;; gather/scatter
@@ -578,9 +584,13 @@ masked_load(i64, 8)
gen_gather(i8) gen_gather(i8)
gen_gather(i16) gen_gather(i16)
gen_gather(i32) gen_gather(i32)
gen_gather(float)
gen_gather(i64) gen_gather(i64)
gen_gather(double)
gen_scatter(i8) gen_scatter(i8)
gen_scatter(i16) gen_scatter(i16)
gen_scatter(i32) gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64) gen_scatter(i64)
gen_scatter(double)

View File

@@ -363,11 +363,16 @@ reduce_equal(8)
load_and_broadcast(i8) load_and_broadcast(i8)
load_and_broadcast(i16) load_and_broadcast(i16)
load_and_broadcast(i32) load_and_broadcast(i32)
load_and_broadcast(float)
load_and_broadcast(i64) load_and_broadcast(i64)
load_and_broadcast(double)
masked_load(i8, 1) masked_load(i8, 1)
masked_load(i16, 2) masked_load(i16, 2)
masked_load(i32, 4) masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8) masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter ;; gather/scatter
@@ -375,12 +380,16 @@ masked_load(i64, 8)
gen_gather(i8) gen_gather(i8)
gen_gather(i16) gen_gather(i16)
gen_gather(i32) gen_gather(i32)
gen_gather(float)
gen_gather(i64) gen_gather(i64)
gen_gather(double)
gen_scatter(i8) gen_scatter(i8)
gen_scatter(i16) gen_scatter(i16)
gen_scatter(i32) gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64) gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float rounding ;; float rounding
@@ -550,6 +559,7 @@ define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
ret void ret void
} }
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt ;; double precision sqrt

View File

@@ -456,18 +456,24 @@ gen_masked_store(i16)
gen_masked_store(i32) gen_masked_store(i32)
gen_masked_store(i64) gen_masked_store(i64)
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts ;; unaligned loads/loads+broadcasts
load_and_broadcast(i8) load_and_broadcast(i8)
load_and_broadcast(i16) load_and_broadcast(i16)
load_and_broadcast(i32) load_and_broadcast(i32)
load_and_broadcast(float)
load_and_broadcast(i64) load_and_broadcast(i64)
load_and_broadcast(double)
masked_load(i8, 1) masked_load(i8, 1)
masked_load(i16, 2) masked_load(i16, 2)
masked_load(i32, 4) masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8) masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter ;; gather/scatter
@@ -477,9 +483,13 @@ masked_load(i64, 8)
gen_gather(i8) gen_gather(i8)
gen_gather(i16) gen_gather(i16)
gen_gather(i32) gen_gather(i32)
gen_gather(float)
gen_gather(i64) gen_gather(i64)
gen_gather(double)
gen_scatter(i8) gen_scatter(i8)
gen_scatter(i16) gen_scatter(i16)
gen_scatter(i32) gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64) gen_scatter(i64)
gen_scatter(double)

View File

@@ -1533,6 +1533,63 @@ define void
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
define(`masked_load_float_double', `
define <WIDTH x float> @__masked_load_float(i8 * %ptr,
<WIDTH x MASK> %mask) readonly alwaysinline {
%v32 = call <WIDTH x i32> @__masked_load_i32(i8 * %ptr, <WIDTH x MASK> %mask)
%vf = bitcast <WIDTH x i32> %v32 to <WIDTH x float>
ret <WIDTH x float> %vf
}
define <WIDTH x double> @__masked_load_double(i8 * %ptr,
<WIDTH x MASK> %mask) readonly alwaysinline {
%v64 = call <WIDTH x i64> @__masked_load_i64(i8 * %ptr, <WIDTH x MASK> %mask)
%vd = bitcast <WIDTH x i64> %v64 to <WIDTH x double>
ret <WIDTH x double> %vd
}
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
define(`masked_store_float_double', `
define void @__masked_store_float(<WIDTH x float> * nocapture, <WIDTH x float>,
<WIDTH x MASK>) nounwind alwaysinline {
%ptr = bitcast <WIDTH x float> * %0 to <WIDTH x i32> *
%val = bitcast <WIDTH x float> %1 to <WIDTH x i32>
call void @__masked_store_i32(<WIDTH x i32> * %ptr, <WIDTH x i32> %val, <WIDTH x MASK> %2)
ret void
}
define void @__masked_store_double(<WIDTH x double> * nocapture, <WIDTH x double>,
<WIDTH x MASK>) nounwind alwaysinline {
%ptr = bitcast <WIDTH x double> * %0 to <WIDTH x i64> *
%val = bitcast <WIDTH x double> %1 to <WIDTH x i64>
call void @__masked_store_i64(<WIDTH x i64> * %ptr, <WIDTH x i64> %val, <WIDTH x MASK> %2)
ret void
}
define void @__masked_store_blend_float(<WIDTH x float> * nocapture, <WIDTH x float>,
<WIDTH x MASK>) nounwind alwaysinline {
%ptr = bitcast <WIDTH x float> * %0 to <WIDTH x i32> *
%val = bitcast <WIDTH x float> %1 to <WIDTH x i32>
call void @__masked_store_blend_i32(<WIDTH x i32> * %ptr, <WIDTH x i32> %val, <WIDTH x MASK> %2)
ret void
}
define void @__masked_store_blend_double(<WIDTH x double> * nocapture, <WIDTH x double>,
<WIDTH x MASK>) nounwind alwaysinline {
%ptr = bitcast <WIDTH x double> * %0 to <WIDTH x i64> *
%val = bitcast <WIDTH x double> %1 to <WIDTH x i64>
call void @__masked_store_blend_i64(<WIDTH x i64> * %ptr, <WIDTH x i64> %val, <WIDTH x MASK> %2)
ret void
}
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
define(`stdlib_core', ` define(`stdlib_core', `
@@ -1552,7 +1609,9 @@ declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
; void __pseudo_masked_store_i8 (uniform int8 *ptr, varying int8 values, mask) ; void __pseudo_masked_store_i8 (uniform int8 *ptr, varying int8 values, mask)
; void __pseudo_masked_store_i16(uniform int16 *ptr, varying int16 values, mask) ; void __pseudo_masked_store_i16(uniform int16 *ptr, varying int16 values, mask)
; void __pseudo_masked_store_i32(uniform int32 *ptr, varying int32 values, mask) ; void __pseudo_masked_store_i32(uniform int32 *ptr, varying int32 values, mask)
; void __pseudo_masked_store_float(uniform float *ptr, varying float values, mask)
; void __pseudo_masked_store_i64(uniform int64 *ptr, varying int64 values, mask) ; void __pseudo_masked_store_i64(uniform int64 *ptr, varying int64 values, mask)
; void __pseudo_masked_store_double(uniform double *ptr, varying double values, mask)
; ;
; These in turn are converted to native masked stores or to regular ; These in turn are converted to native masked stores or to regular
; stores (if the mask is all on) by the MaskedStoreOptPass optimization ; stores (if the mask is all on) by the MaskedStoreOptPass optimization
@@ -1561,7 +1620,9 @@ declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
declare void @__pseudo_masked_store_i8(<WIDTH x i8> * nocapture, <WIDTH x i8>, <WIDTH x MASK>) declare void @__pseudo_masked_store_i8(<WIDTH x i8> * nocapture, <WIDTH x i8>, <WIDTH x MASK>)
declare void @__pseudo_masked_store_i16(<WIDTH x i16> * nocapture, <WIDTH x i16>, <WIDTH x MASK>) declare void @__pseudo_masked_store_i16(<WIDTH x i16> * nocapture, <WIDTH x i16>, <WIDTH x MASK>)
declare void @__pseudo_masked_store_i32(<WIDTH x i32> * nocapture, <WIDTH x i32>, <WIDTH x MASK>) declare void @__pseudo_masked_store_i32(<WIDTH x i32> * nocapture, <WIDTH x i32>, <WIDTH x MASK>)
declare void @__pseudo_masked_store_float(<WIDTH x float> * nocapture, <WIDTH x float>, <WIDTH x MASK>)
declare void @__pseudo_masked_store_i64(<WIDTH x i64> * nocapture, <WIDTH x i64>, <WIDTH x MASK>) declare void @__pseudo_masked_store_i64(<WIDTH x i64> * nocapture, <WIDTH x i64>, <WIDTH x MASK>)
declare void @__pseudo_masked_store_double(<WIDTH x double> * nocapture, <WIDTH x double>, <WIDTH x MASK>)
; Declare the pseudo-gather functions. When the ispc front-end needs ; Declare the pseudo-gather functions. When the ispc front-end needs
; to perform a gather, it generates a call to one of these functions, ; to perform a gather, it generates a call to one of these functions,
@@ -1570,7 +1631,9 @@ declare void @__pseudo_masked_store_i64(<WIDTH x i64> * nocapture, <WIDTH x i64>
; varying int8 __pseudo_gather_i8(varying int8 *, mask) ; varying int8 __pseudo_gather_i8(varying int8 *, mask)
; varying int16 __pseudo_gather_i16(varying int16 *, mask) ; varying int16 __pseudo_gather_i16(varying int16 *, mask)
; varying int32 __pseudo_gather_i32(varying int32 *, mask) ; varying int32 __pseudo_gather_i32(varying int32 *, mask)
; varying float __pseudo_gather_float(varying float *, mask)
; varying int64 __pseudo_gather_i64(varying int64 *, mask) ; varying int64 __pseudo_gather_i64(varying int64 *, mask)
; varying double __pseudo_gather_double(varying double *, mask)
; ;
; The GatherScatterFlattenOpt optimization pass finds these calls and then ; The GatherScatterFlattenOpt optimization pass finds these calls and then
; converts them to make calls to the following functions (when appropriate); ; converts them to make calls to the following functions (when appropriate);
@@ -1582,8 +1645,8 @@ declare void @__pseudo_masked_store_i64(<WIDTH x i64> * nocapture, <WIDTH x i64>
; that use the free 2/4/8 scaling available in x86 addressing calculations, and ; that use the free 2/4/8 scaling available in x86 addressing calculations, and
; offset_delta feeds into the free offset calculation. ; offset_delta feeds into the free offset calculation.
; ;
; varying int{8,16,32,64} ; varying int{8,16,32,float,64,double}
; __pseudo_gather_base_offsets{32,64}_{8,16,32,64}(uniform int8 *base, ; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
; int{32,64} offsets, uniform int32 offset_scale, ; int{32,64} offsets, uniform int32 offset_scale,
; int{32,64} offset_delta, mask) ; int{32,64} offset_delta, mask)
; ;
@@ -1594,12 +1657,16 @@ declare void @__pseudo_masked_store_i64(<WIDTH x i64> * nocapture, <WIDTH x i64>
declare <WIDTH x i8> @__pseudo_gather32_i8(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly declare <WIDTH x i8> @__pseudo_gather32_i8(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i16> @__pseudo_gather32_i16(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly declare <WIDTH x i16> @__pseudo_gather32_i16(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i32> @__pseudo_gather32_i32(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly declare <WIDTH x i32> @__pseudo_gather32_i32(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x float> @__pseudo_gather32_float(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64> @__pseudo_gather32_i64(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly declare <WIDTH x i64> @__pseudo_gather32_i64(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x double> @__pseudo_gather32_double(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i8> @__pseudo_gather64_i8(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly declare <WIDTH x i8> @__pseudo_gather64_i8(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i16> @__pseudo_gather64_i16(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly declare <WIDTH x i16> @__pseudo_gather64_i16(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i32> @__pseudo_gather64_i32(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly declare <WIDTH x i32> @__pseudo_gather64_i32(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x float> @__pseudo_gather64_float(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64> @__pseudo_gather64_i64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly declare <WIDTH x i64> @__pseudo_gather64_i64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x double> @__pseudo_gather64_double(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i8> @__pseudo_gather_base_offsets32_i8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>, declare <WIDTH x i8> @__pseudo_gather_base_offsets32_i8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly <WIDTH x MASK>) nounwind readonly
@@ -1607,8 +1674,12 @@ declare <WIDTH x i16> @__pseudo_gather_base_offsets32_i16(i8 *, <WIDTH x i32>, i
<WIDTH x MASK>) nounwind readonly <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i32> @__pseudo_gather_base_offsets32_i32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>, declare <WIDTH x i32> @__pseudo_gather_base_offsets32_i32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly <WIDTH x MASK>) nounwind readonly
declare <WIDTH x float> @__pseudo_gather_base_offsets32_float(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64> @__pseudo_gather_base_offsets32_i64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>, declare <WIDTH x i64> @__pseudo_gather_base_offsets32_i64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly <WIDTH x MASK>) nounwind readonly
declare <WIDTH x double> @__pseudo_gather_base_offsets32_double(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i8> @__pseudo_gather_base_offsets64_i8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>, declare <WIDTH x i8> @__pseudo_gather_base_offsets64_i8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly <WIDTH x MASK>) nounwind readonly
@@ -1616,8 +1687,12 @@ declare <WIDTH x i16> @__pseudo_gather_base_offsets64_i16(i8 *, <WIDTH x i64>, i
<WIDTH x MASK>) nounwind readonly <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i32> @__pseudo_gather_base_offsets64_i32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>, declare <WIDTH x i32> @__pseudo_gather_base_offsets64_i32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly <WIDTH x MASK>) nounwind readonly
declare <WIDTH x float> @__pseudo_gather_base_offsets64_float(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64> @__pseudo_gather_base_offsets64_i64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>, declare <WIDTH x i64> @__pseudo_gather_base_offsets64_i64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly <WIDTH x MASK>) nounwind readonly
declare <WIDTH x double> @__pseudo_gather_base_offsets64_double(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
; Similarly to the pseudo-gathers defined above, we also declare undefined ; Similarly to the pseudo-gathers defined above, we also declare undefined
; pseudo-scatter instructions with signatures: ; pseudo-scatter instructions with signatures:
@@ -1625,7 +1700,9 @@ declare <WIDTH x i64> @__pseudo_gather_base_offsets64_i64(i8 *, <WIDTH x i64>, i
; void __pseudo_scatter_i8 (varying int8 *, varying int8 values, mask) ; void __pseudo_scatter_i8 (varying int8 *, varying int8 values, mask)
; void __pseudo_scatter_i16(varying int16 *, varying int16 values, mask) ; void __pseudo_scatter_i16(varying int16 *, varying int16 values, mask)
; void __pseudo_scatter_i32(varying int32 *, varying int32 values, mask) ; void __pseudo_scatter_i32(varying int32 *, varying int32 values, mask)
; void __pseudo_scatter_float(varying float *, varying float values, mask)
; void __pseudo_scatter_i64(varying int64 *, varying int64 values, mask) ; void __pseudo_scatter_i64(varying int64 *, varying int64 values, mask)
; void __pseudo_scatter_double(varying double *, varying double values, mask)
; ;
; The GatherScatterFlattenOpt optimization pass also finds these and ; The GatherScatterFlattenOpt optimization pass also finds these and
; transforms them to scatters like: ; transforms them to scatters like:
@@ -1641,12 +1718,16 @@ declare <WIDTH x i64> @__pseudo_gather_base_offsets64_i64(i8 *, <WIDTH x i64>, i
declare void @__pseudo_scatter32_i8(<WIDTH x i32>, <WIDTH x i8>, <WIDTH x MASK>) nounwind declare void @__pseudo_scatter32_i8(<WIDTH x i32>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter32_i16(<WIDTH x i32>, <WIDTH x i16>, <WIDTH x MASK>) nounwind declare void @__pseudo_scatter32_i16(<WIDTH x i32>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter32_i32(<WIDTH x i32>, <WIDTH x i32>, <WIDTH x MASK>) nounwind declare void @__pseudo_scatter32_i32(<WIDTH x i32>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter32_float(<WIDTH x i32>, <WIDTH x float>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter32_i64(<WIDTH x i32>, <WIDTH x i64>, <WIDTH x MASK>) nounwind declare void @__pseudo_scatter32_i64(<WIDTH x i32>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter32_double(<WIDTH x i32>, <WIDTH x double>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter64_i8(<WIDTH x i64>, <WIDTH x i8>, <WIDTH x MASK>) nounwind declare void @__pseudo_scatter64_i8(<WIDTH x i64>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter64_i16(<WIDTH x i64>, <WIDTH x i16>, <WIDTH x MASK>) nounwind declare void @__pseudo_scatter64_i16(<WIDTH x i64>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter64_i32(<WIDTH x i64>, <WIDTH x i32>, <WIDTH x MASK>) nounwind declare void @__pseudo_scatter64_i32(<WIDTH x i64>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter64_float(<WIDTH x i64>, <WIDTH x float>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter64_i64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind declare void @__pseudo_scatter64_i64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter64_double(<WIDTH x i64>, <WIDTH x double>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_i8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>, declare void @__pseudo_scatter_base_offsets32_i8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i8>, <WIDTH x MASK>) nounwind <WIDTH x i8>, <WIDTH x MASK>) nounwind
@@ -1654,8 +1735,12 @@ declare void @__pseudo_scatter_base_offsets32_i16(i8 * nocapture, <WIDTH x i32>,
<WIDTH x i16>, <WIDTH x MASK>) nounwind <WIDTH x i16>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_i32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>, declare void @__pseudo_scatter_base_offsets32_i32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i32>, <WIDTH x MASK>) nounwind <WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_float(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x float>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_i64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>, declare void @__pseudo_scatter_base_offsets32_i64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i64>, <WIDTH x MASK>) nounwind <WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_double(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x double>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_i8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>, declare void @__pseudo_scatter_base_offsets64_i8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i8>, <WIDTH x MASK>) nounwind <WIDTH x i8>, <WIDTH x MASK>) nounwind
@@ -1663,8 +1748,12 @@ declare void @__pseudo_scatter_base_offsets64_i16(i8 * nocapture, <WIDTH x i64>,
<WIDTH x i16>, <WIDTH x MASK>) nounwind <WIDTH x i16>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_i32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>, declare void @__pseudo_scatter_base_offsets64_i32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i32>, <WIDTH x MASK>) nounwind <WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_float(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x float>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_i64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>, declare void @__pseudo_scatter_base_offsets64_i64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i64>, <WIDTH x MASK>) nounwind <WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_double(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x double>, <WIDTH x MASK>) nounwind
declare float @__log_uniform_float(float) nounwind readnone declare float @__log_uniform_float(float) nounwind readnone
declare <WIDTH x float> @__log_varying_float(<WIDTH x float>) nounwind readnone declare <WIDTH x float> @__log_varying_float(<WIDTH x float>) nounwind readnone
@@ -1678,7 +1767,9 @@ declare <WIDTH x float> @__pow_varying_float(<WIDTH x float>, <WIDTH x float>) n
declare void @__use8(<WIDTH x i8>) declare void @__use8(<WIDTH x i8>)
declare void @__use16(<WIDTH x i16>) declare void @__use16(<WIDTH x i16>)
declare void @__use32(<WIDTH x i32>) declare void @__use32(<WIDTH x i32>)
declare void @__usefloat(<WIDTH x float>)
declare void @__use64(<WIDTH x i64>) declare void @__use64(<WIDTH x i64>)
declare void @__usedouble(<WIDTH x double>)
;; This is a temporary function that will be removed at the end of ;; This is a temporary function that will be removed at the end of
;; compilation--the idea is that it calls out to all of the various ;; compilation--the idea is that it calls out to all of the various
@@ -1698,8 +1789,12 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
call void @__use16(<WIDTH x i16> %ml16) call void @__use16(<WIDTH x i16> %ml16)
%ml32 = call <WIDTH x i32> @__masked_load_i32(i8 * %ptr, <WIDTH x MASK> %mask) %ml32 = call <WIDTH x i32> @__masked_load_i32(i8 * %ptr, <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %ml32) call void @__use32(<WIDTH x i32> %ml32)
%mlf = call <WIDTH x float> @__masked_load_float(i8 * %ptr, <WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %mlf)
%ml64 = call <WIDTH x i64> @__masked_load_i64(i8 * %ptr, <WIDTH x MASK> %mask) %ml64 = call <WIDTH x i64> @__masked_load_i64(i8 * %ptr, <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %ml64) call void @__use64(<WIDTH x i64> %ml64)
%mld = call <WIDTH x double> @__masked_load_double(i8 * %ptr, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %mld)
%lb8 = call <WIDTH x i8> @__load_and_broadcast_i8(i8 * %ptr, <WIDTH x MASK> %mask) %lb8 = call <WIDTH x i8> @__load_and_broadcast_i8(i8 * %ptr, <WIDTH x MASK> %mask)
call void @__use8(<WIDTH x i8> %lb8) call void @__use8(<WIDTH x i8> %lb8)
@@ -1707,8 +1802,12 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
call void @__use16(<WIDTH x i16> %lb16) call void @__use16(<WIDTH x i16> %lb16)
%lb32 = call <WIDTH x i32> @__load_and_broadcast_i32(i8 * %ptr, <WIDTH x MASK> %mask) %lb32 = call <WIDTH x i32> @__load_and_broadcast_i32(i8 * %ptr, <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %lb32) call void @__use32(<WIDTH x i32> %lb32)
%lbf = call <WIDTH x float> @__load_and_broadcast_float(i8 * %ptr, <WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %lbf)
%lb64 = call <WIDTH x i64> @__load_and_broadcast_i64(i8 * %ptr, <WIDTH x MASK> %mask) %lb64 = call <WIDTH x i64> @__load_and_broadcast_i64(i8 * %ptr, <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %lb64) call void @__use64(<WIDTH x i64> %lb64)
%lbd = call <WIDTH x double> @__load_and_broadcast_double(i8 * %ptr, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %lbd)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; stores ;; stores
@@ -1721,21 +1820,37 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
%pv32 = bitcast i8 * %ptr to <WIDTH x i32> * %pv32 = bitcast i8 * %ptr to <WIDTH x i32> *
call void @__pseudo_masked_store_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32, call void @__pseudo_masked_store_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
%vf = bitcast <WIDTH x i32> %v32 to <WIDTH x float>
%pvf = bitcast i8 * %ptr to <WIDTH x float> *
call void @__pseudo_masked_store_float(<WIDTH x float> * %pvf, <WIDTH x float> %vf,
<WIDTH x MASK> %mask)
%pv64 = bitcast i8 * %ptr to <WIDTH x i64> * %pv64 = bitcast i8 * %ptr to <WIDTH x i64> *
call void @__pseudo_masked_store_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64, call void @__pseudo_masked_store_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
%vd = bitcast <WIDTH x i64> %v64 to <WIDTH x double>
%pvd = bitcast i8 * %ptr to <WIDTH x double> *
call void @__pseudo_masked_store_double(<WIDTH x double> * %pvd, <WIDTH x double> %vd,
<WIDTH x MASK> %mask)
call void @__masked_store_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8, <WIDTH x MASK> %mask) call void @__masked_store_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
call void @__masked_store_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16, <WIDTH x MASK> %mask) call void @__masked_store_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__masked_store_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask) call void @__masked_store_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__masked_store_float(<WIDTH x float> * %pvf, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__masked_store_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask) call void @__masked_store_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__masked_store_double(<WIDTH x double> * %pvd, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__masked_store_blend_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8, call void @__masked_store_blend_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
call void @__masked_store_blend_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16, call void @__masked_store_blend_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
call void @__masked_store_blend_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32, call void @__masked_store_blend_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
call void @__masked_store_blend_float(<WIDTH x float> * %pvf, <WIDTH x float> %vf,
<WIDTH x MASK> %mask)
call void @__masked_store_blend_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64, call void @__masked_store_blend_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
call void @__masked_store_blend_double(<WIDTH x double> * %pvd, <WIDTH x double> %vd,
<WIDTH x MASK> %mask)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gathers ;; gathers
@@ -1749,9 +1864,15 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
%pg32_32 = call <WIDTH x i32> @__pseudo_gather32_i32(<WIDTH x i32> %v32, %pg32_32 = call <WIDTH x i32> @__pseudo_gather32_i32(<WIDTH x i32> %v32,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %pg32_32) call void @__use32(<WIDTH x i32> %pg32_32)
%pg32_f = call <WIDTH x float> @__pseudo_gather32_float(<WIDTH x i32> %v32,
<WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %pg32_f)
%pg32_64 = call <WIDTH x i64> @__pseudo_gather32_i64(<WIDTH x i32> %v32, %pg32_64 = call <WIDTH x i64> @__pseudo_gather32_i64(<WIDTH x i32> %v32,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %pg32_64) call void @__use64(<WIDTH x i64> %pg32_64)
%pg32_d = call <WIDTH x double> @__pseudo_gather32_double(<WIDTH x i32> %v32,
<WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %pg32_d)
%pg64_8 = call <WIDTH x i8> @__pseudo_gather64_i8(<WIDTH x i64> %v64, %pg64_8 = call <WIDTH x i8> @__pseudo_gather64_i8(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
@@ -1762,9 +1883,15 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
%pg64_32 = call <WIDTH x i32> @__pseudo_gather64_i32(<WIDTH x i64> %v64, %pg64_32 = call <WIDTH x i32> @__pseudo_gather64_i32(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %pg64_32) call void @__use32(<WIDTH x i32> %pg64_32)
%pg64_f = call <WIDTH x float> @__pseudo_gather64_float(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %pg64_f)
%pg64_64 = call <WIDTH x i64> @__pseudo_gather64_i64(<WIDTH x i64> %v64, %pg64_64 = call <WIDTH x i64> @__pseudo_gather64_i64(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %pg64_64) call void @__use64(<WIDTH x i64> %pg64_64)
%pg64_d = call <WIDTH x double> @__pseudo_gather64_double(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %pg64_d)
%g32_8 = call <WIDTH x i8> @__gather32_i8(<WIDTH x i32> %v32, %g32_8 = call <WIDTH x i8> @__gather32_i8(<WIDTH x i32> %v32,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
@@ -1775,9 +1902,15 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
%g32_32 = call <WIDTH x i32> @__gather32_i32(<WIDTH x i32> %v32, %g32_32 = call <WIDTH x i32> @__gather32_i32(<WIDTH x i32> %v32,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %g32_32) call void @__use32(<WIDTH x i32> %g32_32)
%g32_f = call <WIDTH x float> @__gather32_float(<WIDTH x i32> %v32,
<WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %g32_f)
%g32_64 = call <WIDTH x i64> @__gather32_i64(<WIDTH x i32> %v32, %g32_64 = call <WIDTH x i64> @__gather32_i64(<WIDTH x i32> %v32,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %g32_64) call void @__use64(<WIDTH x i64> %g32_64)
%g32_d = call <WIDTH x double> @__gather32_double(<WIDTH x i32> %v32,
<WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %g32_d)
%g64_8 = call <WIDTH x i8> @__gather64_i8(<WIDTH x i64> %v64, %g64_8 = call <WIDTH x i8> @__gather64_i8(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
@@ -1788,9 +1921,15 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
%g64_32 = call <WIDTH x i32> @__gather64_i32(<WIDTH x i64> %v64, %g64_32 = call <WIDTH x i32> @__gather64_i32(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %g64_32) call void @__use32(<WIDTH x i32> %g64_32)
%g64_f = call <WIDTH x float> @__gather64_float(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %g64_f)
%g64_64 = call <WIDTH x i64> @__gather64_i64(<WIDTH x i64> %v64, %g64_64 = call <WIDTH x i64> @__gather64_i64(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask) <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %g64_64) call void @__use64(<WIDTH x i64> %g64_64)
%g64_d = call <WIDTH x double> @__gather64_double(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %g64_d)
%pgbo32_8 = call <WIDTH x i8> %pgbo32_8 = call <WIDTH x i8>
@__pseudo_gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, @__pseudo_gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
@@ -1804,10 +1943,18 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
@__pseudo_gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, @__pseudo_gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask) <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %pgbo32_32) call void @__use32(<WIDTH x i32> %pgbo32_32)
%pgbo32_f = call <WIDTH x float>
@__pseudo_gather_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %pgbo32_f)
%pgbo32_64 = call <WIDTH x i64> %pgbo32_64 = call <WIDTH x i64>
@__pseudo_gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, @__pseudo_gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask) <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %pgbo32_64) call void @__use64(<WIDTH x i64> %pgbo32_64)
%pgbo32_d = call <WIDTH x double>
@__pseudo_gather_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %pgbo32_d)
%gbo32_8 = call <WIDTH x i8> %gbo32_8 = call <WIDTH x i8>
@__gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, @__gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
@@ -1821,10 +1968,18 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
@__gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, @__gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask) <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %gbo32_32) call void @__use32(<WIDTH x i32> %gbo32_32)
%gbo32_f = call <WIDTH x float>
@__gather_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %gbo32_f)
%gbo32_64 = call <WIDTH x i64> %gbo32_64 = call <WIDTH x i64>
@__gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, @__gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask) <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %gbo32_64) call void @__use64(<WIDTH x i64> %gbo32_64)
%gbo32_d = call <WIDTH x double>
@__gather_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %gbo32_d)
%pgbo64_8 = call <WIDTH x i8> %pgbo64_8 = call <WIDTH x i8>
@@ -1839,10 +1994,18 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
@__pseudo_gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, @__pseudo_gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask) <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %pgbo64_32) call void @__use32(<WIDTH x i32> %pgbo64_32)
%pgbo64_f = call <WIDTH x float>
@__pseudo_gather_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %pgbo64_f)
%pgbo64_64 = call <WIDTH x i64> %pgbo64_64 = call <WIDTH x i64>
@__pseudo_gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, @__pseudo_gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask) <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %pgbo64_64) call void @__use64(<WIDTH x i64> %pgbo64_64)
%pgbo64_d = call <WIDTH x double>
@__pseudo_gather_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %pgbo64_d)
%gbo64_8 = call <WIDTH x i8> %gbo64_8 = call <WIDTH x i8>
@__gather_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, @__gather_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
@@ -1856,10 +2019,18 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
@__gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, @__gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask) <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %gbo64_32) call void @__use32(<WIDTH x i32> %gbo64_32)
%gbo64_f = call <WIDTH x float>
@__gather_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %gbo64_f)
%gbo64_64 = call <WIDTH x i64> %gbo64_64 = call <WIDTH x i64>
@__gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, @__gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask) <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %gbo64_64) call void @__use64(<WIDTH x i64> %gbo64_64)
%gbo64_d = call <WIDTH x double>
@__gather_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %gbo64_d)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; scatters ;; scatters
@@ -1867,22 +2038,30 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
call void @__pseudo_scatter32_i8(<WIDTH x i32> %v32, <WIDTH x i8> %v8, <WIDTH x MASK> %mask) call void @__pseudo_scatter32_i8(<WIDTH x i32> %v32, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
call void @__pseudo_scatter32_i16(<WIDTH x i32> %v32, <WIDTH x i16> %v16, <WIDTH x MASK> %mask) call void @__pseudo_scatter32_i16(<WIDTH x i32> %v32, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__pseudo_scatter32_i32(<WIDTH x i32> %v32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask) call void @__pseudo_scatter32_i32(<WIDTH x i32> %v32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__pseudo_scatter32_float(<WIDTH x i32> %v32, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__pseudo_scatter32_i64(<WIDTH x i32> %v32, <WIDTH x i64> %v64, <WIDTH x MASK> %mask) call void @__pseudo_scatter32_i64(<WIDTH x i32> %v32, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_scatter32_double(<WIDTH x i32> %v32, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__pseudo_scatter64_i8(<WIDTH x i64> %v64, <WIDTH x i8> %v8, <WIDTH x MASK> %mask) call void @__pseudo_scatter64_i8(<WIDTH x i64> %v64, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
call void @__pseudo_scatter64_i16(<WIDTH x i64> %v64, <WIDTH x i16> %v16, <WIDTH x MASK> %mask) call void @__pseudo_scatter64_i16(<WIDTH x i64> %v64, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__pseudo_scatter64_i32(<WIDTH x i64> %v64, <WIDTH x i32> %v32, <WIDTH x MASK> %mask) call void @__pseudo_scatter64_i32(<WIDTH x i64> %v64, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__pseudo_scatter64_float(<WIDTH x i64> %v64, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__pseudo_scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask) call void @__pseudo_scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_scatter64_double(<WIDTH x i64> %v64, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__scatter32_i8(<WIDTH x i32> %v32, <WIDTH x i8> %v8, <WIDTH x MASK> %mask) call void @__scatter32_i8(<WIDTH x i32> %v32, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
call void @__scatter32_i16(<WIDTH x i32> %v32, <WIDTH x i16> %v16, <WIDTH x MASK> %mask) call void @__scatter32_i16(<WIDTH x i32> %v32, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__scatter32_i32(<WIDTH x i32> %v32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask) call void @__scatter32_i32(<WIDTH x i32> %v32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__scatter32_float(<WIDTH x i32> %v32, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__scatter32_i64(<WIDTH x i32> %v32, <WIDTH x i64> %v64, <WIDTH x MASK> %mask) call void @__scatter32_i64(<WIDTH x i32> %v32, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__scatter32_double(<WIDTH x i32> %v32, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__scatter64_i8(<WIDTH x i64> %v64, <WIDTH x i8> %v8, <WIDTH x MASK> %mask) call void @__scatter64_i8(<WIDTH x i64> %v64, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
call void @__scatter64_i16(<WIDTH x i64> %v64, <WIDTH x i16> %v16, <WIDTH x MASK> %mask) call void @__scatter64_i16(<WIDTH x i64> %v64, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__scatter64_i32(<WIDTH x i64> %v64, <WIDTH x i32> %v32, <WIDTH x MASK> %mask) call void @__scatter64_i32(<WIDTH x i64> %v64, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__scatter64_float(<WIDTH x i64> %v64, <WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask) call void @__scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__scatter64_double(<WIDTH x i64> %v64, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32, call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i8> %v8, <WIDTH x MASK> %mask) <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
@@ -1890,8 +2069,12 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
<WIDTH x i16> %v16, <WIDTH x MASK> %mask) <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32, call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask) <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32, call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask) <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64, call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i8> %v8, <WIDTH x MASK> %mask) <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
@@ -1899,8 +2082,12 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
<WIDTH x i16> %v16, <WIDTH x MASK> %mask) <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64, call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask) <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64, call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask) <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32, call void @__scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i8> %v8, <WIDTH x MASK> %mask) <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
@@ -1908,8 +2095,12 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
<WIDTH x i16> %v16, <WIDTH x MASK> %mask) <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32, call void @__scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask) <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32, call void @__scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask) <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64, call void @__scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i8> %v8, <WIDTH x MASK> %mask) <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
@@ -1917,8 +2108,12 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
<WIDTH x i16> %v16, <WIDTH x MASK> %mask) <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64, call void @__scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask) <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64, call void @__scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask) <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
ret void ret void
} }

64
ctx.cpp
View File

@@ -2518,12 +2518,16 @@ FunctionEmitContext::gather(llvm::Value *ptr, const PointerType *ptrType,
if (pt != NULL) if (pt != NULL)
funcName = g->target.is32Bit ? "__pseudo_gather32_i32" : funcName = g->target.is32Bit ? "__pseudo_gather32_i32" :
"__pseudo_gather64_i64"; "__pseudo_gather64_i64";
else if (llvmReturnType == LLVMTypes::DoubleVectorType || else if (llvmReturnType == LLVMTypes::DoubleVectorType)
llvmReturnType == LLVMTypes::Int64VectorType) funcName = g->target.is32Bit ? "__pseudo_gather32_double" :
"__pseudo_gather64_double";
else if (llvmReturnType == LLVMTypes::Int64VectorType)
funcName = g->target.is32Bit ? "__pseudo_gather32_i64" : funcName = g->target.is32Bit ? "__pseudo_gather32_i64" :
"__pseudo_gather64_i64"; "__pseudo_gather64_i64";
else if (llvmReturnType == LLVMTypes::FloatVectorType || else if (llvmReturnType == LLVMTypes::FloatVectorType)
llvmReturnType == LLVMTypes::Int32VectorType) funcName = g->target.is32Bit ? "__pseudo_gather32_float" :
"__pseudo_gather64_float";
else if (llvmReturnType == LLVMTypes::Int32VectorType)
funcName = g->target.is32Bit ? "__pseudo_gather32_i32" : funcName = g->target.is32Bit ? "__pseudo_gather32_i32" :
"__pseudo_gather64_i32"; "__pseudo_gather64_i32";
else if (llvmReturnType == LLVMTypes::Int16VectorType) else if (llvmReturnType == LLVMTypes::Int16VectorType)
@@ -2538,15 +2542,15 @@ FunctionEmitContext::gather(llvm::Value *ptr, const PointerType *ptrType,
llvm::Function *gatherFunc = m->module->getFunction(funcName); llvm::Function *gatherFunc = m->module->getFunction(funcName);
AssertPos(currentPos, gatherFunc != NULL); AssertPos(currentPos, gatherFunc != NULL);
llvm::Value *call = CallInst(gatherFunc, NULL, ptr, mask, name); llvm::Value *gatherCall = CallInst(gatherFunc, NULL, ptr, mask, name);
// Add metadata about the source file location so that the // Add metadata about the source file location so that the
// optimization passes can print useful performance warnings if we // optimization passes can print useful performance warnings if we
// can't optimize out this gather // can't optimize out this gather
if (disableGSWarningCount == 0) if (disableGSWarningCount == 0)
addGSMetadata(call, currentPos); addGSMetadata(gatherCall, currentPos);
return BitCastInst(call, llvmReturnType, LLVMGetName(call, "_gather_bitcast")); return gatherCall;
} }
@@ -2709,25 +2713,20 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
StoreInst(final, ptr); StoreInst(final, ptr);
return; return;
} }
else if (Type::Equal(valueType, AtomicType::VaryingDouble) || else if (Type::Equal(valueType, AtomicType::VaryingDouble)) {
Type::Equal(valueType, AtomicType::VaryingInt64) || maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_double");
}
else if (Type::Equal(valueType, AtomicType::VaryingInt64) ||
Type::Equal(valueType, AtomicType::VaryingUInt64)) { Type::Equal(valueType, AtomicType::VaryingUInt64)) {
ptr = BitCastInst(ptr, LLVMTypes::Int64VectorPointerType,
LLVMGetName(ptr, "_to_int64vecptr"));
value = BitCastInst(value, LLVMTypes::Int64VectorType,
LLVMGetName(value, "_to_int64"));
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64"); maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64");
} }
else if (Type::Equal(valueType, AtomicType::VaryingFloat) || else if (Type::Equal(valueType, AtomicType::VaryingFloat)) {
Type::Equal(valueType, AtomicType::VaryingBool) || maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_float");
}
else if (Type::Equal(valueType, AtomicType::VaryingBool) ||
Type::Equal(valueType, AtomicType::VaryingInt32) || Type::Equal(valueType, AtomicType::VaryingInt32) ||
Type::Equal(valueType, AtomicType::VaryingUInt32) || Type::Equal(valueType, AtomicType::VaryingUInt32) ||
CastType<EnumType>(valueType) != NULL) { CastType<EnumType>(valueType) != NULL) {
ptr = BitCastInst(ptr, LLVMTypes::Int32VectorPointerType,
LLVMGetName(ptr, "_to_int32vecptr"));
if (Type::Equal(valueType, AtomicType::VaryingFloat))
value = BitCastInst(value, LLVMTypes::Int32VectorType,
LLVMGetName(value, "_to_int32"));
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32"); maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32");
} }
else if (Type::Equal(valueType, AtomicType::VaryingInt16) || else if (Type::Equal(valueType, AtomicType::VaryingInt16) ||
@@ -2827,27 +2826,34 @@ FunctionEmitContext::scatter(llvm::Value *value, llvm::Value *ptr,
llvm::Type *type = value->getType(); llvm::Type *type = value->getType();
const char *funcName = NULL; const char *funcName = NULL;
if (pt != NULL) if (pt != NULL) {
funcName = g->target.is32Bit ? "__pseudo_scatter32_i32" : funcName = g->target.is32Bit ? "__pseudo_scatter32_i32" :
"__pseudo_scatter64_i64"; "__pseudo_scatter64_i64";
else if (type == LLVMTypes::DoubleVectorType || }
type == LLVMTypes::Int64VectorType) { else if (type == LLVMTypes::DoubleVectorType) {
funcName = g->target.is32Bit ? "__pseudo_scatter32_double" :
"__pseudo_scatter64_double";
}
else if (type == LLVMTypes::Int64VectorType) {
funcName = g->target.is32Bit ? "__pseudo_scatter32_i64" : funcName = g->target.is32Bit ? "__pseudo_scatter32_i64" :
"__pseudo_scatter64_i64"; "__pseudo_scatter64_i64";
value = BitCastInst(value, LLVMTypes::Int64VectorType, "value2int");
} }
else if (type == LLVMTypes::FloatVectorType || else if (type == LLVMTypes::FloatVectorType) {
type == LLVMTypes::Int32VectorType) { funcName = g->target.is32Bit ? "__pseudo_scatter32_float" :
"__pseudo_scatter64_float";
}
else if (type == LLVMTypes::Int32VectorType) {
funcName = g->target.is32Bit ? "__pseudo_scatter32_i32" : funcName = g->target.is32Bit ? "__pseudo_scatter32_i32" :
"__pseudo_scatter64_i32"; "__pseudo_scatter64_i32";
value = BitCastInst(value, LLVMTypes::Int32VectorType, "value2int");
} }
else if (type == LLVMTypes::Int16VectorType) else if (type == LLVMTypes::Int16VectorType) {
funcName = g->target.is32Bit ? "__pseudo_scatter32_i16" : funcName = g->target.is32Bit ? "__pseudo_scatter32_i16" :
"__pseudo_scatter64_i16"; "__pseudo_scatter64_i16";
else if (type == LLVMTypes::Int8VectorType) }
else if (type == LLVMTypes::Int8VectorType) {
funcName = g->target.is32Bit ? "__pseudo_scatter32_i8" : funcName = g->target.is32Bit ? "__pseudo_scatter32_i8" :
"__pseudo_scatter64_i8"; "__pseudo_scatter64_i8";
}
llvm::Function *scatterFunc = m->module->getFunction(funcName); llvm::Function *scatterFunc = m->module->getFunction(funcName);
AssertPos(currentPos, scatterFunc != NULL); AssertPos(currentPos, scatterFunc != NULL);

View File

@@ -1131,6 +1131,16 @@ static FORCEINLINE __vec16_i32 __masked_load_i32(void *p,
return ret; return ret;
} }
static FORCEINLINE __vec16_f __masked_load_float(void *p,
__vec16_i1 mask) {
__vec16_f ret;
float *ptr = (float *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ret.v[i] = ptr[i];
return ret;
}
static FORCEINLINE __vec16_i64 __masked_load_i64(void *p, static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
__vec16_i1 mask) { __vec16_i1 mask) {
__vec16_i64 ret; __vec16_i64 ret;
@@ -1141,6 +1151,16 @@ static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
return ret; return ret;
} }
static FORCEINLINE __vec16_d __masked_load_double(void *p,
__vec16_i1 mask) {
__vec16_d ret;
double *ptr = (double *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ret.v[i] = ptr[i];
return ret;
}
static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val, static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val,
__vec16_i1 mask) { __vec16_i1 mask) {
int8_t *ptr = (int8_t *)p; int8_t *ptr = (int8_t *)p;
@@ -1165,6 +1185,14 @@ static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val,
ptr[i] = val.v[i]; ptr[i] = val.v[i];
} }
static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
__vec16_i1 mask) {
float *ptr = (float *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val, static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
__vec16_i1 mask) { __vec16_i1 mask) {
int64_t *ptr = (int64_t *)p; int64_t *ptr = (int64_t *)p;
@@ -1173,6 +1201,12 @@ static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
ptr[i] = val.v[i]; ptr[i] = val.v[i];
} }
static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
__vec16_i1 mask) {
double *ptr = (double *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ptr[i] = val.v[i];
} }
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val, static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val,
@@ -1190,11 +1224,19 @@ static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val,
__masked_store_i32(p, val, mask); __masked_store_i32(p, val, mask);
} }
static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
__vec16_i1 mask) {
__masked_store_float(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val, static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val,
__vec16_i1 mask) { __vec16_i1 mask) {
__masked_store_i64(p, val, mask); __masked_store_i64(p, val, mask);
} }
static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
__vec16_i1 mask) {
__masked_store_double(p, val, mask);
} }
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
@@ -1224,8 +1266,12 @@ GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32) GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float)
GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float)
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_base_offsets32_double)
GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_base_offsets64_double)
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \
@@ -1244,8 +1290,12 @@ GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __gather32_i16)
GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16) GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16)
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32) GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32)
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32) GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32)
GATHER_GENERAL(__vec16_f, float, __vec16_i32, __gather32_float)
GATHER_GENERAL(__vec16_f, float, __vec16_i64, __gather64_float)
GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __gather32_i64) GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __gather32_i64)
GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64) GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
GATHER_GENERAL(__vec16_d, double, __vec16_i32, __gather32_double)
GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double)
// scatter // scatter
@@ -1269,8 +1319,12 @@ SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32) SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_base_offsets32_float)
SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float)
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_base_offsets32_double)
SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_base_offsets64_double)
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
@@ -1288,8 +1342,12 @@ SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16)
SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16) SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __scatter32_i32) SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __scatter32_i32)
SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __scatter64_i32) SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __scatter64_i32)
SCATTER_GENERAL(__vec16_f, float, __vec16_i32, __scatter32_float)
SCATTER_GENERAL(__vec16_f, float, __vec16_i64, __scatter64_float)
SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __scatter32_i64) SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __scatter32_i64)
SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64) SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64)
SCATTER_GENERAL(__vec16_d, double, __vec16_i32, __scatter32_double)
SCATTER_GENERAL(__vec16_d, double, __vec16_i64, __scatter64_double)
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// packed load/store // packed load/store

View File

@@ -2479,6 +2479,12 @@ static FORCEINLINE __vec4_i32 __masked_load_i32(void *p, __vec4_i1 mask) {
return r; return r;
} }
static FORCEINLINE __vec4_f __masked_load_float(void *p, __vec4_i1 mask) {
__vec4_i32 v32 = __masked_load_i32(p, mask);
return __vec4_f(v32);
}
static FORCEINLINE __vec4_i64 __masked_load_i64(void *p, __vec4_i1 mask) { static FORCEINLINE __vec4_i64 __masked_load_i64(void *p, __vec4_i1 mask) {
uint64_t r[4]; uint64_t r[4];
uint64_t *ptr = (uint64_t *)p; uint64_t *ptr = (uint64_t *)p;
@@ -2501,6 +2507,11 @@ static FORCEINLINE __vec4_i64 __masked_load_i64(void *p, __vec4_i1 mask) {
return __vec4_i64(r[0], r[1], r[2], r[3]); return __vec4_i64(r[0], r[1], r[2], r[3]);
} }
static FORCEINLINE __vec4_d __masked_load_double(void *p, __vec4_i1 mask) {
__vec4_i64 v64 = __masked_load_i64(p, mask);
return __vec4_d(v64);
}
static FORCEINLINE void __masked_store_i8(void *p, __vec4_i8 val, static FORCEINLINE void __masked_store_i8(void *p, __vec4_i8 val,
__vec4_i1 mask) { __vec4_i1 mask) {
int8_t *ptr = (int8_t *)p; int8_t *ptr = (int8_t *)p;
@@ -2563,6 +2574,11 @@ static FORCEINLINE void __masked_store_i32(void *p, __vec4_i32 val,
ptr[3] = _mm_extract_epi32(val.v, 3); ptr[3] = _mm_extract_epi32(val.v, 3);
} }
static FORCEINLINE void __masked_store_float(void *p, __vec4_f val,
__vec4_i1 mask) {
__masked_store_i32(p, __vec4_i32(val), mask);
}
static FORCEINLINE void __masked_store_i64(void *p, __vec4_i64 val, static FORCEINLINE void __masked_store_i64(void *p, __vec4_i64 val,
__vec4_i1 mask) { __vec4_i1 mask) {
int64_t *ptr = (int64_t *)p; int64_t *ptr = (int64_t *)p;
@@ -2583,6 +2599,9 @@ static FORCEINLINE void __masked_store_i64(void *p, __vec4_i64 val,
ptr[3] = _mm_extract_epi64(val.v[1], 1); ptr[3] = _mm_extract_epi64(val.v[1], 1);
} }
static FORCEINLINE void __masked_store_double(void *p, __vec4_d val,
__vec4_i1 mask) {
__masked_store_i64(p, __vec4_i64(val), mask);
} }
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec4_i8 val, static FORCEINLINE void __masked_store_blend_i8(void *p, __vec4_i8 val,
@@ -2600,6 +2619,10 @@ static FORCEINLINE void __masked_store_blend_i32(void *p, __vec4_i32 val,
// FIXME: do a load, blendvps, store here... // FIXME: do a load, blendvps, store here...
__masked_store_i32(p, val, mask); __masked_store_i32(p, val, mask);
} }
static FORCEINLINE void __masked_store_blend_float(void *p, __vec4_f val,
__vec4_i1 mask) {
__masked_store_i32(p, __vec4_i32(val), mask);
} }
static FORCEINLINE void __masked_store_blend_i64(void *p, __vec4_i64 val, static FORCEINLINE void __masked_store_blend_i64(void *p, __vec4_i64 val,
@@ -2608,6 +2631,12 @@ static FORCEINLINE void __masked_store_blend_i64(void *p, __vec4_i64 val,
__masked_store_i64(p, val, mask); __masked_store_i64(p, val, mask);
} }
static FORCEINLINE void __masked_store_blend_double(void *p, __vec4_d val,
__vec4_i1 mask) {
__masked_store_i64(p, __vec4_i64(val), mask);
}
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// gather/scatter // gather/scatter
// offsets * offsetScale is in bytes (for all of these) // offsets * offsetScale is in bytes (for all of these)
@@ -2827,6 +2856,20 @@ __gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
delta, mask); delta, mask);
} }
static FORCEINLINE __vec4_f
__gather_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
__vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_f(), float(), p, offsets, scale,
delta, mask);
}
static FORCEINLINE __vec4_f
__gather_base_offsets64_float(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_f(), float(), p, offsets, scale,
delta, mask);
}
static FORCEINLINE __vec4_i64 static FORCEINLINE __vec4_i64
__gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, __gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) { uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) {
@@ -2841,6 +2884,20 @@ __gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
delta, mask); delta, mask);
} }
static FORCEINLINE __vec4_d
__gather_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_d(), double(), p, offsets, scale,
delta, mask);
}
static FORCEINLINE __vec4_d
__gather_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_d(), double(), p, offsets, scale,
delta, mask);
}
template<typename RetVec, typename RetScalar> template<typename RetVec, typename RetScalar>
static FORCEINLINE RetVec lGather32(RetVec, RetScalar, __vec4_i32 ptrs, static FORCEINLINE RetVec lGather32(RetVec, RetScalar, __vec4_i32 ptrs,
__vec4_i1 mask) { __vec4_i1 mask) {
@@ -2975,6 +3032,14 @@ static FORCEINLINE __vec4_i32 __gather64_i32(__vec4_i64 ptrs, __vec4_i1 mask) {
return r; return r;
} }
static FORCEINLINE __vec4_f __gather32_float(__vec4_i32 ptrs, __vec4_i1 mask) {
return __vec4_f(__gather32_i32(ptrs, mask);
}
static FORCEINLINE __vec4_f __gather64_float(__vec4_i32 ptrs, __vec4_i1 mask) {
return __vec4_f(__gather64_i32(ptrs, mask);
}
static FORCEINLINE __vec4_i64 __gather32_i64(__vec4_i32 ptrs, __vec4_i1 mask) { static FORCEINLINE __vec4_i64 __gather32_i64(__vec4_i32 ptrs, __vec4_i1 mask) {
return lGather32(__vec4_i64(), uint64_t(), ptrs, mask); return lGather32(__vec4_i64(), uint64_t(), ptrs, mask);
} }
@@ -2983,6 +3048,14 @@ static FORCEINLINE __vec4_i64 __gather64_i64(__vec4_i64 ptrs, __vec4_i1 mask) {
return lGather64(__vec4_i64(), uint64_t(), ptrs, mask); return lGather64(__vec4_i64(), uint64_t(), ptrs, mask);
} }
static FORCEINLINE __vec4_d __gather32_double(__vec4_i32 ptrs, __vec4_i1 mask) {
return lGather32(__vec4_d(), double(), ptrs, mask);
}
static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) {
return lGather64(__vec4_d(), double(), ptrs, mask);
}
// scatter // scatter
#define SCATTER32_64(SUFFIX, TYPE, EXTRACT) \ #define SCATTER32_64(SUFFIX, TYPE, EXTRACT) \
@@ -3050,9 +3123,10 @@ __scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
} }
SCATTER32_64(i8, int8_t, _mm_extract_epi8) SCATTER32_64(i8, int8_t, _mm_extract_epi8)
SCATTER32_64(i16, int16_t, _mm_extract_epi16) SCATTER32_64(i16, int16_t, _mm_extract_epi16)
SCATTER32_64(i32, int32_t, _mm_extract_epi32) SCATTER32_64(i32, int32_t, _mm_extract_epi32)
SCATTER32_64(f, float, _mm_extract_epi32)
static FORCEINLINE void static FORCEINLINE void
@@ -3129,6 +3203,21 @@ __scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
} }
} }
static FORCEINLINE void
__scatter_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_d val,
__vec4_i1 mask) {
__scatter_base_offsets32_i64(p, offsets, scale, constOffset, val, mask);
}
static FORCEINLINE void
__scatter_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_d val,
__vec4_i1 mask) {
__scatter_base_offsets64_i64(p, offsets, scale, constOffset, val, mask);
}
static FORCEINLINE void __scatter32_i8(__vec4_i32 ptrs, __vec4_i8 val, static FORCEINLINE void __scatter32_i8(__vec4_i32 ptrs, __vec4_i8 val,
__vec4_i1 mask) { __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0); uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -3291,6 +3380,16 @@ static FORCEINLINE void __scatter64_i32(__vec4_i64 ptrs, __vec4_i32 val,
} }
} }
static FORCEINLINE void __scatter32_float(__vec4_i32 ptrs, __vec4_f val,
__vec4_i1 mask) {
__scatter32_i32(ptrs, __vec4_i32(val), mask);
}
static FORCEINLINE void __scatter64_float(__vec4_i64 ptrs, __vec4_f val,
__vec4_i1 mask) {
__scatter64_i32(ptrs, __vec4_i32(val), mask);
}
static FORCEINLINE void __scatter32_i64(__vec4_i32 ptrs, __vec4_i64 val, static FORCEINLINE void __scatter32_i64(__vec4_i32 ptrs, __vec4_i64 val,
__vec4_i1 mask) { __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0); uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -3345,6 +3444,16 @@ static FORCEINLINE void __scatter64_i64(__vec4_i64 ptrs, __vec4_i64 val,
} }
} }
static FORCEINLINE void __scatter32_double(__vec4_i32 ptrs, __vec4_d val,
__vec4_i1 mask) {
__scatter32_i64(ptrs, __vec4_i64(val), mask);
}
static FORCEINLINE void __scatter64_double(__vec4_i64 ptrs, __vec4_d val,
__vec4_i1 mask) {
__scatter64_i64(ptrs, __vec4_i64(val), mask);
}
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// packed load/store // packed load/store

96
opt.cpp
View File

@@ -1695,8 +1695,12 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
"__pseudo_gather_base_offsets32_i16", true), "__pseudo_gather_base_offsets32_i16", true),
GSInfo("__pseudo_gather32_i32", "__pseudo_gather_base_offsets32_i32", GSInfo("__pseudo_gather32_i32", "__pseudo_gather_base_offsets32_i32",
"__pseudo_gather_base_offsets32_i32", true), "__pseudo_gather_base_offsets32_i32", true),
GSInfo("__pseudo_gather32_float", "__pseudo_gather_base_offsets32_float",
"__pseudo_gather_base_offsets32_float", true),
GSInfo("__pseudo_gather32_i64", "__pseudo_gather_base_offsets32_i64", GSInfo("__pseudo_gather32_i64", "__pseudo_gather_base_offsets32_i64",
"__pseudo_gather_base_offsets32_i64", true), "__pseudo_gather_base_offsets32_i64", true),
GSInfo("__pseudo_gather32_double", "__pseudo_gather_base_offsets32_double",
"__pseudo_gather_base_offsets32_double", true),
GSInfo("__pseudo_scatter32_i8", "__pseudo_scatter_base_offsets32_i8", GSInfo("__pseudo_scatter32_i8", "__pseudo_scatter_base_offsets32_i8",
"__pseudo_scatter_base_offsets32_i8", false), "__pseudo_scatter_base_offsets32_i8", false),
@@ -1704,8 +1708,12 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
"__pseudo_scatter_base_offsets32_i16", false), "__pseudo_scatter_base_offsets32_i16", false),
GSInfo("__pseudo_scatter32_i32", "__pseudo_scatter_base_offsets32_i32", GSInfo("__pseudo_scatter32_i32", "__pseudo_scatter_base_offsets32_i32",
"__pseudo_scatter_base_offsets32_i32", false), "__pseudo_scatter_base_offsets32_i32", false),
GSInfo("__pseudo_scatter32_float", "__pseudo_scatter_base_offsets32_float",
"__pseudo_scatter_base_offsets32_float", false),
GSInfo("__pseudo_scatter32_i64", "__pseudo_scatter_base_offsets32_i64", GSInfo("__pseudo_scatter32_i64", "__pseudo_scatter_base_offsets32_i64",
"__pseudo_scatter_base_offsets32_i64", false), "__pseudo_scatter_base_offsets32_i64", false),
GSInfo("__pseudo_scatter32_double", "__pseudo_scatter_base_offsets32_double",
"__pseudo_scatter_base_offsets32_double", false),
GSInfo("__pseudo_gather64_i8", "__pseudo_gather_base_offsets64_i8", GSInfo("__pseudo_gather64_i8", "__pseudo_gather_base_offsets64_i8",
"__pseudo_gather_base_offsets32_i8", true), "__pseudo_gather_base_offsets32_i8", true),
@@ -1713,8 +1721,12 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
"__pseudo_gather_base_offsets32_i16", true), "__pseudo_gather_base_offsets32_i16", true),
GSInfo("__pseudo_gather64_i32", "__pseudo_gather_base_offsets64_i32", GSInfo("__pseudo_gather64_i32", "__pseudo_gather_base_offsets64_i32",
"__pseudo_gather_base_offsets32_i32", true), "__pseudo_gather_base_offsets32_i32", true),
GSInfo("__pseudo_gather64_float", "__pseudo_gather_base_offsets64_float",
"__pseudo_gather_base_offsets32_float", true),
GSInfo("__pseudo_gather64_i64", "__pseudo_gather_base_offsets64_i64", GSInfo("__pseudo_gather64_i64", "__pseudo_gather_base_offsets64_i64",
"__pseudo_gather_base_offsets32_i64", true), "__pseudo_gather_base_offsets32_i64", true),
GSInfo("__pseudo_gather64_double", "__pseudo_gather_base_offsets64_double",
"__pseudo_gather_base_offsets32_double", true),
GSInfo("__pseudo_scatter64_i8", "__pseudo_scatter_base_offsets64_i8", GSInfo("__pseudo_scatter64_i8", "__pseudo_scatter_base_offsets64_i8",
"__pseudo_scatter_base_offsets32_i8", false), "__pseudo_scatter_base_offsets32_i8", false),
@@ -1722,8 +1734,12 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
"__pseudo_scatter_base_offsets32_i16", false), "__pseudo_scatter_base_offsets32_i16", false),
GSInfo("__pseudo_scatter64_i32", "__pseudo_scatter_base_offsets64_i32", GSInfo("__pseudo_scatter64_i32", "__pseudo_scatter_base_offsets64_i32",
"__pseudo_scatter_base_offsets32_i32", false), "__pseudo_scatter_base_offsets32_i32", false),
GSInfo("__pseudo_scatter64_float", "__pseudo_scatter_base_offsets64_float",
"__pseudo_scatter_base_offsets32_float", false),
GSInfo("__pseudo_scatter64_i64", "__pseudo_scatter_base_offsets64_i64", GSInfo("__pseudo_scatter64_i64", "__pseudo_scatter_base_offsets64_i64",
"__pseudo_scatter_base_offsets32_i64", false), "__pseudo_scatter_base_offsets32_i64", false),
GSInfo("__pseudo_scatter64_double", "__pseudo_scatter_base_offsets64_double",
"__pseudo_scatter_base_offsets32_double", false),
}; };
int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
for (int i = 0; i < numGSFuncs; ++i) for (int i = 0; i < numGSFuncs; ++i)
@@ -1883,15 +1899,21 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
MSInfo("__pseudo_masked_store_i8", 1), MSInfo("__pseudo_masked_store_i8", 1),
MSInfo("__pseudo_masked_store_i16", 2), MSInfo("__pseudo_masked_store_i16", 2),
MSInfo("__pseudo_masked_store_i32", 4), MSInfo("__pseudo_masked_store_i32", 4),
MSInfo("__pseudo_masked_store_float", 4),
MSInfo("__pseudo_masked_store_i64", 8), MSInfo("__pseudo_masked_store_i64", 8),
MSInfo("__pseudo_masked_store_double", 8),
MSInfo("__masked_store_blend_i8", 1), MSInfo("__masked_store_blend_i8", 1),
MSInfo("__masked_store_blend_i16", 2), MSInfo("__masked_store_blend_i16", 2),
MSInfo("__masked_store_blend_i32", 4), MSInfo("__masked_store_blend_i32", 4),
MSInfo("__masked_store_blend_float", 4),
MSInfo("__masked_store_blend_i64", 8), MSInfo("__masked_store_blend_i64", 8),
MSInfo("__masked_store_blend_double", 8),
MSInfo("__masked_store_i8", 1), MSInfo("__masked_store_i8", 1),
MSInfo("__masked_store_i16", 2), MSInfo("__masked_store_i16", 2),
MSInfo("__masked_store_i32", 4), MSInfo("__masked_store_i32", 4),
MSInfo("__masked_store_float", 4),
MSInfo("__masked_store_i64", 8), MSInfo("__masked_store_i64", 8),
MSInfo("__masked_store_double", 8)
}; };
bool modifiedAny = false; bool modifiedAny = false;
@@ -1998,7 +2020,9 @@ MaskedLoadOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
MLInfo("__masked_load_i8", 1), MLInfo("__masked_load_i8", 1),
MLInfo("__masked_load_i16", 2), MLInfo("__masked_load_i16", 2),
MLInfo("__masked_load_i32", 4), MLInfo("__masked_load_i32", 4),
MLInfo("__masked_load_float", 4),
MLInfo("__masked_load_i64", 8), MLInfo("__masked_load_i64", 8),
MLInfo("__masked_load_double", 8)
}; };
bool modifiedAny = false; bool modifiedAny = false;
@@ -2150,8 +2174,12 @@ PseudoMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
"__masked_store_i16"), "__masked_store_i16"),
LMSInfo("__pseudo_masked_store_i32", "__masked_store_blend_i32", LMSInfo("__pseudo_masked_store_i32", "__masked_store_blend_i32",
"__masked_store_i32"), "__masked_store_i32"),
LMSInfo("__pseudo_masked_store_float", "__masked_store_blend_float",
"__masked_store_float"),
LMSInfo("__pseudo_masked_store_i64", "__masked_store_blend_i64", LMSInfo("__pseudo_masked_store_i64", "__masked_store_blend_i64",
"__masked_store_i64"), "__masked_store_i64"),
LMSInfo("__pseudo_masked_store_double", "__masked_store_blend_double",
"__masked_store_double")
}; };
bool modifiedAny = false; bool modifiedAny = false;
@@ -2290,16 +2318,24 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
"__masked_load_i16", 2), "__masked_load_i16", 2),
GatherImpInfo("__pseudo_gather_base_offsets32_i32", "__load_and_broadcast_i32", GatherImpInfo("__pseudo_gather_base_offsets32_i32", "__load_and_broadcast_i32",
"__masked_load_i32", 4), "__masked_load_i32", 4),
GatherImpInfo("__pseudo_gather_base_offsets32_float", "__load_and_broadcast_float",
"__masked_load_float", 4),
GatherImpInfo("__pseudo_gather_base_offsets32_i64", "__load_and_broadcast_i64", GatherImpInfo("__pseudo_gather_base_offsets32_i64", "__load_and_broadcast_i64",
"__masked_load_i64", 8), "__masked_load_i64", 8),
GatherImpInfo("__pseudo_gather_base_offsets32_double", "__load_and_broadcast_double",
"__masked_load_double", 8),
GatherImpInfo("__pseudo_gather_base_offsets64_i8", "__load_and_broadcast_i8", GatherImpInfo("__pseudo_gather_base_offsets64_i8", "__load_and_broadcast_i8",
"__masked_load_i8", 1), "__masked_load_i8", 1),
GatherImpInfo("__pseudo_gather_base_offsets64_i16", "__load_and_broadcast_i16", GatherImpInfo("__pseudo_gather_base_offsets64_i16", "__load_and_broadcast_i16",
"__masked_load_i16", 2), "__masked_load_i16", 2),
GatherImpInfo("__pseudo_gather_base_offsets64_i32", "__load_and_broadcast_i32", GatherImpInfo("__pseudo_gather_base_offsets64_i32", "__load_and_broadcast_i32",
"__masked_load_i32", 4), "__masked_load_i32", 4),
GatherImpInfo("__pseudo_gather_base_offsets64_float", "__load_and_broadcast_float",
"__masked_load_float", 4),
GatherImpInfo("__pseudo_gather_base_offsets64_i64", "__load_and_broadcast_i64", GatherImpInfo("__pseudo_gather_base_offsets64_i64", "__load_and_broadcast_i64",
"__masked_load_i64", 8), "__masked_load_i64", 8),
GatherImpInfo("__pseudo_gather_base_offsets64_double", "__load_and_broadcast_double",
"__masked_load_double", 8)
}; };
ScatterImpInfo sInfo[] = { ScatterImpInfo sInfo[] = {
ScatterImpInfo("__pseudo_scatter_base_offsets32_i8", "__pseudo_masked_store_i8", ScatterImpInfo("__pseudo_scatter_base_offsets32_i8", "__pseudo_masked_store_i8",
@@ -2308,16 +2344,24 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
LLVMTypes::Int16VectorPointerType, 2), LLVMTypes::Int16VectorPointerType, 2),
ScatterImpInfo("__pseudo_scatter_base_offsets32_i32", "__pseudo_masked_store_i32", ScatterImpInfo("__pseudo_scatter_base_offsets32_i32", "__pseudo_masked_store_i32",
LLVMTypes::Int32VectorPointerType, 4), LLVMTypes::Int32VectorPointerType, 4),
ScatterImpInfo("__pseudo_scatter_base_offsets32_float", "__pseudo_masked_store_float",
LLVMTypes::FloatVectorPointerType, 4),
ScatterImpInfo("__pseudo_scatter_base_offsets32_i64", "__pseudo_masked_store_i64", ScatterImpInfo("__pseudo_scatter_base_offsets32_i64", "__pseudo_masked_store_i64",
LLVMTypes::Int64VectorPointerType, 8), LLVMTypes::Int64VectorPointerType, 8),
ScatterImpInfo("__pseudo_scatter_base_offsets32_double", "__pseudo_masked_store_double",
LLVMTypes::DoubleVectorPointerType, 8),
ScatterImpInfo("__pseudo_scatter_base_offsets64_i8", "__pseudo_masked_store_i8", ScatterImpInfo("__pseudo_scatter_base_offsets64_i8", "__pseudo_masked_store_i8",
LLVMTypes::Int8VectorPointerType, 1), LLVMTypes::Int8VectorPointerType, 1),
ScatterImpInfo("__pseudo_scatter_base_offsets64_i16", "__pseudo_masked_store_i16", ScatterImpInfo("__pseudo_scatter_base_offsets64_i16", "__pseudo_masked_store_i16",
LLVMTypes::Int16VectorPointerType, 2), LLVMTypes::Int16VectorPointerType, 2),
ScatterImpInfo("__pseudo_scatter_base_offsets64_i32", "__pseudo_masked_store_i32", ScatterImpInfo("__pseudo_scatter_base_offsets64_i32", "__pseudo_masked_store_i32",
LLVMTypes::Int32VectorPointerType, 4), LLVMTypes::Int32VectorPointerType, 4),
ScatterImpInfo("__pseudo_scatter_base_offsets64_float", "__pseudo_masked_store_float",
LLVMTypes::FloatVectorPointerType, 4),
ScatterImpInfo("__pseudo_scatter_base_offsets64_i64", "__pseudo_masked_store_i64", ScatterImpInfo("__pseudo_scatter_base_offsets64_i64", "__pseudo_masked_store_i64",
LLVMTypes::Int64VectorPointerType, 8), LLVMTypes::Int64VectorPointerType, 8),
ScatterImpInfo("__pseudo_scatter_base_offsets64_double", "__pseudo_masked_store_double",
LLVMTypes::DoubleVectorPointerType, 8)
}; };
bool modifiedAny = false; bool modifiedAny = false;
@@ -3297,9 +3341,11 @@ lCoalesceGathers(const std::vector<llvm::CallInst *> &coalesceGroup) {
llvm::Value *basePtr = lComputeBasePtr(coalesceGroup[0], insertBefore); llvm::Value *basePtr = lComputeBasePtr(coalesceGroup[0], insertBefore);
int elementSize = 0; int elementSize = 0;
if (coalesceGroup[0]->getType() == LLVMTypes::Int32VectorType) if (coalesceGroup[0]->getType() == LLVMTypes::Int32VectorType ||
coalesceGroup[0]->getType() == LLVMTypes::FloatVectorType)
elementSize = 4; elementSize = 4;
else if (coalesceGroup[0]->getType() == LLVMTypes::Int64VectorType) else if (coalesceGroup[0]->getType() == LLVMTypes::Int64VectorType ||
coalesceGroup[0]->getType() == LLVMTypes::DoubleVectorType)
elementSize = 8; elementSize = 8;
else else
FATAL("Unexpected gather type in lCoalesceGathers"); FATAL("Unexpected gather type in lCoalesceGathers");
@@ -3336,14 +3382,20 @@ lCoalesceGathers(const std::vector<llvm::CallInst *> &coalesceGroup) {
// that gives the value from the coalescing process. // that gives the value from the coalescing process.
Assert(results.size() == coalesceGroup.size()); Assert(results.size() == coalesceGroup.size());
for (int i = 0; i < (int)results.size(); ++i) { for (int i = 0; i < (int)results.size(); ++i) {
llvm::Instruction *ir = llvm::dyn_cast<llvm::Instruction>(results[i]);
Assert(ir != NULL);
llvm::Type *origType = coalesceGroup[i]->getType();
if (origType != ir->getType())
ir = new llvm::BitCastInst(ir, origType, ir->getName(),
coalesceGroup[i]);
// Previously, all of the instructions to compute the final result // Previously, all of the instructions to compute the final result
// were into the basic block here; here we remove the very last one // were into the basic block here; here we remove the very last one
// of them (that holds the final result) from the basic block. // of them (that holds the final result) from the basic block.
// This way, the following ReplaceInstWithInst() call will operate // This way, the following ReplaceInstWithInst() call will operate
// successfully. (It expects that the second argument not be in any // successfully. (It expects that the second argument not be in any
// basic block.) // basic block.)
llvm::Instruction *ir = llvm::dyn_cast<llvm::Instruction>(results[i]);
Assert(ir != NULL);
ir->removeFromParent(); ir->removeFromParent();
llvm::ReplaceInstWithInst(coalesceGroup[i], ir); llvm::ReplaceInstWithInst(coalesceGroup[i], ir);
@@ -3391,7 +3443,9 @@ GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) {
llvm::Function *gatherFuncs[] = { llvm::Function *gatherFuncs[] = {
m->module->getFunction("__pseudo_gather_base_offsets32_i32"), m->module->getFunction("__pseudo_gather_base_offsets32_i32"),
m->module->getFunction("__pseudo_gather_base_offsets32_float"),
m->module->getFunction("__pseudo_gather_base_offsets64_i32"), m->module->getFunction("__pseudo_gather_base_offsets64_i32"),
m->module->getFunction("__pseudo_gather_base_offsets64_float"),
}; };
int nGatherFuncs = sizeof(gatherFuncs) / sizeof(gatherFuncs[0]); int nGatherFuncs = sizeof(gatherFuncs) / sizeof(gatherFuncs[0]);
@@ -3401,7 +3455,7 @@ GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) {
for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e;
++iter) { ++iter) {
// Iterate over all of the instructions and look for calls to // Iterate over all of the instructions and look for calls to
// __pseudo_*_base_offsets*_32 calls. // __pseudo_gather_base_offsets{32,64}_{i32,float} calls.
llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter); llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
if (callInst == NULL) if (callInst == NULL)
continue; continue;
@@ -3576,42 +3630,58 @@ PseudoGSToGSPass::runOnBasicBlock(llvm::BasicBlock &bb) {
LowerGSInfo("__pseudo_gather_base_offsets32_i8", "__gather_base_offsets32_i8", true), LowerGSInfo("__pseudo_gather_base_offsets32_i8", "__gather_base_offsets32_i8", true),
LowerGSInfo("__pseudo_gather_base_offsets32_i16", "__gather_base_offsets32_i16", true), LowerGSInfo("__pseudo_gather_base_offsets32_i16", "__gather_base_offsets32_i16", true),
LowerGSInfo("__pseudo_gather_base_offsets32_i32", "__gather_base_offsets32_i32", true), LowerGSInfo("__pseudo_gather_base_offsets32_i32", "__gather_base_offsets32_i32", true),
LowerGSInfo("__pseudo_gather_base_offsets32_float", "__gather_base_offsets32_float", true),
LowerGSInfo("__pseudo_gather_base_offsets32_i64", "__gather_base_offsets32_i64", true), LowerGSInfo("__pseudo_gather_base_offsets32_i64", "__gather_base_offsets32_i64", true),
LowerGSInfo("__pseudo_gather_base_offsets32_double", "__gather_base_offsets32_double", true),
LowerGSInfo("__pseudo_gather_base_offsets64_i8", "__gather_base_offsets64_i8", true), LowerGSInfo("__pseudo_gather_base_offsets64_i8", "__gather_base_offsets64_i8", true),
LowerGSInfo("__pseudo_gather_base_offsets64_i16", "__gather_base_offsets64_i16", true), LowerGSInfo("__pseudo_gather_base_offsets64_i16", "__gather_base_offsets64_i16", true),
LowerGSInfo("__pseudo_gather_base_offsets64_i32", "__gather_base_offsets64_i32", true), LowerGSInfo("__pseudo_gather_base_offsets64_i32", "__gather_base_offsets64_i32", true),
LowerGSInfo("__pseudo_gather_base_offsets64_float", "__gather_base_offsets64_float", true),
LowerGSInfo("__pseudo_gather_base_offsets64_i64", "__gather_base_offsets64_i64", true), LowerGSInfo("__pseudo_gather_base_offsets64_i64", "__gather_base_offsets64_i64", true),
LowerGSInfo("__pseudo_gather_base_offsets64_double", "__gather_base_offsets64_double", true),
LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true), LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true),
LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true), LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true),
LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true), LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true),
LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true),
LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true), LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true),
LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true),
LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true), LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true),
LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true), LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true),
LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true), LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true),
LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true),
LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true), LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true),
LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true),
LowerGSInfo("__pseudo_scatter_base_offsets32_i8", "__scatter_base_offsets32_i8", false), LowerGSInfo("__pseudo_scatter_base_offsets32_i8", "__scatter_base_offsets32_i8", false),
LowerGSInfo("__pseudo_scatter_base_offsets32_i16", "__scatter_base_offsets32_i16", false), LowerGSInfo("__pseudo_scatter_base_offsets32_i16", "__scatter_base_offsets32_i16", false),
LowerGSInfo("__pseudo_scatter_base_offsets32_i32", "__scatter_base_offsets32_i32", false), LowerGSInfo("__pseudo_scatter_base_offsets32_i32", "__scatter_base_offsets32_i32", false),
LowerGSInfo("__pseudo_scatter_base_offsets32_float", "__scatter_base_offsets32_float", false),
LowerGSInfo("__pseudo_scatter_base_offsets32_i64", "__scatter_base_offsets32_i64", false), LowerGSInfo("__pseudo_scatter_base_offsets32_i64", "__scatter_base_offsets32_i64", false),
LowerGSInfo("__pseudo_scatter_base_offsets32_double", "__scatter_base_offsets32_double", false),
LowerGSInfo("__pseudo_scatter_base_offsets64_i8", "__scatter_base_offsets64_i8", false), LowerGSInfo("__pseudo_scatter_base_offsets64_i8", "__scatter_base_offsets64_i8", false),
LowerGSInfo("__pseudo_scatter_base_offsets64_i16", "__scatter_base_offsets64_i16", false), LowerGSInfo("__pseudo_scatter_base_offsets64_i16", "__scatter_base_offsets64_i16", false),
LowerGSInfo("__pseudo_scatter_base_offsets64_i32", "__scatter_base_offsets64_i32", false), LowerGSInfo("__pseudo_scatter_base_offsets64_i32", "__scatter_base_offsets64_i32", false),
LowerGSInfo("__pseudo_scatter_base_offsets64_float", "__scatter_base_offsets64_float", false),
LowerGSInfo("__pseudo_scatter_base_offsets64_i64", "__scatter_base_offsets64_i64", false), LowerGSInfo("__pseudo_scatter_base_offsets64_i64", "__scatter_base_offsets64_i64", false),
LowerGSInfo("__pseudo_scatter_base_offsets64_double", "__scatter_base_offsets64_double", false),
LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false), LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false),
LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false), LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false),
LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false), LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false),
LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false),
LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false), LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false),
LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false),
LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false), LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false),
LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false), LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false),
LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false), LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false),
LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false),
LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false), LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false),
LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false),
}; };
bool modifiedAny = false; bool modifiedAny = false;
@@ -3808,36 +3878,52 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
"__fast_masked_vload", "__fast_masked_vload",
"__gather_base_offsets32_i8", "__gather_base_offsets32_i16", "__gather_base_offsets32_i8", "__gather_base_offsets32_i16",
"__gather_base_offsets32_i32", "__gather_base_offsets32_i64", "__gather_base_offsets32_i32", "__gather_base_offsets32_i64",
"__gather_base_offsets32_float", "__gather_base_offsets32_double",
"__gather_base_offsets64_i8", "__gather_base_offsets64_i16", "__gather_base_offsets64_i8", "__gather_base_offsets64_i16",
"__gather_base_offsets64_i32", "__gather_base_offsets64_i64", "__gather_base_offsets64_i32", "__gather_base_offsets64_i64",
"__gather_base_offsets64_float", "__gather_base_offsets64_double",
"__gather32_i8", "__gather32_i16", "__gather32_i8", "__gather32_i16",
"__gather32_i32", "__gather32_i64", "__gather32_i32", "__gather32_i64",
"__gather32_float", "__gather32_double",
"__gather64_i8", "__gather64_i16", "__gather64_i8", "__gather64_i16",
"__gather64_i32", "__gather64_i64", "__gather64_i32", "__gather64_i64",
"__gather64_float", "__gather64_double",
"__gather_elt32_i8", "__gather_elt32_i16", "__gather_elt32_i8", "__gather_elt32_i16",
"__gather_elt32_i32", "__gather_elt32_i64", "__gather_elt32_i32", "__gather_elt32_i64",
"__gather_elt32_float", "__gather_elt32_double",
"__gather_elt64_i8", "__gather_elt64_i16", "__gather_elt64_i8", "__gather_elt64_i16",
"__gather_elt64_i32", "__gather_elt64_i64", "__gather_elt64_i32", "__gather_elt64_i64",
"__gather_elt64_float", "__gather_elt64_double",
"__load_and_broadcast_i8", "__load_and_broadcast_i16", "__load_and_broadcast_i8", "__load_and_broadcast_i16",
"__load_and_broadcast_i32", "__load_and_broadcast_i64", "__load_and_broadcast_i32", "__load_and_broadcast_i64",
"__load_and_broadcast_float", "__load_and_broadcast_double",
"__masked_load_i8", "__masked_load_i16", "__masked_load_i8", "__masked_load_i16",
"__masked_load_i32", "__masked_load_i64", "__masked_load_i32", "__masked_load_i64",
"__masked_load_float", "__masked_load_double",
"__masked_store_i8", "__masked_store_i16", "__masked_store_i8", "__masked_store_i16",
"__masked_store_i32", "__masked_store_i64", "__masked_store_i32", "__masked_store_i64",
"__masked_store_float", "__masked_store_double",
"__masked_store_blend_i8", "__masked_store_blend_i16", "__masked_store_blend_i8", "__masked_store_blend_i16",
"__masked_store_blend_i32", "__masked_store_blend_i64", "__masked_store_blend_i32", "__masked_store_blend_i64",
"__masked_store_blend_float", "__masked_store_blend_double",
"__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16", "__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16",
"__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64", "__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64",
"__scatter_base_offsets32_float", "__scatter_base_offsets32_double",
"__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16", "__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16",
"__scatter_base_offsets64_i32", "__scatter_base_offsets64_i64", "__scatter_base_offsets64_i32", "__scatter_base_offsets64_i64",
"__scatter_base_offsets64_float", "__scatter_base_offsets64_double",
"__scatter_elt32_i8", "__scatter_elt32_i16", "__scatter_elt32_i8", "__scatter_elt32_i16",
"__scatter_elt32_i32", "__scatter_elt32_i64", "__scatter_elt32_i32", "__scatter_elt32_i64",
"__scatter_elt32_float", "__scatter_elt32_double",
"__scatter_elt64_i8", "__scatter_elt64_i16", "__scatter_elt64_i8", "__scatter_elt64_i16",
"__scatter_elt64_i32", "__scatter_elt64_i64", "__scatter_elt64_i32", "__scatter_elt64_i64",
"__scatter_elt64_float", "__scatter_elt64_double",
"__scatter32_i8", "__scatter32_i16", "__scatter32_i8", "__scatter32_i16",
"__scatter32_i32", "__scatter32_i64", "__scatter32_i32", "__scatter32_i64",
"__scatter32_float", "__scatter32_double",
"__scatter64_i8", "__scatter64_i16", "__scatter64_i8", "__scatter64_i16",
"__scatter64_i32", "__scatter64_i64", "__scatter64_i32", "__scatter64_i64",
"__scatter64_float", "__scatter64_double",
"__keep_funcs_live", "__keep_funcs_live",
}; };