Add routines to standard library to do efficient AOS/SOA conversions.

Currently, we just support 3 and 4-wide variants (i.e. xyzxyz.. and xyzwxyzw..),
for int32 and float types.
This commit is contained in:
Matt Pharr
2011-10-10 10:56:06 -07:00
parent f5391747b9
commit 3cb0115dce
11 changed files with 1041 additions and 2 deletions

View File

@@ -1052,6 +1052,681 @@ define internal <$1 x i32> @__sext_varying_bool(<$1 x i32>) nounwind readnone al
ret <$1 x i32> %0
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; AOS/SOA conversion primitives
;; take 4 4-wide vectors laid out like <r0 g0 b0 a0> <r1 g1 b1 a1> ...
;; and reorder them to <r0 r1 r2 r3> <g0 g1 g2 g3> ...
define internal void
@__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
<4 x float> %v3, <4 x float> * noalias %out0,
<4 x float> * noalias %out1, <4 x float> * noalias %out2,
<4 x float> * noalias %out3) nounwind alwaysinline {
%t0 = shufflevector <4 x float> %v2, <4 x float> %v3, ; r2 r3 g2 g3
<4 x i32> <i32 0, i32 4, i32 1, i32 5>
%t1 = shufflevector <4 x float> %v2, <4 x float> %v3, ; b2 b3 a2 a3
<4 x i32> <i32 2, i32 6, i32 3, i32 7>
%t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; r0 r1 g0 g1
<4 x i32> <i32 0, i32 4, i32 1, i32 5>
%t3 = shufflevector <4 x float> %v0, <4 x float> %v1, ; b0 b1 a0 a1
<4 x i32> <i32 2, i32 6, i32 3, i32 7>
%r0 = shufflevector <4 x float> %t2, <4 x float> %t0, ; r0 r1 r2 r3
<4 x i32> <i32 0, i32 1, i32 4, i32 5>
store <4 x float> %r0, <4 x float> * %out0
%r1 = shufflevector <4 x float> %t2, <4 x float> %t0, ; g0 g1 g2 g3
<4 x i32> <i32 2, i32 3, i32 6, i32 7>
store <4 x float> %r1, <4 x float> * %out1
%r2 = shufflevector <4 x float> %t3, <4 x float> %t1, ; b0 b1 b2 b3
<4 x i32> <i32 0, i32 1, i32 4, i32 5>
store <4 x float> %r2, <4 x float> * %out2
%r3 = shufflevector <4 x float> %t3, <4 x float> %t1, ; a0 a1 a2 a3
<4 x i32> <i32 2, i32 3, i32 6, i32 7>
store <4 x float> %r3, <4 x float> * %out3
ret void
}
;; Do the reverse of __aos_to_soa4_float4--reorder <r0 r1 r2 r3> <g0 g1 g2 g3> ..
;; to <r0 g0 b0 a0> <r1 g1 b1 a1> ...
;; This is the exact same set of operations that __soa_to_soa4_float4 does
;; (a 4x4 transpose), so just call that...
define internal void
@__soa_to_aos4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
<4 x float> %v3, <4 x float> * noalias %out0,
<4 x float> * noalias %out1, <4 x float> * noalias %out2,
<4 x float> * noalias %out3) nounwind alwaysinline {
call void @__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1,
<4 x float> %v2, <4 x float> %v3, <4 x float> * %out0,
<4 x float> * %out1, <4 x float> * %out2, <4 x float> * %out3)
ret void
}
;; Convert 3-wide AOS values to SOA--specifically, given 3 4-vectors
;; <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 x3 y3 z3>, transpose to
;; <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3>.
define internal void
@__aos_to_soa3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
<4 x float> * noalias %out0, <4 x float> * noalias %out1,
<4 x float> * noalias %out2) nounwind alwaysinline {
%t0 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x0 x1 y0 y1
<4 x i32> <i32 0, i32 3, i32 1, i32 4>
%t1 = shufflevector <4 x float> %v1, <4 x float> %v2, ; x2 x3 y2 y3
<4 x i32> <i32 2, i32 5, i32 3, i32 6>
%r0 = shufflevector <4 x float> %t0, <4 x float> %t1, ; x0 x1 x1 x3
<4 x i32> <i32 0, i32 1, i32 4, i32 5>
store <4 x float> %r0, <4 x float> * %out0
%r1 = shufflevector <4 x float> %t0, <4 x float> %t1, ; y0 y1 y2 y3
<4 x i32> <i32 2, i32 3, i32 6, i32 7>
store <4 x float> %r1, <4 x float> * %out1
%t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; z0 z1 x x
<4 x i32> <i32 2, i32 5, i32 undef, i32 undef>
%r2 = shufflevector <4 x float> %t2, <4 x float> %v2, ; z0 z1 z2 z3
<4 x i32> <i32 0, i32 1, i32 4, i32 7>
store <4 x float> %r2, <4 x float> * %out2
ret void
}
;; The inverse of __aos_to_soa3_float4: convert 3 4-vectors
;; <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3> to
;; <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 x3 y3 z3>.
define internal void
@__soa_to_aos3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
<4 x float> * noalias %out0, <4 x float> * noalias %out1,
<4 x float> * noalias %out2) nounwind alwaysinline {
%t0 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x0 x1 x2 y0
<4 x i32> <i32 0, i32 1, i32 2, i32 4>
%t1 = shufflevector <4 x float> %v1, <4 x float> %v2, ; y1 y2 z0 z1
<4 x i32> <i32 1, i32 2, i32 4, i32 5>
%r0 = shufflevector <4 x float> %t0, <4 x float> %t1, ; x0 y0 z0 x1
<4 x i32> <i32 0, i32 3, i32 6, i32 1>
store <4 x float> %r0, <4 x float> * %out0
%r1 = shufflevector <4 x float> %t0, <4 x float> %t1, ; y1 z1 x2 y2
<4 x i32> <i32 4, i32 7, i32 2, i32 5>
store <4 x float> %r1, <4 x float> * %out1
%t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x3 y3 x x
<4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
%r2 = shufflevector <4 x float> %t2, <4 x float> %v2, ; z2 x3 y3 z3
<4 x i32> <i32 6, i32 0, i32 1, i32 7>
store <4 x float> %r2, <4 x float> * %out2
ret void
}
;; 8-wide
;; These functions implement the 8-wide variants of the AOS/SOA conversion
;; routines above. These implementations are all built on top of the 4-wide
;; vector versions.
define internal void
@__aos_to_soa4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
<8 x float> %v3, <8 x float> * noalias %out0,
<8 x float> * noalias %out1, <8 x float> * noalias %out2,
<8 x float> * noalias %out3) nounwind alwaysinline {
;; Split each 8-vector into 2 4-vectors
%v0a = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1a = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2a = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v3a = shufflevector <8 x float> %v3, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v3b = shufflevector <8 x float> %v3, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
;; Similarly for the output pointers
%out0a = bitcast <8 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out1a = bitcast <8 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out2a = bitcast <8 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
%out3a = bitcast <8 x float> * %out3 to <4 x float> *
%out3b = getelementptr <4 x float> * %out3a, i32 1
;; Do the first part--given input vectors like
;; <x0 y0 z0 x1 y1 z1 x2 y2> <z2 x3 y3 z3 x4 y4 z4 x5> <y5 z5 x6 y6 z6 x7 y7 z7>,
;; pass 3 4-vectors <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 z3 y3 z3> to the 4-vec
;; version to compute the first 4 SOA values for the three output variables.
call void @__aos_to_soa4_float4(<4 x float> %v0a, <4 x float> %v0b,
<4 x float> %v1a, <4 x float> %v1b, <4 x float> * %out0a,
<4 x float> * %out1a, <4 x float> * %out2a, <4 x float> * %out3a)
;; And similarly pass <x4 y4 z4 x5> <y5 z5 x6 y6> <z6 x7 y7 z7> to the 4-wide
;; version to compute the second 4 SOA values for the three outputs
call void @__aos_to_soa4_float4(<4 x float> %v2a, <4 x float> %v2b,
<4 x float> %v3a, <4 x float> %v3b, <4 x float> * %out0b,
<4 x float> * %out1b, <4 x float> * %out2b, <4 x float> * %out3b)
ret void
}
define internal void
@__soa_to_aos4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
<8 x float> %v3, <8 x float> * noalias %out0,
<8 x float> * noalias %out1, <8 x float> * noalias %out2,
<8 x float> * noalias %out3) nounwind alwaysinline {
;; As above, split into 4-vectors and 4-wide outputs...
%v0a = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1a = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2a = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v3a = shufflevector <8 x float> %v3, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v3b = shufflevector <8 x float> %v3, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%out0a = bitcast <8 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out1a = bitcast <8 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out2a = bitcast <8 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
%out3a = bitcast <8 x float> * %out3 to <4 x float> *
%out3b = getelementptr <4 x float> * %out3a, i32 1
;; First part--given input vectors
;; <x0 x1 x2 x3 x4 x5 x6 x7> <y0 y1 y2 y3 y4 y5 y6 y7> <z0 z1 z2 z3 z4 z5 z6 z7>
;; pass 3 4-vectors <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3> to
;; compute the first 12 AOS output values.
call void @__soa_to_aos4_float4(<4 x float> %v0a, <4 x float> %v1a,
<4 x float> %v2a, <4 x float> %v3a, <4 x float> * %out0a,
<4 x float> * %out0b, <4 x float> * %out1a, <4 x float> * %out1b)
;; And then pass the 3 4-vectors <x4 x5 x6 x7> <y4 y5 y6 y7> <z4 z5 z6 z7>
;; To compute the next 12 AOS output values
call void @__soa_to_aos4_float4(<4 x float> %v0b, <4 x float> %v1b,
<4 x float> %v2b, <4 x float> %v3b, <4 x float> * %out2a,
<4 x float> * %out2b, <4 x float> * %out3a, <4 x float> * %out3b)
ret void
}
define internal void
@__aos_to_soa3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
<8 x float> * noalias %out0, <8 x float> * noalias %out1,
<8 x float> * noalias %out2) nounwind alwaysinline {
%v0a = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1a = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2a = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%out0a = bitcast <8 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out1a = bitcast <8 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out2a = bitcast <8 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
call void @__aos_to_soa3_float4(<4 x float> %v0a, <4 x float> %v0b,
<4 x float> %v1a, <4 x float> * %out0a, <4 x float> * %out1a,
<4 x float> * %out2a)
call void @__aos_to_soa3_float4(<4 x float> %v1b, <4 x float> %v2a,
<4 x float> %v2b, <4 x float> * %out0b, <4 x float> * %out1b,
<4 x float> * %out2b)
ret void
}
define internal void
@__soa_to_aos3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
<8 x float> * noalias %out0, <8 x float> * noalias %out1,
<8 x float> * noalias %out2) nounwind alwaysinline {
%v0a = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1a = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2a = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%out0a = bitcast <8 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out1a = bitcast <8 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out2a = bitcast <8 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
call void @__soa_to_aos3_float4(<4 x float> %v0a, <4 x float> %v1a,
<4 x float> %v2a, <4 x float> * %out0a, <4 x float> * %out0b,
<4 x float> * %out1a)
call void @__soa_to_aos3_float4(<4 x float> %v0b, <4 x float> %v1b,
<4 x float> %v2b, <4 x float> * %out1b, <4 x float> * %out2a,
<4 x float> * %out2b)
ret void
}
;; 16-wide
define internal void
@__aos_to_soa4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
<16 x float> %v3, <16 x float> * noalias %out0,
<16 x float> * noalias %out1, <16 x float> * noalias %out2,
<16 x float> * noalias %out3) nounwind alwaysinline {
%v0a = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v0c = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v0d = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v1a = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1c = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v1d = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v2a = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2c = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v2d = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v3a = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v3b = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v3c = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v3d = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%out0a = bitcast <16 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out0c = getelementptr <4 x float> * %out0a, i32 2
%out0d = getelementptr <4 x float> * %out0a, i32 3
%out1a = bitcast <16 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out1c = getelementptr <4 x float> * %out1a, i32 2
%out1d = getelementptr <4 x float> * %out1a, i32 3
%out2a = bitcast <16 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
%out2c = getelementptr <4 x float> * %out2a, i32 2
%out2d = getelementptr <4 x float> * %out2a, i32 3
%out3a = bitcast <16 x float> * %out3 to <4 x float> *
%out3b = getelementptr <4 x float> * %out3a, i32 1
%out3c = getelementptr <4 x float> * %out3a, i32 2
%out3d = getelementptr <4 x float> * %out3a, i32 3
call void @__aos_to_soa4_float4(<4 x float> %v0a, <4 x float> %v0b,
<4 x float> %v0c, <4 x float> %v0d, <4 x float> * %out0a,
<4 x float> * %out1a, <4 x float> * %out2a, <4 x float> * %out3a)
call void @__aos_to_soa4_float4(<4 x float> %v1a, <4 x float> %v1b,
<4 x float> %v1c, <4 x float> %v1d, <4 x float> * %out0b,
<4 x float> * %out1b, <4 x float> * %out2b, <4 x float> * %out3b)
call void @__aos_to_soa4_float4(<4 x float> %v2a, <4 x float> %v2b,
<4 x float> %v2c, <4 x float> %v2d, <4 x float> * %out0c,
<4 x float> * %out1c, <4 x float> * %out2c, <4 x float> * %out3c)
call void @__aos_to_soa4_float4(<4 x float> %v3a, <4 x float> %v3b,
<4 x float> %v3c, <4 x float> %v3d, <4 x float> * %out0d,
<4 x float> * %out1d, <4 x float> * %out2d, <4 x float> * %out3d)
ret void
}
define internal void
@__soa_to_aos4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
<16 x float> %v3, <16 x float> * noalias %out0,
<16 x float> * noalias %out1, <16 x float> * noalias %out2,
<16 x float> * noalias %out3) nounwind alwaysinline {
%v0a = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v0c = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v0d = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v1a = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1c = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v1d = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v2a = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2c = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v2d = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v3a = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v3b = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v3c = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v3d = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%out0a = bitcast <16 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out0c = getelementptr <4 x float> * %out0a, i32 2
%out0d = getelementptr <4 x float> * %out0a, i32 3
%out1a = bitcast <16 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out1c = getelementptr <4 x float> * %out1a, i32 2
%out1d = getelementptr <4 x float> * %out1a, i32 3
%out2a = bitcast <16 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
%out2c = getelementptr <4 x float> * %out2a, i32 2
%out2d = getelementptr <4 x float> * %out2a, i32 3
%out3a = bitcast <16 x float> * %out3 to <4 x float> *
%out3b = getelementptr <4 x float> * %out3a, i32 1
%out3c = getelementptr <4 x float> * %out3a, i32 2
%out3d = getelementptr <4 x float> * %out3a, i32 3
call void @__soa_to_aos4_float4(<4 x float> %v0a, <4 x float> %v1a,
<4 x float> %v2a, <4 x float> %v3a, <4 x float> * %out0a,
<4 x float> * %out0b, <4 x float> * %out0c, <4 x float> * %out0d)
call void @__soa_to_aos4_float4(<4 x float> %v0b, <4 x float> %v1b,
<4 x float> %v2b, <4 x float> %v3b, <4 x float> * %out1a,
<4 x float> * %out1b, <4 x float> * %out1c, <4 x float> * %out1d)
call void @__soa_to_aos4_float4(<4 x float> %v0c, <4 x float> %v1c,
<4 x float> %v2c, <4 x float> %v3c, <4 x float> * %out2a,
<4 x float> * %out2b, <4 x float> * %out2c, <4 x float> * %out2d)
call void @__soa_to_aos4_float4(<4 x float> %v0d, <4 x float> %v1d,
<4 x float> %v2d, <4 x float> %v3d, <4 x float> * %out3a,
<4 x float> * %out3b, <4 x float> * %out3c, <4 x float> * %out3d)
ret void
}
define internal void
@__aos_to_soa3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
<16 x float> * noalias %out0, <16 x float> * noalias %out1,
<16 x float> * noalias %out2) nounwind alwaysinline {
%v0a = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v0c = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v0d = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v1a = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1c = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v1d = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v2a = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2c = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v2d = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%out0a = bitcast <16 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out0c = getelementptr <4 x float> * %out0a, i32 2
%out0d = getelementptr <4 x float> * %out0a, i32 3
%out1a = bitcast <16 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out1c = getelementptr <4 x float> * %out1a, i32 2
%out1d = getelementptr <4 x float> * %out1a, i32 3
%out2a = bitcast <16 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
%out2c = getelementptr <4 x float> * %out2a, i32 2
%out2d = getelementptr <4 x float> * %out2a, i32 3
call void @__aos_to_soa3_float4(<4 x float> %v0a, <4 x float> %v0b,
<4 x float> %v0c, <4 x float> * %out0a, <4 x float> * %out1a,
<4 x float> * %out2a)
call void @__aos_to_soa3_float4(<4 x float> %v1a, <4 x float> %v1b,
<4 x float> %v1c, <4 x float> * %out0b, <4 x float> * %out1b,
<4 x float> * %out2b)
call void @__aos_to_soa3_float4(<4 x float> %v2a, <4 x float> %v2b,
<4 x float> %v2c, <4 x float> * %out0c, <4 x float> * %out1c,
<4 x float> * %out2c)
ret void
}
define internal void
@__soa_to_aos3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
<16 x float> * noalias %out0, <16 x float> * noalias %out1,
<16 x float> * noalias %out2) nounwind alwaysinline {
%v0a = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v0c = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v0d = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v1a = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1c = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v1d = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v2a = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2c = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v2d = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%out0a = bitcast <16 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out0c = getelementptr <4 x float> * %out0a, i32 2
%out0d = getelementptr <4 x float> * %out0a, i32 3
%out1a = bitcast <16 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out1c = getelementptr <4 x float> * %out1a, i32 2
%out1d = getelementptr <4 x float> * %out1a, i32 3
%out2a = bitcast <16 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
%out2c = getelementptr <4 x float> * %out2a, i32 2
%out2d = getelementptr <4 x float> * %out2a, i32 3
call void @__soa_to_aos3_float4(<4 x float> %v0a, <4 x float> %v1a,
<4 x float> %v2a, <4 x float> * %out0a, <4 x float> * %out0b,
<4 x float> * %out0c)
call void @__soa_to_aos3_float4(<4 x float> %v0b, <4 x float> %v1b,
<4 x float> %v2b, <4 x float> * %out0d, <4 x float> * %out1a,
<4 x float> * %out1b)
call void @__soa_to_aos3_float4(<4 x float> %v0c, <4 x float> %v1c,
<4 x float> %v2c, <4 x float> * %out1c, <4 x float> * %out1d,
<4 x float> * %out2a)
call void @__soa_to_aos3_float4(<4 x float> %v0d, <4 x float> %v1d,
<4 x float> %v2d, <4 x float> * %out2b, <4 x float> * %out2c,
<4 x float> * %out2d)
ret void
}
;; versions to be called from stdlib
define internal void
@__aos_to_soa4_float([0 x float] * noalias %base, i32 %offset,
<$1 x float> * noalias %out0, <$1 x float> * noalias %out1,
<$1 x float> * noalias %out2, <$1 x float> * noalias %out3)
nounwind alwaysinline {
%pf = bitcast [0 x float] * %base to float *
%p = getelementptr float * %pf, i32 %offset
%p0 = bitcast float * %p to <$1 x float> *
%v0 = load <$1 x float> * %p0, align 4
%p1 = getelementptr <$1 x float> * %p0, i32 1
%v1 = load <$1 x float> * %p1, align 4
%p2 = getelementptr <$1 x float> * %p0, i32 2
%v2 = load <$1 x float> * %p2, align 4
%p3 = getelementptr <$1 x float> * %p0, i32 3
%v3 = load <$1 x float> * %p3, align 4
call void @__aos_to_soa4_float$1(<$1 x float> %v0, <$1 x float> %v1,
<$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0,
<$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3)
ret void
}
define internal void
@__aos_to_soa4_int32([0 x i32] * noalias %base, i32 %offset,
<$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
<$1 x i32> * noalias %out2, <$1 x i32> * noalias %out3)
nounwind alwaysinline {
%fbase = bitcast [0 x i32] * %base to [0 x float] *
%fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
%fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
%fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
%fout3 = bitcast <$1 x i32> * %out3 to <$1 x float> *
call void @__aos_to_soa4_float([0 x float] * %fbase, i32 %offset,
<$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2,
<$1 x float> * %fout3)
ret void
}
define internal void
@__soa_to_aos4_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
<$1 x float> %v3, [0 x float] * noalias %base,
i32 %offset) nounwind alwaysinline {
%pf = bitcast [0 x float] * %base to float *
%p = getelementptr float * %pf, i32 %offset
%out0 = bitcast float * %p to <$1 x float> *
%out1 = getelementptr <$1 x float> * %out0, i32 1
%out2 = getelementptr <$1 x float> * %out0, i32 2
%out3 = getelementptr <$1 x float> * %out0, i32 3
call void @__soa_to_aos4_float$1(<$1 x float> %v0, <$1 x float> %v1,
<$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0,
<$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3)
ret void
}
define internal void
@__soa_to_aos4_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
<$1 x i32> %v3, [0 x i32] * noalias %base,
i32 %offset) nounwind alwaysinline {
%fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
%fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
%fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
%fv3 = bitcast <$1 x i32> %v3 to <$1 x float>
%fbase = bitcast [0 x i32] * %base to [0 x float] *
call void @__soa_to_aos4_float(<$1 x float> %fv0, <$1 x float> %fv1,
<$1 x float> %fv2, <$1 x float> %fv3, [0 x float] * %fbase,
i32 %offset)
ret void
}
define internal void
@__aos_to_soa3_float([0 x float] * noalias %base, i32 %offset,
<$1 x float> * %out0, <$1 x float> * %out1,
<$1 x float> * %out2) nounwind alwaysinline {
%pf = bitcast [0 x float] * %base to float *
%p = getelementptr float * %pf, i32 %offset
%p0 = bitcast float * %p to <$1 x float> *
%v0 = load <$1 x float> * %p0, align 4
%p1 = getelementptr <$1 x float> * %p0, i32 1
%v1 = load <$1 x float> * %p1, align 4
%p2 = getelementptr <$1 x float> * %p0, i32 2
%v2 = load <$1 x float> * %p2, align 4
call void @__aos_to_soa3_float$1(<$1 x float> %v0, <$1 x float> %v1,
<$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1,
<$1 x float> * %out2)
ret void
}
define internal void
@__aos_to_soa3_int32([0 x i32] * noalias %base, i32 %offset,
<$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
<$1 x i32> * noalias %out2) nounwind alwaysinline {
%fbase = bitcast [0 x i32] * %base to [0 x float] *
%fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
%fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
%fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
call void @__aos_to_soa3_float([0 x float] * %fbase, i32 %offset,
<$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2)
ret void
}
define internal void
@__soa_to_aos3_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
[0 x float] * noalias %base, i32 %offset) nounwind alwaysinline {
%pf = bitcast [0 x float] * %base to float *
%p = getelementptr float * %pf, i32 %offset
%out0 = bitcast float * %p to <$1 x float> *
%out1 = getelementptr <$1 x float> * %out0, i32 1
%out2 = getelementptr <$1 x float> * %out0, i32 2
call void @__soa_to_aos3_float$1(<$1 x float> %v0, <$1 x float> %v1,
<$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1,
<$1 x float> * %out2)
ret void
}
define internal void
@__soa_to_aos3_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
[0 x i32] * noalias %base, i32 %offset) nounwind alwaysinline {
%fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
%fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
%fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
%fbase = bitcast [0 x i32] * %base to [0 x float] *
call void @__soa_to_aos3_float(<$1 x float> %fv0, <$1 x float> %fv1,
<$1 x float> %fv2, [0 x float] * %fbase, i32 %offset)
ret void
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; prefetching