Add routines to standard library to do efficient AOS/SOA conversions.
Currently, we just support 3 and 4-wide variants (i.e. xyzxyz.. and xyzwxyzw..), for int32 and float types.
This commit is contained in:
675
builtins.m4
675
builtins.m4
@@ -1052,6 +1052,681 @@ define internal <$1 x i32> @__sext_varying_bool(<$1 x i32>) nounwind readnone al
|
||||
ret <$1 x i32> %0
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; AOS/SOA conversion primitives
|
||||
|
||||
;; take 4 4-wide vectors laid out like <r0 g0 b0 a0> <r1 g1 b1 a1> ...
|
||||
;; and reorder them to <r0 r1 r2 r3> <g0 g1 g2 g3> ...
|
||||
|
||||
define internal void
|
||||
@__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
|
||||
<4 x float> %v3, <4 x float> * noalias %out0,
|
||||
<4 x float> * noalias %out1, <4 x float> * noalias %out2,
|
||||
<4 x float> * noalias %out3) nounwind alwaysinline {
|
||||
%t0 = shufflevector <4 x float> %v2, <4 x float> %v3, ; r2 r3 g2 g3
|
||||
<4 x i32> <i32 0, i32 4, i32 1, i32 5>
|
||||
%t1 = shufflevector <4 x float> %v2, <4 x float> %v3, ; b2 b3 a2 a3
|
||||
<4 x i32> <i32 2, i32 6, i32 3, i32 7>
|
||||
%t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; r0 r1 g0 g1
|
||||
<4 x i32> <i32 0, i32 4, i32 1, i32 5>
|
||||
%t3 = shufflevector <4 x float> %v0, <4 x float> %v1, ; b0 b1 a0 a1
|
||||
<4 x i32> <i32 2, i32 6, i32 3, i32 7>
|
||||
|
||||
%r0 = shufflevector <4 x float> %t2, <4 x float> %t0, ; r0 r1 r2 r3
|
||||
<4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
store <4 x float> %r0, <4 x float> * %out0
|
||||
%r1 = shufflevector <4 x float> %t2, <4 x float> %t0, ; g0 g1 g2 g3
|
||||
<4 x i32> <i32 2, i32 3, i32 6, i32 7>
|
||||
store <4 x float> %r1, <4 x float> * %out1
|
||||
%r2 = shufflevector <4 x float> %t3, <4 x float> %t1, ; b0 b1 b2 b3
|
||||
<4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
store <4 x float> %r2, <4 x float> * %out2
|
||||
%r3 = shufflevector <4 x float> %t3, <4 x float> %t1, ; a0 a1 a2 a3
|
||||
<4 x i32> <i32 2, i32 3, i32 6, i32 7>
|
||||
store <4 x float> %r3, <4 x float> * %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
;; Do the reverse of __aos_to_soa4_float4--reorder <r0 r1 r2 r3> <g0 g1 g2 g3> ..
|
||||
;; to <r0 g0 b0 a0> <r1 g1 b1 a1> ...
|
||||
;; This is the exact same set of operations that __soa_to_soa4_float4 does
|
||||
;; (a 4x4 transpose), so just call that...
|
||||
|
||||
define internal void
|
||||
@__soa_to_aos4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
|
||||
<4 x float> %v3, <4 x float> * noalias %out0,
|
||||
<4 x float> * noalias %out1, <4 x float> * noalias %out2,
|
||||
<4 x float> * noalias %out3) nounwind alwaysinline {
|
||||
call void @__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1,
|
||||
<4 x float> %v2, <4 x float> %v3, <4 x float> * %out0,
|
||||
<4 x float> * %out1, <4 x float> * %out2, <4 x float> * %out3)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
;; Convert 3-wide AOS values to SOA--specifically, given 3 4-vectors
|
||||
;; <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 x3 y3 z3>, transpose to
|
||||
;; <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3>.
|
||||
|
||||
define internal void
|
||||
@__aos_to_soa3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
|
||||
<4 x float> * noalias %out0, <4 x float> * noalias %out1,
|
||||
<4 x float> * noalias %out2) nounwind alwaysinline {
|
||||
%t0 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x0 x1 y0 y1
|
||||
<4 x i32> <i32 0, i32 3, i32 1, i32 4>
|
||||
%t1 = shufflevector <4 x float> %v1, <4 x float> %v2, ; x2 x3 y2 y3
|
||||
<4 x i32> <i32 2, i32 5, i32 3, i32 6>
|
||||
|
||||
%r0 = shufflevector <4 x float> %t0, <4 x float> %t1, ; x0 x1 x1 x3
|
||||
<4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
store <4 x float> %r0, <4 x float> * %out0
|
||||
|
||||
%r1 = shufflevector <4 x float> %t0, <4 x float> %t1, ; y0 y1 y2 y3
|
||||
<4 x i32> <i32 2, i32 3, i32 6, i32 7>
|
||||
store <4 x float> %r1, <4 x float> * %out1
|
||||
|
||||
%t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; z0 z1 x x
|
||||
<4 x i32> <i32 2, i32 5, i32 undef, i32 undef>
|
||||
|
||||
%r2 = shufflevector <4 x float> %t2, <4 x float> %v2, ; z0 z1 z2 z3
|
||||
<4 x i32> <i32 0, i32 1, i32 4, i32 7>
|
||||
store <4 x float> %r2, <4 x float> * %out2
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
;; The inverse of __aos_to_soa3_float4: convert 3 4-vectors
|
||||
;; <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3> to
|
||||
;; <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 x3 y3 z3>.
|
||||
|
||||
define internal void
|
||||
@__soa_to_aos3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
|
||||
<4 x float> * noalias %out0, <4 x float> * noalias %out1,
|
||||
<4 x float> * noalias %out2) nounwind alwaysinline {
|
||||
%t0 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x0 x1 x2 y0
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 4>
|
||||
%t1 = shufflevector <4 x float> %v1, <4 x float> %v2, ; y1 y2 z0 z1
|
||||
<4 x i32> <i32 1, i32 2, i32 4, i32 5>
|
||||
|
||||
%r0 = shufflevector <4 x float> %t0, <4 x float> %t1, ; x0 y0 z0 x1
|
||||
<4 x i32> <i32 0, i32 3, i32 6, i32 1>
|
||||
store <4 x float> %r0, <4 x float> * %out0
|
||||
%r1 = shufflevector <4 x float> %t0, <4 x float> %t1, ; y1 z1 x2 y2
|
||||
<4 x i32> <i32 4, i32 7, i32 2, i32 5>
|
||||
store <4 x float> %r1, <4 x float> * %out1
|
||||
|
||||
%t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x3 y3 x x
|
||||
<4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
|
||||
|
||||
%r2 = shufflevector <4 x float> %t2, <4 x float> %v2, ; z2 x3 y3 z3
|
||||
<4 x i32> <i32 6, i32 0, i32 1, i32 7>
|
||||
store <4 x float> %r2, <4 x float> * %out2
|
||||
ret void
|
||||
}
|
||||
|
||||
;; 8-wide
|
||||
;; These functions implement the 8-wide variants of the AOS/SOA conversion
|
||||
;; routines above. These implementations are all built on top of the 4-wide
|
||||
;; vector versions.
|
||||
|
||||
define internal void
|
||||
@__aos_to_soa4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
|
||||
<8 x float> %v3, <8 x float> * noalias %out0,
|
||||
<8 x float> * noalias %out1, <8 x float> * noalias %out2,
|
||||
<8 x float> * noalias %out3) nounwind alwaysinline {
|
||||
;; Split each 8-vector into 2 4-vectors
|
||||
%v0a = shufflevector <8 x float> %v0, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0b = shufflevector <8 x float> %v0, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1a = shufflevector <8 x float> %v1, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1b = shufflevector <8 x float> %v1, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v2a = shufflevector <8 x float> %v2, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v2b = shufflevector <8 x float> %v2, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v3a = shufflevector <8 x float> %v3, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v3b = shufflevector <8 x float> %v3, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
;; Similarly for the output pointers
|
||||
%out0a = bitcast <8 x float> * %out0 to <4 x float> *
|
||||
%out0b = getelementptr <4 x float> * %out0a, i32 1
|
||||
%out1a = bitcast <8 x float> * %out1 to <4 x float> *
|
||||
%out1b = getelementptr <4 x float> * %out1a, i32 1
|
||||
%out2a = bitcast <8 x float> * %out2 to <4 x float> *
|
||||
%out2b = getelementptr <4 x float> * %out2a, i32 1
|
||||
%out3a = bitcast <8 x float> * %out3 to <4 x float> *
|
||||
%out3b = getelementptr <4 x float> * %out3a, i32 1
|
||||
|
||||
;; Do the first part--given input vectors like
|
||||
;; <x0 y0 z0 x1 y1 z1 x2 y2> <z2 x3 y3 z3 x4 y4 z4 x5> <y5 z5 x6 y6 z6 x7 y7 z7>,
|
||||
;; pass 3 4-vectors <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 z3 y3 z3> to the 4-vec
|
||||
;; version to compute the first 4 SOA values for the three output variables.
|
||||
call void @__aos_to_soa4_float4(<4 x float> %v0a, <4 x float> %v0b,
|
||||
<4 x float> %v1a, <4 x float> %v1b, <4 x float> * %out0a,
|
||||
<4 x float> * %out1a, <4 x float> * %out2a, <4 x float> * %out3a)
|
||||
|
||||
;; And similarly pass <x4 y4 z4 x5> <y5 z5 x6 y6> <z6 x7 y7 z7> to the 4-wide
|
||||
;; version to compute the second 4 SOA values for the three outputs
|
||||
call void @__aos_to_soa4_float4(<4 x float> %v2a, <4 x float> %v2b,
|
||||
<4 x float> %v3a, <4 x float> %v3b, <4 x float> * %out0b,
|
||||
<4 x float> * %out1b, <4 x float> * %out2b, <4 x float> * %out3b)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define internal void
|
||||
@__soa_to_aos4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
|
||||
<8 x float> %v3, <8 x float> * noalias %out0,
|
||||
<8 x float> * noalias %out1, <8 x float> * noalias %out2,
|
||||
<8 x float> * noalias %out3) nounwind alwaysinline {
|
||||
;; As above, split into 4-vectors and 4-wide outputs...
|
||||
%v0a = shufflevector <8 x float> %v0, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0b = shufflevector <8 x float> %v0, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1a = shufflevector <8 x float> %v1, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1b = shufflevector <8 x float> %v1, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v2a = shufflevector <8 x float> %v2, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v2b = shufflevector <8 x float> %v2, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v3a = shufflevector <8 x float> %v3, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v3b = shufflevector <8 x float> %v3, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
%out0a = bitcast <8 x float> * %out0 to <4 x float> *
|
||||
%out0b = getelementptr <4 x float> * %out0a, i32 1
|
||||
%out1a = bitcast <8 x float> * %out1 to <4 x float> *
|
||||
%out1b = getelementptr <4 x float> * %out1a, i32 1
|
||||
%out2a = bitcast <8 x float> * %out2 to <4 x float> *
|
||||
%out2b = getelementptr <4 x float> * %out2a, i32 1
|
||||
%out3a = bitcast <8 x float> * %out3 to <4 x float> *
|
||||
%out3b = getelementptr <4 x float> * %out3a, i32 1
|
||||
|
||||
;; First part--given input vectors
|
||||
;; <x0 x1 x2 x3 x4 x5 x6 x7> <y0 y1 y2 y3 y4 y5 y6 y7> <z0 z1 z2 z3 z4 z5 z6 z7>
|
||||
;; pass 3 4-vectors <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3> to
|
||||
;; compute the first 12 AOS output values.
|
||||
call void @__soa_to_aos4_float4(<4 x float> %v0a, <4 x float> %v1a,
|
||||
<4 x float> %v2a, <4 x float> %v3a, <4 x float> * %out0a,
|
||||
<4 x float> * %out0b, <4 x float> * %out1a, <4 x float> * %out1b)
|
||||
|
||||
;; And then pass the 3 4-vectors <x4 x5 x6 x7> <y4 y5 y6 y7> <z4 z5 z6 z7>
|
||||
;; To compute the next 12 AOS output values
|
||||
call void @__soa_to_aos4_float4(<4 x float> %v0b, <4 x float> %v1b,
|
||||
<4 x float> %v2b, <4 x float> %v3b, <4 x float> * %out2a,
|
||||
<4 x float> * %out2b, <4 x float> * %out3a, <4 x float> * %out3b)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define internal void
|
||||
@__aos_to_soa3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
|
||||
<8 x float> * noalias %out0, <8 x float> * noalias %out1,
|
||||
<8 x float> * noalias %out2) nounwind alwaysinline {
|
||||
%v0a = shufflevector <8 x float> %v0, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0b = shufflevector <8 x float> %v0, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1a = shufflevector <8 x float> %v1, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1b = shufflevector <8 x float> %v1, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v2a = shufflevector <8 x float> %v2, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v2b = shufflevector <8 x float> %v2, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
%out0a = bitcast <8 x float> * %out0 to <4 x float> *
|
||||
%out0b = getelementptr <4 x float> * %out0a, i32 1
|
||||
%out1a = bitcast <8 x float> * %out1 to <4 x float> *
|
||||
%out1b = getelementptr <4 x float> * %out1a, i32 1
|
||||
%out2a = bitcast <8 x float> * %out2 to <4 x float> *
|
||||
%out2b = getelementptr <4 x float> * %out2a, i32 1
|
||||
|
||||
call void @__aos_to_soa3_float4(<4 x float> %v0a, <4 x float> %v0b,
|
||||
<4 x float> %v1a, <4 x float> * %out0a, <4 x float> * %out1a,
|
||||
<4 x float> * %out2a)
|
||||
call void @__aos_to_soa3_float4(<4 x float> %v1b, <4 x float> %v2a,
|
||||
<4 x float> %v2b, <4 x float> * %out0b, <4 x float> * %out1b,
|
||||
<4 x float> * %out2b)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define internal void
|
||||
@__soa_to_aos3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
|
||||
<8 x float> * noalias %out0, <8 x float> * noalias %out1,
|
||||
<8 x float> * noalias %out2) nounwind alwaysinline {
|
||||
%v0a = shufflevector <8 x float> %v0, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0b = shufflevector <8 x float> %v0, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1a = shufflevector <8 x float> %v1, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1b = shufflevector <8 x float> %v1, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v2a = shufflevector <8 x float> %v2, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v2b = shufflevector <8 x float> %v2, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
%out0a = bitcast <8 x float> * %out0 to <4 x float> *
|
||||
%out0b = getelementptr <4 x float> * %out0a, i32 1
|
||||
%out1a = bitcast <8 x float> * %out1 to <4 x float> *
|
||||
%out1b = getelementptr <4 x float> * %out1a, i32 1
|
||||
%out2a = bitcast <8 x float> * %out2 to <4 x float> *
|
||||
%out2b = getelementptr <4 x float> * %out2a, i32 1
|
||||
|
||||
call void @__soa_to_aos3_float4(<4 x float> %v0a, <4 x float> %v1a,
|
||||
<4 x float> %v2a, <4 x float> * %out0a, <4 x float> * %out0b,
|
||||
<4 x float> * %out1a)
|
||||
call void @__soa_to_aos3_float4(<4 x float> %v0b, <4 x float> %v1b,
|
||||
<4 x float> %v2b, <4 x float> * %out1b, <4 x float> * %out2a,
|
||||
<4 x float> * %out2b)
|
||||
ret void
|
||||
}
|
||||
|
||||
;; 16-wide
|
||||
|
||||
define internal void
|
||||
@__aos_to_soa4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
|
||||
<16 x float> %v3, <16 x float> * noalias %out0,
|
||||
<16 x float> * noalias %out1, <16 x float> * noalias %out2,
|
||||
<16 x float> * noalias %out3) nounwind alwaysinline {
|
||||
%v0a = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0b = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v0c = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v0d = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
%v1a = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1b = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1c = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v1d = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
%v2a = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v2b = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v2c = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v2d = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
%v3a = shufflevector <16 x float> %v3, <16 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v3b = shufflevector <16 x float> %v3, <16 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v3c = shufflevector <16 x float> %v3, <16 x float> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v3d = shufflevector <16 x float> %v3, <16 x float> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
|
||||
%out0a = bitcast <16 x float> * %out0 to <4 x float> *
|
||||
%out0b = getelementptr <4 x float> * %out0a, i32 1
|
||||
%out0c = getelementptr <4 x float> * %out0a, i32 2
|
||||
%out0d = getelementptr <4 x float> * %out0a, i32 3
|
||||
%out1a = bitcast <16 x float> * %out1 to <4 x float> *
|
||||
%out1b = getelementptr <4 x float> * %out1a, i32 1
|
||||
%out1c = getelementptr <4 x float> * %out1a, i32 2
|
||||
%out1d = getelementptr <4 x float> * %out1a, i32 3
|
||||
%out2a = bitcast <16 x float> * %out2 to <4 x float> *
|
||||
%out2b = getelementptr <4 x float> * %out2a, i32 1
|
||||
%out2c = getelementptr <4 x float> * %out2a, i32 2
|
||||
%out2d = getelementptr <4 x float> * %out2a, i32 3
|
||||
%out3a = bitcast <16 x float> * %out3 to <4 x float> *
|
||||
%out3b = getelementptr <4 x float> * %out3a, i32 1
|
||||
%out3c = getelementptr <4 x float> * %out3a, i32 2
|
||||
%out3d = getelementptr <4 x float> * %out3a, i32 3
|
||||
|
||||
call void @__aos_to_soa4_float4(<4 x float> %v0a, <4 x float> %v0b,
|
||||
<4 x float> %v0c, <4 x float> %v0d, <4 x float> * %out0a,
|
||||
<4 x float> * %out1a, <4 x float> * %out2a, <4 x float> * %out3a)
|
||||
call void @__aos_to_soa4_float4(<4 x float> %v1a, <4 x float> %v1b,
|
||||
<4 x float> %v1c, <4 x float> %v1d, <4 x float> * %out0b,
|
||||
<4 x float> * %out1b, <4 x float> * %out2b, <4 x float> * %out3b)
|
||||
call void @__aos_to_soa4_float4(<4 x float> %v2a, <4 x float> %v2b,
|
||||
<4 x float> %v2c, <4 x float> %v2d, <4 x float> * %out0c,
|
||||
<4 x float> * %out1c, <4 x float> * %out2c, <4 x float> * %out3c)
|
||||
call void @__aos_to_soa4_float4(<4 x float> %v3a, <4 x float> %v3b,
|
||||
<4 x float> %v3c, <4 x float> %v3d, <4 x float> * %out0d,
|
||||
<4 x float> * %out1d, <4 x float> * %out2d, <4 x float> * %out3d)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define internal void
|
||||
@__soa_to_aos4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
|
||||
<16 x float> %v3, <16 x float> * noalias %out0,
|
||||
<16 x float> * noalias %out1, <16 x float> * noalias %out2,
|
||||
<16 x float> * noalias %out3) nounwind alwaysinline {
|
||||
%v0a = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0b = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v0c = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v0d = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
%v1a = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1b = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1c = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v1d = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
%v2a = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v2b = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v2c = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v2d = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
%v3a = shufflevector <16 x float> %v3, <16 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v3b = shufflevector <16 x float> %v3, <16 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v3c = shufflevector <16 x float> %v3, <16 x float> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v3d = shufflevector <16 x float> %v3, <16 x float> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
|
||||
%out0a = bitcast <16 x float> * %out0 to <4 x float> *
|
||||
%out0b = getelementptr <4 x float> * %out0a, i32 1
|
||||
%out0c = getelementptr <4 x float> * %out0a, i32 2
|
||||
%out0d = getelementptr <4 x float> * %out0a, i32 3
|
||||
%out1a = bitcast <16 x float> * %out1 to <4 x float> *
|
||||
%out1b = getelementptr <4 x float> * %out1a, i32 1
|
||||
%out1c = getelementptr <4 x float> * %out1a, i32 2
|
||||
%out1d = getelementptr <4 x float> * %out1a, i32 3
|
||||
%out2a = bitcast <16 x float> * %out2 to <4 x float> *
|
||||
%out2b = getelementptr <4 x float> * %out2a, i32 1
|
||||
%out2c = getelementptr <4 x float> * %out2a, i32 2
|
||||
%out2d = getelementptr <4 x float> * %out2a, i32 3
|
||||
%out3a = bitcast <16 x float> * %out3 to <4 x float> *
|
||||
%out3b = getelementptr <4 x float> * %out3a, i32 1
|
||||
%out3c = getelementptr <4 x float> * %out3a, i32 2
|
||||
%out3d = getelementptr <4 x float> * %out3a, i32 3
|
||||
|
||||
call void @__soa_to_aos4_float4(<4 x float> %v0a, <4 x float> %v1a,
|
||||
<4 x float> %v2a, <4 x float> %v3a, <4 x float> * %out0a,
|
||||
<4 x float> * %out0b, <4 x float> * %out0c, <4 x float> * %out0d)
|
||||
call void @__soa_to_aos4_float4(<4 x float> %v0b, <4 x float> %v1b,
|
||||
<4 x float> %v2b, <4 x float> %v3b, <4 x float> * %out1a,
|
||||
<4 x float> * %out1b, <4 x float> * %out1c, <4 x float> * %out1d)
|
||||
call void @__soa_to_aos4_float4(<4 x float> %v0c, <4 x float> %v1c,
|
||||
<4 x float> %v2c, <4 x float> %v3c, <4 x float> * %out2a,
|
||||
<4 x float> * %out2b, <4 x float> * %out2c, <4 x float> * %out2d)
|
||||
call void @__soa_to_aos4_float4(<4 x float> %v0d, <4 x float> %v1d,
|
||||
<4 x float> %v2d, <4 x float> %v3d, <4 x float> * %out3a,
|
||||
<4 x float> * %out3b, <4 x float> * %out3c, <4 x float> * %out3d)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define internal void
|
||||
@__aos_to_soa3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
|
||||
<16 x float> * noalias %out0, <16 x float> * noalias %out1,
|
||||
<16 x float> * noalias %out2) nounwind alwaysinline {
|
||||
%v0a = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0b = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v0c = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v0d = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
%v1a = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1b = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1c = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v1d = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
%v2a = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v2b = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v2c = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v2d = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
|
||||
%out0a = bitcast <16 x float> * %out0 to <4 x float> *
|
||||
%out0b = getelementptr <4 x float> * %out0a, i32 1
|
||||
%out0c = getelementptr <4 x float> * %out0a, i32 2
|
||||
%out0d = getelementptr <4 x float> * %out0a, i32 3
|
||||
%out1a = bitcast <16 x float> * %out1 to <4 x float> *
|
||||
%out1b = getelementptr <4 x float> * %out1a, i32 1
|
||||
%out1c = getelementptr <4 x float> * %out1a, i32 2
|
||||
%out1d = getelementptr <4 x float> * %out1a, i32 3
|
||||
%out2a = bitcast <16 x float> * %out2 to <4 x float> *
|
||||
%out2b = getelementptr <4 x float> * %out2a, i32 1
|
||||
%out2c = getelementptr <4 x float> * %out2a, i32 2
|
||||
%out2d = getelementptr <4 x float> * %out2a, i32 3
|
||||
|
||||
call void @__aos_to_soa3_float4(<4 x float> %v0a, <4 x float> %v0b,
|
||||
<4 x float> %v0c, <4 x float> * %out0a, <4 x float> * %out1a,
|
||||
<4 x float> * %out2a)
|
||||
call void @__aos_to_soa3_float4(<4 x float> %v1a, <4 x float> %v1b,
|
||||
<4 x float> %v1c, <4 x float> * %out0b, <4 x float> * %out1b,
|
||||
<4 x float> * %out2b)
|
||||
call void @__aos_to_soa3_float4(<4 x float> %v2a, <4 x float> %v2b,
|
||||
<4 x float> %v2c, <4 x float> * %out0c, <4 x float> * %out1c,
|
||||
<4 x float> * %out2c)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define internal void
|
||||
@__soa_to_aos3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
|
||||
<16 x float> * noalias %out0, <16 x float> * noalias %out1,
|
||||
<16 x float> * noalias %out2) nounwind alwaysinline {
|
||||
%v0a = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0b = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v0c = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v0d = shufflevector <16 x float> %v0, <16 x float> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
%v1a = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1b = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1c = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v1d = shufflevector <16 x float> %v1, <16 x float> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
%v2a = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v2b = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v2c = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v2d = shufflevector <16 x float> %v2, <16 x float> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
|
||||
%out0a = bitcast <16 x float> * %out0 to <4 x float> *
|
||||
%out0b = getelementptr <4 x float> * %out0a, i32 1
|
||||
%out0c = getelementptr <4 x float> * %out0a, i32 2
|
||||
%out0d = getelementptr <4 x float> * %out0a, i32 3
|
||||
%out1a = bitcast <16 x float> * %out1 to <4 x float> *
|
||||
%out1b = getelementptr <4 x float> * %out1a, i32 1
|
||||
%out1c = getelementptr <4 x float> * %out1a, i32 2
|
||||
%out1d = getelementptr <4 x float> * %out1a, i32 3
|
||||
%out2a = bitcast <16 x float> * %out2 to <4 x float> *
|
||||
%out2b = getelementptr <4 x float> * %out2a, i32 1
|
||||
%out2c = getelementptr <4 x float> * %out2a, i32 2
|
||||
%out2d = getelementptr <4 x float> * %out2a, i32 3
|
||||
|
||||
call void @__soa_to_aos3_float4(<4 x float> %v0a, <4 x float> %v1a,
|
||||
<4 x float> %v2a, <4 x float> * %out0a, <4 x float> * %out0b,
|
||||
<4 x float> * %out0c)
|
||||
call void @__soa_to_aos3_float4(<4 x float> %v0b, <4 x float> %v1b,
|
||||
<4 x float> %v2b, <4 x float> * %out0d, <4 x float> * %out1a,
|
||||
<4 x float> * %out1b)
|
||||
call void @__soa_to_aos3_float4(<4 x float> %v0c, <4 x float> %v1c,
|
||||
<4 x float> %v2c, <4 x float> * %out1c, <4 x float> * %out1d,
|
||||
<4 x float> * %out2a)
|
||||
call void @__soa_to_aos3_float4(<4 x float> %v0d, <4 x float> %v1d,
|
||||
<4 x float> %v2d, <4 x float> * %out2b, <4 x float> * %out2c,
|
||||
<4 x float> * %out2d)
|
||||
ret void
|
||||
}
|
||||
|
||||
;; versions to be called from stdlib
|
||||
|
||||
define internal void
|
||||
@__aos_to_soa4_float([0 x float] * noalias %base, i32 %offset,
|
||||
<$1 x float> * noalias %out0, <$1 x float> * noalias %out1,
|
||||
<$1 x float> * noalias %out2, <$1 x float> * noalias %out3)
|
||||
nounwind alwaysinline {
|
||||
%pf = bitcast [0 x float] * %base to float *
|
||||
%p = getelementptr float * %pf, i32 %offset
|
||||
%p0 = bitcast float * %p to <$1 x float> *
|
||||
%v0 = load <$1 x float> * %p0, align 4
|
||||
%p1 = getelementptr <$1 x float> * %p0, i32 1
|
||||
%v1 = load <$1 x float> * %p1, align 4
|
||||
%p2 = getelementptr <$1 x float> * %p0, i32 2
|
||||
%v2 = load <$1 x float> * %p2, align 4
|
||||
%p3 = getelementptr <$1 x float> * %p0, i32 3
|
||||
%v3 = load <$1 x float> * %p3, align 4
|
||||
call void @__aos_to_soa4_float$1(<$1 x float> %v0, <$1 x float> %v1,
|
||||
<$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0,
|
||||
<$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define internal void
|
||||
@__aos_to_soa4_int32([0 x i32] * noalias %base, i32 %offset,
|
||||
<$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
|
||||
<$1 x i32> * noalias %out2, <$1 x i32> * noalias %out3)
|
||||
nounwind alwaysinline {
|
||||
%fbase = bitcast [0 x i32] * %base to [0 x float] *
|
||||
%fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
|
||||
%fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
|
||||
%fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
|
||||
%fout3 = bitcast <$1 x i32> * %out3 to <$1 x float> *
|
||||
call void @__aos_to_soa4_float([0 x float] * %fbase, i32 %offset,
|
||||
<$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2,
|
||||
<$1 x float> * %fout3)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define internal void
|
||||
@__soa_to_aos4_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
|
||||
<$1 x float> %v3, [0 x float] * noalias %base,
|
||||
i32 %offset) nounwind alwaysinline {
|
||||
%pf = bitcast [0 x float] * %base to float *
|
||||
%p = getelementptr float * %pf, i32 %offset
|
||||
%out0 = bitcast float * %p to <$1 x float> *
|
||||
%out1 = getelementptr <$1 x float> * %out0, i32 1
|
||||
%out2 = getelementptr <$1 x float> * %out0, i32 2
|
||||
%out3 = getelementptr <$1 x float> * %out0, i32 3
|
||||
call void @__soa_to_aos4_float$1(<$1 x float> %v0, <$1 x float> %v1,
|
||||
<$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0,
|
||||
<$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define internal void
|
||||
@__soa_to_aos4_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
|
||||
<$1 x i32> %v3, [0 x i32] * noalias %base,
|
||||
i32 %offset) nounwind alwaysinline {
|
||||
%fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
|
||||
%fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
|
||||
%fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
|
||||
%fv3 = bitcast <$1 x i32> %v3 to <$1 x float>
|
||||
%fbase = bitcast [0 x i32] * %base to [0 x float] *
|
||||
call void @__soa_to_aos4_float(<$1 x float> %fv0, <$1 x float> %fv1,
|
||||
<$1 x float> %fv2, <$1 x float> %fv3, [0 x float] * %fbase,
|
||||
i32 %offset)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define internal void
|
||||
@__aos_to_soa3_float([0 x float] * noalias %base, i32 %offset,
|
||||
<$1 x float> * %out0, <$1 x float> * %out1,
|
||||
<$1 x float> * %out2) nounwind alwaysinline {
|
||||
%pf = bitcast [0 x float] * %base to float *
|
||||
%p = getelementptr float * %pf, i32 %offset
|
||||
%p0 = bitcast float * %p to <$1 x float> *
|
||||
%v0 = load <$1 x float> * %p0, align 4
|
||||
%p1 = getelementptr <$1 x float> * %p0, i32 1
|
||||
%v1 = load <$1 x float> * %p1, align 4
|
||||
%p2 = getelementptr <$1 x float> * %p0, i32 2
|
||||
%v2 = load <$1 x float> * %p2, align 4
|
||||
call void @__aos_to_soa3_float$1(<$1 x float> %v0, <$1 x float> %v1,
|
||||
<$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1,
|
||||
<$1 x float> * %out2)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define internal void
|
||||
@__aos_to_soa3_int32([0 x i32] * noalias %base, i32 %offset,
|
||||
<$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
|
||||
<$1 x i32> * noalias %out2) nounwind alwaysinline {
|
||||
%fbase = bitcast [0 x i32] * %base to [0 x float] *
|
||||
%fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
|
||||
%fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
|
||||
%fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
|
||||
call void @__aos_to_soa3_float([0 x float] * %fbase, i32 %offset,
|
||||
<$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define internal void
|
||||
@__soa_to_aos3_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
|
||||
[0 x float] * noalias %base, i32 %offset) nounwind alwaysinline {
|
||||
%pf = bitcast [0 x float] * %base to float *
|
||||
%p = getelementptr float * %pf, i32 %offset
|
||||
%out0 = bitcast float * %p to <$1 x float> *
|
||||
%out1 = getelementptr <$1 x float> * %out0, i32 1
|
||||
%out2 = getelementptr <$1 x float> * %out0, i32 2
|
||||
call void @__soa_to_aos3_float$1(<$1 x float> %v0, <$1 x float> %v1,
|
||||
<$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1,
|
||||
<$1 x float> * %out2)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define internal void
|
||||
@__soa_to_aos3_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
|
||||
[0 x i32] * noalias %base, i32 %offset) nounwind alwaysinline {
|
||||
%fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
|
||||
%fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
|
||||
%fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
|
||||
%fbase = bitcast [0 x i32] * %base to [0 x float] *
|
||||
call void @__soa_to_aos3_float(<$1 x float> %fv0, <$1 x float> %fv1,
|
||||
<$1 x float> %fv2, [0 x float] * %fbase, i32 %offset)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; prefetching
|
||||
|
||||
|
||||
101
docs/ispc.txt
101
docs/ispc.txt
@@ -89,6 +89,7 @@ Contents:
|
||||
+ `Math Functions`_
|
||||
+ `Output Functions`_
|
||||
+ `Cross-Program Instance Operations`_
|
||||
+ `Converting Between Array-of-Structures and Structure-of-Arrays Layout`_
|
||||
+ `Packed Load and Store Operations`_
|
||||
+ `Conversions To and From Half-Precision Floats`_
|
||||
+ `Atomic Operations and Memory Fences`_
|
||||
@@ -2022,6 +2023,97 @@ bitwise-or are available:
|
||||
unsigned int64 exclusive_scan_or(unsigned int64 v)
|
||||
|
||||
|
||||
Converting Between Array-of-Structures and Structure-of-Arrays Layout
|
||||
---------------------------------------------------------------------
|
||||
|
||||
Applications often lay data out in memory in "array of structures" form.
|
||||
Though convenient in C/C++ code, this layout can make ``ispc`` programs
|
||||
less efficient than they would be if the data was laid out in "structure of
|
||||
arrays" form. (See the section `Understanding How to Interoperate With the
|
||||
Application's Data`_ for extended discussion of this topic.)
|
||||
|
||||
The standard library does provide a few functions that efficiently convert
|
||||
between these two formats, for cases where it's not possible to change the
|
||||
application to use "structure of arrays layout". Consider an array of 3D
|
||||
(x,y,z) position data laid out in a C array like:
|
||||
|
||||
::
|
||||
|
||||
// C++ code
|
||||
float pos[] = { x0, y0, z0, x1, y1, z1, x2, ... };
|
||||
|
||||
|
||||
In an ``ispc`` program, we might want to load a set of (x,y,z) values and
|
||||
do a computation based on them. The natural expression of this:
|
||||
|
||||
::
|
||||
|
||||
extern uniform float pos[];
|
||||
uniform int base = ...;
|
||||
float x = pos[base + 3 * programIndex]; // x = { x0 x1 x2 ... }
|
||||
float y = pos[base + 1 + 3 * programIndex]; // y = { y0 y1 y2 ... }
|
||||
float z = pos[base + 2 + 3 * programIndex]; // z = { z0 z1 z2 ... }
|
||||
|
||||
leads to irregular memory accesses and reduced performance. Alternatively,
|
||||
the aos_to_soa3 standard library function could be used:
|
||||
|
||||
::
|
||||
|
||||
extern uniform float pos[];
|
||||
uniform int base = ...;
|
||||
float x, y, z;
|
||||
aos_to_soa3(pos, base, x, y, z);
|
||||
|
||||
This routine loads ``3*programCount`` values from the given array starting
|
||||
at the given offset, returning three ``varying`` results. There are both
|
||||
``int32`` and ``float`` variants of this function:
|
||||
|
||||
::
|
||||
|
||||
void aos_to_soa3(uniform float a[], uniform int offset, reference float v0,
|
||||
reference float v1, reference float v2)
|
||||
void aos_to_soa3(uniform int32 a[], uniform int offset, reference int32 v0,
|
||||
reference int32 v1, reference int32 v2)
|
||||
|
||||
After computation is done, corresponding functions convert back from the
|
||||
SoA values in ``ispc`` ``varying`` variables and write the values back to
|
||||
the given array, starting at the given offset.
|
||||
|
||||
::
|
||||
|
||||
extern uniform float pos[];
|
||||
uniform int base = ...;
|
||||
float x, y, z;
|
||||
aos_to_soa3(pos, base, x, y, z);
|
||||
// do computation with x, y, z
|
||||
soa_to_aos3(x, y, z, pos, base);
|
||||
|
||||
::
|
||||
|
||||
void soa_to_aos3(float v0, float v1, float v2, uniform float a[],
|
||||
uniform int offset)
|
||||
void soa_to_aos3(int32 v0, int32 v1, int32 v2, uniform int32 a[],
|
||||
uniform int offset)
|
||||
|
||||
There are also variants of these functions that convert 4-wide values
|
||||
between AoS and SoA layouts. In other words, ``aos_to_soa4`` converts AoS
|
||||
data in memory laid out like ``r0 g0 b0 a0 r1 g1 b1 a1 ...`` to four ``varying``
|
||||
variables with values ``r0 r1...``, ``g0 g1...``, ``b0 b1...``, and ``a0
|
||||
a1...`, reading a total of ``4*programCount`` values from the given array,
|
||||
starting at the given offset.
|
||||
|
||||
::
|
||||
|
||||
void aos_to_soa4(uniform float a[], uniform int offset, reference float v0,
|
||||
reference float v1, reference float v2, reference float v3)
|
||||
void aos_to_soa4(uniform int32 a[], uniform int offset, reference int32 v0,
|
||||
reference int32 v1, reference int32 v2, reference int32 v3)
|
||||
void soa_to_aos4(float v0, float v1, float v2, float v3, uniform float a[],
|
||||
uniform int offset)
|
||||
void soa_to_aos4(int32 v0, int32 v1, int32 v2, int32 v3, uniform int32 a[],
|
||||
uniform int offset)
|
||||
|
||||
|
||||
Packed Load and Store Operations
|
||||
--------------------------------
|
||||
|
||||
@@ -2653,8 +2745,13 @@ values are loaded into the local ``x``, ``y``, and ``z`` variables,
|
||||
SIMD-efficient computation can proceed; getting to that point is
|
||||
relatively inefficient.
|
||||
|
||||
An alternative would be the "structure of arrays" (SoA) layout. In C, the
|
||||
data would be declared as:
|
||||
(As described previously in `Converting Between Array-of-Structures and
|
||||
Structure-of-Arrays Layout`_, this computation could be written more
|
||||
efficiently using standard library routines to convert from the AoS layout,
|
||||
if we were given a flat array of ``float`` values.)
|
||||
|
||||
An alternative data layout would be the "structure of arrays" (SoA). In C,
|
||||
the data would be declared as:
|
||||
|
||||
::
|
||||
|
||||
|
||||
27
failing_tests/aossoa-7.ispc
Normal file
27
failing_tests/aossoa-7.ispc
Normal file
@@ -0,0 +1,27 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_v(uniform float RET[]) {
|
||||
#define width 4
|
||||
//CO const uniform int width = 4;
|
||||
uniform int a[width*programCount], r[width*programCount];
|
||||
for (uniform int i = 0; i < width*programCount; ++i)
|
||||
a[i] = -1;
|
||||
|
||||
int x = width * programIndex;
|
||||
int y = 1 + width * programIndex;
|
||||
int z = 2 + width * programIndex;
|
||||
int w = 3 + width * programIndex;
|
||||
|
||||
soa_to_aos4(x, y, z, w, a, 0);
|
||||
uniform int errs = 0;
|
||||
for (uniform int i = 0; i < width * programCount; ++i)
|
||||
if (a[i] != i) ++errs;
|
||||
|
||||
RET[programIndex] = errs;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
|
||||
26
failing_tests/aossoa-8.ispc
Normal file
26
failing_tests/aossoa-8.ispc
Normal file
@@ -0,0 +1,26 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_v(uniform float RET[]) {
|
||||
#define width 3
|
||||
//CO const uniform int width = 4;
|
||||
uniform int a[width*programCount], r[width*programCount];
|
||||
for (uniform int i = 0; i < width*programCount; ++i)
|
||||
a[i] = -1;
|
||||
|
||||
int x = width * programIndex;
|
||||
int y = 1 + width * programIndex;
|
||||
int z = 2 + width * programIndex;
|
||||
|
||||
soa_to_aos3(x, y, z, a, 0);
|
||||
uniform int errs = 0;
|
||||
for (uniform int i = 0; i < width * programCount; ++i)
|
||||
if (a[i] != i) ++errs;
|
||||
|
||||
RET[programIndex] = errs;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
|
||||
51
stdlib.ispc
51
stdlib.ispc
@@ -315,6 +315,57 @@ static inline uniform int lanemask() {
|
||||
return __movmsk(__mask);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// AOS/SOA conversion
|
||||
|
||||
static inline void
|
||||
aos_to_soa3(uniform float a[], uniform int offset, reference float v0,
|
||||
reference float v1, reference float v2) {
|
||||
__aos_to_soa3_float(a, offset, v0, v1, v2);
|
||||
}
|
||||
|
||||
static inline void
|
||||
soa_to_aos3(float v0, float v1, float v2, uniform float a[],
|
||||
uniform int offset) {
|
||||
__soa_to_aos3_float(v0, v1, v2, a, offset);
|
||||
}
|
||||
|
||||
static inline void
|
||||
aos_to_soa4(uniform float a[], uniform int offset, reference float v0,
|
||||
reference float v1, reference float v2, reference float v3) {
|
||||
__aos_to_soa4_float(a, offset, v0, v1, v2, v3);
|
||||
}
|
||||
|
||||
static inline void
|
||||
soa_to_aos4(float v0, float v1, float v2, float v3, uniform float a[],
|
||||
uniform int offset) {
|
||||
__soa_to_aos4_float(v0, v1, v2, v3, a, offset);
|
||||
}
|
||||
|
||||
static inline void
|
||||
aos_to_soa3(uniform int32 a[], uniform int offset, reference int32 v0,
|
||||
reference int32 v1, reference int32 v2) {
|
||||
__aos_to_soa3_int32(a, offset, v0, v1, v2);
|
||||
}
|
||||
|
||||
static inline void
|
||||
soa_to_aos3(int32 v0, int32 v1, int32 v2, uniform int32 a[],
|
||||
uniform int offset) {
|
||||
__soa_to_aos3_int32(v0, v1, v2, a, offset);
|
||||
}
|
||||
|
||||
static inline void
|
||||
aos_to_soa4(uniform int32 a[], uniform int offset, reference int32 v0,
|
||||
reference int32 v1, reference int32 v2, reference int32 v3) {
|
||||
__aos_to_soa4_int32(a, offset, v0, v1, v2, v3);
|
||||
}
|
||||
|
||||
static inline void
|
||||
soa_to_aos4(int32 v0, int32 v1, int32 v2, int32 v3, uniform int32 a[],
|
||||
uniform int offset) {
|
||||
__soa_to_aos4_int32(v0, v1, v2, v3, a, offset);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Prefetching
|
||||
|
||||
|
||||
27
tests/aossoa-1.ispc
Normal file
27
tests/aossoa-1.ispc
Normal file
@@ -0,0 +1,27 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_v(uniform float RET[]) {
|
||||
#define width 3
|
||||
#define maxProgramCount 16
|
||||
//CO const uniform int width = 3;
|
||||
//CO const uniform int maxProgramCount = 16;
|
||||
uniform float a[width*maxProgramCount], r[width*maxProgramCount];
|
||||
for (uniform int i = 0; i < width*maxProgramCount; ++i)
|
||||
a[i] = i;
|
||||
|
||||
float x=-1, y=-1, z=-1;
|
||||
aos_to_soa3(a, 0, x, y, z);
|
||||
|
||||
int errs = 0;
|
||||
if (x != width * programIndex) ++errs;
|
||||
if (y != 1 + width * programIndex) ++errs;
|
||||
if (z != 2 + width * programIndex) ++errs;
|
||||
|
||||
RET[programIndex] = errs;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
|
||||
28
tests/aossoa-2.ispc
Normal file
28
tests/aossoa-2.ispc
Normal file
@@ -0,0 +1,28 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_v(uniform float RET[]) {
|
||||
#define width 4
|
||||
#define maxProgramCount 16
|
||||
//CO const uniform int width = 4;
|
||||
//CO const uniform int maxProgramCount = 16;
|
||||
uniform float a[width*maxProgramCount], r[width*maxProgramCount];
|
||||
for (uniform int i = 0; i < width*maxProgramCount; ++i)
|
||||
a[i] = i;
|
||||
|
||||
float x=-1, y=-1, z=-1, w=-1;
|
||||
aos_to_soa4(a, 0, x, y, z, w);
|
||||
|
||||
int errs = 0;
|
||||
if (x != width * programIndex) ++errs;
|
||||
if (y != 1 + width * programIndex) ++errs;
|
||||
if (z != 2 + width * programIndex) ++errs;
|
||||
if (w != 3 + width * programIndex) ++errs;
|
||||
|
||||
RET[programIndex] = errs;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
|
||||
27
tests/aossoa-3.ispc
Normal file
27
tests/aossoa-3.ispc
Normal file
@@ -0,0 +1,27 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_v(uniform float RET[]) {
|
||||
#define width 4
|
||||
//CO const uniform int width = 4;
|
||||
uniform float a[width*programCount], r[width*programCount];
|
||||
for (uniform int i = 0; i < width*programCount; ++i)
|
||||
a[i] = -1;
|
||||
|
||||
float x = width * programIndex;
|
||||
float y = 1 + width * programIndex;
|
||||
float z = 2 + width * programIndex;
|
||||
float w = 3 + width * programIndex;
|
||||
|
||||
soa_to_aos4(x, y, z, w, a, 0);
|
||||
uniform int errs = 0;
|
||||
for (uniform int i = 0; i < width * programCount; ++i)
|
||||
if (a[i] != i) ++errs;
|
||||
|
||||
RET[programIndex] = errs;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
|
||||
26
tests/aossoa-4.ispc
Normal file
26
tests/aossoa-4.ispc
Normal file
@@ -0,0 +1,26 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_v(uniform float RET[]) {
|
||||
#define width 3
|
||||
//CO const uniform int width = 4;
|
||||
uniform float a[width*programCount], r[width*programCount];
|
||||
for (uniform int i = 0; i < width*programCount; ++i)
|
||||
a[i] = -1;
|
||||
|
||||
float x = width * programIndex;
|
||||
float y = 1 + width * programIndex;
|
||||
float z = 2 + width * programIndex;
|
||||
|
||||
soa_to_aos3(x, y, z, a, 0);
|
||||
uniform int errs = 0;
|
||||
for (uniform int i = 0; i < width * programCount; ++i)
|
||||
if (a[i] != i) ++errs;
|
||||
|
||||
RET[programIndex] = errs;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
|
||||
27
tests/aossoa-5.ispc
Normal file
27
tests/aossoa-5.ispc
Normal file
@@ -0,0 +1,27 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_v(uniform float RET[]) {
|
||||
#define width 3
|
||||
#define maxProgramCount 16
|
||||
//CO const uniform int width = 3;
|
||||
//CO const uniform int maxProgramCount = 16;
|
||||
uniform int a[width*maxProgramCount], r[width*maxProgramCount];
|
||||
for (uniform int i = 0; i < width*maxProgramCount; ++i)
|
||||
a[i] = i;
|
||||
|
||||
int x=-1, y=-1, z=-1;
|
||||
aos_to_soa3(a, 0, x, y, z);
|
||||
|
||||
int errs = 0;
|
||||
if (x != width * programIndex) ++errs;
|
||||
if (y != 1 + width * programIndex) ++errs;
|
||||
if (z != 2 + width * programIndex) ++errs;
|
||||
|
||||
RET[programIndex] = errs;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
|
||||
28
tests/aossoa-6.ispc
Normal file
28
tests/aossoa-6.ispc
Normal file
@@ -0,0 +1,28 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_v(uniform float RET[]) {
|
||||
#define width 4
|
||||
#define maxProgramCount 16
|
||||
//CO const uniform int width = 4;
|
||||
//CO const uniform int maxProgramCount = 16;
|
||||
uniform int a[width*maxProgramCount], r[width*maxProgramCount];
|
||||
for (uniform int i = 0; i < width*maxProgramCount; ++i)
|
||||
a[i] = i;
|
||||
|
||||
int x=-1, y=-1, z=-1, w=-1;
|
||||
aos_to_soa4(a, 0, x, y, z, w);
|
||||
|
||||
int errs = 0;
|
||||
if (x != width * programIndex) ++errs;
|
||||
if (y != 1 + width * programIndex) ++errs;
|
||||
if (z != 2 + width * programIndex) ++errs;
|
||||
if (w != 3 + width * programIndex) ++errs;
|
||||
|
||||
RET[programIndex] = errs;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user