Add routines to standard library to do efficient AOS/SOA conversions.

Currently, we just support 3 and 4-wide variants (i.e. xyzxyz.. and xyzwxyzw..),
for int32 and float types.
This commit is contained in:
Matt Pharr
2011-10-10 10:56:06 -07:00
parent f5391747b9
commit 3cb0115dce
11 changed files with 1041 additions and 2 deletions

View File

@@ -1052,6 +1052,681 @@ define internal <$1 x i32> @__sext_varying_bool(<$1 x i32>) nounwind readnone al
ret <$1 x i32> %0
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; AOS/SOA conversion primitives
;; take 4 4-wide vectors laid out like <r0 g0 b0 a0> <r1 g1 b1 a1> ...
;; and reorder them to <r0 r1 r2 r3> <g0 g1 g2 g3> ...
define internal void
@__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
<4 x float> %v3, <4 x float> * noalias %out0,
<4 x float> * noalias %out1, <4 x float> * noalias %out2,
<4 x float> * noalias %out3) nounwind alwaysinline {
%t0 = shufflevector <4 x float> %v2, <4 x float> %v3, ; r2 r3 g2 g3
<4 x i32> <i32 0, i32 4, i32 1, i32 5>
%t1 = shufflevector <4 x float> %v2, <4 x float> %v3, ; b2 b3 a2 a3
<4 x i32> <i32 2, i32 6, i32 3, i32 7>
%t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; r0 r1 g0 g1
<4 x i32> <i32 0, i32 4, i32 1, i32 5>
%t3 = shufflevector <4 x float> %v0, <4 x float> %v1, ; b0 b1 a0 a1
<4 x i32> <i32 2, i32 6, i32 3, i32 7>
%r0 = shufflevector <4 x float> %t2, <4 x float> %t0, ; r0 r1 r2 r3
<4 x i32> <i32 0, i32 1, i32 4, i32 5>
store <4 x float> %r0, <4 x float> * %out0
%r1 = shufflevector <4 x float> %t2, <4 x float> %t0, ; g0 g1 g2 g3
<4 x i32> <i32 2, i32 3, i32 6, i32 7>
store <4 x float> %r1, <4 x float> * %out1
%r2 = shufflevector <4 x float> %t3, <4 x float> %t1, ; b0 b1 b2 b3
<4 x i32> <i32 0, i32 1, i32 4, i32 5>
store <4 x float> %r2, <4 x float> * %out2
%r3 = shufflevector <4 x float> %t3, <4 x float> %t1, ; a0 a1 a2 a3
<4 x i32> <i32 2, i32 3, i32 6, i32 7>
store <4 x float> %r3, <4 x float> * %out3
ret void
}
;; Do the reverse of __aos_to_soa4_float4--reorder <r0 r1 r2 r3> <g0 g1 g2 g3> ..
;; to <r0 g0 b0 a0> <r1 g1 b1 a1> ...
;; This is the exact same set of operations that __soa_to_soa4_float4 does
;; (a 4x4 transpose), so just call that...
define internal void
@__soa_to_aos4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
<4 x float> %v3, <4 x float> * noalias %out0,
<4 x float> * noalias %out1, <4 x float> * noalias %out2,
<4 x float> * noalias %out3) nounwind alwaysinline {
call void @__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1,
<4 x float> %v2, <4 x float> %v3, <4 x float> * %out0,
<4 x float> * %out1, <4 x float> * %out2, <4 x float> * %out3)
ret void
}
;; Convert 3-wide AOS values to SOA--specifically, given 3 4-vectors
;; <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 x3 y3 z3>, transpose to
;; <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3>.
define internal void
@__aos_to_soa3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
<4 x float> * noalias %out0, <4 x float> * noalias %out1,
<4 x float> * noalias %out2) nounwind alwaysinline {
%t0 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x0 x1 y0 y1
<4 x i32> <i32 0, i32 3, i32 1, i32 4>
%t1 = shufflevector <4 x float> %v1, <4 x float> %v2, ; x2 x3 y2 y3
<4 x i32> <i32 2, i32 5, i32 3, i32 6>
%r0 = shufflevector <4 x float> %t0, <4 x float> %t1, ; x0 x1 x1 x3
<4 x i32> <i32 0, i32 1, i32 4, i32 5>
store <4 x float> %r0, <4 x float> * %out0
%r1 = shufflevector <4 x float> %t0, <4 x float> %t1, ; y0 y1 y2 y3
<4 x i32> <i32 2, i32 3, i32 6, i32 7>
store <4 x float> %r1, <4 x float> * %out1
%t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; z0 z1 x x
<4 x i32> <i32 2, i32 5, i32 undef, i32 undef>
%r2 = shufflevector <4 x float> %t2, <4 x float> %v2, ; z0 z1 z2 z3
<4 x i32> <i32 0, i32 1, i32 4, i32 7>
store <4 x float> %r2, <4 x float> * %out2
ret void
}
;; The inverse of __aos_to_soa3_float4: convert 3 4-vectors
;; <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3> to
;; <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 x3 y3 z3>.
define internal void
@__soa_to_aos3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
<4 x float> * noalias %out0, <4 x float> * noalias %out1,
<4 x float> * noalias %out2) nounwind alwaysinline {
%t0 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x0 x1 x2 y0
<4 x i32> <i32 0, i32 1, i32 2, i32 4>
%t1 = shufflevector <4 x float> %v1, <4 x float> %v2, ; y1 y2 z0 z1
<4 x i32> <i32 1, i32 2, i32 4, i32 5>
%r0 = shufflevector <4 x float> %t0, <4 x float> %t1, ; x0 y0 z0 x1
<4 x i32> <i32 0, i32 3, i32 6, i32 1>
store <4 x float> %r0, <4 x float> * %out0
%r1 = shufflevector <4 x float> %t0, <4 x float> %t1, ; y1 z1 x2 y2
<4 x i32> <i32 4, i32 7, i32 2, i32 5>
store <4 x float> %r1, <4 x float> * %out1
%t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x3 y3 x x
<4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
%r2 = shufflevector <4 x float> %t2, <4 x float> %v2, ; z2 x3 y3 z3
<4 x i32> <i32 6, i32 0, i32 1, i32 7>
store <4 x float> %r2, <4 x float> * %out2
ret void
}
;; 8-wide
;; These functions implement the 8-wide variants of the AOS/SOA conversion
;; routines above. These implementations are all built on top of the 4-wide
;; vector versions.
define internal void
@__aos_to_soa4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
<8 x float> %v3, <8 x float> * noalias %out0,
<8 x float> * noalias %out1, <8 x float> * noalias %out2,
<8 x float> * noalias %out3) nounwind alwaysinline {
;; Split each 8-vector into 2 4-vectors
%v0a = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1a = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2a = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v3a = shufflevector <8 x float> %v3, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v3b = shufflevector <8 x float> %v3, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
;; Similarly for the output pointers
%out0a = bitcast <8 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out1a = bitcast <8 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out2a = bitcast <8 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
%out3a = bitcast <8 x float> * %out3 to <4 x float> *
%out3b = getelementptr <4 x float> * %out3a, i32 1
;; Do the first part--given input vectors like
;; <x0 y0 z0 x1 y1 z1 x2 y2> <z2 x3 y3 z3 x4 y4 z4 x5> <y5 z5 x6 y6 z6 x7 y7 z7>,
;; pass 3 4-vectors <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 z3 y3 z3> to the 4-vec
;; version to compute the first 4 SOA values for the three output variables.
call void @__aos_to_soa4_float4(<4 x float> %v0a, <4 x float> %v0b,
<4 x float> %v1a, <4 x float> %v1b, <4 x float> * %out0a,
<4 x float> * %out1a, <4 x float> * %out2a, <4 x float> * %out3a)
;; And similarly pass <x4 y4 z4 x5> <y5 z5 x6 y6> <z6 x7 y7 z7> to the 4-wide
;; version to compute the second 4 SOA values for the three outputs
call void @__aos_to_soa4_float4(<4 x float> %v2a, <4 x float> %v2b,
<4 x float> %v3a, <4 x float> %v3b, <4 x float> * %out0b,
<4 x float> * %out1b, <4 x float> * %out2b, <4 x float> * %out3b)
ret void
}
define internal void
@__soa_to_aos4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
<8 x float> %v3, <8 x float> * noalias %out0,
<8 x float> * noalias %out1, <8 x float> * noalias %out2,
<8 x float> * noalias %out3) nounwind alwaysinline {
;; As above, split into 4-vectors and 4-wide outputs...
%v0a = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1a = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2a = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v3a = shufflevector <8 x float> %v3, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v3b = shufflevector <8 x float> %v3, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%out0a = bitcast <8 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out1a = bitcast <8 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out2a = bitcast <8 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
%out3a = bitcast <8 x float> * %out3 to <4 x float> *
%out3b = getelementptr <4 x float> * %out3a, i32 1
;; First part--given input vectors
;; <x0 x1 x2 x3 x4 x5 x6 x7> <y0 y1 y2 y3 y4 y5 y6 y7> <z0 z1 z2 z3 z4 z5 z6 z7>
;; pass 3 4-vectors <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3> to
;; compute the first 12 AOS output values.
call void @__soa_to_aos4_float4(<4 x float> %v0a, <4 x float> %v1a,
<4 x float> %v2a, <4 x float> %v3a, <4 x float> * %out0a,
<4 x float> * %out0b, <4 x float> * %out1a, <4 x float> * %out1b)
;; And then pass the 3 4-vectors <x4 x5 x6 x7> <y4 y5 y6 y7> <z4 z5 z6 z7>
;; To compute the next 12 AOS output values
call void @__soa_to_aos4_float4(<4 x float> %v0b, <4 x float> %v1b,
<4 x float> %v2b, <4 x float> %v3b, <4 x float> * %out2a,
<4 x float> * %out2b, <4 x float> * %out3a, <4 x float> * %out3b)
ret void
}
define internal void
@__aos_to_soa3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
<8 x float> * noalias %out0, <8 x float> * noalias %out1,
<8 x float> * noalias %out2) nounwind alwaysinline {
%v0a = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1a = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2a = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%out0a = bitcast <8 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out1a = bitcast <8 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out2a = bitcast <8 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
call void @__aos_to_soa3_float4(<4 x float> %v0a, <4 x float> %v0b,
<4 x float> %v1a, <4 x float> * %out0a, <4 x float> * %out1a,
<4 x float> * %out2a)
call void @__aos_to_soa3_float4(<4 x float> %v1b, <4 x float> %v2a,
<4 x float> %v2b, <4 x float> * %out0b, <4 x float> * %out1b,
<4 x float> * %out2b)
ret void
}
define internal void
@__soa_to_aos3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
<8 x float> * noalias %out0, <8 x float> * noalias %out1,
<8 x float> * noalias %out2) nounwind alwaysinline {
%v0a = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <8 x float> %v0, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1a = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <8 x float> %v1, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2a = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <8 x float> %v2, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%out0a = bitcast <8 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out1a = bitcast <8 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out2a = bitcast <8 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
call void @__soa_to_aos3_float4(<4 x float> %v0a, <4 x float> %v1a,
<4 x float> %v2a, <4 x float> * %out0a, <4 x float> * %out0b,
<4 x float> * %out1a)
call void @__soa_to_aos3_float4(<4 x float> %v0b, <4 x float> %v1b,
<4 x float> %v2b, <4 x float> * %out1b, <4 x float> * %out2a,
<4 x float> * %out2b)
ret void
}
;; 16-wide
define internal void
@__aos_to_soa4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
<16 x float> %v3, <16 x float> * noalias %out0,
<16 x float> * noalias %out1, <16 x float> * noalias %out2,
<16 x float> * noalias %out3) nounwind alwaysinline {
%v0a = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v0c = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v0d = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v1a = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1c = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v1d = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v2a = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2c = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v2d = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v3a = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v3b = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v3c = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v3d = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%out0a = bitcast <16 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out0c = getelementptr <4 x float> * %out0a, i32 2
%out0d = getelementptr <4 x float> * %out0a, i32 3
%out1a = bitcast <16 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out1c = getelementptr <4 x float> * %out1a, i32 2
%out1d = getelementptr <4 x float> * %out1a, i32 3
%out2a = bitcast <16 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
%out2c = getelementptr <4 x float> * %out2a, i32 2
%out2d = getelementptr <4 x float> * %out2a, i32 3
%out3a = bitcast <16 x float> * %out3 to <4 x float> *
%out3b = getelementptr <4 x float> * %out3a, i32 1
%out3c = getelementptr <4 x float> * %out3a, i32 2
%out3d = getelementptr <4 x float> * %out3a, i32 3
call void @__aos_to_soa4_float4(<4 x float> %v0a, <4 x float> %v0b,
<4 x float> %v0c, <4 x float> %v0d, <4 x float> * %out0a,
<4 x float> * %out1a, <4 x float> * %out2a, <4 x float> * %out3a)
call void @__aos_to_soa4_float4(<4 x float> %v1a, <4 x float> %v1b,
<4 x float> %v1c, <4 x float> %v1d, <4 x float> * %out0b,
<4 x float> * %out1b, <4 x float> * %out2b, <4 x float> * %out3b)
call void @__aos_to_soa4_float4(<4 x float> %v2a, <4 x float> %v2b,
<4 x float> %v2c, <4 x float> %v2d, <4 x float> * %out0c,
<4 x float> * %out1c, <4 x float> * %out2c, <4 x float> * %out3c)
call void @__aos_to_soa4_float4(<4 x float> %v3a, <4 x float> %v3b,
<4 x float> %v3c, <4 x float> %v3d, <4 x float> * %out0d,
<4 x float> * %out1d, <4 x float> * %out2d, <4 x float> * %out3d)
ret void
}
define internal void
@__soa_to_aos4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
<16 x float> %v3, <16 x float> * noalias %out0,
<16 x float> * noalias %out1, <16 x float> * noalias %out2,
<16 x float> * noalias %out3) nounwind alwaysinline {
%v0a = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v0c = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v0d = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v1a = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1c = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v1d = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v2a = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2c = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v2d = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v3a = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v3b = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v3c = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v3d = shufflevector <16 x float> %v3, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%out0a = bitcast <16 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out0c = getelementptr <4 x float> * %out0a, i32 2
%out0d = getelementptr <4 x float> * %out0a, i32 3
%out1a = bitcast <16 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out1c = getelementptr <4 x float> * %out1a, i32 2
%out1d = getelementptr <4 x float> * %out1a, i32 3
%out2a = bitcast <16 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
%out2c = getelementptr <4 x float> * %out2a, i32 2
%out2d = getelementptr <4 x float> * %out2a, i32 3
%out3a = bitcast <16 x float> * %out3 to <4 x float> *
%out3b = getelementptr <4 x float> * %out3a, i32 1
%out3c = getelementptr <4 x float> * %out3a, i32 2
%out3d = getelementptr <4 x float> * %out3a, i32 3
call void @__soa_to_aos4_float4(<4 x float> %v0a, <4 x float> %v1a,
<4 x float> %v2a, <4 x float> %v3a, <4 x float> * %out0a,
<4 x float> * %out0b, <4 x float> * %out0c, <4 x float> * %out0d)
call void @__soa_to_aos4_float4(<4 x float> %v0b, <4 x float> %v1b,
<4 x float> %v2b, <4 x float> %v3b, <4 x float> * %out1a,
<4 x float> * %out1b, <4 x float> * %out1c, <4 x float> * %out1d)
call void @__soa_to_aos4_float4(<4 x float> %v0c, <4 x float> %v1c,
<4 x float> %v2c, <4 x float> %v3c, <4 x float> * %out2a,
<4 x float> * %out2b, <4 x float> * %out2c, <4 x float> * %out2d)
call void @__soa_to_aos4_float4(<4 x float> %v0d, <4 x float> %v1d,
<4 x float> %v2d, <4 x float> %v3d, <4 x float> * %out3a,
<4 x float> * %out3b, <4 x float> * %out3c, <4 x float> * %out3d)
ret void
}
define internal void
@__aos_to_soa3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
<16 x float> * noalias %out0, <16 x float> * noalias %out1,
<16 x float> * noalias %out2) nounwind alwaysinline {
%v0a = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v0c = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v0d = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v1a = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1c = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v1d = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v2a = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2c = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v2d = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%out0a = bitcast <16 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out0c = getelementptr <4 x float> * %out0a, i32 2
%out0d = getelementptr <4 x float> * %out0a, i32 3
%out1a = bitcast <16 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out1c = getelementptr <4 x float> * %out1a, i32 2
%out1d = getelementptr <4 x float> * %out1a, i32 3
%out2a = bitcast <16 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
%out2c = getelementptr <4 x float> * %out2a, i32 2
%out2d = getelementptr <4 x float> * %out2a, i32 3
call void @__aos_to_soa3_float4(<4 x float> %v0a, <4 x float> %v0b,
<4 x float> %v0c, <4 x float> * %out0a, <4 x float> * %out1a,
<4 x float> * %out2a)
call void @__aos_to_soa3_float4(<4 x float> %v1a, <4 x float> %v1b,
<4 x float> %v1c, <4 x float> * %out0b, <4 x float> * %out1b,
<4 x float> * %out2b)
call void @__aos_to_soa3_float4(<4 x float> %v2a, <4 x float> %v2b,
<4 x float> %v2c, <4 x float> * %out0c, <4 x float> * %out1c,
<4 x float> * %out2c)
ret void
}
define internal void
@__soa_to_aos3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
<16 x float> * noalias %out0, <16 x float> * noalias %out1,
<16 x float> * noalias %out2) nounwind alwaysinline {
%v0a = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0b = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v0c = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v0d = shufflevector <16 x float> %v0, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v1a = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1b = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1c = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v1d = shufflevector <16 x float> %v1, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v2a = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2b = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2c = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v2d = shufflevector <16 x float> %v2, <16 x float> undef,
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
%out0a = bitcast <16 x float> * %out0 to <4 x float> *
%out0b = getelementptr <4 x float> * %out0a, i32 1
%out0c = getelementptr <4 x float> * %out0a, i32 2
%out0d = getelementptr <4 x float> * %out0a, i32 3
%out1a = bitcast <16 x float> * %out1 to <4 x float> *
%out1b = getelementptr <4 x float> * %out1a, i32 1
%out1c = getelementptr <4 x float> * %out1a, i32 2
%out1d = getelementptr <4 x float> * %out1a, i32 3
%out2a = bitcast <16 x float> * %out2 to <4 x float> *
%out2b = getelementptr <4 x float> * %out2a, i32 1
%out2c = getelementptr <4 x float> * %out2a, i32 2
%out2d = getelementptr <4 x float> * %out2a, i32 3
call void @__soa_to_aos3_float4(<4 x float> %v0a, <4 x float> %v1a,
<4 x float> %v2a, <4 x float> * %out0a, <4 x float> * %out0b,
<4 x float> * %out0c)
call void @__soa_to_aos3_float4(<4 x float> %v0b, <4 x float> %v1b,
<4 x float> %v2b, <4 x float> * %out0d, <4 x float> * %out1a,
<4 x float> * %out1b)
call void @__soa_to_aos3_float4(<4 x float> %v0c, <4 x float> %v1c,
<4 x float> %v2c, <4 x float> * %out1c, <4 x float> * %out1d,
<4 x float> * %out2a)
call void @__soa_to_aos3_float4(<4 x float> %v0d, <4 x float> %v1d,
<4 x float> %v2d, <4 x float> * %out2b, <4 x float> * %out2c,
<4 x float> * %out2d)
ret void
}
;; versions to be called from stdlib
define internal void
@__aos_to_soa4_float([0 x float] * noalias %base, i32 %offset,
<$1 x float> * noalias %out0, <$1 x float> * noalias %out1,
<$1 x float> * noalias %out2, <$1 x float> * noalias %out3)
nounwind alwaysinline {
%pf = bitcast [0 x float] * %base to float *
%p = getelementptr float * %pf, i32 %offset
%p0 = bitcast float * %p to <$1 x float> *
%v0 = load <$1 x float> * %p0, align 4
%p1 = getelementptr <$1 x float> * %p0, i32 1
%v1 = load <$1 x float> * %p1, align 4
%p2 = getelementptr <$1 x float> * %p0, i32 2
%v2 = load <$1 x float> * %p2, align 4
%p3 = getelementptr <$1 x float> * %p0, i32 3
%v3 = load <$1 x float> * %p3, align 4
call void @__aos_to_soa4_float$1(<$1 x float> %v0, <$1 x float> %v1,
<$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0,
<$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3)
ret void
}
define internal void
@__aos_to_soa4_int32([0 x i32] * noalias %base, i32 %offset,
<$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
<$1 x i32> * noalias %out2, <$1 x i32> * noalias %out3)
nounwind alwaysinline {
%fbase = bitcast [0 x i32] * %base to [0 x float] *
%fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
%fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
%fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
%fout3 = bitcast <$1 x i32> * %out3 to <$1 x float> *
call void @__aos_to_soa4_float([0 x float] * %fbase, i32 %offset,
<$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2,
<$1 x float> * %fout3)
ret void
}
define internal void
@__soa_to_aos4_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
<$1 x float> %v3, [0 x float] * noalias %base,
i32 %offset) nounwind alwaysinline {
%pf = bitcast [0 x float] * %base to float *
%p = getelementptr float * %pf, i32 %offset
%out0 = bitcast float * %p to <$1 x float> *
%out1 = getelementptr <$1 x float> * %out0, i32 1
%out2 = getelementptr <$1 x float> * %out0, i32 2
%out3 = getelementptr <$1 x float> * %out0, i32 3
call void @__soa_to_aos4_float$1(<$1 x float> %v0, <$1 x float> %v1,
<$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0,
<$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3)
ret void
}
define internal void
@__soa_to_aos4_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
<$1 x i32> %v3, [0 x i32] * noalias %base,
i32 %offset) nounwind alwaysinline {
%fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
%fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
%fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
%fv3 = bitcast <$1 x i32> %v3 to <$1 x float>
%fbase = bitcast [0 x i32] * %base to [0 x float] *
call void @__soa_to_aos4_float(<$1 x float> %fv0, <$1 x float> %fv1,
<$1 x float> %fv2, <$1 x float> %fv3, [0 x float] * %fbase,
i32 %offset)
ret void
}
define internal void
@__aos_to_soa3_float([0 x float] * noalias %base, i32 %offset,
<$1 x float> * %out0, <$1 x float> * %out1,
<$1 x float> * %out2) nounwind alwaysinline {
%pf = bitcast [0 x float] * %base to float *
%p = getelementptr float * %pf, i32 %offset
%p0 = bitcast float * %p to <$1 x float> *
%v0 = load <$1 x float> * %p0, align 4
%p1 = getelementptr <$1 x float> * %p0, i32 1
%v1 = load <$1 x float> * %p1, align 4
%p2 = getelementptr <$1 x float> * %p0, i32 2
%v2 = load <$1 x float> * %p2, align 4
call void @__aos_to_soa3_float$1(<$1 x float> %v0, <$1 x float> %v1,
<$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1,
<$1 x float> * %out2)
ret void
}
define internal void
@__aos_to_soa3_int32([0 x i32] * noalias %base, i32 %offset,
<$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
<$1 x i32> * noalias %out2) nounwind alwaysinline {
%fbase = bitcast [0 x i32] * %base to [0 x float] *
%fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
%fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
%fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
call void @__aos_to_soa3_float([0 x float] * %fbase, i32 %offset,
<$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2)
ret void
}
define internal void
@__soa_to_aos3_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
[0 x float] * noalias %base, i32 %offset) nounwind alwaysinline {
%pf = bitcast [0 x float] * %base to float *
%p = getelementptr float * %pf, i32 %offset
%out0 = bitcast float * %p to <$1 x float> *
%out1 = getelementptr <$1 x float> * %out0, i32 1
%out2 = getelementptr <$1 x float> * %out0, i32 2
call void @__soa_to_aos3_float$1(<$1 x float> %v0, <$1 x float> %v1,
<$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1,
<$1 x float> * %out2)
ret void
}
define internal void
@__soa_to_aos3_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
[0 x i32] * noalias %base, i32 %offset) nounwind alwaysinline {
%fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
%fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
%fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
%fbase = bitcast [0 x i32] * %base to [0 x float] *
call void @__soa_to_aos3_float(<$1 x float> %fv0, <$1 x float> %fv1,
<$1 x float> %fv2, [0 x float] * %fbase, i32 %offset)
ret void
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; prefetching

View File

@@ -89,6 +89,7 @@ Contents:
+ `Math Functions`_
+ `Output Functions`_
+ `Cross-Program Instance Operations`_
+ `Converting Between Array-of-Structures and Structure-of-Arrays Layout`_
+ `Packed Load and Store Operations`_
+ `Conversions To and From Half-Precision Floats`_
+ `Atomic Operations and Memory Fences`_
@@ -2022,6 +2023,97 @@ bitwise-or are available:
unsigned int64 exclusive_scan_or(unsigned int64 v)
Converting Between Array-of-Structures and Structure-of-Arrays Layout
---------------------------------------------------------------------
Applications often lay data out in memory in "array of structures" form.
Though convenient in C/C++ code, this layout can make ``ispc`` programs
less efficient than they would be if the data was laid out in "structure of
arrays" form. (See the section `Understanding How to Interoperate With the
Application's Data`_ for extended discussion of this topic.)
The standard library does provide a few functions that efficiently convert
between these two formats, for cases where it's not possible to change the
application to use "structure of arrays layout". Consider an array of 3D
(x,y,z) position data laid out in a C array like:
::
// C++ code
float pos[] = { x0, y0, z0, x1, y1, z1, x2, ... };
In an ``ispc`` program, we might want to load a set of (x,y,z) values and
do a computation based on them. The natural expression of this:
::
extern uniform float pos[];
uniform int base = ...;
float x = pos[base + 3 * programIndex]; // x = { x0 x1 x2 ... }
float y = pos[base + 1 + 3 * programIndex]; // y = { y0 y1 y2 ... }
float z = pos[base + 2 + 3 * programIndex]; // z = { z0 z1 z2 ... }
leads to irregular memory accesses and reduced performance. Alternatively,
the aos_to_soa3 standard library function could be used:
::
extern uniform float pos[];
uniform int base = ...;
float x, y, z;
aos_to_soa3(pos, base, x, y, z);
This routine loads ``3*programCount`` values from the given array starting
at the given offset, returning three ``varying`` results. There are both
``int32`` and ``float`` variants of this function:
::
void aos_to_soa3(uniform float a[], uniform int offset, reference float v0,
reference float v1, reference float v2)
void aos_to_soa3(uniform int32 a[], uniform int offset, reference int32 v0,
reference int32 v1, reference int32 v2)
After computation is done, corresponding functions convert back from the
SoA values in ``ispc`` ``varying`` variables and write the values back to
the given array, starting at the given offset.
::
extern uniform float pos[];
uniform int base = ...;
float x, y, z;
aos_to_soa3(pos, base, x, y, z);
// do computation with x, y, z
soa_to_aos3(x, y, z, pos, base);
::
void soa_to_aos3(float v0, float v1, float v2, uniform float a[],
uniform int offset)
void soa_to_aos3(int32 v0, int32 v1, int32 v2, uniform int32 a[],
uniform int offset)
There are also variants of these functions that convert 4-wide values
between AoS and SoA layouts. In other words, ``aos_to_soa4`` converts AoS
data in memory laid out like ``r0 g0 b0 a0 r1 g1 b1 a1 ...`` to four ``varying``
variables with values ``r0 r1...``, ``g0 g1...``, ``b0 b1...``, and ``a0
a1...`, reading a total of ``4*programCount`` values from the given array,
starting at the given offset.
::
void aos_to_soa4(uniform float a[], uniform int offset, reference float v0,
reference float v1, reference float v2, reference float v3)
void aos_to_soa4(uniform int32 a[], uniform int offset, reference int32 v0,
reference int32 v1, reference int32 v2, reference int32 v3)
void soa_to_aos4(float v0, float v1, float v2, float v3, uniform float a[],
uniform int offset)
void soa_to_aos4(int32 v0, int32 v1, int32 v2, int32 v3, uniform int32 a[],
uniform int offset)
Packed Load and Store Operations
--------------------------------
@@ -2653,8 +2745,13 @@ values are loaded into the local ``x``, ``y``, and ``z`` variables,
SIMD-efficient computation can proceed; getting to that point is
relatively inefficient.
An alternative would be the "structure of arrays" (SoA) layout. In C, the
data would be declared as:
(As described previously in `Converting Between Array-of-Structures and
Structure-of-Arrays Layout`_, this computation could be written more
efficiently using standard library routines to convert from the AoS layout,
if we were given a flat array of ``float`` values.)
An alternative data layout would be the "structure of arrays" (SoA). In C,
the data would be declared as:
::

View File

@@ -0,0 +1,27 @@
export uniform int width() { return programCount; }
export void f_v(uniform float RET[]) {
#define width 4
//CO const uniform int width = 4;
uniform int a[width*programCount], r[width*programCount];
for (uniform int i = 0; i < width*programCount; ++i)
a[i] = -1;
int x = width * programIndex;
int y = 1 + width * programIndex;
int z = 2 + width * programIndex;
int w = 3 + width * programIndex;
soa_to_aos4(x, y, z, w, a, 0);
uniform int errs = 0;
for (uniform int i = 0; i < width * programCount; ++i)
if (a[i] != i) ++errs;
RET[programIndex] = errs;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

View File

@@ -0,0 +1,26 @@
export uniform int width() { return programCount; }
export void f_v(uniform float RET[]) {
#define width 3
//CO const uniform int width = 4;
uniform int a[width*programCount], r[width*programCount];
for (uniform int i = 0; i < width*programCount; ++i)
a[i] = -1;
int x = width * programIndex;
int y = 1 + width * programIndex;
int z = 2 + width * programIndex;
soa_to_aos3(x, y, z, a, 0);
uniform int errs = 0;
for (uniform int i = 0; i < width * programCount; ++i)
if (a[i] != i) ++errs;
RET[programIndex] = errs;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

View File

@@ -315,6 +315,57 @@ static inline uniform int lanemask() {
return __movmsk(__mask);
}
///////////////////////////////////////////////////////////////////////////
// AOS/SOA conversion
static inline void
aos_to_soa3(uniform float a[], uniform int offset, reference float v0,
reference float v1, reference float v2) {
__aos_to_soa3_float(a, offset, v0, v1, v2);
}
static inline void
soa_to_aos3(float v0, float v1, float v2, uniform float a[],
uniform int offset) {
__soa_to_aos3_float(v0, v1, v2, a, offset);
}
static inline void
aos_to_soa4(uniform float a[], uniform int offset, reference float v0,
reference float v1, reference float v2, reference float v3) {
__aos_to_soa4_float(a, offset, v0, v1, v2, v3);
}
static inline void
soa_to_aos4(float v0, float v1, float v2, float v3, uniform float a[],
uniform int offset) {
__soa_to_aos4_float(v0, v1, v2, v3, a, offset);
}
static inline void
aos_to_soa3(uniform int32 a[], uniform int offset, reference int32 v0,
reference int32 v1, reference int32 v2) {
__aos_to_soa3_int32(a, offset, v0, v1, v2);
}
static inline void
soa_to_aos3(int32 v0, int32 v1, int32 v2, uniform int32 a[],
uniform int offset) {
__soa_to_aos3_int32(v0, v1, v2, a, offset);
}
static inline void
aos_to_soa4(uniform int32 a[], uniform int offset, reference int32 v0,
reference int32 v1, reference int32 v2, reference int32 v3) {
__aos_to_soa4_int32(a, offset, v0, v1, v2, v3);
}
static inline void
soa_to_aos4(int32 v0, int32 v1, int32 v2, int32 v3, uniform int32 a[],
uniform int offset) {
__soa_to_aos4_int32(v0, v1, v2, v3, a, offset);
}
///////////////////////////////////////////////////////////////////////////
// Prefetching

27
tests/aossoa-1.ispc Normal file
View File

@@ -0,0 +1,27 @@
export uniform int width() { return programCount; }
export void f_v(uniform float RET[]) {
#define width 3
#define maxProgramCount 16
//CO const uniform int width = 3;
//CO const uniform int maxProgramCount = 16;
uniform float a[width*maxProgramCount], r[width*maxProgramCount];
for (uniform int i = 0; i < width*maxProgramCount; ++i)
a[i] = i;
float x=-1, y=-1, z=-1;
aos_to_soa3(a, 0, x, y, z);
int errs = 0;
if (x != width * programIndex) ++errs;
if (y != 1 + width * programIndex) ++errs;
if (z != 2 + width * programIndex) ++errs;
RET[programIndex] = errs;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

28
tests/aossoa-2.ispc Normal file
View File

@@ -0,0 +1,28 @@
export uniform int width() { return programCount; }
export void f_v(uniform float RET[]) {
#define width 4
#define maxProgramCount 16
//CO const uniform int width = 4;
//CO const uniform int maxProgramCount = 16;
uniform float a[width*maxProgramCount], r[width*maxProgramCount];
for (uniform int i = 0; i < width*maxProgramCount; ++i)
a[i] = i;
float x=-1, y=-1, z=-1, w=-1;
aos_to_soa4(a, 0, x, y, z, w);
int errs = 0;
if (x != width * programIndex) ++errs;
if (y != 1 + width * programIndex) ++errs;
if (z != 2 + width * programIndex) ++errs;
if (w != 3 + width * programIndex) ++errs;
RET[programIndex] = errs;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

27
tests/aossoa-3.ispc Normal file
View File

@@ -0,0 +1,27 @@
export uniform int width() { return programCount; }
export void f_v(uniform float RET[]) {
#define width 4
//CO const uniform int width = 4;
uniform float a[width*programCount], r[width*programCount];
for (uniform int i = 0; i < width*programCount; ++i)
a[i] = -1;
float x = width * programIndex;
float y = 1 + width * programIndex;
float z = 2 + width * programIndex;
float w = 3 + width * programIndex;
soa_to_aos4(x, y, z, w, a, 0);
uniform int errs = 0;
for (uniform int i = 0; i < width * programCount; ++i)
if (a[i] != i) ++errs;
RET[programIndex] = errs;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

26
tests/aossoa-4.ispc Normal file
View File

@@ -0,0 +1,26 @@
export uniform int width() { return programCount; }
export void f_v(uniform float RET[]) {
#define width 3
//CO const uniform int width = 4;
uniform float a[width*programCount], r[width*programCount];
for (uniform int i = 0; i < width*programCount; ++i)
a[i] = -1;
float x = width * programIndex;
float y = 1 + width * programIndex;
float z = 2 + width * programIndex;
soa_to_aos3(x, y, z, a, 0);
uniform int errs = 0;
for (uniform int i = 0; i < width * programCount; ++i)
if (a[i] != i) ++errs;
RET[programIndex] = errs;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

27
tests/aossoa-5.ispc Normal file
View File

@@ -0,0 +1,27 @@
export uniform int width() { return programCount; }
export void f_v(uniform float RET[]) {
#define width 3
#define maxProgramCount 16
//CO const uniform int width = 3;
//CO const uniform int maxProgramCount = 16;
uniform int a[width*maxProgramCount], r[width*maxProgramCount];
for (uniform int i = 0; i < width*maxProgramCount; ++i)
a[i] = i;
int x=-1, y=-1, z=-1;
aos_to_soa3(a, 0, x, y, z);
int errs = 0;
if (x != width * programIndex) ++errs;
if (y != 1 + width * programIndex) ++errs;
if (z != 2 + width * programIndex) ++errs;
RET[programIndex] = errs;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

28
tests/aossoa-6.ispc Normal file
View File

@@ -0,0 +1,28 @@
export uniform int width() { return programCount; }
export void f_v(uniform float RET[]) {
#define width 4
#define maxProgramCount 16
//CO const uniform int width = 4;
//CO const uniform int maxProgramCount = 16;
uniform int a[width*maxProgramCount], r[width*maxProgramCount];
for (uniform int i = 0; i < width*maxProgramCount; ++i)
a[i] = i;
int x=-1, y=-1, z=-1, w=-1;
aos_to_soa4(a, 0, x, y, z, w);
int errs = 0;
if (x != width * programIndex) ++errs;
if (y != 1 + width * programIndex) ++errs;
if (z != 2 + width * programIndex) ++errs;
if (w != 3 + width * programIndex) ++errs;
RET[programIndex] = errs;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}