From 3cb0115dce00539b834b6841e301c90c71756db7 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Mon, 10 Oct 2011 10:56:06 -0700 Subject: [PATCH] Add routines to standard library to do efficient AOS/SOA conversions. Currently, we just support 3 and 4-wide variants (i.e. xyzxyz.. and xyzwxyzw..), for int32 and float types. --- builtins.m4 | 675 ++++++++++++++++++++++++++++++++++++ docs/ispc.txt | 101 +++++- failing_tests/aossoa-7.ispc | 27 ++ failing_tests/aossoa-8.ispc | 26 ++ stdlib.ispc | 51 +++ tests/aossoa-1.ispc | 27 ++ tests/aossoa-2.ispc | 28 ++ tests/aossoa-3.ispc | 27 ++ tests/aossoa-4.ispc | 26 ++ tests/aossoa-5.ispc | 27 ++ tests/aossoa-6.ispc | 28 ++ 11 files changed, 1041 insertions(+), 2 deletions(-) create mode 100644 failing_tests/aossoa-7.ispc create mode 100644 failing_tests/aossoa-8.ispc create mode 100644 tests/aossoa-1.ispc create mode 100644 tests/aossoa-2.ispc create mode 100644 tests/aossoa-3.ispc create mode 100644 tests/aossoa-4.ispc create mode 100644 tests/aossoa-5.ispc create mode 100644 tests/aossoa-6.ispc diff --git a/builtins.m4 b/builtins.m4 index cd704a0b..723dd800 100644 --- a/builtins.m4 +++ b/builtins.m4 @@ -1052,6 +1052,681 @@ define internal <$1 x i32> @__sext_varying_bool(<$1 x i32>) nounwind readnone al ret <$1 x i32> %0 } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; AOS/SOA conversion primitives + +;; take 4 4-wide vectors laid out like ... +;; and reorder them to ... + +define internal void +@__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, + <4 x float> %v3, <4 x float> * noalias %out0, + <4 x float> * noalias %out1, <4 x float> * noalias %out2, + <4 x float> * noalias %out3) nounwind alwaysinline { + %t0 = shufflevector <4 x float> %v2, <4 x float> %v3, ; r2 r3 g2 g3 + <4 x i32> + %t1 = shufflevector <4 x float> %v2, <4 x float> %v3, ; b2 b3 a2 a3 + <4 x i32> + %t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; r0 r1 g0 g1 + <4 x i32> + %t3 = shufflevector <4 x float> %v0, <4 x float> %v1, ; b0 b1 a0 a1 + <4 x i32> + + %r0 = shufflevector <4 x float> %t2, <4 x float> %t0, ; r0 r1 r2 r3 + <4 x i32> + store <4 x float> %r0, <4 x float> * %out0 + %r1 = shufflevector <4 x float> %t2, <4 x float> %t0, ; g0 g1 g2 g3 + <4 x i32> + store <4 x float> %r1, <4 x float> * %out1 + %r2 = shufflevector <4 x float> %t3, <4 x float> %t1, ; b0 b1 b2 b3 + <4 x i32> + store <4 x float> %r2, <4 x float> * %out2 + %r3 = shufflevector <4 x float> %t3, <4 x float> %t1, ; a0 a1 a2 a3 + <4 x i32> + store <4 x float> %r3, <4 x float> * %out3 + ret void +} + + +;; Do the reverse of __aos_to_soa4_float4--reorder .. +;; to ... +;; This is the exact same set of operations that __soa_to_soa4_float4 does +;; (a 4x4 transpose), so just call that... + +define internal void +@__soa_to_aos4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, + <4 x float> %v3, <4 x float> * noalias %out0, + <4 x float> * noalias %out1, <4 x float> * noalias %out2, + <4 x float> * noalias %out3) nounwind alwaysinline { + call void @__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1, + <4 x float> %v2, <4 x float> %v3, <4 x float> * %out0, + <4 x float> * %out1, <4 x float> * %out2, <4 x float> * %out3) + ret void +} + + +;; Convert 3-wide AOS values to SOA--specifically, given 3 4-vectors +;; , transpose to +;; . + +define internal void +@__aos_to_soa3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, + <4 x float> * noalias %out0, <4 x float> * noalias %out1, + <4 x float> * noalias %out2) nounwind alwaysinline { + %t0 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x0 x1 y0 y1 + <4 x i32> + %t1 = shufflevector <4 x float> %v1, <4 x float> %v2, ; x2 x3 y2 y3 + <4 x i32> + + %r0 = shufflevector <4 x float> %t0, <4 x float> %t1, ; x0 x1 x1 x3 + <4 x i32> + store <4 x float> %r0, <4 x float> * %out0 + + %r1 = shufflevector <4 x float> %t0, <4 x float> %t1, ; y0 y1 y2 y3 + <4 x i32> + store <4 x float> %r1, <4 x float> * %out1 + + %t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; z0 z1 x x + <4 x i32> + + %r2 = shufflevector <4 x float> %t2, <4 x float> %v2, ; z0 z1 z2 z3 + <4 x i32> + store <4 x float> %r2, <4 x float> * %out2 + ret void +} + + +;; The inverse of __aos_to_soa3_float4: convert 3 4-vectors +;; to +;; . + +define internal void +@__soa_to_aos3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, + <4 x float> * noalias %out0, <4 x float> * noalias %out1, + <4 x float> * noalias %out2) nounwind alwaysinline { + %t0 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x0 x1 x2 y0 + <4 x i32> + %t1 = shufflevector <4 x float> %v1, <4 x float> %v2, ; y1 y2 z0 z1 + <4 x i32> + + %r0 = shufflevector <4 x float> %t0, <4 x float> %t1, ; x0 y0 z0 x1 + <4 x i32> + store <4 x float> %r0, <4 x float> * %out0 + %r1 = shufflevector <4 x float> %t0, <4 x float> %t1, ; y1 z1 x2 y2 + <4 x i32> + store <4 x float> %r1, <4 x float> * %out1 + + %t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x3 y3 x x + <4 x i32> + + %r2 = shufflevector <4 x float> %t2, <4 x float> %v2, ; z2 x3 y3 z3 + <4 x i32> + store <4 x float> %r2, <4 x float> * %out2 + ret void +} + +;; 8-wide +;; These functions implement the 8-wide variants of the AOS/SOA conversion +;; routines above. These implementations are all built on top of the 4-wide +;; vector versions. + +define internal void +@__aos_to_soa4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2, + <8 x float> %v3, <8 x float> * noalias %out0, + <8 x float> * noalias %out1, <8 x float> * noalias %out2, + <8 x float> * noalias %out3) nounwind alwaysinline { + ;; Split each 8-vector into 2 4-vectors + %v0a = shufflevector <8 x float> %v0, <8 x float> undef, + <4 x i32> + %v0b = shufflevector <8 x float> %v0, <8 x float> undef, + <4 x i32> + %v1a = shufflevector <8 x float> %v1, <8 x float> undef, + <4 x i32> + %v1b = shufflevector <8 x float> %v1, <8 x float> undef, + <4 x i32> + %v2a = shufflevector <8 x float> %v2, <8 x float> undef, + <4 x i32> + %v2b = shufflevector <8 x float> %v2, <8 x float> undef, + <4 x i32> + %v3a = shufflevector <8 x float> %v3, <8 x float> undef, + <4 x i32> + %v3b = shufflevector <8 x float> %v3, <8 x float> undef, + <4 x i32> + + ;; Similarly for the output pointers + %out0a = bitcast <8 x float> * %out0 to <4 x float> * + %out0b = getelementptr <4 x float> * %out0a, i32 1 + %out1a = bitcast <8 x float> * %out1 to <4 x float> * + %out1b = getelementptr <4 x float> * %out1a, i32 1 + %out2a = bitcast <8 x float> * %out2 to <4 x float> * + %out2b = getelementptr <4 x float> * %out2a, i32 1 + %out3a = bitcast <8 x float> * %out3 to <4 x float> * + %out3b = getelementptr <4 x float> * %out3a, i32 1 + + ;; Do the first part--given input vectors like + ;; , + ;; pass 3 4-vectors to the 4-vec + ;; version to compute the first 4 SOA values for the three output variables. + call void @__aos_to_soa4_float4(<4 x float> %v0a, <4 x float> %v0b, + <4 x float> %v1a, <4 x float> %v1b, <4 x float> * %out0a, + <4 x float> * %out1a, <4 x float> * %out2a, <4 x float> * %out3a) + + ;; And similarly pass to the 4-wide + ;; version to compute the second 4 SOA values for the three outputs + call void @__aos_to_soa4_float4(<4 x float> %v2a, <4 x float> %v2b, + <4 x float> %v3a, <4 x float> %v3b, <4 x float> * %out0b, + <4 x float> * %out1b, <4 x float> * %out2b, <4 x float> * %out3b) + ret void +} + + +define internal void +@__soa_to_aos4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2, + <8 x float> %v3, <8 x float> * noalias %out0, + <8 x float> * noalias %out1, <8 x float> * noalias %out2, + <8 x float> * noalias %out3) nounwind alwaysinline { + ;; As above, split into 4-vectors and 4-wide outputs... + %v0a = shufflevector <8 x float> %v0, <8 x float> undef, + <4 x i32> + %v0b = shufflevector <8 x float> %v0, <8 x float> undef, + <4 x i32> + %v1a = shufflevector <8 x float> %v1, <8 x float> undef, + <4 x i32> + %v1b = shufflevector <8 x float> %v1, <8 x float> undef, + <4 x i32> + %v2a = shufflevector <8 x float> %v2, <8 x float> undef, + <4 x i32> + %v2b = shufflevector <8 x float> %v2, <8 x float> undef, + <4 x i32> + %v3a = shufflevector <8 x float> %v3, <8 x float> undef, + <4 x i32> + %v3b = shufflevector <8 x float> %v3, <8 x float> undef, + <4 x i32> + + %out0a = bitcast <8 x float> * %out0 to <4 x float> * + %out0b = getelementptr <4 x float> * %out0a, i32 1 + %out1a = bitcast <8 x float> * %out1 to <4 x float> * + %out1b = getelementptr <4 x float> * %out1a, i32 1 + %out2a = bitcast <8 x float> * %out2 to <4 x float> * + %out2b = getelementptr <4 x float> * %out2a, i32 1 + %out3a = bitcast <8 x float> * %out3 to <4 x float> * + %out3b = getelementptr <4 x float> * %out3a, i32 1 + + ;; First part--given input vectors + ;; + ;; pass 3 4-vectors to + ;; compute the first 12 AOS output values. + call void @__soa_to_aos4_float4(<4 x float> %v0a, <4 x float> %v1a, + <4 x float> %v2a, <4 x float> %v3a, <4 x float> * %out0a, + <4 x float> * %out0b, <4 x float> * %out1a, <4 x float> * %out1b) + + ;; And then pass the 3 4-vectors + ;; To compute the next 12 AOS output values + call void @__soa_to_aos4_float4(<4 x float> %v0b, <4 x float> %v1b, + <4 x float> %v2b, <4 x float> %v3b, <4 x float> * %out2a, + <4 x float> * %out2b, <4 x float> * %out3a, <4 x float> * %out3b) + ret void +} + + +define internal void +@__aos_to_soa3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2, + <8 x float> * noalias %out0, <8 x float> * noalias %out1, + <8 x float> * noalias %out2) nounwind alwaysinline { + %v0a = shufflevector <8 x float> %v0, <8 x float> undef, + <4 x i32> + %v0b = shufflevector <8 x float> %v0, <8 x float> undef, + <4 x i32> + %v1a = shufflevector <8 x float> %v1, <8 x float> undef, + <4 x i32> + %v1b = shufflevector <8 x float> %v1, <8 x float> undef, + <4 x i32> + %v2a = shufflevector <8 x float> %v2, <8 x float> undef, + <4 x i32> + %v2b = shufflevector <8 x float> %v2, <8 x float> undef, + <4 x i32> + + %out0a = bitcast <8 x float> * %out0 to <4 x float> * + %out0b = getelementptr <4 x float> * %out0a, i32 1 + %out1a = bitcast <8 x float> * %out1 to <4 x float> * + %out1b = getelementptr <4 x float> * %out1a, i32 1 + %out2a = bitcast <8 x float> * %out2 to <4 x float> * + %out2b = getelementptr <4 x float> * %out2a, i32 1 + + call void @__aos_to_soa3_float4(<4 x float> %v0a, <4 x float> %v0b, + <4 x float> %v1a, <4 x float> * %out0a, <4 x float> * %out1a, + <4 x float> * %out2a) + call void @__aos_to_soa3_float4(<4 x float> %v1b, <4 x float> %v2a, + <4 x float> %v2b, <4 x float> * %out0b, <4 x float> * %out1b, + <4 x float> * %out2b) + ret void +} + + +define internal void +@__soa_to_aos3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2, + <8 x float> * noalias %out0, <8 x float> * noalias %out1, + <8 x float> * noalias %out2) nounwind alwaysinline { + %v0a = shufflevector <8 x float> %v0, <8 x float> undef, + <4 x i32> + %v0b = shufflevector <8 x float> %v0, <8 x float> undef, + <4 x i32> + %v1a = shufflevector <8 x float> %v1, <8 x float> undef, + <4 x i32> + %v1b = shufflevector <8 x float> %v1, <8 x float> undef, + <4 x i32> + %v2a = shufflevector <8 x float> %v2, <8 x float> undef, + <4 x i32> + %v2b = shufflevector <8 x float> %v2, <8 x float> undef, + <4 x i32> + + %out0a = bitcast <8 x float> * %out0 to <4 x float> * + %out0b = getelementptr <4 x float> * %out0a, i32 1 + %out1a = bitcast <8 x float> * %out1 to <4 x float> * + %out1b = getelementptr <4 x float> * %out1a, i32 1 + %out2a = bitcast <8 x float> * %out2 to <4 x float> * + %out2b = getelementptr <4 x float> * %out2a, i32 1 + + call void @__soa_to_aos3_float4(<4 x float> %v0a, <4 x float> %v1a, + <4 x float> %v2a, <4 x float> * %out0a, <4 x float> * %out0b, + <4 x float> * %out1a) + call void @__soa_to_aos3_float4(<4 x float> %v0b, <4 x float> %v1b, + <4 x float> %v2b, <4 x float> * %out1b, <4 x float> * %out2a, + <4 x float> * %out2b) + ret void +} + +;; 16-wide + +define internal void +@__aos_to_soa4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2, + <16 x float> %v3, <16 x float> * noalias %out0, + <16 x float> * noalias %out1, <16 x float> * noalias %out2, + <16 x float> * noalias %out3) nounwind alwaysinline { + %v0a = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v0b = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v0c = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v0d = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v1a = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v1b = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v1c = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v1d = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v2a = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + %v2b = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + %v2c = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + %v2d = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + %v3a = shufflevector <16 x float> %v3, <16 x float> undef, + <4 x i32> + %v3b = shufflevector <16 x float> %v3, <16 x float> undef, + <4 x i32> + %v3c = shufflevector <16 x float> %v3, <16 x float> undef, + <4 x i32> + %v3d = shufflevector <16 x float> %v3, <16 x float> undef, + <4 x i32> + + %out0a = bitcast <16 x float> * %out0 to <4 x float> * + %out0b = getelementptr <4 x float> * %out0a, i32 1 + %out0c = getelementptr <4 x float> * %out0a, i32 2 + %out0d = getelementptr <4 x float> * %out0a, i32 3 + %out1a = bitcast <16 x float> * %out1 to <4 x float> * + %out1b = getelementptr <4 x float> * %out1a, i32 1 + %out1c = getelementptr <4 x float> * %out1a, i32 2 + %out1d = getelementptr <4 x float> * %out1a, i32 3 + %out2a = bitcast <16 x float> * %out2 to <4 x float> * + %out2b = getelementptr <4 x float> * %out2a, i32 1 + %out2c = getelementptr <4 x float> * %out2a, i32 2 + %out2d = getelementptr <4 x float> * %out2a, i32 3 + %out3a = bitcast <16 x float> * %out3 to <4 x float> * + %out3b = getelementptr <4 x float> * %out3a, i32 1 + %out3c = getelementptr <4 x float> * %out3a, i32 2 + %out3d = getelementptr <4 x float> * %out3a, i32 3 + + call void @__aos_to_soa4_float4(<4 x float> %v0a, <4 x float> %v0b, + <4 x float> %v0c, <4 x float> %v0d, <4 x float> * %out0a, + <4 x float> * %out1a, <4 x float> * %out2a, <4 x float> * %out3a) + call void @__aos_to_soa4_float4(<4 x float> %v1a, <4 x float> %v1b, + <4 x float> %v1c, <4 x float> %v1d, <4 x float> * %out0b, + <4 x float> * %out1b, <4 x float> * %out2b, <4 x float> * %out3b) + call void @__aos_to_soa4_float4(<4 x float> %v2a, <4 x float> %v2b, + <4 x float> %v2c, <4 x float> %v2d, <4 x float> * %out0c, + <4 x float> * %out1c, <4 x float> * %out2c, <4 x float> * %out3c) + call void @__aos_to_soa4_float4(<4 x float> %v3a, <4 x float> %v3b, + <4 x float> %v3c, <4 x float> %v3d, <4 x float> * %out0d, + <4 x float> * %out1d, <4 x float> * %out2d, <4 x float> * %out3d) + ret void +} + + +define internal void +@__soa_to_aos4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2, + <16 x float> %v3, <16 x float> * noalias %out0, + <16 x float> * noalias %out1, <16 x float> * noalias %out2, + <16 x float> * noalias %out3) nounwind alwaysinline { + %v0a = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v0b = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v0c = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v0d = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v1a = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v1b = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v1c = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v1d = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v2a = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + %v2b = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + %v2c = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + %v2d = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + %v3a = shufflevector <16 x float> %v3, <16 x float> undef, + <4 x i32> + %v3b = shufflevector <16 x float> %v3, <16 x float> undef, + <4 x i32> + %v3c = shufflevector <16 x float> %v3, <16 x float> undef, + <4 x i32> + %v3d = shufflevector <16 x float> %v3, <16 x float> undef, + <4 x i32> + + %out0a = bitcast <16 x float> * %out0 to <4 x float> * + %out0b = getelementptr <4 x float> * %out0a, i32 1 + %out0c = getelementptr <4 x float> * %out0a, i32 2 + %out0d = getelementptr <4 x float> * %out0a, i32 3 + %out1a = bitcast <16 x float> * %out1 to <4 x float> * + %out1b = getelementptr <4 x float> * %out1a, i32 1 + %out1c = getelementptr <4 x float> * %out1a, i32 2 + %out1d = getelementptr <4 x float> * %out1a, i32 3 + %out2a = bitcast <16 x float> * %out2 to <4 x float> * + %out2b = getelementptr <4 x float> * %out2a, i32 1 + %out2c = getelementptr <4 x float> * %out2a, i32 2 + %out2d = getelementptr <4 x float> * %out2a, i32 3 + %out3a = bitcast <16 x float> * %out3 to <4 x float> * + %out3b = getelementptr <4 x float> * %out3a, i32 1 + %out3c = getelementptr <4 x float> * %out3a, i32 2 + %out3d = getelementptr <4 x float> * %out3a, i32 3 + + call void @__soa_to_aos4_float4(<4 x float> %v0a, <4 x float> %v1a, + <4 x float> %v2a, <4 x float> %v3a, <4 x float> * %out0a, + <4 x float> * %out0b, <4 x float> * %out0c, <4 x float> * %out0d) + call void @__soa_to_aos4_float4(<4 x float> %v0b, <4 x float> %v1b, + <4 x float> %v2b, <4 x float> %v3b, <4 x float> * %out1a, + <4 x float> * %out1b, <4 x float> * %out1c, <4 x float> * %out1d) + call void @__soa_to_aos4_float4(<4 x float> %v0c, <4 x float> %v1c, + <4 x float> %v2c, <4 x float> %v3c, <4 x float> * %out2a, + <4 x float> * %out2b, <4 x float> * %out2c, <4 x float> * %out2d) + call void @__soa_to_aos4_float4(<4 x float> %v0d, <4 x float> %v1d, + <4 x float> %v2d, <4 x float> %v3d, <4 x float> * %out3a, + <4 x float> * %out3b, <4 x float> * %out3c, <4 x float> * %out3d) + ret void +} + + +define internal void +@__aos_to_soa3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2, + <16 x float> * noalias %out0, <16 x float> * noalias %out1, + <16 x float> * noalias %out2) nounwind alwaysinline { + %v0a = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v0b = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v0c = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v0d = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v1a = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v1b = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v1c = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v1d = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v2a = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + %v2b = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + %v2c = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + %v2d = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + + %out0a = bitcast <16 x float> * %out0 to <4 x float> * + %out0b = getelementptr <4 x float> * %out0a, i32 1 + %out0c = getelementptr <4 x float> * %out0a, i32 2 + %out0d = getelementptr <4 x float> * %out0a, i32 3 + %out1a = bitcast <16 x float> * %out1 to <4 x float> * + %out1b = getelementptr <4 x float> * %out1a, i32 1 + %out1c = getelementptr <4 x float> * %out1a, i32 2 + %out1d = getelementptr <4 x float> * %out1a, i32 3 + %out2a = bitcast <16 x float> * %out2 to <4 x float> * + %out2b = getelementptr <4 x float> * %out2a, i32 1 + %out2c = getelementptr <4 x float> * %out2a, i32 2 + %out2d = getelementptr <4 x float> * %out2a, i32 3 + + call void @__aos_to_soa3_float4(<4 x float> %v0a, <4 x float> %v0b, + <4 x float> %v0c, <4 x float> * %out0a, <4 x float> * %out1a, + <4 x float> * %out2a) + call void @__aos_to_soa3_float4(<4 x float> %v1a, <4 x float> %v1b, + <4 x float> %v1c, <4 x float> * %out0b, <4 x float> * %out1b, + <4 x float> * %out2b) + call void @__aos_to_soa3_float4(<4 x float> %v2a, <4 x float> %v2b, + <4 x float> %v2c, <4 x float> * %out0c, <4 x float> * %out1c, + <4 x float> * %out2c) + ret void +} + + +define internal void +@__soa_to_aos3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2, + <16 x float> * noalias %out0, <16 x float> * noalias %out1, + <16 x float> * noalias %out2) nounwind alwaysinline { + %v0a = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v0b = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v0c = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v0d = shufflevector <16 x float> %v0, <16 x float> undef, + <4 x i32> + %v1a = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v1b = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v1c = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v1d = shufflevector <16 x float> %v1, <16 x float> undef, + <4 x i32> + %v2a = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + %v2b = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + %v2c = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + %v2d = shufflevector <16 x float> %v2, <16 x float> undef, + <4 x i32> + + %out0a = bitcast <16 x float> * %out0 to <4 x float> * + %out0b = getelementptr <4 x float> * %out0a, i32 1 + %out0c = getelementptr <4 x float> * %out0a, i32 2 + %out0d = getelementptr <4 x float> * %out0a, i32 3 + %out1a = bitcast <16 x float> * %out1 to <4 x float> * + %out1b = getelementptr <4 x float> * %out1a, i32 1 + %out1c = getelementptr <4 x float> * %out1a, i32 2 + %out1d = getelementptr <4 x float> * %out1a, i32 3 + %out2a = bitcast <16 x float> * %out2 to <4 x float> * + %out2b = getelementptr <4 x float> * %out2a, i32 1 + %out2c = getelementptr <4 x float> * %out2a, i32 2 + %out2d = getelementptr <4 x float> * %out2a, i32 3 + + call void @__soa_to_aos3_float4(<4 x float> %v0a, <4 x float> %v1a, + <4 x float> %v2a, <4 x float> * %out0a, <4 x float> * %out0b, + <4 x float> * %out0c) + call void @__soa_to_aos3_float4(<4 x float> %v0b, <4 x float> %v1b, + <4 x float> %v2b, <4 x float> * %out0d, <4 x float> * %out1a, + <4 x float> * %out1b) + call void @__soa_to_aos3_float4(<4 x float> %v0c, <4 x float> %v1c, + <4 x float> %v2c, <4 x float> * %out1c, <4 x float> * %out1d, + <4 x float> * %out2a) + call void @__soa_to_aos3_float4(<4 x float> %v0d, <4 x float> %v1d, + <4 x float> %v2d, <4 x float> * %out2b, <4 x float> * %out2c, + <4 x float> * %out2d) + ret void +} + +;; versions to be called from stdlib + +define internal void +@__aos_to_soa4_float([0 x float] * noalias %base, i32 %offset, + <$1 x float> * noalias %out0, <$1 x float> * noalias %out1, + <$1 x float> * noalias %out2, <$1 x float> * noalias %out3) + nounwind alwaysinline { + %pf = bitcast [0 x float] * %base to float * + %p = getelementptr float * %pf, i32 %offset + %p0 = bitcast float * %p to <$1 x float> * + %v0 = load <$1 x float> * %p0, align 4 + %p1 = getelementptr <$1 x float> * %p0, i32 1 + %v1 = load <$1 x float> * %p1, align 4 + %p2 = getelementptr <$1 x float> * %p0, i32 2 + %v2 = load <$1 x float> * %p2, align 4 + %p3 = getelementptr <$1 x float> * %p0, i32 3 + %v3 = load <$1 x float> * %p3, align 4 + call void @__aos_to_soa4_float$1(<$1 x float> %v0, <$1 x float> %v1, + <$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0, + <$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3) + ret void +} + + +define internal void +@__aos_to_soa4_int32([0 x i32] * noalias %base, i32 %offset, + <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1, + <$1 x i32> * noalias %out2, <$1 x i32> * noalias %out3) + nounwind alwaysinline { + %fbase = bitcast [0 x i32] * %base to [0 x float] * + %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> * + %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> * + %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> * + %fout3 = bitcast <$1 x i32> * %out3 to <$1 x float> * + call void @__aos_to_soa4_float([0 x float] * %fbase, i32 %offset, + <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2, + <$1 x float> * %fout3) + ret void +} + + +define internal void +@__soa_to_aos4_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2, + <$1 x float> %v3, [0 x float] * noalias %base, + i32 %offset) nounwind alwaysinline { + %pf = bitcast [0 x float] * %base to float * + %p = getelementptr float * %pf, i32 %offset + %out0 = bitcast float * %p to <$1 x float> * + %out1 = getelementptr <$1 x float> * %out0, i32 1 + %out2 = getelementptr <$1 x float> * %out0, i32 2 + %out3 = getelementptr <$1 x float> * %out0, i32 3 + call void @__soa_to_aos4_float$1(<$1 x float> %v0, <$1 x float> %v1, + <$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0, + <$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3) + ret void +} + + +define internal void +@__soa_to_aos4_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2, + <$1 x i32> %v3, [0 x i32] * noalias %base, + i32 %offset) nounwind alwaysinline { + %fv0 = bitcast <$1 x i32> %v0 to <$1 x float> + %fv1 = bitcast <$1 x i32> %v1 to <$1 x float> + %fv2 = bitcast <$1 x i32> %v2 to <$1 x float> + %fv3 = bitcast <$1 x i32> %v3 to <$1 x float> + %fbase = bitcast [0 x i32] * %base to [0 x float] * + call void @__soa_to_aos4_float(<$1 x float> %fv0, <$1 x float> %fv1, + <$1 x float> %fv2, <$1 x float> %fv3, [0 x float] * %fbase, + i32 %offset) + ret void +} + + +define internal void +@__aos_to_soa3_float([0 x float] * noalias %base, i32 %offset, + <$1 x float> * %out0, <$1 x float> * %out1, + <$1 x float> * %out2) nounwind alwaysinline { + %pf = bitcast [0 x float] * %base to float * + %p = getelementptr float * %pf, i32 %offset + %p0 = bitcast float * %p to <$1 x float> * + %v0 = load <$1 x float> * %p0, align 4 + %p1 = getelementptr <$1 x float> * %p0, i32 1 + %v1 = load <$1 x float> * %p1, align 4 + %p2 = getelementptr <$1 x float> * %p0, i32 2 + %v2 = load <$1 x float> * %p2, align 4 + call void @__aos_to_soa3_float$1(<$1 x float> %v0, <$1 x float> %v1, + <$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1, + <$1 x float> * %out2) + ret void +} + + +define internal void +@__aos_to_soa3_int32([0 x i32] * noalias %base, i32 %offset, + <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1, + <$1 x i32> * noalias %out2) nounwind alwaysinline { + %fbase = bitcast [0 x i32] * %base to [0 x float] * + %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> * + %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> * + %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> * + call void @__aos_to_soa3_float([0 x float] * %fbase, i32 %offset, + <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2) + ret void +} + + +define internal void +@__soa_to_aos3_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2, + [0 x float] * noalias %base, i32 %offset) nounwind alwaysinline { + %pf = bitcast [0 x float] * %base to float * + %p = getelementptr float * %pf, i32 %offset + %out0 = bitcast float * %p to <$1 x float> * + %out1 = getelementptr <$1 x float> * %out0, i32 1 + %out2 = getelementptr <$1 x float> * %out0, i32 2 + call void @__soa_to_aos3_float$1(<$1 x float> %v0, <$1 x float> %v1, + <$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1, + <$1 x float> * %out2) + ret void +} + + +define internal void +@__soa_to_aos3_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2, + [0 x i32] * noalias %base, i32 %offset) nounwind alwaysinline { + %fv0 = bitcast <$1 x i32> %v0 to <$1 x float> + %fv1 = bitcast <$1 x i32> %v1 to <$1 x float> + %fv2 = bitcast <$1 x i32> %v2 to <$1 x float> + %fbase = bitcast [0 x i32] * %base to [0 x float] * + call void @__soa_to_aos3_float(<$1 x float> %fv0, <$1 x float> %fv1, + <$1 x float> %fv2, [0 x float] * %fbase, i32 %offset) + ret void +} + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; prefetching diff --git a/docs/ispc.txt b/docs/ispc.txt index ad2254d2..43f30cd8 100644 --- a/docs/ispc.txt +++ b/docs/ispc.txt @@ -89,6 +89,7 @@ Contents: + `Math Functions`_ + `Output Functions`_ + `Cross-Program Instance Operations`_ + + `Converting Between Array-of-Structures and Structure-of-Arrays Layout`_ + `Packed Load and Store Operations`_ + `Conversions To and From Half-Precision Floats`_ + `Atomic Operations and Memory Fences`_ @@ -2022,6 +2023,97 @@ bitwise-or are available: unsigned int64 exclusive_scan_or(unsigned int64 v) +Converting Between Array-of-Structures and Structure-of-Arrays Layout +--------------------------------------------------------------------- + +Applications often lay data out in memory in "array of structures" form. +Though convenient in C/C++ code, this layout can make ``ispc`` programs +less efficient than they would be if the data was laid out in "structure of +arrays" form. (See the section `Understanding How to Interoperate With the +Application's Data`_ for extended discussion of this topic.) + +The standard library does provide a few functions that efficiently convert +between these two formats, for cases where it's not possible to change the +application to use "structure of arrays layout". Consider an array of 3D +(x,y,z) position data laid out in a C array like: + +:: + + // C++ code + float pos[] = { x0, y0, z0, x1, y1, z1, x2, ... }; + + +In an ``ispc`` program, we might want to load a set of (x,y,z) values and +do a computation based on them. The natural expression of this: + +:: + + extern uniform float pos[]; + uniform int base = ...; + float x = pos[base + 3 * programIndex]; // x = { x0 x1 x2 ... } + float y = pos[base + 1 + 3 * programIndex]; // y = { y0 y1 y2 ... } + float z = pos[base + 2 + 3 * programIndex]; // z = { z0 z1 z2 ... } + +leads to irregular memory accesses and reduced performance. Alternatively, +the aos_to_soa3 standard library function could be used: + +:: + + extern uniform float pos[]; + uniform int base = ...; + float x, y, z; + aos_to_soa3(pos, base, x, y, z); + +This routine loads ``3*programCount`` values from the given array starting +at the given offset, returning three ``varying`` results. There are both +``int32`` and ``float`` variants of this function: + +:: + + void aos_to_soa3(uniform float a[], uniform int offset, reference float v0, + reference float v1, reference float v2) + void aos_to_soa3(uniform int32 a[], uniform int offset, reference int32 v0, + reference int32 v1, reference int32 v2) + +After computation is done, corresponding functions convert back from the +SoA values in ``ispc`` ``varying`` variables and write the values back to +the given array, starting at the given offset. + +:: + + extern uniform float pos[]; + uniform int base = ...; + float x, y, z; + aos_to_soa3(pos, base, x, y, z); + // do computation with x, y, z + soa_to_aos3(x, y, z, pos, base); + +:: + + void soa_to_aos3(float v0, float v1, float v2, uniform float a[], + uniform int offset) + void soa_to_aos3(int32 v0, int32 v1, int32 v2, uniform int32 a[], + uniform int offset) + +There are also variants of these functions that convert 4-wide values +between AoS and SoA layouts. In other words, ``aos_to_soa4`` converts AoS +data in memory laid out like ``r0 g0 b0 a0 r1 g1 b1 a1 ...`` to four ``varying`` +variables with values ``r0 r1...``, ``g0 g1...``, ``b0 b1...``, and ``a0 +a1...`, reading a total of ``4*programCount`` values from the given array, +starting at the given offset. + +:: + + void aos_to_soa4(uniform float a[], uniform int offset, reference float v0, + reference float v1, reference float v2, reference float v3) + void aos_to_soa4(uniform int32 a[], uniform int offset, reference int32 v0, + reference int32 v1, reference int32 v2, reference int32 v3) + void soa_to_aos4(float v0, float v1, float v2, float v3, uniform float a[], + uniform int offset) + void soa_to_aos4(int32 v0, int32 v1, int32 v2, int32 v3, uniform int32 a[], + uniform int offset) + + Packed Load and Store Operations -------------------------------- @@ -2653,8 +2745,13 @@ values are loaded into the local ``x``, ``y``, and ``z`` variables, SIMD-efficient computation can proceed; getting to that point is relatively inefficient. -An alternative would be the "structure of arrays" (SoA) layout. In C, the -data would be declared as: +(As described previously in `Converting Between Array-of-Structures and +Structure-of-Arrays Layout`_, this computation could be written more +efficiently using standard library routines to convert from the AoS layout, +if we were given a flat array of ``float`` values.) + +An alternative data layout would be the "structure of arrays" (SoA). In C, +the data would be declared as: :: diff --git a/failing_tests/aossoa-7.ispc b/failing_tests/aossoa-7.ispc new file mode 100644 index 00000000..1c682b7d --- /dev/null +++ b/failing_tests/aossoa-7.ispc @@ -0,0 +1,27 @@ + +export uniform int width() { return programCount; } + +export void f_v(uniform float RET[]) { +#define width 4 +//CO const uniform int width = 4; + uniform int a[width*programCount], r[width*programCount]; + for (uniform int i = 0; i < width*programCount; ++i) + a[i] = -1; + + int x = width * programIndex; + int y = 1 + width * programIndex; + int z = 2 + width * programIndex; + int w = 3 + width * programIndex; + + soa_to_aos4(x, y, z, w, a, 0); + uniform int errs = 0; + for (uniform int i = 0; i < width * programCount; ++i) + if (a[i] != i) ++errs; + + RET[programIndex] = errs; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} + diff --git a/failing_tests/aossoa-8.ispc b/failing_tests/aossoa-8.ispc new file mode 100644 index 00000000..4c0afea0 --- /dev/null +++ b/failing_tests/aossoa-8.ispc @@ -0,0 +1,26 @@ + +export uniform int width() { return programCount; } + +export void f_v(uniform float RET[]) { +#define width 3 +//CO const uniform int width = 4; + uniform int a[width*programCount], r[width*programCount]; + for (uniform int i = 0; i < width*programCount; ++i) + a[i] = -1; + + int x = width * programIndex; + int y = 1 + width * programIndex; + int z = 2 + width * programIndex; + + soa_to_aos3(x, y, z, a, 0); + uniform int errs = 0; + for (uniform int i = 0; i < width * programCount; ++i) + if (a[i] != i) ++errs; + + RET[programIndex] = errs; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} + diff --git a/stdlib.ispc b/stdlib.ispc index fea10fa5..ebe146be 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -315,6 +315,57 @@ static inline uniform int lanemask() { return __movmsk(__mask); } +/////////////////////////////////////////////////////////////////////////// +// AOS/SOA conversion + +static inline void +aos_to_soa3(uniform float a[], uniform int offset, reference float v0, + reference float v1, reference float v2) { + __aos_to_soa3_float(a, offset, v0, v1, v2); +} + +static inline void +soa_to_aos3(float v0, float v1, float v2, uniform float a[], + uniform int offset) { + __soa_to_aos3_float(v0, v1, v2, a, offset); +} + +static inline void +aos_to_soa4(uniform float a[], uniform int offset, reference float v0, + reference float v1, reference float v2, reference float v3) { + __aos_to_soa4_float(a, offset, v0, v1, v2, v3); +} + +static inline void +soa_to_aos4(float v0, float v1, float v2, float v3, uniform float a[], + uniform int offset) { + __soa_to_aos4_float(v0, v1, v2, v3, a, offset); +} + +static inline void +aos_to_soa3(uniform int32 a[], uniform int offset, reference int32 v0, + reference int32 v1, reference int32 v2) { + __aos_to_soa3_int32(a, offset, v0, v1, v2); +} + +static inline void +soa_to_aos3(int32 v0, int32 v1, int32 v2, uniform int32 a[], + uniform int offset) { + __soa_to_aos3_int32(v0, v1, v2, a, offset); +} + +static inline void +aos_to_soa4(uniform int32 a[], uniform int offset, reference int32 v0, + reference int32 v1, reference int32 v2, reference int32 v3) { + __aos_to_soa4_int32(a, offset, v0, v1, v2, v3); +} + +static inline void +soa_to_aos4(int32 v0, int32 v1, int32 v2, int32 v3, uniform int32 a[], + uniform int offset) { + __soa_to_aos4_int32(v0, v1, v2, v3, a, offset); +} + /////////////////////////////////////////////////////////////////////////// // Prefetching diff --git a/tests/aossoa-1.ispc b/tests/aossoa-1.ispc new file mode 100644 index 00000000..0500f4d9 --- /dev/null +++ b/tests/aossoa-1.ispc @@ -0,0 +1,27 @@ + +export uniform int width() { return programCount; } + +export void f_v(uniform float RET[]) { +#define width 3 +#define maxProgramCount 16 +//CO const uniform int width = 3; +//CO const uniform int maxProgramCount = 16; + uniform float a[width*maxProgramCount], r[width*maxProgramCount]; + for (uniform int i = 0; i < width*maxProgramCount; ++i) + a[i] = i; + + float x=-1, y=-1, z=-1; + aos_to_soa3(a, 0, x, y, z); + + int errs = 0; + if (x != width * programIndex) ++errs; + if (y != 1 + width * programIndex) ++errs; + if (z != 2 + width * programIndex) ++errs; + + RET[programIndex] = errs; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} + diff --git a/tests/aossoa-2.ispc b/tests/aossoa-2.ispc new file mode 100644 index 00000000..b5bc84ee --- /dev/null +++ b/tests/aossoa-2.ispc @@ -0,0 +1,28 @@ + +export uniform int width() { return programCount; } + +export void f_v(uniform float RET[]) { +#define width 4 +#define maxProgramCount 16 +//CO const uniform int width = 4; +//CO const uniform int maxProgramCount = 16; + uniform float a[width*maxProgramCount], r[width*maxProgramCount]; + for (uniform int i = 0; i < width*maxProgramCount; ++i) + a[i] = i; + + float x=-1, y=-1, z=-1, w=-1; + aos_to_soa4(a, 0, x, y, z, w); + + int errs = 0; + if (x != width * programIndex) ++errs; + if (y != 1 + width * programIndex) ++errs; + if (z != 2 + width * programIndex) ++errs; + if (w != 3 + width * programIndex) ++errs; + + RET[programIndex] = errs; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} + diff --git a/tests/aossoa-3.ispc b/tests/aossoa-3.ispc new file mode 100644 index 00000000..416a5a8e --- /dev/null +++ b/tests/aossoa-3.ispc @@ -0,0 +1,27 @@ + +export uniform int width() { return programCount; } + +export void f_v(uniform float RET[]) { +#define width 4 +//CO const uniform int width = 4; + uniform float a[width*programCount], r[width*programCount]; + for (uniform int i = 0; i < width*programCount; ++i) + a[i] = -1; + + float x = width * programIndex; + float y = 1 + width * programIndex; + float z = 2 + width * programIndex; + float w = 3 + width * programIndex; + + soa_to_aos4(x, y, z, w, a, 0); + uniform int errs = 0; + for (uniform int i = 0; i < width * programCount; ++i) + if (a[i] != i) ++errs; + + RET[programIndex] = errs; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} + diff --git a/tests/aossoa-4.ispc b/tests/aossoa-4.ispc new file mode 100644 index 00000000..4646f1ca --- /dev/null +++ b/tests/aossoa-4.ispc @@ -0,0 +1,26 @@ + +export uniform int width() { return programCount; } + +export void f_v(uniform float RET[]) { +#define width 3 +//CO const uniform int width = 4; + uniform float a[width*programCount], r[width*programCount]; + for (uniform int i = 0; i < width*programCount; ++i) + a[i] = -1; + + float x = width * programIndex; + float y = 1 + width * programIndex; + float z = 2 + width * programIndex; + + soa_to_aos3(x, y, z, a, 0); + uniform int errs = 0; + for (uniform int i = 0; i < width * programCount; ++i) + if (a[i] != i) ++errs; + + RET[programIndex] = errs; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} + diff --git a/tests/aossoa-5.ispc b/tests/aossoa-5.ispc new file mode 100644 index 00000000..4cadb502 --- /dev/null +++ b/tests/aossoa-5.ispc @@ -0,0 +1,27 @@ + +export uniform int width() { return programCount; } + +export void f_v(uniform float RET[]) { +#define width 3 +#define maxProgramCount 16 +//CO const uniform int width = 3; +//CO const uniform int maxProgramCount = 16; + uniform int a[width*maxProgramCount], r[width*maxProgramCount]; + for (uniform int i = 0; i < width*maxProgramCount; ++i) + a[i] = i; + + int x=-1, y=-1, z=-1; + aos_to_soa3(a, 0, x, y, z); + + int errs = 0; + if (x != width * programIndex) ++errs; + if (y != 1 + width * programIndex) ++errs; + if (z != 2 + width * programIndex) ++errs; + + RET[programIndex] = errs; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} + diff --git a/tests/aossoa-6.ispc b/tests/aossoa-6.ispc new file mode 100644 index 00000000..52269ba2 --- /dev/null +++ b/tests/aossoa-6.ispc @@ -0,0 +1,28 @@ + +export uniform int width() { return programCount; } + +export void f_v(uniform float RET[]) { +#define width 4 +#define maxProgramCount 16 +//CO const uniform int width = 4; +//CO const uniform int maxProgramCount = 16; + uniform int a[width*maxProgramCount], r[width*maxProgramCount]; + for (uniform int i = 0; i < width*maxProgramCount; ++i) + a[i] = i; + + int x=-1, y=-1, z=-1, w=-1; + aos_to_soa4(a, 0, x, y, z, w); + + int errs = 0; + if (x != width * programIndex) ++errs; + if (y != 1 + width * programIndex) ++errs; + if (z != 2 + width * programIndex) ++errs; + if (w != 3 + width * programIndex) ++errs; + + RET[programIndex] = errs; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} +