From 3cb0115dce00539b834b6841e301c90c71756db7 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 10 Oct 2011 10:56:06 -0700
Subject: [PATCH] Add routines to standard library to do efficient AOS/SOA
 conversions.

Currently, we just support 3 and 4-wide variants (i.e. xyzxyz.. and xyzwxyzw..),
for int32 and float types.
---
 builtins.m4                 | 675 ++++++++++++++++++++++++++++++++++++
 docs/ispc.txt               | 101 +++++-
 failing_tests/aossoa-7.ispc |  27 ++
 failing_tests/aossoa-8.ispc |  26 ++
 stdlib.ispc                 |  51 +++
 tests/aossoa-1.ispc         |  27 ++
 tests/aossoa-2.ispc         |  28 ++
 tests/aossoa-3.ispc         |  27 ++
 tests/aossoa-4.ispc         |  26 ++
 tests/aossoa-5.ispc         |  27 ++
 tests/aossoa-6.ispc         |  28 ++
 11 files changed, 1041 insertions(+), 2 deletions(-)
 create mode 100644 failing_tests/aossoa-7.ispc
 create mode 100644 failing_tests/aossoa-8.ispc
 create mode 100644 tests/aossoa-1.ispc
 create mode 100644 tests/aossoa-2.ispc
 create mode 100644 tests/aossoa-3.ispc
 create mode 100644 tests/aossoa-4.ispc
 create mode 100644 tests/aossoa-5.ispc
 create mode 100644 tests/aossoa-6.ispc
diff --git a/builtins.m4 b/builtins.m4
index cd704a0b..723dd800 100644
--- a/builtins.m4
+++ b/builtins.m4
@@ -1052,6 +1052,681 @@ define internal <$1 x i32> @__sext_varying_bool(<$1 x i32>) nounwind readnone al
   ret <$1 x i32> %0
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; AOS/SOA conversion primitives
+
+;; take 4 4-wide vectors laid out like <r0 g0 b0 a0> <r1 g1 b1 a1> ...
+;; and reorder them to <r0 r1 r2 r3> <g0 g1 g2 g3> ...
+
+define internal void
+@__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
+        <4 x float> %v3, <4 x float> * noalias %out0, 
+        <4 x float> * noalias %out1, <4 x float> * noalias %out2, 
+        <4 x float> * noalias %out3) nounwind alwaysinline { 
+  %t0 = shufflevector <4 x float> %v2, <4 x float> %v3,  ; r2 r3 g2 g3
+          <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %t1 = shufflevector <4 x float> %v2, <4 x float> %v3,  ; b2 b3 a2 a3
+          <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %t2 = shufflevector <4 x float> %v0, <4 x float> %v1,  ; r0 r1 g0 g1
+          <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %t3 = shufflevector <4 x float> %v0, <4 x float> %v1,  ; b0 b1 a0 a1
+          <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+
+  %r0 = shufflevector <4 x float> %t2, <4 x float> %t0,  ; r0 r1 r2 r3
+          <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  store <4 x float> %r0, <4 x float> * %out0
+  %r1 = shufflevector <4 x float> %t2, <4 x float> %t0,  ; g0 g1 g2 g3
+          <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  store <4 x float> %r1, <4 x float> * %out1
+  %r2 = shufflevector <4 x float> %t3, <4 x float> %t1,  ; b0 b1 b2 b3
+          <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  store <4 x float> %r2, <4 x float> * %out2
+  %r3 = shufflevector <4 x float> %t3, <4 x float> %t1,  ; a0 a1 a2 a3
+          <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  store <4 x float> %r3, <4 x float> * %out3
+  ret void
+}
+
+
+;; Do the reverse of __aos_to_soa4_float4--reorder <r0 r1 r2 r3> <g0 g1 g2 g3> ..
+;; to <r0 g0 b0 a0> <r1 g1 b1 a1> ...
+;; This is the exact same set of operations that __soa_to_soa4_float4 does
+;; (a 4x4 transpose), so just call that...
+
+define internal void
+@__soa_to_aos4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
+        <4 x float> %v3, <4 x float> * noalias %out0, 
+        <4 x float> * noalias %out1, <4 x float> * noalias %out2, 
+        <4 x float> * noalias %out3) nounwind alwaysinline { 
+  call void @__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1, 
+    <4 x float> %v2, <4 x float> %v3, <4 x float> * %out0, 
+    <4 x float> * %out1, <4 x float> * %out2, <4 x float> * %out3)
+  ret void
+}
+
+
+;; Convert 3-wide AOS values to SOA--specifically, given 3 4-vectors
+;; <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 x3 y3 z3>, transpose to
+;; <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3>.
+
+define internal void
+@__aos_to_soa3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
+        <4 x float> * noalias %out0, <4 x float> * noalias %out1,
+        <4 x float> * noalias %out2) nounwind alwaysinline { 
+  %t0 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x0 x1 y0 y1
+    <4 x i32> <i32 0, i32 3, i32 1, i32 4>
+  %t1 = shufflevector <4 x float> %v1, <4 x float> %v2, ; x2 x3 y2 y3
+    <4 x i32> <i32 2, i32 5, i32 3, i32 6>
+
+  %r0 = shufflevector <4 x float> %t0, <4 x float> %t1, ; x0 x1 x1 x3
+    <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  store <4 x float> %r0, <4 x float> * %out0
+
+  %r1 = shufflevector <4 x float> %t0, <4 x float> %t1, ; y0 y1 y2 y3
+    <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  store <4 x float> %r1, <4 x float> * %out1
+
+  %t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; z0 z1 x x
+    <4 x i32> <i32 2, i32 5, i32 undef, i32 undef>
+
+  %r2 = shufflevector <4 x float> %t2, <4 x float> %v2, ; z0 z1 z2 z3
+    <4 x i32> <i32 0, i32 1, i32 4, i32 7>
+  store <4 x float> %r2, <4 x float> * %out2
+  ret void
+}
+
+
+;; The inverse of __aos_to_soa3_float4: convert 3 4-vectors
+;; <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3> to
+;; <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 x3 y3 z3>.
+
+define internal void
+@__soa_to_aos3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2,
+        <4 x float> * noalias %out0, <4 x float> * noalias %out1,
+        <4 x float> * noalias %out2) nounwind alwaysinline { 
+  %t0 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x0 x1 x2 y0
+    <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  %t1 = shufflevector <4 x float> %v1, <4 x float> %v2, ; y1 y2 z0 z1
+    <4 x i32> <i32 1, i32 2, i32 4, i32 5>
+ 
+  %r0 = shufflevector <4 x float> %t0, <4 x float> %t1, ; x0 y0 z0 x1
+    <4 x i32> <i32 0, i32 3, i32 6, i32 1>
+  store <4 x float> %r0, <4 x float> * %out0
+  %r1 = shufflevector <4 x float> %t0, <4 x float> %t1, ; y1 z1 x2 y2
+    <4 x i32> <i32 4, i32 7, i32 2, i32 5>
+  store <4 x float> %r1, <4 x float> * %out1
+
+  %t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x3 y3 x x
+    <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
+
+  %r2 = shufflevector <4 x float> %t2, <4 x float> %v2, ; z2 x3 y3 z3
+    <4 x i32> <i32 6, i32 0, i32 1, i32 7>
+  store <4 x float> %r2, <4 x float> * %out2
+  ret void
+}
+
+;; 8-wide
+;; These functions implement the 8-wide variants of the AOS/SOA conversion
+;; routines above.  These implementations are all built on top of the 4-wide
+;; vector versions.
+ 
+define internal void
+@__aos_to_soa4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
+        <8 x float> %v3, <8 x float> * noalias %out0, 
+        <8 x float> * noalias %out1, <8 x float> * noalias %out2, 
+        <8 x float> * noalias %out3) nounwind alwaysinline {
+  ;; Split each 8-vector into 2 4-vectors
+  %v0a = shufflevector <8 x float> %v0, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0b = shufflevector <8 x float> %v0, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1a = shufflevector <8 x float> %v1, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1b = shufflevector <8 x float> %v1, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v2a = shufflevector <8 x float> %v2, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2b = shufflevector <8 x float> %v2, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v3a = shufflevector <8 x float> %v3, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v3b = shufflevector <8 x float> %v3, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  ;; Similarly for the output pointers
+  %out0a = bitcast <8 x float> * %out0 to <4 x float> *
+  %out0b = getelementptr <4 x float> * %out0a, i32 1
+  %out1a = bitcast <8 x float> * %out1 to <4 x float> *
+  %out1b = getelementptr <4 x float> * %out1a, i32 1
+  %out2a = bitcast <8 x float> * %out2 to <4 x float> *
+  %out2b = getelementptr <4 x float> * %out2a, i32 1
+  %out3a = bitcast <8 x float> * %out3 to <4 x float> *
+  %out3b = getelementptr <4 x float> * %out3a, i32 1
+ 
+  ;; Do the first part--given input vectors like
+  ;; <x0 y0 z0 x1 y1 z1 x2 y2> <z2 x3 y3 z3 x4 y4 z4 x5> <y5 z5 x6 y6 z6 x7 y7 z7>,
+  ;; pass 3 4-vectors <x0 y0 z0 x1> <y1 z1 x2 y2> <z2 z3 y3 z3> to the 4-vec
+  ;; version to compute the first 4 SOA values for the three output variables.
+  call void @__aos_to_soa4_float4(<4 x float> %v0a, <4 x float> %v0b,
+         <4 x float> %v1a, <4 x float> %v1b, <4 x float> * %out0a,
+         <4 x float> * %out1a, <4 x float> * %out2a, <4 x float> * %out3a)
+
+  ;; And similarly pass <x4 y4 z4 x5> <y5 z5 x6 y6> <z6 x7 y7 z7> to the 4-wide
+  ;; version to compute the second 4 SOA values for the three outputs
+  call void @__aos_to_soa4_float4(<4 x float> %v2a, <4 x float> %v2b,
+         <4 x float> %v3a, <4 x float> %v3b, <4 x float> * %out0b,
+         <4 x float> * %out1b, <4 x float> * %out2b, <4 x float> * %out3b)
+  ret void
+}
+
+
+define internal void
+@__soa_to_aos4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
+        <8 x float> %v3, <8 x float> * noalias %out0, 
+        <8 x float> * noalias %out1, <8 x float> * noalias %out2, 
+        <8 x float> * noalias %out3) nounwind alwaysinline {
+  ;; As above, split into 4-vectors and 4-wide outputs...
+  %v0a = shufflevector <8 x float> %v0, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0b = shufflevector <8 x float> %v0, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1a = shufflevector <8 x float> %v1, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1b = shufflevector <8 x float> %v1, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v2a = shufflevector <8 x float> %v2, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2b = shufflevector <8 x float> %v2, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v3a = shufflevector <8 x float> %v3, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v3b = shufflevector <8 x float> %v3, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  %out0a = bitcast <8 x float> * %out0 to <4 x float> *
+  %out0b = getelementptr <4 x float> * %out0a, i32 1
+  %out1a = bitcast <8 x float> * %out1 to <4 x float> *
+  %out1b = getelementptr <4 x float> * %out1a, i32 1
+  %out2a = bitcast <8 x float> * %out2 to <4 x float> *
+  %out2b = getelementptr <4 x float> * %out2a, i32 1
+  %out3a = bitcast <8 x float> * %out3 to <4 x float> *
+  %out3b = getelementptr <4 x float> * %out3a, i32 1
+
+  ;; First part--given input vectors
+  ;; <x0 x1 x2 x3 x4 x5 x6 x7> <y0 y1 y2 y3 y4 y5 y6 y7> <z0 z1 z2 z3 z4 z5 z6 z7> 
+  ;; pass 3 4-vectors <x0 x1 x2 x3> <y0 y1 y2 y3> <z0 z1 z2 z3> to 
+  ;; compute the first 12 AOS output values.
+  call void @__soa_to_aos4_float4(<4 x float> %v0a, <4 x float> %v1a,
+         <4 x float> %v2a, <4 x float> %v3a, <4 x float> * %out0a,
+         <4 x float> * %out0b, <4 x float> * %out1a, <4 x float> * %out1b)
+
+  ;; And then pass the 3 4-vectors <x4 x5 x6 x7> <y4 y5 y6 y7> <z4 z5 z6 z7>
+  ;; To compute the next 12 AOS output values
+  call void @__soa_to_aos4_float4(<4 x float> %v0b, <4 x float> %v1b,
+         <4 x float> %v2b, <4 x float> %v3b, <4 x float> * %out2a,
+         <4 x float> * %out2b, <4 x float> * %out3a, <4 x float> * %out3b)
+  ret void
+}
+
+
+define internal void
+@__aos_to_soa3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
+        <8 x float> * noalias %out0, <8 x float> * noalias %out1,
+        <8 x float> * noalias %out2) nounwind alwaysinline {
+  %v0a = shufflevector <8 x float> %v0, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0b = shufflevector <8 x float> %v0, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1a = shufflevector <8 x float> %v1, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1b = shufflevector <8 x float> %v1, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v2a = shufflevector <8 x float> %v2, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2b = shufflevector <8 x float> %v2, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  %out0a = bitcast <8 x float> * %out0 to <4 x float> *
+  %out0b = getelementptr <4 x float> * %out0a, i32 1
+  %out1a = bitcast <8 x float> * %out1 to <4 x float> *
+  %out1b = getelementptr <4 x float> * %out1a, i32 1
+  %out2a = bitcast <8 x float> * %out2 to <4 x float> *
+  %out2b = getelementptr <4 x float> * %out2a, i32 1
+
+  call void @__aos_to_soa3_float4(<4 x float> %v0a, <4 x float> %v0b,
+         <4 x float> %v1a, <4 x float> * %out0a, <4 x float> * %out1a,
+         <4 x float> * %out2a)
+  call void @__aos_to_soa3_float4(<4 x float> %v1b, <4 x float> %v2a,
+         <4 x float> %v2b, <4 x float> * %out0b, <4 x float> * %out1b,
+         <4 x float> * %out2b)
+  ret void
+}
+
+
+define internal void
+@__soa_to_aos3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
+        <8 x float> * noalias %out0, <8 x float> * noalias %out1,
+        <8 x float> * noalias %out2) nounwind alwaysinline {
+  %v0a = shufflevector <8 x float> %v0, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0b = shufflevector <8 x float> %v0, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1a = shufflevector <8 x float> %v1, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1b = shufflevector <8 x float> %v1, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v2a = shufflevector <8 x float> %v2, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2b = shufflevector <8 x float> %v2, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  %out0a = bitcast <8 x float> * %out0 to <4 x float> *
+  %out0b = getelementptr <4 x float> * %out0a, i32 1
+  %out1a = bitcast <8 x float> * %out1 to <4 x float> *
+  %out1b = getelementptr <4 x float> * %out1a, i32 1
+  %out2a = bitcast <8 x float> * %out2 to <4 x float> *
+  %out2b = getelementptr <4 x float> * %out2a, i32 1
+
+  call void @__soa_to_aos3_float4(<4 x float> %v0a, <4 x float> %v1a,
+         <4 x float> %v2a, <4 x float> * %out0a, <4 x float> * %out0b,
+         <4 x float> * %out1a)
+  call void @__soa_to_aos3_float4(<4 x float> %v0b, <4 x float> %v1b,
+         <4 x float> %v2b, <4 x float> * %out1b, <4 x float> * %out2a,
+         <4 x float> * %out2b)
+  ret void
+}
+
+;; 16-wide
+
+define internal void
+@__aos_to_soa4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
+        <16 x float> %v3, <16 x float> * noalias %out0, 
+        <16 x float> * noalias %out1, <16 x float> * noalias %out2, 
+        <16 x float> * noalias %out3) nounwind alwaysinline {
+  %v0a = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0b = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v0c = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v0d = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v1a = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1b = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1c = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v1d = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v2a = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2b = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v2c = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v2d = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v3a = shufflevector <16 x float> %v3, <16 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v3b = shufflevector <16 x float> %v3, <16 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v3c = shufflevector <16 x float> %v3, <16 x float> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v3d = shufflevector <16 x float> %v3, <16 x float> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %out0a = bitcast <16 x float> * %out0 to <4 x float> *
+  %out0b = getelementptr <4 x float> * %out0a, i32 1
+  %out0c = getelementptr <4 x float> * %out0a, i32 2
+  %out0d = getelementptr <4 x float> * %out0a, i32 3
+  %out1a = bitcast <16 x float> * %out1 to <4 x float> *
+  %out1b = getelementptr <4 x float> * %out1a, i32 1
+  %out1c = getelementptr <4 x float> * %out1a, i32 2
+  %out1d = getelementptr <4 x float> * %out1a, i32 3
+  %out2a = bitcast <16 x float> * %out2 to <4 x float> *
+  %out2b = getelementptr <4 x float> * %out2a, i32 1
+  %out2c = getelementptr <4 x float> * %out2a, i32 2
+  %out2d = getelementptr <4 x float> * %out2a, i32 3
+  %out3a = bitcast <16 x float> * %out3 to <4 x float> *
+  %out3b = getelementptr <4 x float> * %out3a, i32 1
+  %out3c = getelementptr <4 x float> * %out3a, i32 2
+  %out3d = getelementptr <4 x float> * %out3a, i32 3
+
+  call void @__aos_to_soa4_float4(<4 x float> %v0a, <4 x float> %v0b,
+         <4 x float> %v0c, <4 x float> %v0d, <4 x float> * %out0a,
+         <4 x float> * %out1a, <4 x float> * %out2a, <4 x float> * %out3a)
+  call void @__aos_to_soa4_float4(<4 x float> %v1a, <4 x float> %v1b,
+         <4 x float> %v1c, <4 x float> %v1d, <4 x float> * %out0b,
+         <4 x float> * %out1b, <4 x float> * %out2b, <4 x float> * %out3b)
+  call void @__aos_to_soa4_float4(<4 x float> %v2a, <4 x float> %v2b,
+         <4 x float> %v2c, <4 x float> %v2d, <4 x float> * %out0c,
+         <4 x float> * %out1c, <4 x float> * %out2c, <4 x float> * %out3c)
+  call void @__aos_to_soa4_float4(<4 x float> %v3a, <4 x float> %v3b,
+         <4 x float> %v3c, <4 x float> %v3d, <4 x float> * %out0d,
+         <4 x float> * %out1d, <4 x float> * %out2d, <4 x float> * %out3d)
+  ret void
+}
+
+
+define internal void
+@__soa_to_aos4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
+        <16 x float> %v3, <16 x float> * noalias %out0, 
+        <16 x float> * noalias %out1, <16 x float> * noalias %out2, 
+        <16 x float> * noalias %out3) nounwind alwaysinline {
+  %v0a = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0b = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v0c = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v0d = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v1a = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1b = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1c = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v1d = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v2a = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2b = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v2c = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v2d = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v3a = shufflevector <16 x float> %v3, <16 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v3b = shufflevector <16 x float> %v3, <16 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v3c = shufflevector <16 x float> %v3, <16 x float> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v3d = shufflevector <16 x float> %v3, <16 x float> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %out0a = bitcast <16 x float> * %out0 to <4 x float> *
+  %out0b = getelementptr <4 x float> * %out0a, i32 1
+  %out0c = getelementptr <4 x float> * %out0a, i32 2
+  %out0d = getelementptr <4 x float> * %out0a, i32 3
+  %out1a = bitcast <16 x float> * %out1 to <4 x float> *
+  %out1b = getelementptr <4 x float> * %out1a, i32 1
+  %out1c = getelementptr <4 x float> * %out1a, i32 2
+  %out1d = getelementptr <4 x float> * %out1a, i32 3
+  %out2a = bitcast <16 x float> * %out2 to <4 x float> *
+  %out2b = getelementptr <4 x float> * %out2a, i32 1
+  %out2c = getelementptr <4 x float> * %out2a, i32 2
+  %out2d = getelementptr <4 x float> * %out2a, i32 3
+  %out3a = bitcast <16 x float> * %out3 to <4 x float> *
+  %out3b = getelementptr <4 x float> * %out3a, i32 1
+  %out3c = getelementptr <4 x float> * %out3a, i32 2
+  %out3d = getelementptr <4 x float> * %out3a, i32 3
+
+  call void @__soa_to_aos4_float4(<4 x float> %v0a, <4 x float> %v1a,
+         <4 x float> %v2a, <4 x float> %v3a, <4 x float> * %out0a,
+         <4 x float> * %out0b, <4 x float> * %out0c, <4 x float> * %out0d)
+  call void @__soa_to_aos4_float4(<4 x float> %v0b, <4 x float> %v1b,
+         <4 x float> %v2b, <4 x float> %v3b, <4 x float> * %out1a,
+         <4 x float> * %out1b, <4 x float> * %out1c, <4 x float> * %out1d)
+  call void @__soa_to_aos4_float4(<4 x float> %v0c, <4 x float> %v1c,
+         <4 x float> %v2c, <4 x float> %v3c, <4 x float> * %out2a,
+         <4 x float> * %out2b, <4 x float> * %out2c, <4 x float> * %out2d)
+  call void @__soa_to_aos4_float4(<4 x float> %v0d, <4 x float> %v1d,
+         <4 x float> %v2d, <4 x float> %v3d, <4 x float> * %out3a,
+         <4 x float> * %out3b, <4 x float> * %out3c, <4 x float> * %out3d)
+  ret void
+}
+
+
+define internal void
+@__aos_to_soa3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
+        <16 x float> * noalias %out0, <16 x float> * noalias %out1,
+        <16 x float> * noalias %out2) nounwind alwaysinline {
+  %v0a = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0b = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v0c = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v0d = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v1a = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1b = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1c = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v1d = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v2a = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2b = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v2c = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v2d = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %out0a = bitcast <16 x float> * %out0 to <4 x float> *
+  %out0b = getelementptr <4 x float> * %out0a, i32 1
+  %out0c = getelementptr <4 x float> * %out0a, i32 2
+  %out0d = getelementptr <4 x float> * %out0a, i32 3
+  %out1a = bitcast <16 x float> * %out1 to <4 x float> *
+  %out1b = getelementptr <4 x float> * %out1a, i32 1
+  %out1c = getelementptr <4 x float> * %out1a, i32 2
+  %out1d = getelementptr <4 x float> * %out1a, i32 3
+  %out2a = bitcast <16 x float> * %out2 to <4 x float> *
+  %out2b = getelementptr <4 x float> * %out2a, i32 1
+  %out2c = getelementptr <4 x float> * %out2a, i32 2
+  %out2d = getelementptr <4 x float> * %out2a, i32 3
+
+  call void @__aos_to_soa3_float4(<4 x float> %v0a, <4 x float> %v0b,
+         <4 x float> %v0c, <4 x float> * %out0a, <4 x float> * %out1a,
+         <4 x float> * %out2a)
+  call void @__aos_to_soa3_float4(<4 x float> %v1a, <4 x float> %v1b,
+         <4 x float> %v1c, <4 x float> * %out0b, <4 x float> * %out1b,
+         <4 x float> * %out2b)
+  call void @__aos_to_soa3_float4(<4 x float> %v2a, <4 x float> %v2b,
+         <4 x float> %v2c, <4 x float> * %out0c, <4 x float> * %out1c,
+         <4 x float> * %out2c)
+  ret void
+}
+
+
+define internal void
+@__soa_to_aos3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2,
+        <16 x float> * noalias %out0, <16 x float> * noalias %out1,
+        <16 x float> * noalias %out2) nounwind alwaysinline {
+  %v0a = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0b = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v0c = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v0d = shufflevector <16 x float> %v0, <16 x float> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v1a = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1b = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1c = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v1d = shufflevector <16 x float> %v1, <16 x float> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v2a = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2b = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v2c = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v2d = shufflevector <16 x float> %v2, <16 x float> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %out0a = bitcast <16 x float> * %out0 to <4 x float> *
+  %out0b = getelementptr <4 x float> * %out0a, i32 1
+  %out0c = getelementptr <4 x float> * %out0a, i32 2
+  %out0d = getelementptr <4 x float> * %out0a, i32 3
+  %out1a = bitcast <16 x float> * %out1 to <4 x float> *
+  %out1b = getelementptr <4 x float> * %out1a, i32 1
+  %out1c = getelementptr <4 x float> * %out1a, i32 2
+  %out1d = getelementptr <4 x float> * %out1a, i32 3
+  %out2a = bitcast <16 x float> * %out2 to <4 x float> *
+  %out2b = getelementptr <4 x float> * %out2a, i32 1
+  %out2c = getelementptr <4 x float> * %out2a, i32 2
+  %out2d = getelementptr <4 x float> * %out2a, i32 3
+
+  call void @__soa_to_aos3_float4(<4 x float> %v0a, <4 x float> %v1a,
+         <4 x float> %v2a, <4 x float> * %out0a, <4 x float> * %out0b,
+         <4 x float> * %out0c)
+  call void @__soa_to_aos3_float4(<4 x float> %v0b, <4 x float> %v1b,
+         <4 x float> %v2b, <4 x float> * %out0d, <4 x float> * %out1a,
+         <4 x float> * %out1b)
+  call void @__soa_to_aos3_float4(<4 x float> %v0c, <4 x float> %v1c,
+         <4 x float> %v2c, <4 x float> * %out1c, <4 x float> * %out1d,
+         <4 x float> * %out2a)
+  call void @__soa_to_aos3_float4(<4 x float> %v0d, <4 x float> %v1d,
+         <4 x float> %v2d, <4 x float> * %out2b, <4 x float> * %out2c,
+         <4 x float> * %out2d)
+  ret void
+}
+
+;; versions to be called from stdlib
+
+define internal void
+@__aos_to_soa4_float([0 x float] * noalias %base, i32 %offset,
+        <$1 x float> * noalias %out0, <$1 x float> * noalias %out1,
+        <$1 x float> * noalias %out2, <$1 x float> * noalias %out3)
+        nounwind alwaysinline { 
+  %pf = bitcast [0 x float] * %base to float *
+  %p = getelementptr float * %pf, i32 %offset
+  %p0 = bitcast float * %p to <$1 x float> *
+  %v0 = load <$1 x float> * %p0, align 4
+  %p1 = getelementptr <$1 x float> * %p0, i32 1
+  %v1 = load <$1 x float> * %p1, align 4
+  %p2 = getelementptr <$1 x float> * %p0, i32 2
+  %v2 = load <$1 x float> * %p2, align 4
+  %p3 = getelementptr <$1 x float> * %p0, i32 3
+  %v3 = load <$1 x float> * %p3, align 4
+  call void @__aos_to_soa4_float$1(<$1 x float> %v0, <$1 x float> %v1, 
+         <$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0, 
+         <$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3)
+  ret void
+}
+
+
+define internal void
+@__aos_to_soa4_int32([0 x i32] * noalias %base, i32 %offset,
+        <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
+        <$1 x i32> * noalias %out2, <$1 x i32> * noalias %out3)
+        nounwind alwaysinline { 
+  %fbase = bitcast [0 x i32] * %base to [0 x float] *
+  %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
+  %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
+  %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
+  %fout3 = bitcast <$1 x i32> * %out3 to <$1 x float> *
+  call void @__aos_to_soa4_float([0 x float] * %fbase, i32 %offset,
+      <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2, 
+      <$1 x float> * %fout3)
+  ret void
+}
+
+
+define internal void
+@__soa_to_aos4_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
+             <$1 x float> %v3, [0 x float] * noalias %base,
+             i32 %offset) nounwind alwaysinline { 
+  %pf = bitcast [0 x float] * %base to float *
+  %p = getelementptr float * %pf, i32 %offset
+  %out0 = bitcast float * %p to <$1 x float> *
+  %out1 = getelementptr <$1 x float> * %out0, i32 1
+  %out2 = getelementptr <$1 x float> * %out0, i32 2
+  %out3 = getelementptr <$1 x float> * %out0, i32 3
+  call void @__soa_to_aos4_float$1(<$1 x float> %v0, <$1 x float> %v1, 
+         <$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0, 
+         <$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3)
+  ret void
+}
+
+
+define internal void
+@__soa_to_aos4_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
+             <$1 x i32> %v3, [0 x i32] * noalias %base,
+             i32 %offset) nounwind alwaysinline { 
+  %fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
+  %fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
+  %fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
+  %fv3 = bitcast <$1 x i32> %v3 to <$1 x float>
+  %fbase = bitcast [0 x i32] * %base to [0 x float] *
+  call void @__soa_to_aos4_float(<$1 x float> %fv0, <$1 x float> %fv1, 
+      <$1 x float> %fv2, <$1 x float> %fv3, [0 x float] * %fbase,
+      i32 %offset)
+  ret void
+}
+
+
+define internal void
+@__aos_to_soa3_float([0 x float] * noalias %base, i32 %offset,
+        <$1 x float> * %out0, <$1 x float> * %out1,
+        <$1 x float> * %out2) nounwind alwaysinline { 
+  %pf = bitcast [0 x float] * %base to float *
+  %p = getelementptr float * %pf, i32 %offset
+  %p0 = bitcast float * %p to <$1 x float> *
+  %v0 = load <$1 x float> * %p0, align 4
+  %p1 = getelementptr <$1 x float> * %p0, i32 1
+  %v1 = load <$1 x float> * %p1, align 4
+  %p2 = getelementptr <$1 x float> * %p0, i32 2
+  %v2 = load <$1 x float> * %p2, align 4
+  call void @__aos_to_soa3_float$1(<$1 x float> %v0, <$1 x float> %v1, 
+         <$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1,
+         <$1 x float> * %out2)
+  ret void
+}
+
+
+define internal void
+@__aos_to_soa3_int32([0 x i32] * noalias %base, i32 %offset,
+        <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
+        <$1 x i32> * noalias %out2) nounwind alwaysinline { 
+  %fbase = bitcast [0 x i32] * %base to [0 x float] *
+  %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
+  %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
+  %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
+  call void @__aos_to_soa3_float([0 x float] * %fbase, i32 %offset,
+      <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2)
+  ret void
+}
+
+
+define internal void
+@__soa_to_aos3_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
+             [0 x float] * noalias %base, i32 %offset) nounwind alwaysinline { 
+  %pf = bitcast [0 x float] * %base to float *
+  %p = getelementptr float * %pf, i32 %offset
+  %out0 = bitcast float * %p to <$1 x float> *
+  %out1 = getelementptr <$1 x float> * %out0, i32 1
+  %out2 = getelementptr <$1 x float> * %out0, i32 2
+  call void @__soa_to_aos3_float$1(<$1 x float> %v0, <$1 x float> %v1, 
+         <$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1,
+         <$1 x float> * %out2)
+  ret void
+}
+
+
+define internal void
+@__soa_to_aos3_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
+             [0 x i32] * noalias %base, i32 %offset) nounwind alwaysinline { 
+  %fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
+  %fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
+  %fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
+  %fbase = bitcast [0 x i32] * %base to [0 x float] *
+  call void @__soa_to_aos3_float(<$1 x float> %fv0, <$1 x float> %fv1, 
+      <$1 x float> %fv2, [0 x float] * %fbase, i32 %offset)
+  ret void
+}
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; prefetching
 
diff --git a/docs/ispc.txt b/docs/ispc.txt
index ad2254d2..43f30cd8 100644
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -89,6 +89,7 @@ Contents:
   + `Math Functions`_
   + `Output Functions`_
   + `Cross-Program Instance Operations`_
+  + `Converting Between Array-of-Structures and Structure-of-Arrays Layout`_
   + `Packed Load and Store Operations`_
   + `Conversions To and From Half-Precision Floats`_
   + `Atomic Operations and Memory Fences`_
@@ -2022,6 +2023,97 @@ bitwise-or are available:
     unsigned int64 exclusive_scan_or(unsigned int64 v) 
 
 
+Converting Between Array-of-Structures and Structure-of-Arrays Layout
+---------------------------------------------------------------------
+
+Applications often lay data out in memory in "array of structures" form.
+Though convenient in C/C++ code, this layout can make ``ispc`` programs
+less efficient than they would be if the data was laid out in "structure of
+arrays" form.  (See the section `Understanding How to Interoperate With the
+Application's Data`_ for extended discussion of this topic.)
+
+The standard library does provide a few functions that efficiently convert
+between these two formats, for cases where it's not possible to change the
+application to use "structure of arrays layout".  Consider an array of 3D
+(x,y,z) position data laid out in a C array like:
+
+::
+
+    // C++ code
+    float pos[] = { x0, y0, z0, x1, y1, z1, x2, ... };
+
+
+In an ``ispc`` program, we might want to load a set of (x,y,z) values and
+do a computation based on them.  The natural expression of this:
+
+::
+
+    extern uniform float pos[];
+    uniform int base = ...;
+    float x = pos[base + 3 * programIndex];     // x = { x0 x1 x2 ... }
+    float y = pos[base + 1 + 3 * programIndex]; // y = { y0 y1 y2 ... }
+    float z = pos[base + 2 + 3 * programIndex]; // z = { z0 z1 z2 ... }
+
+leads to irregular memory accesses and reduced performance.  Alternatively,
+the aos_to_soa3 standard library function could be used:
+
+::
+
+    extern uniform float pos[];
+    uniform int base = ...;
+    float x, y, z;
+    aos_to_soa3(pos, base, x, y, z);
+
+This routine loads ``3*programCount`` values from the given array starting
+at the given offset, returning three ``varying`` results.  There are both
+``int32`` and ``float`` variants of this function:
+
+::
+
+    void aos_to_soa3(uniform float a[], uniform int offset, reference float v0,
+                     reference float v1, reference float v2)
+    void aos_to_soa3(uniform int32 a[], uniform int offset, reference int32 v0,
+                     reference int32 v1, reference int32 v2)
+
+After computation is done, corresponding functions convert back from the
+SoA values in ``ispc`` ``varying`` variables and write the values back to
+the given array, starting at the given offset.
+
+::
+
+    extern uniform float pos[];
+    uniform int base = ...;
+    float x, y, z;
+    aos_to_soa3(pos, base, x, y, z);
+    // do computation with x, y, z
+    soa_to_aos3(x, y, z, pos, base);
+
+::
+
+    void soa_to_aos3(float v0, float v1, float v2, uniform float a[], 
+                     uniform int offset)
+    void soa_to_aos3(int32 v0, int32 v1, int32 v2, uniform int32 a[], 
+                     uniform int offset)
+
+There are also variants of these functions that convert 4-wide values
+between AoS and SoA layouts.  In other words, ``aos_to_soa4`` converts AoS
+data in memory laid out like ``r0 g0 b0 a0 r1 g1 b1 a1 ...`` to four ``varying``
+variables with values ``r0 r1...``, ``g0 g1...``, ``b0 b1...``, and ``a0
+a1...`, reading a total of ``4*programCount`` values from the given array,
+starting at the given offset.
+
+::
+
+    void aos_to_soa4(uniform float a[], uniform int offset, reference float v0,
+                     reference float v1, reference float v2, reference float v3)
+    void aos_to_soa4(uniform int32 a[], uniform int offset, reference int32 v0,
+                     reference int32 v1, reference int32 v2, reference int32 v3)
+    void soa_to_aos4(float v0, float v1, float v2, float v3, uniform float a[], 
+                     uniform int offset)
+    void soa_to_aos4(int32 v0, int32 v1, int32 v2, int32 v3, uniform int32 a[], 
+                     uniform int offset)
+
+
 Packed Load and Store Operations
 --------------------------------
 
@@ -2653,8 +2745,13 @@ values are loaded into the local ``x``, ``y``, and ``z`` variables,
 SIMD-efficient computation can proceed; getting to that point is
 relatively inefficient.
 
-An alternative would be the "structure of arrays" (SoA) layout.  In C, the
-data would be declared as:
+(As described previously in `Converting Between Array-of-Structures and
+Structure-of-Arrays Layout`_, this computation could be written more
+efficiently using standard library routines to convert from the AoS layout,
+if we were given a flat array of ``float`` values.) 
+
+An alternative data layout would be the "structure of arrays" (SoA).  In C,
+the data would be declared as:
 
 ::
 
diff --git a/failing_tests/aossoa-7.ispc b/failing_tests/aossoa-7.ispc
new file mode 100644
index 00000000..1c682b7d
--- /dev/null
+++ b/failing_tests/aossoa-7.ispc
@@ -0,0 +1,27 @@
+
+export uniform int width() { return programCount; }
+
+export void f_v(uniform float RET[]) {
+#define width 4
+//CO    const uniform int width = 4;
+    uniform int a[width*programCount], r[width*programCount];
+    for (uniform int i = 0; i < width*programCount; ++i)
+        a[i] = -1;
+
+    int x = width * programIndex;
+    int y = 1 + width * programIndex;
+    int z = 2 + width * programIndex;
+    int w = 3 + width * programIndex;
+
+    soa_to_aos4(x, y, z, w, a, 0);
+    uniform int errs = 0;
+    for (uniform int i = 0; i < width * programCount; ++i)
+        if (a[i] != i) ++errs;
+
+    RET[programIndex] = errs; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
+
diff --git a/failing_tests/aossoa-8.ispc b/failing_tests/aossoa-8.ispc
new file mode 100644
index 00000000..4c0afea0
--- /dev/null
+++ b/failing_tests/aossoa-8.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+export void f_v(uniform float RET[]) {
+#define width 3
+//CO    const uniform int width = 4;
+    uniform int a[width*programCount], r[width*programCount];
+    for (uniform int i = 0; i < width*programCount; ++i)
+        a[i] = -1;
+
+    int x = width * programIndex;
+    int y = 1 + width * programIndex;
+    int z = 2 + width * programIndex;
+
+    soa_to_aos3(x, y, z, a, 0);
+    uniform int errs = 0;
+    for (uniform int i = 0; i < width * programCount; ++i)
+        if (a[i] != i) ++errs;
+
+    RET[programIndex] = errs; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
+
diff --git a/stdlib.ispc b/stdlib.ispc
index fea10fa5..ebe146be 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -315,6 +315,57 @@ static inline uniform int lanemask() {
     return __movmsk(__mask);
 }
 
+///////////////////////////////////////////////////////////////////////////
+// AOS/SOA conversion
+
+static inline void
+aos_to_soa3(uniform float a[], uniform int offset, reference float v0,
+            reference float v1, reference float v2) {
+    __aos_to_soa3_float(a, offset, v0, v1, v2);
+}
+
+static inline void
+soa_to_aos3(float v0, float v1, float v2, uniform float a[], 
+            uniform int offset) {
+    __soa_to_aos3_float(v0, v1, v2, a, offset);
+}
+
+static inline void
+aos_to_soa4(uniform float a[], uniform int offset, reference float v0,
+            reference float v1, reference float v2, reference float v3) {
+    __aos_to_soa4_float(a, offset, v0, v1, v2, v3);
+}
+
+static inline void
+soa_to_aos4(float v0, float v1, float v2, float v3, uniform float a[], 
+            uniform int offset) {
+    __soa_to_aos4_float(v0, v1, v2, v3, a, offset);
+}
+
+static inline void
+aos_to_soa3(uniform int32 a[], uniform int offset, reference int32 v0,
+            reference int32 v1, reference int32 v2) {
+    __aos_to_soa3_int32(a, offset, v0, v1, v2);
+}
+
+static inline void
+soa_to_aos3(int32 v0, int32 v1, int32 v2, uniform int32 a[], 
+            uniform int offset) {
+    __soa_to_aos3_int32(v0, v1, v2, a, offset);
+}
+
+static inline void
+aos_to_soa4(uniform int32 a[], uniform int offset, reference int32 v0,
+            reference int32 v1, reference int32 v2, reference int32 v3) {
+    __aos_to_soa4_int32(a, offset, v0, v1, v2, v3);
+}
+
+static inline void
+soa_to_aos4(int32 v0, int32 v1, int32 v2, int32 v3, uniform int32 a[], 
+            uniform int offset) {
+    __soa_to_aos4_int32(v0, v1, v2, v3, a, offset);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // Prefetching
 
diff --git a/tests/aossoa-1.ispc b/tests/aossoa-1.ispc
new file mode 100644
index 00000000..0500f4d9
--- /dev/null
+++ b/tests/aossoa-1.ispc
@@ -0,0 +1,27 @@
+
+export uniform int width() { return programCount; }
+
+export void f_v(uniform float RET[]) {
+#define width 3
+#define maxProgramCount 16
+//CO    const uniform int width = 3;
+//CO    const uniform int maxProgramCount = 16;
+    uniform float a[width*maxProgramCount], r[width*maxProgramCount];
+    for (uniform int i = 0; i < width*maxProgramCount; ++i)
+        a[i] = i;
+
+    float x=-1, y=-1, z=-1;
+    aos_to_soa3(a, 0, x, y, z);
+
+    int errs = 0;
+    if (x != width * programIndex) ++errs;
+    if (y != 1 + width * programIndex) ++errs;
+    if (z != 2 + width * programIndex) ++errs;
+
+    RET[programIndex] = errs; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
+
diff --git a/tests/aossoa-2.ispc b/tests/aossoa-2.ispc
new file mode 100644
index 00000000..b5bc84ee
--- /dev/null
+++ b/tests/aossoa-2.ispc
@@ -0,0 +1,28 @@
+
+export uniform int width() { return programCount; }
+
+export void f_v(uniform float RET[]) {
+#define width 4
+#define maxProgramCount 16
+//CO    const uniform int width = 4;
+//CO    const uniform int maxProgramCount = 16;
+    uniform float a[width*maxProgramCount], r[width*maxProgramCount];
+    for (uniform int i = 0; i < width*maxProgramCount; ++i)
+        a[i] = i;
+
+    float x=-1, y=-1, z=-1, w=-1;
+    aos_to_soa4(a, 0, x, y, z, w);
+
+    int errs = 0;
+    if (x != width * programIndex) ++errs;
+    if (y != 1 + width * programIndex) ++errs;
+    if (z != 2 + width * programIndex) ++errs;
+    if (w != 3 + width * programIndex) ++errs;
+
+    RET[programIndex] = errs; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
+
diff --git a/tests/aossoa-3.ispc b/tests/aossoa-3.ispc
new file mode 100644
index 00000000..416a5a8e
--- /dev/null
+++ b/tests/aossoa-3.ispc
@@ -0,0 +1,27 @@
+
+export uniform int width() { return programCount; }
+
+export void f_v(uniform float RET[]) {
+#define width 4
+//CO    const uniform int width = 4;
+    uniform float a[width*programCount], r[width*programCount];
+    for (uniform int i = 0; i < width*programCount; ++i)
+        a[i] = -1;
+
+    float x = width * programIndex;
+    float y = 1 + width * programIndex;
+    float z = 2 + width * programIndex;
+    float w = 3 + width * programIndex;
+
+    soa_to_aos4(x, y, z, w, a, 0);
+    uniform int errs = 0;
+    for (uniform int i = 0; i < width * programCount; ++i)
+        if (a[i] != i) ++errs;
+
+    RET[programIndex] = errs; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
+
diff --git a/tests/aossoa-4.ispc b/tests/aossoa-4.ispc
new file mode 100644
index 00000000..4646f1ca
--- /dev/null
+++ b/tests/aossoa-4.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+export void f_v(uniform float RET[]) {
+#define width 3
+//CO    const uniform int width = 4;
+    uniform float a[width*programCount], r[width*programCount];
+    for (uniform int i = 0; i < width*programCount; ++i)
+        a[i] = -1;
+
+    float x = width * programIndex;
+    float y = 1 + width * programIndex;
+    float z = 2 + width * programIndex;
+
+    soa_to_aos3(x, y, z, a, 0);
+    uniform int errs = 0;
+    for (uniform int i = 0; i < width * programCount; ++i)
+        if (a[i] != i) ++errs;
+
+    RET[programIndex] = errs; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
+
diff --git a/tests/aossoa-5.ispc b/tests/aossoa-5.ispc
new file mode 100644
index 00000000..4cadb502
--- /dev/null
+++ b/tests/aossoa-5.ispc
@@ -0,0 +1,27 @@
+
+export uniform int width() { return programCount; }
+
+export void f_v(uniform float RET[]) {
+#define width 3
+#define maxProgramCount 16
+//CO    const uniform int width = 3;
+//CO    const uniform int maxProgramCount = 16;
+    uniform int a[width*maxProgramCount], r[width*maxProgramCount];
+    for (uniform int i = 0; i < width*maxProgramCount; ++i)
+        a[i] = i;
+
+    int x=-1, y=-1, z=-1;
+    aos_to_soa3(a, 0, x, y, z);
+
+    int errs = 0;
+    if (x != width * programIndex) ++errs;
+    if (y != 1 + width * programIndex) ++errs;
+    if (z != 2 + width * programIndex) ++errs;
+
+    RET[programIndex] = errs; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
+
diff --git a/tests/aossoa-6.ispc b/tests/aossoa-6.ispc
new file mode 100644
index 00000000..52269ba2
--- /dev/null
+++ b/tests/aossoa-6.ispc
@@ -0,0 +1,28 @@
+
+export uniform int width() { return programCount; }
+
+export void f_v(uniform float RET[]) {
+#define width 4
+#define maxProgramCount 16
+//CO    const uniform int width = 4;
+//CO    const uniform int maxProgramCount = 16;
+    uniform int a[width*maxProgramCount], r[width*maxProgramCount];
+    for (uniform int i = 0; i < width*maxProgramCount; ++i)
+        a[i] = i;
+
+    int x=-1, y=-1, z=-1, w=-1;
+    aos_to_soa4(a, 0, x, y, z, w);
+
+    int errs = 0;
+    if (x != width * programIndex) ++errs;
+    if (y != 1 + width * programIndex) ++errs;
+    if (z != 2 + width * programIndex) ++errs;
+    if (w != 3 + width * programIndex) ++errs;
+
+    RET[programIndex] = errs; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
+