;; Copyright (c) 2010-2013, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are ;; met: ;; ;; * Redistributions of source code must retain the above copyright ;; notice, this list of conditions and the following disclaimer. ;; ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; ;; * Neither the name of Intel Corporation nor the names of its ;; contributors may be used to endorse or promote products derived from ;; this software without specific prior written permission. ;; ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS ;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED ;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A ;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER ;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; This file provides a variety of macros used to generate LLVM bitcode ;; parametrized in various ways. Implementations of the standard library ;; builtins for various targets can use macros from this file to simplify ;; generating code for their implementations of those builtins. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; It is a bit of a pain to compute this in m4 for 32 and 64-wide targets... define(`ALL_ON_MASK', `ifelse(WIDTH, `64', `-1', WIDTH, `32', `4294967295', `eval((1< $2, <1 x $1> undef, <8 x i32> ') define(`convert1to16', ` $3 = shufflevector <1 x $1> $2, <1 x $1> undef, <16 x i32> ') define(`convert4to8', ` $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <8 x i32> ') define(`convert4to16', ` $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <16 x i32> ') define(`convert8to16', ` $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <16 x i32> ') define(`convert4to32', ` $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <32 x i32> ') define(`convert8to32', ` $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <32 x i32> ') define(`convert16to32', ` $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <32 x i32> ') define(`convert8to1', ` $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <1 x i32> ') define(`convert16to1', ` $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <1 x i32> ') define(`convert8to4', ` $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <4 x i32> ') define(`convert16to4', ` $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> ') define(`convert16to8', ` $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <8 x i32> ') define(`convert32to4', ` $3 = shufflevector <32 x $1> $2, <32 x $1> undef, <4 x i32> ') define(`convert32to8', ` $3 = shufflevector <32 x $1> $2, <32 x $1> undef, <8 x i32> ') define(`convert32to16', ` $3 = shufflevector <32 x $1> $2, <32 x $1> undef, <16 x i32> ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;saturation arithmetic define(`saturation_arithmetic', `ifelse(WIDTH, `4', `saturation_arithmetic_vec4()', WIDTH, `8', `saturation_arithmetic_vec8()', WIDTH, `16', `saturation_arithmetic_vec16()')') ;; create vector constant. Used by saturation_arithmetic_novec_universal below. define(`const_vector', ` ifelse(WIDTH, `4', `<$1 $2, $1 $2, $1 $2, $1 $2>', WIDTH, `8', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', WIDTH, `16', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', WIDTH, `32', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', WIDTH, `64', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', `<$1 $2>')') ;; utility function used by saturation_arithmetic_novec below. This shouldn't be called by ;; target .ll files directly. ;; $1: {add,sub} (used in constructing function names) define(`saturation_arithmetic_novec_universal', ` define @__p$1s_vi8(, ) { %v0_i16 = sext %0 to %v1_i16 = sext %1 to %res = $1 %v0_i16, %v1_i16 %over_mask = icmp sgt %res, const_vector(i16, 127) %over_res = select %over_mask, const_vector(i16, 127), %res %under_mask = icmp slt %res, const_vector(i16, -128) %ret_i16 = select %under_mask, const_vector(i16, -128), %over_res %ret = trunc %ret_i16 to ret %ret } define @__p$1s_vi16(, ) { %v0_i32 = sext %0 to %v1_i32 = sext %1 to %res = $1 %v0_i32, %v1_i32 %over_mask = icmp sgt %res, const_vector(i32, 32767) %over_res = select %over_mask, const_vector(i32, 32767), %res %under_mask = icmp slt %res, const_vector(i32, -32768) %ret_i32 = select %under_mask, const_vector(i32, -32768), %over_res %ret = trunc %ret_i32 to ret %ret } define @__p$1us_vi8(, ) { %v0_i16 = zext %0 to %v1_i16 = zext %1 to %res = $1 %v0_i16, %v1_i16 %over_mask = icmp ugt %res, const_vector(i16, 255) %over_res = select %over_mask, const_vector(i16, 255), %res %under_mask = icmp slt %res, const_vector(i16, 0) %ret_i16 = select %under_mask, const_vector(i16, 0), %over_res %ret = trunc %ret_i16 to ret %ret } define @__p$1us_vi16(, ) { %v0_i32 = zext %0 to %v1_i32 = zext %1 to %res = $1 %v0_i32, %v1_i32 %over_mask = icmp ugt %res, const_vector(i32, 65535) %over_res = select %over_mask, const_vector(i32, 65535), %res %under_mask = icmp slt %res, const_vector(i32, 0) %ret_i32 = select %under_mask, const_vector(i32, 0), %over_res %ret = trunc %ret_i32 to ret %ret } ') ;; implementation for targets which doesn't have h/w instructions define(`saturation_arithmetic_novec', ` saturation_arithmetic_novec_universal(sub) saturation_arithmetic_novec_universal(add) ') ;;4-wide vector saturation arithmetic define(`saturation_arithmetic_vec4', ` declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone define <4 x i8> @__padds_vi8(<4 x i8>, <4 x i8>) { convert4to16(i8, %0, %v0) convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) convert16to4(i8, %r16, %r) ret <4 x i8> %r } declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define <4 x i16> @__padds_vi16(<4 x i16>, <4 x i16>) { convert4to8(i16, %0, %v0) convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) convert8to4(i16, %r16, %r) ret <4 x i16> %r } declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define <4 x i8> @__paddus_vi8(<4 x i8>, <4 x i8>) { convert4to16(i8, %0, %v0) convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) convert16to4(i8, %r16, %r) ret <4 x i8> %r } declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define <4 x i16> @__paddus_vi16(<4 x i16>, <4 x i16>) { convert4to8(i16, %0, %v0) convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) convert8to4(i16, %r16, %r) ret <4 x i16> %r } declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define <4 x i8> @__psubs_vi8(<4 x i8>, <4 x i8>) { convert4to16(i8, %0, %v0) convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) convert16to4(i8, %r16, %r) ret <4 x i8> %r } declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define <4 x i16> @__psubs_vi16(<4 x i16>, <4 x i16>) { convert4to8(i16, %0, %v0) convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) convert8to4(i16, %r16, %r) ret <4 x i16> %r } declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define <4 x i8> @__psubus_vi8(<4 x i8>, <4 x i8>) { convert4to16(i8, %0, %v0) convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) convert16to4(i8, %r16, %r) ret <4 x i8> %r } declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define <4 x i16> @__psubus_vi16(<4 x i16>, <4 x i16>) { convert4to8(i16, %0, %v0) convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) convert8to4(i16, %r16, %r) ret <4 x i16> %r } ') ;;8-wide vector saturation arithmetic define(`saturation_arithmetic_vec8', ` declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone define <8 x i8> @__padds_vi8(<8 x i8>, <8 x i8>) { convert8to16(i8, %0, %v0) convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) convert16to8(i8, %r16, %r) ret <8 x i8> %r } declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) { %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ret <8 x i16> %res } declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define <8 x i8> @__paddus_vi8(<8 x i8>, <8 x i8>) { convert8to16(i8, %0, %v0) convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) convert16to8(i8, %r16, %r) ret <8 x i8> %r } declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) { %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ret <8 x i16> %res } declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define <8 x i8> @__psubs_vi8(<8 x i8>, <8 x i8>) { convert8to16(i8, %0, %v0) convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) convert16to8(i8, %r16, %r) ret <8 x i8> %r } declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) { %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ret <8 x i16> %res } declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define <8 x i8> @__psubus_vi8(<8 x i8>, <8 x i8>) { convert8to16(i8, %0, %v0) convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) convert16to8(i8, %r16, %r) ret <8 x i8> %r } declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) { %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ret <8 x i16> %res } ') ;;16-wide vector saturation arithmetic define(`saturation_arithmetic_vec16', ` declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) { %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] ret <16 x i8> %res } declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) { binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1) ret <16 x i16> %ret } declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) { %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] ret <16 x i8> %res } declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) { binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) ret <16 x i16> %ret } declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) { %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] ret <16 x i8> %res } declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) { binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) ret <16 x i16> %ret } declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) { %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] ret <16 x i8> %res } declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) { binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) ret <16 x i16> %ret } ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector deconstruction utilities ;; split 8-wide vector into 2 4-wide vectors ;; ;; $1: vector element type ;; $2: 8-wide vector ;; $3: first 4-wide vector ;; $4: second 4-wide vector define(`v8tov4', ` $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <4 x i32> $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <4 x i32> ') define(`v16tov8', ` $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <8 x i32> $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <8 x i32> ') define(`v4tov2', ` $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> ') define(`v8tov2', ` $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> $5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> $6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> ') define(`v16tov4', ` $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> $5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> $6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector assembly: wider vector from two narrower vectors ;; ;; $1: vector element type ;; $2: first n-wide vector ;; $3: second n-wide vector ;; $4: result 2*n-wide vector define(`v8tov16', ` $4 = shufflevector <8 x $1> $2, <8 x $1> $3, <16 x i32> ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Helper macro for calling various SSE instructions for scalar values ;; but where the instruction takes a vector parameter. ;; $1 : name of variable to put the final value in ;; $2 : vector width of the target ;; $3 : scalar type of the operand ;; $4 : SSE intrinsic name ;; $5 : variable name that has the scalar value ;; For example, the following call causes the variable %ret to have ;; the result of a call to sqrtss with the scalar value in %0 ;; sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0) define(`sse_unary_scalar', ` %$1_vec = insertelement <$2 x $3> undef, $3 $5, i32 0 %$1_val = call <$2 x $3> $4(<$2 x $3> %$1_vec) %$1 = extractelement <$2 x $3> %$1_val, i32 0 ') ;; Similar to `sse_unary_scalar', this helper macro is for calling binary ;; SSE instructions with scalar values, ;; $1: name of variable to put the result in ;; $2: vector width of the target ;; $3: scalar type of the operand ;; $4 : SSE intrinsic name ;; $5 : variable name that has the first scalar operand ;; $6 : variable name that has the second scalar operand define(`sse_binary_scalar', ` %$1_veca = insertelement <$2 x $3> undef, $3 $5, i32 0 %$1_vecb = insertelement <$2 x $3> undef, $3 $6, i32 0 %$1_val = call <$2 x $3> $4(<$2 x $3> %$1_veca, <$2 x $3> %$1_vecb) %$1 = extractelement <$2 x $3> %$1_val, i32 0 ') ;; Do a reduction over a 4-wide vector ;; $1: type of final scalar result ;; $2: 4-wide function that takes 2 4-wide operands and returns the ;; element-wise reduction ;; $3: scalar function that takes two scalar operands and returns ;; the final reduction define(`reduce4', ` %v1 = shufflevector <4 x $1> %0, <4 x $1> undef, <4 x i32> %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %0) %m1a = extractelement <4 x $1> %m1, i32 0 %m1b = extractelement <4 x $1> %m1, i32 1 %m = call $1 $3($1 %m1a, $1 %m1b) ret $1 %m ' ) ;; Similar to `reduce4', do a reduction over an 8-wide vector ;; $1: type of final scalar result ;; $2: 8-wide function that takes 2 8-wide operands and returns the ;; element-wise reduction ;; $3: scalar function that takes two scalar operands and returns ;; the final reduction define(`reduce8', ` %v1 = shufflevector <8 x $1> %0, <8 x $1> undef, <8 x i32> %m1 = call <8 x $1> $2(<8 x $1> %v1, <8 x $1> %0) %v2 = shufflevector <8 x $1> %m1, <8 x $1> undef, <8 x i32> %m2 = call <8 x $1> $2(<8 x $1> %v2, <8 x $1> %m1) %m2a = extractelement <8 x $1> %m2, i32 0 %m2b = extractelement <8 x $1> %m2, i32 1 %m = call $1 $3($1 %m2a, $1 %m2b) ret $1 %m ' ) define(`reduce16', ` %v1 = shufflevector <16 x $1> %0, <16 x $1> undef, <16 x i32> %m1 = call <16 x $1> $2(<16 x $1> %v1, <16 x $1> %0) %v2 = shufflevector <16 x $1> %m1, <16 x $1> undef, <16 x i32> %m2 = call <16 x $1> $2(<16 x $1> %v2, <16 x $1> %m1) %v3 = shufflevector <16 x $1> %m2, <16 x $1> undef, <16 x i32> %m3 = call <16 x $1> $2(<16 x $1> %v3, <16 x $1> %m2) %m3a = extractelement <16 x $1> %m3, i32 0 %m3b = extractelement <16 x $1> %m3, i32 1 %m = call $1 $3($1 %m3a, $1 %m3b) ret $1 %m ' ) ;; Do an reduction over an 8-wide vector, using a vector reduction function ;; that only takes 4-wide vectors ;; $1: type of final scalar result ;; $2: 4-wide function that takes 2 4-wide operands and returns the ;; element-wise reduction ;; $3: scalar function that takes two scalar operands and returns ;; the final reduction define(`reduce8by4', ` v8tov4($1, %0, %v1, %v2) %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2) %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef, <4 x i32> %m2 = call <4 x $1> $2(<4 x $1> %v3, <4 x $1> %m1) %m2a = extractelement <4 x $1> %m2, i32 0 %m2b = extractelement <4 x $1> %m2, i32 1 %m = call $1 $3($1 %m2a, $1 %m2b) ret $1 %m ' ) ;; Apply a unary function to the 4-vector in %0, return the vector result. ;; $1: scalar type of result ;; $2: name of scalar function to call define(`unary1to4', ` %v_0 = extractelement <4 x $1> %0, i32 0 %r_0 = call $1 $2($1 %v_0) %ret_0 = insertelement <4 x $1> undef, $1 %r_0, i32 0 %v_1 = extractelement <4 x $1> %0, i32 1 %r_1 = call $1 $2($1 %v_1) %ret_1 = insertelement <4 x $1> %ret_0, $1 %r_1, i32 1 %v_2 = extractelement <4 x $1> %0, i32 2 %r_2 = call $1 $2($1 %v_2) %ret_2 = insertelement <4 x $1> %ret_1, $1 %r_2, i32 2 %v_3 = extractelement <4 x $1> %0, i32 3 %r_3 = call $1 $2($1 %v_3) %ret_3 = insertelement <4 x $1> %ret_2, $1 %r_3, i32 3 ret <4 x $1> %ret_3 ') define(`unary1to8', ` %v_0 = extractelement <8 x $1> %0, i32 0 %r_0 = call $1 $2($1 %v_0) %ret_0 = insertelement <8 x $1> undef, $1 %r_0, i32 0 %v_1 = extractelement <8 x $1> %0, i32 1 %r_1 = call $1 $2($1 %v_1) %ret_1 = insertelement <8 x $1> %ret_0, $1 %r_1, i32 1 %v_2 = extractelement <8 x $1> %0, i32 2 %r_2 = call $1 $2($1 %v_2) %ret_2 = insertelement <8 x $1> %ret_1, $1 %r_2, i32 2 %v_3 = extractelement <8 x $1> %0, i32 3 %r_3 = call $1 $2($1 %v_3) %ret_3 = insertelement <8 x $1> %ret_2, $1 %r_3, i32 3 %v_4 = extractelement <8 x $1> %0, i32 4 %r_4 = call $1 $2($1 %v_4) %ret_4 = insertelement <8 x $1> %ret_3, $1 %r_4, i32 4 %v_5 = extractelement <8 x $1> %0, i32 5 %r_5 = call $1 $2($1 %v_5) %ret_5 = insertelement <8 x $1> %ret_4, $1 %r_5, i32 5 %v_6 = extractelement <8 x $1> %0, i32 6 %r_6 = call $1 $2($1 %v_6) %ret_6 = insertelement <8 x $1> %ret_5, $1 %r_6, i32 6 %v_7 = extractelement <8 x $1> %0, i32 7 %r_7 = call $1 $2($1 %v_7) %ret_7 = insertelement <8 x $1> %ret_6, $1 %r_7, i32 7 ret <8 x $1> %ret_7 ') ;; Given a unary function that takes a 2-wide vector and a 4-wide vector ;; that we'd like to apply it to, extract 2 2-wide vectors from the 4-wide ;; vector, apply it, and return the corresponding 4-wide vector result ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements ;; $3: 2-wide unary vector function to apply ;; $4: 4-wide operand value define(`unary2to4', ` %$1_0 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0) %$1_1 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1) %$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> ' ) ;; Similar to `unary2to4', this applies a 2-wide binary function to two 4-wide ;; vector operands ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements ;; $3: 2-wide binary vector function to apply ;; $4: First 4-wide operand value ;; $5: Second 4-wide operand value define(`binary2to4', ` %$1_0a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> %$1_0b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b) %$1_1a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> %$1_1b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b) %$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> ' ) ;; Similar to `unary2to4', this maps a 4-wide unary function to an 8-wide ;; vector operand ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements ;; $3: 4-wide unary vector function to apply ;; $4: 8-wide operand value define(`unary4to8', ` %__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) %__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) %$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, <8 x i32> ' ) ;; $1: name of variable into which the final result should go ;; $2: scalar type of the input vector elements ;; $3: scalar type of the result vector elements ;; $4: 4-wide unary vector function to apply ;; $5: 8-wide operand value define(`unary4to8conv', ` %$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) %$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) %$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, <8 x i32> ' ) define(`unary4to16', ` %__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) %__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) %__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2) %__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3) %__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, <8 x i32> %__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3, <8 x i32> %$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b, <16 x i32> ' ) define(`unary4to16conv', ` %$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) %$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) %$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2) %$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3) %$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, <8 x i32> %$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3, <8 x i32> %$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b, <16 x i32> ' ) ;; And so forth... ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements ;; $3: 8-wide unary vector function to apply ;; $4: 16-wide operand value define(`unary8to16', ` %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <8 x i32> %v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0) %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <8 x i32> %v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1) %$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, <16 x i32> ' ) ;; And along the lines of `binary2to4', this maps a 4-wide binary function to ;; two 8-wide vector operands ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements ;; $3: 4-wide unary vector function to apply ;; $4: First 8-wide operand value ;; $5: Second 8-wide operand value define(`binary4to8', ` %$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> %$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b) %$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> %$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b) %$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, <8 x i32> ' ) define(`binary8to16', ` %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <8 x i32> %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <8 x i32> %v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0a, <8 x $2> %$1_0b) %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <8 x i32> %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <8 x i32> %v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1a, <8 x $2> %$1_1b) %$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, <16 x i32> ' ) define(`binary4to16', ` %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %r$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b) %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %r$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b) %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %r$1_2 = call <4 x $2> $3(<4 x $2> %$1_2a, <4 x $2> %$1_2b) %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %r$1_3 = call <4 x $2> $3(<4 x $2> %$1_3a, <4 x $2> %$1_3b) %r$1_01 = shufflevector <4 x $2> %r$1_0, <4 x $2> %r$1_1, <8 x i32> %r$1_23 = shufflevector <4 x $2> %r$1_2, <4 x $2> %r$1_3, <8 x i32> %$1 = shufflevector <8 x $2> %r$1_01, <8 x $2> %r$1_23, <16 x i32> ') ;; Maps a 2-wide unary function to an 8-wide vector operand, returning an ;; 8-wide vector result ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements ;; $3: 2-wide unary vector function to apply ;; $4: 8-wide operand value define(`unary2to8', ` %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0) %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1) %$1_2 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2) %$1_3 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3) %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, <4 x i32> %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, <8 x i32> ' ) define(`unary2to16', ` %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0) %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1) %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2) %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3) %$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4) %$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5) %$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6) %$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7) %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, <4 x i32> %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, <8 x i32> %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, <4 x i32> %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, <4 x i32> %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, <8 x i32> %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, <16 x i32> ' ) ;; Maps an 2-wide binary function to two 8-wide vector operands ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements ;; $3: 2-wide unary vector function to apply ;; $4: First 8-wide operand value ;; $5: Second 8-wide operand value define(`binary2to8', ` %$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b) %$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b) %$1_2a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %$1_2b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b) %$1_3a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, <4 x i32> %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, <8 x i32> ' ) define(`binary2to16', ` %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b) %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b) %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b) %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) %$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b) %$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b) %$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b) %$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b) %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, <4 x i32> %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, <8 x i32> %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, <4 x i32> %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, <4 x i32> %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, <8 x i32> %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, <16 x i32> ' ) ;; The unary SSE round intrinsic takes a second argument that encodes the ;; rounding mode. This macro makes it easier to apply the 4-wide roundps ;; to 8-wide vector operands ;; $1: value to be rounded ;; $2: integer encoding of rounding mode ;; FIXME: this just has a ret statement at the end to return the result, ;; which is inconsistent with the macros above define(`round4to8', ` %v0 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> %v1 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> %r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2) %r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2) %ret = shufflevector <4 x float> %r0, <4 x float> %r1, <8 x i32> ret <8 x float> %ret ' ) define(`round4to16', ` %v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> %v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> %v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> %v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> %r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2) %r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2) %r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2) %r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2) %ret01 = shufflevector <4 x float> %r0, <4 x float> %r1, <8 x i32> %ret23 = shufflevector <4 x float> %r2, <4 x float> %r3, <8 x i32> %ret = shufflevector <8 x float> %ret01, <8 x float> %ret23, <16 x i32> ret <16 x float> %ret ' ) define(`round8to16', ` %v0 = shufflevector <16 x float> $1, <16 x float> undef, <8 x i32> %v1 = shufflevector <16 x float> $1, <16 x float> undef, <8 x i32> %r0 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v0, i32 $2) %r1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v1, i32 $2) %ret = shufflevector <8 x float> %r0, <8 x float> %r1, <16 x i32> ret <16 x float> %ret ' ) define(`round4to8double', ` %v0 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> %v1 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> %r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2) %r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2) %ret = shufflevector <4 x double> %r0, <4 x double> %r1, <8 x i32> ret <8 x double> %ret ' ) ; and similarly for doubles... define(`round2to4double', ` %v0 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> %v1 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> %r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2) %r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2) %ret = shufflevector <2 x double> %r0, <2 x double> %r1, <4 x i32> ret <4 x double> %ret ' ) define(`round2to8double', ` %v0 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> %v1 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> %v2 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> %v3 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> %r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2) %r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2) %r2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v2, i32 $2) %r3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v3, i32 $2) %ret0 = shufflevector <2 x double> %r0, <2 x double> %r1, <4 x i32> %ret1 = shufflevector <2 x double> %r2, <2 x double> %r3, <4 x i32> %ret = shufflevector <4 x double> %ret0, <4 x double> %ret1, <8 x i32> ret <8 x double> %ret ' ) define(`round4to16double', ` %v0 = shufflevector <16 x double> $1, <16 x double> undef, <4 x i32> %v1 = shufflevector <16 x double> $1, <16 x double> undef, <4 x i32> %v2 = shufflevector <16 x double> $1, <16 x double> undef, <4 x i32> %v3 = shufflevector <16 x double> $1, <16 x double> undef, <4 x i32> %r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2) %r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2) %r2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v2, i32 $2) %r3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v3, i32 $2) %ret0 = shufflevector <4 x double> %r0, <4 x double> %r1, <8 x i32> %ret1 = shufflevector <4 x double> %r2, <4 x double> %r3, <8 x i32> %ret = shufflevector <8 x double> %ret0, <8 x double> %ret1, <16 x i32> ret <16 x double> %ret ' ) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; forloop macro divert(`-1') # forloop(var, from, to, stmt) - improved version: # works even if VAR is not a strict macro name # performs sanity check that FROM is larger than TO # allows complex numerical expressions in TO and FROM define(`forloop', `ifelse(eval(`($3) >= ($2)'), `1', `pushdef(`$1', eval(`$2'))_$0(`$1', eval(`$3'), `$4')popdef(`$1')')') define(`_forloop', `$3`'ifelse(indir(`$1'), `$2', `', `define(`$1', incr(indir(`$1')))$0($@)')') divert`'dnl ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; stdlib_core ;; ;; This macro defines a bunch of helper routines that depend on the ;; target's vector width ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; define(`shuffles', ` define @__broadcast_$1(, i32) nounwind readnone alwaysinline { %v = extractelement %0, i32 %1 %broadcast_init = insertelement undef, $1 %v, i32 0 %broadcast = shufflevector %broadcast_init, undef, zeroinitializer ret %broadcast } define @__rotate_$1(, i32) nounwind readnone alwaysinline { %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1) br i1 %isc, label %is_const, label %not_const is_const: ; though verbose, this turms into tight code if %1 is a constant forloop(i, 0, eval(WIDTH-1), ` %delta_`'i = add i32 %1, i %delta_clamped_`'i = and i32 %delta_`'i, eval(WIDTH-1) %v_`'i = extractelement %0, i32 %delta_clamped_`'i') %ret_0 = insertelement undef, $1 %v_0, i32 0 forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement %ret_`'eval(i-1), $1 %v_`'i, i32 i ') ret %ret_`'eval(WIDTH-1) not_const: ; store two instances of the vector into memory %ptr = alloca , i32 2 %ptr0 = getelementptr * %ptr, i32 0 store %0, * %ptr0 %ptr1 = getelementptr * %ptr, i32 1 store %0, * %ptr1 ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector %offset = and i32 %1, eval(WIDTH-1) %ptr_as_elt_array = bitcast * %ptr to [eval(2*WIDTH) x $1] * %load_ptr = getelementptr [eval(2*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset %load_ptr_vec = bitcast $1 * %load_ptr to * %result = load * %load_ptr_vec, align $2 ret %result } define @__shift_$1(, i32) nounwind readnone alwaysinline { %ptr = alloca , i32 3 %ptr0 = getelementptr * %ptr, i32 0 store zeroinitializer, * %ptr0 %ptr1 = getelementptr * %ptr, i32 1 store %0, * %ptr1 %ptr2 = getelementptr * %ptr, i32 2 store zeroinitializer, * %ptr2 %offset = add i32 %1, WIDTH %ptr_as_elt_array = bitcast * %ptr to [eval(3*WIDTH) x $1] * %load_ptr = getelementptr [eval(3*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset %load_ptr_vec = bitcast $1 * %load_ptr to * %result = load * %load_ptr_vec, align $2 ret %result } define @__shuffle_$1(, ) nounwind readnone alwaysinline { forloop(i, 0, eval(WIDTH-1), ` %index_`'i = extractelement %1, i32 i') forloop(i, 0, eval(WIDTH-1), ` %v_`'i = extractelement %0, i32 %index_`'i') %ret_0 = insertelement undef, $1 %v_0, i32 0 forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement %ret_`'eval(i-1), $1 %v_`'i, i32 i ') ret %ret_`'eval(WIDTH-1) } define @__shuffle2_$1(, , ) nounwind readnone alwaysinline { %v2 = shufflevector %0, %1, < forloop(i, 0, eval(2*WIDTH-2), `i32 i, ') i32 eval(2*WIDTH-1) > forloop(i, 0, eval(WIDTH-1), ` %index_`'i = extractelement %2, i32 i') %isc = call i1 @__is_compile_time_constant_varying_int32( %2) br i1 %isc, label %is_const, label %not_const is_const: ; extract from the requested lanes and insert into the result; LLVM turns ; this into good code in the end forloop(i, 0, eval(WIDTH-1), ` %v_`'i = extractelement %v2, i32 %index_`'i') %ret_0 = insertelement undef, $1 %v_0, i32 0 forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement %ret_`'eval(i-1), $1 %v_`'i, i32 i ') ret %ret_`'eval(WIDTH-1) not_const: ; otherwise store the two vectors onto the stack and then use the given ; permutation vector to get indices into that array... %ptr = alloca store %v2, * %ptr %baseptr = bitcast * %ptr to $1 * %ptr_0 = getelementptr $1 * %baseptr, i32 %index_0 %val_0 = load $1 * %ptr_0 %result_0 = insertelement undef, $1 %val_0, i32 0 forloop(i, 1, eval(WIDTH-1), ` %ptr_`'i = getelementptr $1 * %baseptr, i32 %index_`'i %val_`'i = load $1 * %ptr_`'i %result_`'i = insertelement %result_`'eval(i-1), $1 %val_`'i, i32 i ') ret %result_`'eval(WIDTH-1) } ') define(`define_shuffles',` shuffles(i8, 1) shuffles(i16, 2) shuffles(float, 4) shuffles(i32, 4) shuffles(double, 8) shuffles(i64, 8) ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; global_atomic_associative ;; More efficient implementation for atomics that are associative (e.g., ;; add, and, ...). If a basic implementation would do sometihng like: ;; result0 = atomic_op(ptr, val0) ;; result1 = atomic_op(ptr, val1) ;; .. ;; Then instead we can do: ;; tmp = (val0 op val1 op ...) ;; result0 = atomic_op(ptr, tmp) ;; result1 = (result0 op val0) ;; .. ;; And more efficiently compute the same result ;; ;; Takes five parameters: ;; $1: vector width of the target ;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names) ;; (add, sub...) ;; $3: return type of the LLVM atomic (e.g. i32) ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) ;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...) define(`mask_converts', ` define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) { %r = sext <$1 x i1> %0 to <$1 x i8> ret <$1 x i8> %r } define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) { %r = sext <$1 x i1> %0 to <$1 x i16> ret <$1 x i16> %r } define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) { %r = sext <$1 x i1> %0 to <$1 x i32> ret <$1 x i32> %r } define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) { %r = sext <$1 x i1> %0 to <$1 x i64> ret <$1 x i64> %r } define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) { ret <$1 x i8> %0 } define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) { %r = sext <$1 x i8> %0 to <$1 x i16> ret <$1 x i16> %r } define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) { %r = sext <$1 x i8> %0 to <$1 x i32> ret <$1 x i32> %r } define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) { %r = sext <$1 x i8> %0 to <$1 x i64> ret <$1 x i64> %r } define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) { %r = trunc <$1 x i16> %0 to <$1 x i8> ret <$1 x i8> %r } define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) { ret <$1 x i16> %0 } define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) { %r = sext <$1 x i16> %0 to <$1 x i32> ret <$1 x i32> %r } define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) { %r = sext <$1 x i16> %0 to <$1 x i64> ret <$1 x i64> %r } define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) { %r = trunc <$1 x i32> %0 to <$1 x i8> ret <$1 x i8> %r } define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) { %r = trunc <$1 x i32> %0 to <$1 x i16> ret <$1 x i16> %r } define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) { ret <$1 x i32> %0 } define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) { %r = sext <$1 x i32> %0 to <$1 x i64> ret <$1 x i64> %r } define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) { %r = trunc <$1 x i64> %0 to <$1 x i8> ret <$1 x i8> %r } define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) { %r = trunc <$1 x i64> %0 to <$1 x i16> ret <$1 x i16> %r } define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) { %r = trunc <$1 x i64> %0 to <$1 x i32> ret <$1 x i32> %r } define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) { ret <$1 x i64> %0 } ') mask_converts(WIDTH) define(`global_atomic_associative', ` define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, <$1 x MASK> %m) nounwind alwaysinline { ; first, for any lanes where the mask is off, compute a vector where those lanes ; hold the identity value.. ; for the bit tricks below, we need the mask to have the ; the same element size as the element type. %mask = call <$1 x $3> @convertmask_`'MASK`'_$3_$1(<$1 x MASK> %m) ; zero out any lanes that are off %valoff = and <$1 x $3> %val, %mask ; compute an identity vector that is zero in on lanes and has the identiy value ; in the off lanes %idv1 = bitcast $3 $5 to <1 x $3> %idvec = shufflevector <1 x $3> %idv1, <1 x $3> undef, <$1 x i32> < forloop(i, 1, eval($1-1), `i32 0, ') i32 0 > %notmask = xor <$1 x $3> %mask, < forloop(i, 1, eval($1-1), `$3 -1, ') $3 -1 > %idoff = and <$1 x $3> %idvec, %notmask ; and comptue the merged vector that holds the identity in the off lanes %valp = or <$1 x $3> %valoff, %idoff ; now compute the local reduction (val0 op val1 op ... )--initialize ; %eltvec so that the 0th element is the identity, the first is val0, ; the second is (val0 op val1), .. %red0 = extractelement <$1 x $3> %valp, i32 0 %eltvec0 = insertelement <$1 x $3> undef, $3 $5, i32 0 forloop(i, 1, eval($1-1), ` %elt`'i = extractelement <$1 x $3> %valp, i32 i %red`'i = $2 $3 %red`'eval(i-1), %elt`'i %eltvec`'i = insertelement <$1 x $3> %eltvec`'eval(i-1), $3 %red`'eval(i-1), i32 i') ; make the atomic call, passing it the final reduced value %final0 = atomicrmw $2 $3 * %ptr, $3 %red`'eval($1-1) seq_cst ; now go back and compute the values to be returned for each program ; instance--this just involves smearing the old value returned from the ; actual atomic call across the vector and applying the vector op to the ; %eltvec vector computed above.. %finalv1 = bitcast $3 %final0 to <1 x $3> %final_base = shufflevector <1 x $3> %finalv1, <1 x $3> undef, <$1 x i32> < forloop(i, 1, eval($1-1), `i32 0, ') i32 0 > %r = $2 <$1 x $3> %final_base, %eltvec`'eval($1-1) ret <$1 x $3> %r } ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; global_atomic_uniform ;; Defines the implementation of a function that handles the mapping from ;; an ispc atomic function to the underlying LLVM intrinsics. This variant ;; just calls the atomic once, for the given uniform value ;; ;; Takes four parameters: ;; $1: vector width of the target ;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names) ;; (add, sub...) ;; $3: return type of the LLVM atomic (e.g. i32) ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) define(`global_atomic_uniform', ` define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline { %r = atomicrmw $2 $3 * %ptr, $3 %val seq_cst ret $3 %r } ') ;; Macro to declare the function that implements the swap atomic. ;; Takes three parameters: ;; $1: vector width of the target ;; $2: llvm type of the vector elements (e.g. i32) ;; $3: ispc type of the elements (e.g. int32) define(`global_swap', ` define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline { %r = atomicrmw xchg $2 * %ptr, $2 %val seq_cst ret $2 %r } ') ;; Similarly, macro to declare the function that implements the compare/exchange ;; atomic. Takes three parameters: ;; $1: vector width of the target ;; $2: llvm type of the vector elements (e.g. i32) ;; $3: ispc type of the elements (e.g. int32) define(`global_atomic_exchange', ` define <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp, <$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline { %rptr = alloca <$1 x $2> %rptr32 = bitcast <$1 x $2> * %rptr to $2 * per_lane($1, <$1 x MASK> %mask, ` %cmp_LANE_ID = extractelement <$1 x $2> %cmp, i32 LANE %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE %r_LANE_ID = cmpxchg $2 * %ptr, $2 %cmp_LANE_ID, $2 %val_LANE_ID seq_cst %rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE store $2 %r_LANE_ID, $2 * %rp_LANE_ID') %r = load <$1 x $2> * %rptr ret <$1 x $2> %r } define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp, $2 %val) nounwind alwaysinline { %r = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst ret $2 %r } ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; count trailing zeros define(`ctlztz', ` declare_count_zeros() define i32 @__count_trailing_zeros_i32(i32) nounwind readnone alwaysinline { %c = call i32 @llvm.cttz.i32(i32 %0) ret i32 %c } define i64 @__count_trailing_zeros_i64(i64) nounwind readnone alwaysinline { %c = call i64 @llvm.cttz.i64(i64 %0) ret i64 %c } define i32 @__count_leading_zeros_i32(i32) nounwind readnone alwaysinline { %c = call i32 @llvm.ctlz.i32(i32 %0) ret i32 %c } define i64 @__count_leading_zeros_i64(i64) nounwind readnone alwaysinline { %c = call i64 @llvm.ctlz.i64(i64 %0) ret i64 %c } ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; prefetching define(`define_prefetches', ` declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality, i32 %cachetype) ; cachetype == 1 is dcache define void @__prefetch_read_uniform_1(i8 *) alwaysinline { call void @llvm.prefetch(i8 * %0, i32 0, i32 3, i32 1) ret void } define void @__prefetch_read_uniform_2(i8 *) alwaysinline { call void @llvm.prefetch(i8 * %0, i32 0, i32 2, i32 1) ret void } define void @__prefetch_read_uniform_3(i8 *) alwaysinline { call void @llvm.prefetch(i8 * %0, i32 0, i32 1, i32 1) ret void } define void @__prefetch_read_uniform_nt(i8 *) alwaysinline { call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1) ret void } ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; AOS/SOA conversion primitives ;; take 4 4-wide vectors laid out like ... ;; and reorder them to ... define(`aossoa', `define void @__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, <4 x float> %v3, <4 x float> * noalias %out0, <4 x float> * noalias %out1, <4 x float> * noalias %out2, <4 x float> * noalias %out3) nounwind alwaysinline { %t0 = shufflevector <4 x float> %v2, <4 x float> %v3, ; r2 r3 g2 g3 <4 x i32> %t1 = shufflevector <4 x float> %v2, <4 x float> %v3, ; b2 b3 a2 a3 <4 x i32> %t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; r0 r1 g0 g1 <4 x i32> %t3 = shufflevector <4 x float> %v0, <4 x float> %v1, ; b0 b1 a0 a1 <4 x i32> %r0 = shufflevector <4 x float> %t2, <4 x float> %t0, ; r0 r1 r2 r3 <4 x i32> store <4 x float> %r0, <4 x float> * %out0 %r1 = shufflevector <4 x float> %t2, <4 x float> %t0, ; g0 g1 g2 g3 <4 x i32> store <4 x float> %r1, <4 x float> * %out1 %r2 = shufflevector <4 x float> %t3, <4 x float> %t1, ; b0 b1 b2 b3 <4 x i32> store <4 x float> %r2, <4 x float> * %out2 %r3 = shufflevector <4 x float> %t3, <4 x float> %t1, ; a0 a1 a2 a3 <4 x i32> store <4 x float> %r3, <4 x float> * %out3 ret void } ;; Do the reverse of __aos_to_soa4_float4--reorder .. ;; to ... ;; This is the exact same set of operations that __soa_to_soa4_float4 does ;; (a 4x4 transpose), so just call that... define void @__soa_to_aos4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, <4 x float> %v3, <4 x float> * noalias %out0, <4 x float> * noalias %out1, <4 x float> * noalias %out2, <4 x float> * noalias %out3) nounwind alwaysinline { call void @__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, <4 x float> %v3, <4 x float> * %out0, <4 x float> * %out1, <4 x float> * %out2, <4 x float> * %out3) ret void } ;; Convert 3-wide AOS values to SOA--specifically, given 3 4-vectors ;; , transpose to ;; . define void @__aos_to_soa3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, <4 x float> * noalias %out0, <4 x float> * noalias %out1, <4 x float> * noalias %out2) nounwind alwaysinline { %t0 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x0 x1 y0 y1 <4 x i32> %t1 = shufflevector <4 x float> %v1, <4 x float> %v2, ; x2 x3 y2 y3 <4 x i32> %r0 = shufflevector <4 x float> %t0, <4 x float> %t1, ; x0 x1 x1 x3 <4 x i32> store <4 x float> %r0, <4 x float> * %out0 %r1 = shufflevector <4 x float> %t0, <4 x float> %t1, ; y0 y1 y2 y3 <4 x i32> store <4 x float> %r1, <4 x float> * %out1 %t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; z0 z1 x x <4 x i32> %r2 = shufflevector <4 x float> %t2, <4 x float> %v2, ; z0 z1 z2 z3 <4 x i32> store <4 x float> %r2, <4 x float> * %out2 ret void } ;; The inverse of __aos_to_soa3_float4: convert 3 4-vectors ;; to ;; . define void @__soa_to_aos3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, <4 x float> * noalias %out0, <4 x float> * noalias %out1, <4 x float> * noalias %out2) nounwind alwaysinline { %t0 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x0 x1 x2 y0 <4 x i32> %t1 = shufflevector <4 x float> %v1, <4 x float> %v2, ; y1 y2 z0 z1 <4 x i32> %r0 = shufflevector <4 x float> %t0, <4 x float> %t1, ; x0 y0 z0 x1 <4 x i32> store <4 x float> %r0, <4 x float> * %out0 %r1 = shufflevector <4 x float> %t0, <4 x float> %t1, ; y1 z1 x2 y2 <4 x i32> store <4 x float> %r1, <4 x float> * %out1 %t2 = shufflevector <4 x float> %v0, <4 x float> %v1, ; x3 y3 x x <4 x i32> %r2 = shufflevector <4 x float> %t2, <4 x float> %v2, ; z2 x3 y3 z3 <4 x i32> store <4 x float> %r2, <4 x float> * %out2 ret void } ;; 8-wide ;; These functions implement the 8-wide variants of the AOS/SOA conversion ;; routines above. These implementations are all built on top of the 4-wide ;; vector versions. define void @__aos_to_soa4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2, <8 x float> %v3, <8 x float> * noalias %out0, <8 x float> * noalias %out1, <8 x float> * noalias %out2, <8 x float> * noalias %out3) nounwind alwaysinline { ;; Split each 8-vector into 2 4-vectors %v0a = shufflevector <8 x float> %v0, <8 x float> undef, <4 x i32> %v0b = shufflevector <8 x float> %v0, <8 x float> undef, <4 x i32> %v1a = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> %v1b = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> %v2a = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> %v2b = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> %v3a = shufflevector <8 x float> %v3, <8 x float> undef, <4 x i32> %v3b = shufflevector <8 x float> %v3, <8 x float> undef, <4 x i32> ;; Similarly for the output pointers %out0a = bitcast <8 x float> * %out0 to <4 x float> * %out0b = getelementptr <4 x float> * %out0a, i32 1 %out1a = bitcast <8 x float> * %out1 to <4 x float> * %out1b = getelementptr <4 x float> * %out1a, i32 1 %out2a = bitcast <8 x float> * %out2 to <4 x float> * %out2b = getelementptr <4 x float> * %out2a, i32 1 %out3a = bitcast <8 x float> * %out3 to <4 x float> * %out3b = getelementptr <4 x float> * %out3a, i32 1 ;; Do the first part--given input vectors like ;; , ;; pass 3 4-vectors to the 4-vec ;; version to compute the first 4 SOA values for the three output variables. call void @__aos_to_soa4_float4(<4 x float> %v0a, <4 x float> %v0b, <4 x float> %v1a, <4 x float> %v1b, <4 x float> * %out0a, <4 x float> * %out1a, <4 x float> * %out2a, <4 x float> * %out3a) ;; And similarly pass to the 4-wide ;; version to compute the second 4 SOA values for the three outputs call void @__aos_to_soa4_float4(<4 x float> %v2a, <4 x float> %v2b, <4 x float> %v3a, <4 x float> %v3b, <4 x float> * %out0b, <4 x float> * %out1b, <4 x float> * %out2b, <4 x float> * %out3b) ret void } define void @__soa_to_aos4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2, <8 x float> %v3, <8 x float> * noalias %out0, <8 x float> * noalias %out1, <8 x float> * noalias %out2, <8 x float> * noalias %out3) nounwind alwaysinline { ;; As above, split into 4-vectors and 4-wide outputs... %v0a = shufflevector <8 x float> %v0, <8 x float> undef, <4 x i32> %v0b = shufflevector <8 x float> %v0, <8 x float> undef, <4 x i32> %v1a = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> %v1b = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> %v2a = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> %v2b = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> %v3a = shufflevector <8 x float> %v3, <8 x float> undef, <4 x i32> %v3b = shufflevector <8 x float> %v3, <8 x float> undef, <4 x i32> %out0a = bitcast <8 x float> * %out0 to <4 x float> * %out0b = getelementptr <4 x float> * %out0a, i32 1 %out1a = bitcast <8 x float> * %out1 to <4 x float> * %out1b = getelementptr <4 x float> * %out1a, i32 1 %out2a = bitcast <8 x float> * %out2 to <4 x float> * %out2b = getelementptr <4 x float> * %out2a, i32 1 %out3a = bitcast <8 x float> * %out3 to <4 x float> * %out3b = getelementptr <4 x float> * %out3a, i32 1 ;; First part--given input vectors ;; ;; pass 3 4-vectors to ;; compute the first 12 AOS output values. call void @__soa_to_aos4_float4(<4 x float> %v0a, <4 x float> %v1a, <4 x float> %v2a, <4 x float> %v3a, <4 x float> * %out0a, <4 x float> * %out0b, <4 x float> * %out1a, <4 x float> * %out1b) ;; And then pass the 3 4-vectors ;; To compute the next 12 AOS output values call void @__soa_to_aos4_float4(<4 x float> %v0b, <4 x float> %v1b, <4 x float> %v2b, <4 x float> %v3b, <4 x float> * %out2a, <4 x float> * %out2b, <4 x float> * %out3a, <4 x float> * %out3b) ret void } define void @__aos_to_soa3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2, <8 x float> * noalias %out0, <8 x float> * noalias %out1, <8 x float> * noalias %out2) nounwind alwaysinline { %v0a = shufflevector <8 x float> %v0, <8 x float> undef, <4 x i32> %v0b = shufflevector <8 x float> %v0, <8 x float> undef, <4 x i32>