;; Copyright (c) 2010-2016, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are ;; met: ;; ;; * Redistributions of source code must retain the above copyright ;; notice, this list of conditions and the following disclaimer. ;; ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; ;; * Neither the name of Intel Corporation nor the names of its ;; contributors may be used to endorse or promote products derived from ;; this software without specific prior written permission. ;; ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS ;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED ;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A ;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER ;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; This file provides a variety of macros used to generate LLVM bitcode ;; parametrized in various ways. Implementations of the standard library ;; builtins for various targets can use macros from this file to simplify ;; generating code for their implementations of those builtins. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; It is a bit of a pain to compute this in m4 for 32 and 64-wide targets... define(`ALL_ON_MASK', `ifelse(WIDTH, `64', `-1', WIDTH, `32', `4294967295', `eval((1< $2, <8 x $1> undef, <4 x i32> $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <4 x i32> ') define(`v16tov8', ` $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <8 x i32> $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <8 x i32> ') define(`v4tov2', ` $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> ') define(`v8tov2', ` $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> $5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> $6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> ') define(`v16tov4', ` $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> $5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> $6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector assembly: wider vector from two narrower vectors ;; ;; $1: vector element type ;; $2: first n-wide vector ;; $3: second n-wide vector ;; $4: result 2*n-wide vector define(`v8tov16', ` $4 = shufflevector <8 x $1> $2, <8 x $1> $3, <16 x i32> ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Helper macro for calling various SSE instructions for scalar values ;; but where the instruction takes a vector parameter. ;; $1 : name of variable to put the final value in ;; $2 : vector width of the target ;; $3 : scalar type of the operand ;; $4 : SSE intrinsic name ;; $5 : variable name that has the scalar value ;; For example, the following call causes the variable %ret to have ;; the result of a call to sqrtss with the scalar value in %0 ;; sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0) define(`sse_unary_scalar', ` %$1_vec = insertelement <$2 x $3> undef, $3 $5, i32 0 %$1_val = call <$2 x $3> $4(<$2 x $3> %$1_vec) %$1 = extractelement <$2 x $3> %$1_val, i32 0 ') ;; Similar to `sse_unary_scalar', this helper macro is for calling binary ;; SSE instructions with scalar values, ;; $1: name of variable to put the result in ;; $2: vector width of the target ;; $3: scalar type of the operand ;; $4 : SSE intrinsic name ;; $5 : variable name that has the first scalar operand ;; $6 : variable name that has the second scalar operand define(`sse_binary_scalar', ` %$1_veca = insertelement <$2 x $3> undef, $3 $5, i32 0 %$1_vecb = insertelement <$2 x $3> undef, $3 $6, i32 0 %$1_val = call <$2 x $3> $4(<$2 x $3> %$1_veca, <$2 x $3> %$1_vecb) %$1 = extractelement <$2 x $3> %$1_val, i32 0 ') ;; Do a reduction over a 4-wide vector ;; $1: type of final scalar result ;; $2: 4-wide function that takes 2 4-wide operands and returns the ;; element-wise reduction ;; $3: scalar function that takes two scalar operands and returns ;; the final reduction define(`reduce4', ` %v1 = shufflevector <4 x $1> %0, <4 x $1> undef, <4 x i32> %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %0) %m1a = extractelement <4 x $1> %m1, i32 0 %m1b = extractelement <4 x $1> %m1, i32 1 %m = call $1 $3($1 %m1a, $1 %m1b) ret $1 %m ' ) ;; Similar to `reduce4', do a reduction over an 8-wide vector ;; $1: type of final scalar result ;; $2: 8-wide function that takes 2 8-wide operands and returns the ;; element-wise reduction ;; $3: scalar function that takes two scalar operands and returns ;; the final reduction define(`reduce8', ` %v1 = shufflevector <8 x $1> %0, <8 x $1> undef, <8 x i32> %m1 = call <8 x $1> $2(<8 x $1> %v1, <8 x $1> %0) %v2 = shufflevector <8 x $1> %m1, <8 x $1> undef, <8 x i32> %m2 = call <8 x $1> $2(<8 x $1> %v2, <8 x $1> %m1) %m2a = extractelement <8 x $1> %m2, i32 0 %m2b = extractelement <8 x $1> %m2, i32 1 %m = call $1 $3($1 %m2a, $1 %m2b) ret $1 %m ' ) define(`reduce16', ` %v1 = shufflevector <16 x $1> %0, <16 x $1> undef, <16 x i32> %m1 = call <16 x $1> $2(<16 x $1> %v1, <16 x $1> %0) %v2 = shufflevector <16 x $1> %m1, <16 x $1> undef, <16 x i32> %m2 = call <16 x $1> $2(<16 x $1> %v2, <16 x $1> %m1) %v3 = shufflevector <16 x $1> %m2, <16 x $1> undef, <16 x i32> %m3 = call <16 x $1> $2(<16 x $1> %v3, <16 x $1> %m2) %m3a = extractelement <16 x $1> %m3, i32 0 %m3b = extractelement <16 x $1> %m3, i32 1 %m = call $1 $3($1 %m3a, $1 %m3b) ret $1 %m ' ) ;; Do an reduction over an 8-wide vector, using a vector reduction function ;; that only takes 4-wide vectors ;; $1: type of final scalar result ;; $2: 4-wide function that takes 2 4-wide operands and returns the ;; element-wise reduction ;; $3: scalar function that takes two scalar operands and returns ;; the final reduction define(`reduce8by4', ` v8tov4($1, %0, %v1, %v2) %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2) %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef, <4 x i32> %m2 = call <4 x $1> $2(<4 x $1> %v3, <4 x $1> %m1) %m2a = extractelement <4 x $1> %m2, i32 0 %m2b = extractelement <4 x $1> %m2, i32 1 %m = call $1 $3($1 %m2a, $1 %m2b) ret $1 %m ' ) ;; Apply a unary function to the 4-vector in %0, return the vector result. ;; $1: scalar type of result ;; $2: name of scalar function to call define(`unary1to4', ` %v_0 = extractelement <4 x $1> %0, i32 0 %r_0 = call $1 $2($1 %v_0) %ret_0 = insertelement <4 x $1> undef, $1 %r_0, i32 0 %v_1 = extractelement <4 x $1> %0, i32 1 %r_1 = call $1 $2($1 %v_1) %ret_1 = insertelement <4 x $1> %ret_0, $1 %r_1, i32 1 %v_2 = extractelement <4 x $1> %0, i32 2 %r_2 = call $1 $2($1 %v_2) %ret_2 = insertelement <4 x $1> %ret_1, $1 %r_2, i32 2 %v_3 = extractelement <4 x $1> %0, i32 3 %r_3 = call $1 $2($1 %v_3) %ret_3 = insertelement <4 x $1> %ret_2, $1 %r_3, i32 3 ret <4 x $1> %ret_3 ') define(`unary1to8', ` %v_0 = extractelement <8 x $1> %0, i32 0 %r_0 = call $1 $2($1 %v_0) %ret_0 = insertelement <8 x $1> undef, $1 %r_0, i32 0 %v_1 = extractelement <8 x $1> %0, i32 1 %r_1 = call $1 $2($1 %v_1) %ret_1 = insertelement <8 x $1> %ret_0, $1 %r_1, i32 1 %v_2 = extractelement <8 x $1> %0, i32 2 %r_2 = call $1 $2($1 %v_2) %ret_2 = insertelement <8 x $1> %ret_1, $1 %r_2, i32 2 %v_3 = extractelement <8 x $1> %0, i32 3 %r_3 = call $1 $2($1 %v_3) %ret_3 = insertelement <8 x $1> %ret_2, $1 %r_3, i32 3 %v_4 = extractelement <8 x $1> %0, i32 4 %r_4 = call $1 $2($1 %v_4) %ret_4 = insertelement <8 x $1> %ret_3, $1 %r_4, i32 4 %v_5 = extractelement <8 x $1> %0, i32 5 %r_5 = call $1 $2($1 %v_5) %ret_5 = insertelement <8 x $1> %ret_4, $1 %r_5, i32 5 %v_6 = extractelement <8 x $1> %0, i32 6 %r_6 = call $1 $2($1 %v_6) %ret_6 = insertelement <8 x $1> %ret_5, $1 %r_6, i32 6 %v_7 = extractelement <8 x $1> %0, i32 7 %r_7 = call $1 $2($1 %v_7) %ret_7 = insertelement <8 x $1> %ret_6, $1 %r_7, i32 7 ret <8 x $1> %ret_7 ') ;; Given a unary function that takes a 2-wide vector and a 4-wide vector ;; that we'd like to apply it to, extract 2 2-wide vectors from the 4-wide ;; vector, apply it, and return the corresponding 4-wide vector result ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements ;; $3: 2-wide unary vector function to apply ;; $4: 4-wide operand value define(`unary2to4', ` %$1_0 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0) %$1_1 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1) %$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> ' ) ;; Similar to `unary2to4', this applies a 2-wide binary function to two 4-wide ;; vector operands ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements ;; $3: 2-wide binary vector function to apply ;; $4: First 4-wide operand value ;; $5: Second 4-wide operand value define(`binary2to4', ` %$1_0a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> %$1_0b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b) %$1_1a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> %$1_1b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b) %$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> ' ) ;; Similar to `unary2to4', this maps a 4-wide unary function to an 8-wide ;; vector operand ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements ;; $3: 4-wide unary vector function to apply ;; $4: 8-wide operand value define(`unary4to8', ` %__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) %__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) %$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, <8 x i32> ' ) ;; $1: name of variable into which the final result should go ;; $2: scalar type of the input vector elements ;; $3: scalar type of the result vector elements ;; $4: 4-wide unary vector function to apply ;; $5: 8-wide operand value define(`unary4to8conv', ` %$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) %$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) %$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, <8 x i32> ' ) define(`unary4to16', ` %__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) %__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) %__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2) %__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3) %__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, <8 x i32> %__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3, <8 x i32> %$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b, <16 x i32> ' ) define(`unary4to16conv', ` %$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) %$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) %$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2) %$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3) %$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, <8 x i32> %$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3, <8 x i32> %$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b, <16 x i32> ' ) ;; And so forth... ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements ;; $3: 8-wide unary vector function to apply ;; $4: 16-wide operand value define(`unary8to16', ` %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <8 x i32> %v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0) %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <8 x i32> %v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1) %$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, <16 x i32> ' ) ;; And along the lines of `binary2to4', this maps a 4-wide binary function to ;; two 8-wide vector operands ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements ;; $3: 4-wide unary vector function to apply ;; $4: First 8-wide operand value ;; $5: Second 8-wide operand value define(`binary4to8', ` %$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> %$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b) %$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> %$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b) %$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, <8 x i32> ' ) define(`binary8to16', ` %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <8 x i32> %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <8 x i32> %v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0a, <8 x $2> %$1_0b) %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <8 x i32> %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <8 x i32> %v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1a, <8 x $2> %$1_1b) %$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, <16 x i32> ' ) define(`binary4to16', ` %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %r$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b) %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %r$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b) %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %r$1_2 = call <4 x $2> $3(<4 x $2> %$1_2a, <4 x $2> %$1_2b) %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> %r$1_3 = call <4 x $2> $3(<4 x $2> %$1_3a, <4 x $2> %$1_3b) %r$1_01 = shufflevector <4 x $2> %r$1_0, <4 x $2> %r$1_1, <8 x i32> %r$1_23 = shufflevector <4 x $2> %r$1_2, <4 x $2> %r$1_3, <8 x i32> %$1 = shufflevector <8 x $2> %r$1_01, <8 x $2> %r$1_23, <16 x i32> ') ;; Maps a 2-wide unary function to an 8-wide vector operand, returning an ;; 8-wide vector result ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements ;; $3: 2-wide unary vector function to apply ;; $4: 8-wide operand value define(`unary2to8', ` %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0) %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1) %$1_2 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2) %$1_3 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3) %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, <4 x i32> %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, <8 x i32> ' ) define(`unary2to16', ` %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0) %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1) %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2) %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3) %$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4) %$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5) %$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6) %$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7) %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, <4 x i32> %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, <8 x i32> %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, <4 x i32> %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, <4 x i32> %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, <8 x i32> %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, <16 x i32> ' ) ;; Maps an 2-wide binary function to two 8-wide vector operands ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements ;; $3: 2-wide unary vector function to apply ;; $4: First 8-wide operand value ;; $5: Second 8-wide operand value define(`binary2to8', ` %$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b) %$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b) %$1_2a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %$1_2b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b) %$1_3a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, <4 x i32> %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, <8 x i32> ' ) define(`binary2to16', ` %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b) %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b) %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b) %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) %$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b) %$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b) %$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b) %$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> %$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b) %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, <4 x i32> %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, <8 x i32> %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, <4 x i32> %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, <4 x i32> %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, <8 x i32> %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, <16 x i32> ' ) ;; The unary SSE round intrinsic takes a second argument that encodes the ;; rounding mode. This macro makes it easier to apply the 4-wide roundps ;; to 8-wide vector operands ;; $1: value to be rounded ;; $2: integer encoding of rounding mode ;; FIXME: this just has a ret statement at the end to return the result, ;; which is inconsistent with the macros above define(`round4to8', ` %v0 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> %v1 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> %r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2) %r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2) %ret = shufflevector <4 x float> %r0, <4 x float> %r1, <8 x i32> ret <8 x float> %ret ' ) define(`round4to16', ` %v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> %v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> %v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> %v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> %r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2) %r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2) %r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2) %r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2) %ret01 = shufflevector <4 x float> %r0, <4 x float> %r1, <8 x i32> %ret23 = shufflevector <4 x float> %r2, <4 x float> %r3, <8 x i32> %ret = shufflevector <8 x float> %ret01, <8 x float> %ret23, <16 x i32> ret <16 x float> %ret ' ) define(`round8to16', ` %v0 = shufflevector <16 x float> $1, <16 x float> undef, <8 x i32> %v1 = shufflevector <16 x float> $1, <16 x float> undef, <8 x i32> %r0 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v0, i32 $2) %r1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v1, i32 $2) %ret = shufflevector <8 x float> %r0, <8 x float> %r1, <16 x i32> ret <16 x float> %ret ' ) define(`round4to8double', ` %v0 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> %v1 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> %r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2) %r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2) %ret = shufflevector <4 x double> %r0, <4 x double> %r1, <8 x i32> ret <8 x double> %ret ' ) ; and similarly for doubles... define(`round2to4double', ` %v0 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> %v1 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> %r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2) %r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2) %ret = shufflevector <2 x double> %r0, <2 x double> %r1, <4 x i32> ret <4 x double> %ret ' ) define(`round2to8double', ` %v0 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> %v1 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> %v2 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> %v3 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> %r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2) %r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2) %r2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v2, i32 $2) %r3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v3, i32 $2) %ret0 = shufflevector <2 x double> %r0, <2 x double> %r1, <4 x i32> %ret1 = shufflevector <2 x double> %r2, <2 x double> %r3, <4 x i32> %ret = shufflevector <4 x double> %ret0, <4 x double> %ret1, <8 x i32> ret <8 x double> %ret ' ) define(`round4to16double', ` %v0 = shufflevector <16 x double> $1, <16 x double> undef, <4 x i32> %v1 = shufflevector <16 x double> $1, <16 x double> undef, <4 x i32> %v2 = shufflevector <16 x double> $1, <16 x double> undef, <4 x i32> %v3 = shufflevector <16 x double> $1, <16 x double> undef, <4 x i32> %r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2) %r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2) %r2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v2, i32 $2) %r3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v3, i32 $2) %ret0 = shufflevector <4 x double> %r0, <4 x double> %r1, <8 x i32> %ret1 = shufflevector <4 x double> %r2, <4 x double> %r3, <8 x i32> %ret = shufflevector <8 x double> %ret0, <8 x double> %ret1, <16 x i32> ret <16 x double> %ret ' ) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; forloop macro divert(`-1') # forloop(var, from, to, stmt) - improved version: # works even if VAR is not a strict macro name # performs sanity check that FROM is larger than TO # allows complex numerical expressions in TO and FROM define(`forloop', `ifelse(eval(`($3) >= ($2)'), `1', `pushdef(`$1', eval(`$2'))_$0(`$1', eval(`$3'), `$4')popdef(`$1')')') define(`_forloop', `$3`'ifelse(indir(`$1'), `$2', `', `define(`$1', incr(indir(`$1')))$0($@)')') divert`'dnl ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; stdlib_core ;; ;; This macro defines a bunch of helper routines that depend on the ;; target's vector width ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; define(`shuffles', ` ') define(`define_shuffles',` shuffles(i8, 1) shuffles(i16, 2) shuffles(float, 4) shuffles(i32, 4) shuffles(double, 8) shuffles(i64, 8) ') define(`mask_converts', ` define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) { %r = sext <$1 x i1> %0 to <$1 x i8> ret <$1 x i8> %r } define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) { %r = sext <$1 x i1> %0 to <$1 x i16> ret <$1 x i16> %r } define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) { %r = sext <$1 x i1> %0 to <$1 x i32> ret <$1 x i32> %r } define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) { %r = sext <$1 x i1> %0 to <$1 x i64> ret <$1 x i64> %r } define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) { ret <$1 x i8> %0 } define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) { %r = sext <$1 x i8> %0 to <$1 x i16> ret <$1 x i16> %r } define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) { %r = sext <$1 x i8> %0 to <$1 x i32> ret <$1 x i32> %r } define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) { %r = sext <$1 x i8> %0 to <$1 x i64> ret <$1 x i64> %r } define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) { %r = trunc <$1 x i16> %0 to <$1 x i8> ret <$1 x i8> %r } define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) { ret <$1 x i16> %0 } define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) { %r = sext <$1 x i16> %0 to <$1 x i32> ret <$1 x i32> %r } define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) { %r = sext <$1 x i16> %0 to <$1 x i64> ret <$1 x i64> %r } define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) { %r = trunc <$1 x i32> %0 to <$1 x i8> ret <$1 x i8> %r } define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) { %r = trunc <$1 x i32> %0 to <$1 x i16> ret <$1 x i16> %r } define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) { ret <$1 x i32> %0 } define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) { %r = sext <$1 x i32> %0 to <$1 x i64> ret <$1 x i64> %r } define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) { %r = trunc <$1 x i64> %0 to <$1 x i8> ret <$1 x i8> %r } define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) { %r = trunc <$1 x i64> %0 to <$1 x i16> ret <$1 x i16> %r } define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) { %r = trunc <$1 x i64> %0 to <$1 x i32> ret <$1 x i32> %r } define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) { ret <$1 x i64> %0 } ') mask_converts(WIDTH) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; count trailing zeros define(`ctlztz', ` declare_count_zeros() define i32 @__count_trailing_zeros_i32(i32) nounwind readnone alwaysinline { %c = call i32 @llvm.cttz.i32(i32 %0) ret i32 %c } define i64 @__count_trailing_zeros_i64(i64) nounwind readnone alwaysinline { %c = call i64 @llvm.cttz.i64(i64 %0) ret i64 %c } define i32 @__count_leading_zeros_i32(i32) nounwind readnone alwaysinline { %c = call i32 @llvm.ctlz.i32(i32 %0) ret i32 %c } define i64 @__count_leading_zeros_i64(i64) nounwind readnone alwaysinline { %c = call i64 @llvm.ctlz.i64(i64 %0) ret i64 %c } ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; prefetching define(`define_prefetches', ` declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality, i32 %cachetype) ; cachetype == 1 is dcache define void @__prefetch_read_uniform_1(i8 *) alwaysinline { call void @llvm.prefetch(i8 * %0, i32 0, i32 3, i32 1) ret void } define void @__prefetch_read_uniform_2(i8 *) alwaysinline { call void @llvm.prefetch(i8 * %0, i32 0, i32 2, i32 1) ret void } define void @__prefetch_read_uniform_3(i8 *) alwaysinline { call void @llvm.prefetch(i8 * %0, i32 0, i32 1, i32 1) ret void } define void @__prefetch_read_uniform_nt(i8 *) alwaysinline { call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1) ret void } ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; AOS/SOA conversion primitives ;; take 4 4-wide vectors laid out like ... ;; and reorder them to ... define(`aossoa', ` declare void @__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, <4 x float> %v3, <4 x float> * noalias %out0, <4 x float> * noalias %out1, <4 x float> * noalias %out2, <4 x float> * noalias %out3) nounwind alwaysinline ; ;; Do the reverse of __aos_to_soa4_float4--reorder .. ;; to ... ;; This is the exact same set of operations that __soa_to_soa4_float4 does ;; (a 4x4 transpose), so just call that... declare void @__soa_to_aos4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, <4 x float> %v3, <4 x float> * noalias %out0, <4 x float> * noalias %out1, <4 x float> * noalias %out2, <4 x float> * noalias %out3) nounwind alwaysinline; ;; Convert 3-wide AOS values to SOA--specifically, given 3 4-vectors ;; , transpose to ;; . declare void @__aos_to_soa3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, <4 x float> * noalias %out0, <4 x float> * noalias %out1, <4 x float> * noalias %out2) nounwind alwaysinline ;; The inverse of __aos_to_soa3_float4: convert 3 4-vectors ;; to ;; . declare void @__soa_to_aos3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, <4 x float> * noalias %out0, <4 x float> * noalias %out1, <4 x float> * noalias %out2) nounwind alwaysinline ;; 8-wide ;; These functions implement the 8-wide variants of the AOS/SOA conversion ;; routines above. These implementations are all built on top of the 4-wide ;; vector versions. declare void @__aos_to_soa4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2, <8 x float> %v3, <8 x float> * noalias %out0, <8 x float> * noalias %out1, <8 x float> * noalias %out2, <8 x float> * noalias %out3) nounwind alwaysinline declare void @__soa_to_aos4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2, <8 x float> %v3, <8 x float> * noalias %out0, <8 x float> * noalias %out1, <8 x float> * noalias %out2, <8 x float> * noalias %out3) nounwind alwaysinline declare void @__aos_to_soa3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2, <8 x float> * noalias %out0, <8 x float> * noalias %out1, <8 x float> * noalias %out2) nounwind alwaysinline ; declare void @__soa_to_aos3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2, <8 x float> * noalias %out0, <8 x float> * noalias %out1, <8 x float> * noalias %out2) nounwind alwaysinline ; ;; 16-wide declare void @__aos_to_soa4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2, <16 x float> %v3, <16 x float> * noalias %out0, <16 x float> * noalias %out1, <16 x float> * noalias %out2, <16 x float> * noalias %out3) nounwind alwaysinline ; declare void @__soa_to_aos4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2, <16 x float> %v3, <16 x float> * noalias %out0, <16 x float> * noalias %out1, <16 x float> * noalias %out2, <16 x float> * noalias %out3) nounwind alwaysinline ; declare void @__aos_to_soa3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2, <16 x float> * noalias %out0, <16 x float> * noalias %out1, <16 x float> * noalias %out2) nounwind alwaysinline ; declare void @__soa_to_aos3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2, <16 x float> * noalias %out0, <16 x float> * noalias %out1, <16 x float> * noalias %out2) nounwind alwaysinline ; ;; versions to be called from stdlib declare void @__aos_to_soa4_float(float * noalias %p, * noalias %out0, * noalias %out1, * noalias %out2, * noalias %out3) nounwind alwaysinline ; declare void @__soa_to_aos4_float( %v0, %v1, %v2, %v3, float * noalias %p) nounwind alwaysinline ; declare void @__aos_to_soa3_float(float * noalias %p, * %out0, * %out1, * %out2) nounwind alwaysinline ; declare void @__soa_to_aos3_float( %v0, %v1, %v2, float * noalias %p) nounwind alwaysinline ; ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; define(`masked_load_float_double', ` define @__masked_load_float(i8 * %ptr, %mask) readonly alwaysinline { %v32 = call @__masked_load_i32(i8 * %ptr, %mask) %vf = bitcast %v32 to ret %vf } define @__masked_load_double(i8 * %ptr, %mask) readonly alwaysinline { %v64 = call @__masked_load_i64(i8 * %ptr, %mask) %vd = bitcast %v64 to ret %vd } ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; define(`masked_store_float_double', ` define void @__masked_store_float( * nocapture, , ) nounwind alwaysinline { %ptr = bitcast * %0 to * %val = bitcast %1 to call void @__masked_store_i32( * %ptr, %val, %2) ret void } define void @__masked_store_double( * nocapture, , ) nounwind alwaysinline { %ptr = bitcast * %0 to * %val = bitcast %1 to call void @__masked_store_i64( * %ptr, %val, %2) ret void } define void @__masked_store_blend_float( * nocapture, , ) nounwind alwaysinline { %ptr = bitcast * %0 to * %val = bitcast %1 to call void @__masked_store_blend_i32( * %ptr, %val, %2) ret void } define void @__masked_store_blend_double( * nocapture, , ) nounwind alwaysinline { %ptr = bitcast * %0 to * %val = bitcast %1 to call void @__masked_store_blend_i64( * %ptr, %val, %2) ret void } ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; define(`stdlib_core', ` declare i32 @__fast_masked_vload() declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind declare i1 @__is_compile_time_constant_mask( %mask) declare i1 @__is_compile_time_constant_uniform_int32(i32) declare i1 @__is_compile_time_constant_varying_int32() ; This function declares placeholder masked store functions for the ; front-end to use. ; ; void __pseudo_masked_store_i8 (uniform int8 *ptr, varying int8 values, mask) ; void __pseudo_masked_store_i16(uniform int16 *ptr, varying int16 values, mask) ; void __pseudo_masked_store_i32(uniform int32 *ptr, varying int32 values, mask) ; void __pseudo_masked_store_float(uniform float *ptr, varying float values, mask) ; void __pseudo_masked_store_i64(uniform int64 *ptr, varying int64 values, mask) ; void __pseudo_masked_store_double(uniform double *ptr, varying double values, mask) ; ; These in turn are converted to native masked stores or to regular ; stores (if the mask is all on) by the MaskedStoreOptPass optimization ; pass. declare void @__pseudo_masked_store_i8( * nocapture, , ) declare void @__pseudo_masked_store_i16( * nocapture, , ) declare void @__pseudo_masked_store_i32( * nocapture, , ) declare void @__pseudo_masked_store_float( * nocapture, , ) declare void @__pseudo_masked_store_i64( * nocapture, , ) declare void @__pseudo_masked_store_double( * nocapture, , ) ; Declare the pseudo-gather functions. When the ispc front-end needs ; to perform a gather, it generates a call to one of these functions, ; which ideally have these signatures: ; ; varying int8 __pseudo_gather_i8(varying int8 *, mask) ; varying int16 __pseudo_gather_i16(varying int16 *, mask) ; varying int32 __pseudo_gather_i32(varying int32 *, mask) ; varying float __pseudo_gather_float(varying float *, mask) ; varying int64 __pseudo_gather_i64(varying int64 *, mask) ; varying double __pseudo_gather_double(varying double *, mask) ; ; However, vectors of pointers weren not legal in LLVM until recently, so ; instead, it emits calls to functions that either take vectors of int32s ; or int64s, depending on the compilation target. declare @__pseudo_gather32_i8(, ) nounwind readonly declare @__pseudo_gather32_i16(, ) nounwind readonly declare @__pseudo_gather32_i32(, ) nounwind readonly declare @__pseudo_gather32_float(, ) nounwind readonly declare @__pseudo_gather32_i64(, ) nounwind readonly declare @__pseudo_gather32_double(, ) nounwind readonly declare @__pseudo_gather64_i8(, ) nounwind readonly declare @__pseudo_gather64_i16(, ) nounwind readonly declare @__pseudo_gather64_i32(, ) nounwind readonly declare @__pseudo_gather64_float(, ) nounwind readonly declare @__pseudo_gather64_i64(, ) nounwind readonly declare @__pseudo_gather64_double(, ) nounwind readonly ; The ImproveMemoryOps optimization pass finds these calls and then ; tries to convert them to be calls to gather functions that take a uniform ; base pointer and then a varying integer offset, when possible. ; ; For targets without a native gather instruction, it is best to factor the ; integer offsets like "{1/2/4/8} * varying_offset + constant_offset", ; where varying_offset includes non-compile time constant values, and ; constant_offset includes compile-time constant values. (The scalar loads ; generated in turn can then take advantage of the free offsetting and scale by ; 1/2/4/8 that is offered by the x86 addresisng modes.) ; ; varying int{8,16,32,float,64,double} ; __pseudo_gather_factored_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base, ; int{32,64} offsets, uniform int32 offset_scale, ; int{32,64} offset_delta, mask) ; ; For targets with a gather instruction, it is better to just factor them into ; a gather from a uniform base pointer and then "{1/2/4/8} * offsets", where the ; offsets are int32/64 vectors. ; ; varying int{8,16,32,float,64,double} ; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base, ; uniform int32 offset_scale, int{32,64} offsets, mask) declare @__pseudo_gather_factored_base_offsets32_i8(i8 *, , i32, , ) nounwind readonly declare @__pseudo_gather_factored_base_offsets32_i16(i8 *, , i32, , ) nounwind readonly declare @__pseudo_gather_factored_base_offsets32_i32(i8 *, , i32, , ) nounwind readonly declare @__pseudo_gather_factored_base_offsets32_float(i8 *, , i32, , ) nounwind readonly declare @__pseudo_gather_factored_base_offsets32_i64(i8 *, , i32, , ) nounwind readonly declare @__pseudo_gather_factored_base_offsets32_double(i8 *, , i32, , ) nounwind readonly declare @__pseudo_gather_factored_base_offsets64_i8(i8 *, , i32, , ) nounwind readonly declare @__pseudo_gather_factored_base_offsets64_i16(i8 *, , i32, , ) nounwind readonly declare @__pseudo_gather_factored_base_offsets64_i32(i8 *, , i32, , ) nounwind readonly declare @__pseudo_gather_factored_base_offsets64_float(i8 *, , i32, , ) nounwind readonly declare @__pseudo_gather_factored_base_offsets64_i64(i8 *, , i32, , ) nounwind readonly declare @__pseudo_gather_factored_base_offsets64_double(i8 *, , i32, , ) nounwind readonly declare @__pseudo_gather_base_offsets32_i8(i8 *, i32, , ) nounwind readonly declare @__pseudo_gather_base_offsets32_i16(i8 *, i32, , ) nounwind readonly declare @__pseudo_gather_base_offsets32_i32(i8 *, i32, , ) nounwind readonly declare @__pseudo_gather_base_offsets32_float(i8 *, i32, , ) nounwind readonly declare @__pseudo_gather_base_offsets32_i64(i8 *, i32, , ) nounwind readonly declare @__pseudo_gather_base_offsets32_double(i8 *, i32, , ) nounwind readonly declare @__pseudo_gather_base_offsets64_i8(i8 *, i32, , ) nounwind readonly declare @__pseudo_gather_base_offsets64_i16(i8 *, i32, , ) nounwind readonly declare @__pseudo_gather_base_offsets64_i32(i8 *, i32, , ) nounwind readonly declare @__pseudo_gather_base_offsets64_float(i8 *, i32, , ) nounwind readonly declare @__pseudo_gather_base_offsets64_i64(i8 *, i32, , ) nounwind readonly declare @__pseudo_gather_base_offsets64_double(i8 *, i32, , ) nounwind readonly ; Similarly to the pseudo-gathers defined above, we also declare undefined ; pseudo-scatter instructions with signatures: ; ; void __pseudo_scatter_i8 (varying int8 *, varying int8 values, mask) ; void __pseudo_scatter_i16(varying int16 *, varying int16 values, mask) ; void __pseudo_scatter_i32(varying int32 *, varying int32 values, mask) ; void __pseudo_scatter_float(varying float *, varying float values, mask) ; void __pseudo_scatter_i64(varying int64 *, varying int64 values, mask) ; void __pseudo_scatter_double(varying double *, varying double values, mask) ; declare void @__pseudo_scatter32_i8(, , ) nounwind declare void @__pseudo_scatter32_i16(, , ) nounwind declare void @__pseudo_scatter32_i32(, ,