merged with master

This commit is contained in:
egaburov
2013-10-08 19:13:30 +02:00
162 changed files with 16148 additions and 1942 deletions

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2011, Intel Corporation
;; Copyright (c) 2011-2013, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -41,15 +41,13 @@
@__system_best_isa = internal global i32 -1
declare void @abort() noreturn
;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
;; following code... Specifically, __get_system_isa should return a value
;; corresponding to one of the Target::ISA enumerant values that gives the
;; most capable ISA that the curremt system can run.
;;
;; Note: clang from LLVM 3.0 should be used if this is updated, for maximum
;; backwards compatibility for anyone building ispc with LLVM 3.0
;; Note: clang from LLVM 3.1 should be used if this is updated, for maximum
;; backwards compatibility for anyone building ispc with LLVM 3.1
;;
;; #include <stdint.h>
;; #include <stdlib.h>
@@ -60,7 +58,7 @@ declare void @abort() noreturn
;; : "0" (infoType));
;; }
;;
;; /* Save %ebx in case it's the PIC register */
;; // Save %ebx in case it's the PIC register.
;; static void __cpuid_count(int info[4], int level, int count) {
;; __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
;; "cpuid\n\t"
@@ -69,13 +67,23 @@ declare void @abort() noreturn
;; : "0" (level), "2" (count));
;; }
;;
;; static int __os_has_avx_support() {
;; // Check xgetbv; this uses a .byte sequence instead of the instruction
;; // directly because older assemblers do not include support for xgetbv and
;; // there is no easy way to conditionally compile based on the assembler used.
;; int rEAX, rEDX;
;; __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
;; return (rEAX & 6) == 6;
;; }
;;
;; int32_t __get_system_isa() {
;; int info[4];
;; __cpuid(info, 1);
;;
;; /* NOTE: the values returned below must be the same as the
;; corresponding enumerant values in Target::ISA. */
;; if ((info[2] & (1 << 28)) != 0) {
;; // NOTE: the values returned below must be the same as the
;; // corresponding enumerant values in Target::ISA.
;; if ((info[2] & (1 << 28)) != 0 &&
;; __os_has_avx_support()) {
;; if ((info[2] & (1 << 29)) != 0 && // F16C
;; (info[2] & (1 << 30)) != 0) { // RDRAND
;; // So far, so good. AVX2?
@@ -98,47 +106,56 @@ declare void @abort() noreturn
;; abort();
;; }
define i32 @__get_system_isa() nounwind uwtable ssp {
define i32 @__get_system_isa() nounwind uwtable {
entry:
%0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
%asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2
%asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
%and = and i32 %asmresult5.i, 268435456
%cmp = icmp eq i32 %and, 0
br i1 %cmp, label %if.else13, label %if.then
br i1 %cmp, label %if.else14, label %land.lhs.true
if.then: ; preds = %entry
%1 = and i32 %asmresult5.i, 1610612736
%2 = icmp eq i32 %1, 1610612736
br i1 %2, label %if.then7, label %return
land.lhs.true: ; preds = %entry
%1 = tail call { i32, i32 } asm sideeffect ".byte 0x0f, 0x01, 0xd0", "={ax},={dx},{cx},~{dirflag},~{fpsr},~{flags}"(i32 0) nounwind
%asmresult.i25 = extractvalue { i32, i32 } %1, 0
%and.i = and i32 %asmresult.i25, 6
%cmp.i = icmp eq i32 %and.i, 6
br i1 %cmp.i, label %if.then, label %if.else14
if.then7: ; preds = %if.then
%3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
%asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1
%and10 = lshr i32 %asmresult4.i28, 5
%4 = and i32 %and10, 1
%5 = add i32 %4, 3
if.then: ; preds = %land.lhs.true
%2 = and i32 %asmresult5.i, 1610612736
%3 = icmp eq i32 %2, 1610612736
br i1 %3, label %if.then8, label %return
if.then8: ; preds = %if.then
%4 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
%asmresult4.i30 = extractvalue { i32, i32, i32, i32 } %4, 1
%and11 = lshr i32 %asmresult4.i30, 5
%5 = and i32 %and11, 1
%6 = add i32 %5, 3
br label %return
if.else13: ; preds = %entry
%and15 = and i32 %asmresult5.i, 524288
%cmp16 = icmp eq i32 %and15, 0
br i1 %cmp16, label %if.else18, label %return
if.else14: ; preds = %land.lhs.true, %entry
%and16 = and i32 %asmresult5.i, 524288
%cmp17 = icmp eq i32 %and16, 0
br i1 %cmp17, label %if.else19, label %return
if.else18: ; preds = %if.else13
%and20 = and i32 %asmresult6.i, 67108864
%cmp21 = icmp eq i32 %and20, 0
br i1 %cmp21, label %if.else23, label %return
if.else19: ; preds = %if.else14
%and21 = and i32 %asmresult6.i, 67108864
%cmp22 = icmp eq i32 %and21, 0
br i1 %cmp22, label %if.else24, label %return
if.else23: ; preds = %if.else18
if.else24: ; preds = %if.else19
tail call void @abort() noreturn nounwind
unreachable
return: ; preds = %if.else18, %if.else13, %if.then7, %if.then
%retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ]
return: ; preds = %if.else19, %if.else14, %if.then8, %if.then
%retval.0 = phi i32 [ %6, %if.then8 ], [ 2, %if.then ], [ 1, %if.else14 ], [ 0, %if.else19 ]
ret i32 %retval.0
}
declare void @abort() noreturn nounwind
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; This function is called by each of the dispatch functions we generate;
;; it sets @__system_best_isa if it is unset.

217
builtins/svml.m4 Normal file
View File

@@ -0,0 +1,217 @@
;; copyright stub :)
;; Copyright (c) 2013, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;; svml macro
;; svml_stubs : stubs for svml calls
;; $1 - type ("float" or "double")
;; $2 - svml internal function suffix ("f" for float, "d" for double)
;; $3 - vector width
define(`svml_stubs',`
declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone alwaysinline
declare void @__svml_sincos$2(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline
')
;; svml_declare : declaration of __svml_* intrinsics
;; $1 - type ("float" or "double")
;; $2 - __svml_* intrinsic function suffix
;; float: "f4"(sse) "f8"(avx) "f16"(avx512)
;; double: "2"(sse) "4"(avx) "8"(avx512)
;; $3 - vector width
define(`svml_declare',`
declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_sincos$2(<$3 x $1> *, <$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone
');
;; defintition of __svml_* internal functions
;; $1 - type ("float" or "double")
;; $2 - __svml_* intrinsic function suffix
;; float: "f4"(sse) "f8"(avx) "f16"(avx512)
;; double: "2"(sse) "4"(avx) "8"(avx512)
;; $3 - vector width
;; $4 - svml internal function suffix ("f" for float, "d" for double)
define(`svml_define',`
define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0)
ret <$3 x $1> %ret
}
define <$3 x $1> @__svml_asin$4(<$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_asin$2(<$3 x $1> %0)
ret <$3 x $1> %ret
}
define <$3 x $1> @__svml_cos$4(<$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_cos$2(<$3 x $1> %0)
ret <$3 x $1> %ret
}
define void @__svml_sincos$4(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline {
%s = call <$3 x $1> @__svml_sincos$2(<$3 x $1> * %2, <$3 x $1> %0)
store <$3 x $1> %s, <$3 x $1> * %1
ret void
}
define <$3 x $1> @__svml_tan$4(<$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_tan$2(<$3 x $1> %0)
ret <$3 x $1> %ret
}
define <$3 x $1> @__svml_atan$4(<$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_atan$2(<$3 x $1> %0)
ret <$3 x $1> %ret
}
define <$3 x $1> @__svml_atan2$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_atan2$2(<$3 x $1> %0, <$3 x $1> %1)
ret <$3 x $1> %ret
}
define <$3 x $1> @__svml_exp$4(<$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_exp$2(<$3 x $1> %0)
ret <$3 x $1> %ret
}
define <$3 x $1> @__svml_log$4(<$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_log$2(<$3 x $1> %0)
ret <$3 x $1> %ret
}
define <$3 x $1> @__svml_pow$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_pow$2(<$3 x $1> %0, <$3 x $1> %1)
ret <$3 x $1> %ret
}
')
;; svml_define_x : defintition of __svml_* internal functions operation on extended width
;; $1 - type ("float" or "double")
;; $2 - __svml_* intrinsic function suffix
;; float: "f4"(sse) "f8"(avx) "f16"(avx512)
;; double: "2"(sse) "4"(avx) "8"(avx512)
;; $3 - vector width
;; $4 - svml internal function suffix ("f" for float, "d" for double)
;; $5 - extended width, must be at least twice the native vector width
;; contigent on existing of unary$3to$5 and binary$3to$5 macros
;; *todo*: in sincos call use __svml_sincos[f][2,4,8,16] call, e.g.
;;define void @__svml_sincosf(<8 x float>, <8 x float> *,
;; <8 x float> *) nounwind readnone alwaysinline {
;; ; call svml_sincosf4 two times with the two 4-wide sub-vectors
;; %a = shufflevector <8 x float> %0, <8 x float> undef,
;; <4 x i32> <i32 0, i32 1, i32 2, i32 3>
;; %b = shufflevector <8 x float> %0, <8 x float> undef,
;; <4 x i32> <i32 4, i32 5, i32 6, i32 7>
;;
;; %cospa = alloca <4 x float>
;; %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
;;
;; %cospb = alloca <4 x float>
;; %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
;;
;; %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
;; <8 x i32> <i32 0, i32 1, i32 2, i32 3,
;; i32 4, i32 5, i32 6, i32 7>
;; store <8 x float> %sin, <8 x float> * %1
;;
;; %cosa = load <4 x float> * %cospa
;; %cosb = load <4 x float> * %cospb
;; %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
;; <8 x i32> <i32 0, i32 1, i32 2, i32 3,
;; i32 4, i32 5, i32 6, i32 7>
;; store <8 x float> %cos, <8 x float> * %2
;;
;; ret void
;;}
define(`svml_define_x',`
define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline {
unary$3to$5(ret, $1, @__svml_sin$2, %0)
ret <$5 x $1> %ret
}
define <$5 x $1> @__svml_asin$4(<$5 x $1>) nounwind readnone alwaysinline {
unary$3to$5(ret, $1, @__svml_asin$2, %0)
ret <$5 x $1> %ret
}
define <$5 x $1> @__svml_cos$4(<$5 x $1>) nounwind readnone alwaysinline {
unary$3to$5(ret, $1, @__svml_cos$2, %0)
ret <$5 x $1> %ret
}
define void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline
{
%s = call <$5 x $1> @__svml_sin$4(<$5 x $1> %0)
%c = call <$5 x $1> @__svml_cos$4(<$5 x $1> %0)
store <$5 x $1> %s, <$5 x $1> * %1
store <$5 x $1> %c, <$5 x $1> * %2
ret void
}
define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline {
unary$3to$5(ret, $1, @__svml_tan$2, %0)
ret <$5 x $1> %ret
}
define <$5 x $1> @__svml_atan$4(<$5 x $1>) nounwind readnone alwaysinline {
unary$3to$5(ret, $1, @__svml_atan$2, %0)
ret <$5 x $1> %ret
}
define <$5 x $1> @__svml_atan2$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
binary$3to$5(ret, $1, @__svml_atan2$2, %0, %1)
ret <$5 x $1> %ret
}
define <$5 x $1> @__svml_exp$4(<$5 x $1>) nounwind readnone alwaysinline {
unary$3to$5(ret, $1, @__svml_exp$2, %0)
ret <$5 x $1> %ret
}
define <$5 x $1> @__svml_log$4(<$5 x $1>) nounwind readnone alwaysinline {
unary$3to$5(ret, $1, @__svml_log$2, %0)
ret <$5 x $1> %ret
}
define <$5 x $1> @__svml_pow$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
binary$3to$5(ret, $1, @__svml_pow$2, %0, %1)
ret <$5 x $1> %ret
}
')

View File

@@ -277,3 +277,9 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
define_avgs()

View File

@@ -137,19 +137,14 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; svml
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
; or, use the macro to call the 4-wide ones 4x with our 16-wide
; vectors...
include(`svml.m4')
;; single precision
svml_declare(float,f8,8)
svml_define_x(float,f8,8,f,16)
declare <16 x float> @__svml_sin(<16 x float>)
declare <16 x float> @__svml_cos(<16 x float>)
declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
declare <16 x float> @__svml_tan(<16 x float>)
declare <16 x float> @__svml_atan(<16 x float>)
declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
declare <16 x float> @__svml_exp(<16 x float>)
declare <16 x float> @__svml_log(<16 x float>)
declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
;; double precision
svml_declare(double,4,4)
svml_define_x(double,4,4,d,16)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max
@@ -271,6 +266,33 @@ reduce_equal(16)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int32 ops
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <16 x i16> @__add_varying_i16(<16 x i16>,
<16 x i16>) nounwind readnone alwaysinline {
%r = add <16 x i16> %0, %1
ret <16 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
}
define <16 x i32> @__add_varying_int32(<16 x i32>,
<16 x i32>) nounwind readnone alwaysinline {
%s = add <16 x i32> %0, %1

View File

@@ -137,19 +137,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; svml
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
; or, use the macro to call the 4-wide ones twice with our 8-wide
; vectors...
include(`svml.m4')
;; single precision
svml_declare(float,f8,8)
svml_define(float,f8,8,f)
declare <8 x float> @__svml_sin(<8 x float>)
declare <8 x float> @__svml_cos(<8 x float>)
declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
declare <8 x float> @__svml_tan(<8 x float>)
declare <8 x float> @__svml_atan(<8 x float>)
declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
declare <8 x float> @__svml_exp(<8 x float>)
declare <8 x float> @__svml_log(<8 x float>)
declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
;; double precision
svml_declare(double,4,4)
svml_define_x(double,4,4,d,8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max
@@ -217,7 +212,6 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
ret float %sum
}
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
reduce8(float, @__min_varying_float, @__min_uniform_float)
}
@@ -229,6 +223,42 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
reduce_equal(8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int8 ops
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int16 ops
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
<8 x i16>) nounwind readnone alwaysinline {
%r = add <8 x i16> %0, %1
ret <8 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int32 ops
@@ -257,20 +287,14 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; horizontal uint32 ops
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal double ops
@@ -329,9 +353,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; horizontal uint64 ops
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
}

View File

@@ -0,0 +1,81 @@
;; Copyright (c) 2013, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(`target-avx1-i64x4base.ll')
rdrand_decls()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
%call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
ret <4 x i32> %call
}
define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
%call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
ret <4 x i32> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unsigned int min/max
define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
%call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
ret <4 x i32> %call
}
define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
%call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
ret <4 x i32> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
ifelse(NO_HALF_DECLARES, `1', `', `
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)

View File

@@ -0,0 +1,513 @@
;; Copyright (c) 2013, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Basic 4-wide definitions
define(`WIDTH',`4')
define(`MASK',`i64')
include(`util.m4')
stdlib_core()
packed_load_and_store()
scans()
int64minmax()
include(`target-avx-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
;; sse intrinsic
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
; float iv = __rcp_v(v);
; return iv * (2. - v * iv);
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
; do one N-R iteration
%v_iv = fmul <4 x float> %0, %call
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
%iv_mul = fmul <4 x float> %call, %two_minus
ret <4 x float> %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats
;; sse intrinsic
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
ret <4 x float> %call
}
define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
ret <4 x float> %call
}
define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
ret <4 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding doubles
;; avx intrinsic
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
%call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 8)
ret <4 x double> %call
}
define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
%call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 9)
ret <4 x double> %call
}
define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
%call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 10)
ret <4 x double> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt
;; sse intrinsic
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
; float is = __rsqrt_v(v);
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <4 x float> %v, %is
%v_is_is = fmul <4 x float> %v_is, %is
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <4 x float> %is, %three_sub
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
ret <4 x float> %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; sqrt
;; sse intrinsic
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
ret <4 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
;; avx<76> intrinsic
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
%call = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %0)
ret <4 x double> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; svml
include(`svml.m4')
;; single precision
svml_declare(float,f4,4)
svml_define(float,f4,4,f)
;; double precision
svml_declare(double,4,4)
svml_define(double,4,4,d)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max
;; sse intrinsics
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
ret <4 x float> %call
}
define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
ret <4 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops
;; sse intrinsic
declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
define i64 @__movmsk(<4 x i64>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i64> %0 to <4 x double>
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
%v64 = zext i32 %v to i64
ret i64 %v64
}
define i1 @__any(<4 x i64>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i64> %0 to <4 x double>
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<4 x i64>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i64> %0 to <4 x double>
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 15
ret i1 %cmp
}
define i1 @__none(<4 x i64>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i64> %0 to <4 x double>
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal float ops
;; sse intrinsic
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
%v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
%v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
%scalar = extractelement <4 x float> %v2, i32 0
ret float %scalar
}
define float @__reduce_min_float(<4 x float>) nounwind readnone {
reduce4(float, @__min_varying_float, @__min_uniform_float)
}
define float @__reduce_max_float(<4 x float>) nounwind readnone {
reduce4(float, @__max_varying_float, @__max_uniform_float)
}
reduce_equal(4)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int8 ops
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline
{
%wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int16 ops
define internal <4 x i16> @__add_varying_i16(<4 x i16>,
<4 x i16>) nounwind readnone alwaysinline {
%r = add <4 x i16> %0, %1
ret <4 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int32 ops
define <4 x i32> @__add_varying_int32(<4 x i32>,
<4 x i32>) nounwind readnone alwaysinline {
%s = add <4 x i32> %0, %1
ret <4 x i32> %s
}
define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
%s = add i32 %0, %1
ret i32 %s
}
define i32 @__reduce_add_int32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__add_varying_int32, @__add_uniform_int32)
}
define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
}
define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
}
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal double ops
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
define double @__reduce_add_double(<4 x double>) nounwind readonly alwaysinline {
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1 = shufflevector <4 x double> <double 0.,double 0.,double 0.,double 0.>, <4 x double> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
;; %v1 = <4 x double> <double 0., double 0., double 0., double 0.>
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
%final0 = extractelement <4 x double> %sum1, i32 0
%final1 = extractelement <4 x double> %sum1, i32 2
%sum = fadd double %final0, %final1
ret double %sum
}
define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
reduce4(double, @__min_varying_double, @__min_uniform_double)
}
define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
reduce4(double, @__max_varying_double, @__max_uniform_double)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int64 ops
define <4 x i64> @__add_varying_int64(<4 x i64>,
<4 x i64>) nounwind readnone alwaysinline {
%s = add <4 x i64> %0, %1
ret <4 x i64> %s
}
define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
%s = add i64 %0, %1
ret i64 %s
}
define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__add_varying_int64, @__add_uniform_int64)
}
define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
}
define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
}
define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
}
define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
; no masked load instruction for i8 and i16 types??
masked_load(i8, 1)
masked_load(i16, 2)
;; avx intrinsics
declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask)
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline {
%mask = trunc <4 x i64> %mask64 to <4 x i32>
%floatmask = bitcast <4 x i32> %mask to <4 x float>
%floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask)
%retval = bitcast <4 x float> %floatval to <4 x i32>
ret <4 x i32> %retval
}
define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline {
%doublemask = bitcast <4 x i64> %mask to <4 x double>
%doubleval = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %doublemask)
%retval = bitcast <4 x double> %doubleval to <4 x i64>
ret <4 x i64> %retval
}
masked_load_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
gen_masked_store(i8)
gen_masked_store(i16)
; note that mask is the 2nd parameter, not the 3rd one!!
;; avx intrinsics
declare void @llvm.x86.avx.maskstore.ps (i8 *, <4 x float>, <4 x float>)
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>,
<4 x i64>) nounwind alwaysinline {
%mask32 = trunc <4 x i64> %2 to <4 x i32>
%ptr = bitcast <4 x i32> * %0 to i8 *
%val = bitcast <4 x i32> %1 to <4 x float>
%mask = bitcast <4 x i32> %mask32 to <4 x float>
call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val)
ret void
}
define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
<4 x i64>) nounwind alwaysinline {
%ptr = bitcast <4 x i64> * %0 to i8 *
%val = bitcast <4 x i64> %1 to <4 x double>
%mask = bitcast <4 x i64> %2 to <4 x double>
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask, <4 x double> %val)
ret void
}
masked_store_blend_8_16_by_4_mask64()
;; sse intrinsic
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
<4 x float>) nounwind readnone
define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>,
<4 x i64>) nounwind alwaysinline {
%mask = trunc <4 x i64> %2 to <4 x i32>
%mask_as_float = bitcast <4 x i32> %mask to <4 x float>
%oldValue = load <4 x i32>* %0, align 4
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
%newAsFloat = bitcast <4 x i32> %1 to <4 x float>
%blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
<4 x float> %newAsFloat,
<4 x float> %mask_as_float)
%blendAsInt = bitcast <4 x float> %blend to <4 x i32>
store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
ret void
}
;; avx intrinsic
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
<4 x double>) nounwind readnone
define void @__masked_store_blend_i64(<4 x i64>* nocapture , <4 x i64>,
<4 x i64>) nounwind alwaysinline {
%mask_as_double = bitcast <4 x i64> %2 to <4 x double>
%oldValue = load <4 x i64>* %0, align 4
%oldAsDouble = bitcast <4 x i64> %oldValue to <4 x double>
%newAsDouble = bitcast <4 x i64> %1 to <4 x double>
%blend = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %oldAsDouble,
<4 x double> %newAsDouble,
<4 x double> %mask_as_double)
%blendAsInt = bitcast <4 x double> %blend to <4 x i64>
store <4 x i64> %blendAsInt, <4 x i64>* %0, align 4
ret void
}
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; scatter
gen_scatter(i8)
gen_scatter(i16)
gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
%call = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %0, <4 x double> %1)
ret <4 x double> %call
}
define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
%call = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %0, <4 x double> %1)
ret <4 x double> %call
}

View File

@@ -310,6 +310,7 @@ declare double @round (double) nounwind readnone
;declare float @llvm.sqrt.f32(float %Val)
declare double @llvm.sqrt.f64(double %Val)
declare float @llvm.sin.f32(float %Val)
declare float @llvm.asin.f32(float %Val)
declare float @llvm.cos.f32(float %Val)
declare float @llvm.sqrt.f32(float %Val)
declare float @llvm.exp.f32(float %Val)
@@ -471,6 +472,15 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
ret i64 %call
}
define i8 @__reduce_add_int8(<1 x i8> %v) nounwind readonly alwaysinline {
%r = extractelement <1 x i8> %v, i32 0
ret i8 %r
}
define i16 @__reduce_add_int16(<1 x i16> %v) nounwind readonly alwaysinline {
%r = extractelement <1 x i16> %v, i32 0
ret i16 %r
}
define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
%r = extractelement <1 x float> %v, i32 0
@@ -642,7 +652,18 @@ define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
declare <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline
declare <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline
declare <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline
declare void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline
declare <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline
declare <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline
declare <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline
declare <1 x float> @__svml_expd(<1 x float>) nounwind readnone alwaysinline
declare <1 x float> @__svml_logd(<1 x float>) nounwind readnone alwaysinline
declare <1 x float> @__svml_powd(<1 x float>, <1 x float>) nounwind readnone alwaysinline
define <1 x float> @__svml_sinf(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
;ret <1 x float> %ret
;%r = extractelement <1 x float> %0, i32 0
@@ -653,7 +674,18 @@ define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
}
define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
define <1 x float> @__svml_asinf(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_asinf4(<1 x float> %0)
;ret <1 x float> %ret
;%r = extractelement <1 x float> %0, i32 0
;%s = call float @llvm.asin.f32(float %r)
;%rv = insertelement <1 x float> undef, float %r, i32 0
;ret <1 x float> %rv
unary1to1(float,@llvm.asin.f32)
}
define <1 x float> @__svml_cosf(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
;ret <1 x float> %ret
;%r = extractelement <1 x float> %0, i32 0
@@ -664,18 +696,18 @@ define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
}
define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
define void @__svml_sincosf(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
; store <1 x float> %s, <1 x float> * %1
; ret void
%sin = call <1 x float> @__svml_sin (<1 x float> %0)
%cos = call <1 x float> @__svml_cos (<1 x float> %0)
%sin = call <1 x float> @__svml_sinf(<1 x float> %0)
%cos = call <1 x float> @__svml_cosf(<1 x float> %0)
store <1 x float> %sin, <1 x float> * %1
store <1 x float> %cos, <1 x float> * %2
ret void
}
define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
define <1 x float> @__svml_tanf(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
;ret <1 x float> %ret
;%r = extractelement <1 x float> %0, i32 0
@@ -687,7 +719,7 @@ define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
ret <1 x float > %0
}
define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
define <1 x float> @__svml_atanf(<1 x float>) nounwind readnone alwaysinline {
; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
; ret <1 x float> %ret
;%r = extractelement <1 x float> %0, i32 0
@@ -700,7 +732,7 @@ define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
}
define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
define <1 x float> @__svml_atan2f(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
;ret <1 x float> %ret
;%y = extractelement <1 x float> %0, i32 0
@@ -713,19 +745,19 @@ define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone al
ret <1 x float > %0
}
define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
define <1 x float> @__svml_expf(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
;ret <1 x float> %ret
unary1to1(float, @llvm.exp.f32)
}
define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
define <1 x float> @__svml_logf(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
;ret <1 x float> %ret
unary1to1(float, @llvm.log.f32)
}
define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
define <1 x float> @__svml_powf(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
;ret <1 x float> %ret
%r = extractelement <1 x float> %0, i32 0
@@ -953,3 +985,9 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
define_avgs()

View File

@@ -202,21 +202,15 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
;; svml
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
; or, use the macro to call the 4-wide ones twice with our 8-wide
; vectors...
declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
declare <WIDTH x float> @__svml_log(<WIDTH x float>)
declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
;; svml
include(`svml.m4')
svml_stubs(float,f,WIDTH)
svml_stubs(double,d,WIDTH)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reductions
@@ -226,14 +220,16 @@ declare i1 @__any(<WIDTH x i1>) nounwind readnone
declare i1 @__all(<WIDTH x i1>) nounwind readnone
declare i1 @__none(<WIDTH x i1>) nounwind readnone
declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone
declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone
declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
declare i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone
@@ -244,7 +240,6 @@ declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone
declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone
@@ -379,3 +374,8 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind
declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
define_avgs()

517
builtins/target-neon-16.ll Normal file
View File

@@ -0,0 +1,517 @@
;;
;; target-neon-16.ll
;;
;; Copyright(c) 2013 Google, Inc.
;;
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Matt Pharr nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
define(`WIDTH',`8')
define(`MASK',`i16')
include(`util.m4')
include(`target-neon-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
unary4to8conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
ret <8 x float> %r
}
define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
unary4to8conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
ret <8 x i16> %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; math
;; round/floor/ceil
;; FIXME: grabbed these from the sse2 target, which does not have native
;; instructions for these. Is there a better approach for NEON?
define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
%float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
%bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i,
<i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
%bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
%int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
%binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i,
<float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
%binop21.i = fadd <8 x float> %binop.i,
<float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
%float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
%bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
ret <8 x float> %int_to_float_bitcast.i.i.i
}
define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
%calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
%bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
%val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
%bitop.i = and <8 x i32> %val_to_boolvec32.i,
<i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
%binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
ret <8 x float> %binop.i
}
define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
%calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
%bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
%val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
%bitop.i = and <8 x i32> %val_to_boolvec32.i,
<i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
%binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
ret <8 x float> %binop.i
}
;; FIXME: rounding doubles and double vectors needs to be implemented
declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone
declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone
declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; min/max
declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
<WIDTH x float>) nounwind readnone {
binary4to8(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
ret <WIDTH x float> %r
}
define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
<WIDTH x float>) nounwind readnone {
binary4to8(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
ret <WIDTH x float> %r
}
declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
binary4to8(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
ret <WIDTH x i32> %r
}
define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
binary4to8(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
ret <WIDTH x i32> %r
}
define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
binary4to8(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
ret <WIDTH x i32> %r
}
define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
binary4to8(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
ret <WIDTH x i32> %r
}
;; sqrt/rsqrt/rcp
declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
unary4to8(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
binary4to8(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
%x1 = fmul <WIDTH x float> %x0, %x0_nr
binary4to8(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
%x2 = fmul <WIDTH x float> %x1, %x1_nr
ret <WIDTH x float> %x2
}
declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
unary4to8(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
%x0_2 = fmul <WIDTH x float> %x0, %x0
binary4to8(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
%x1 = fmul <WIDTH x float> %x0, %x0_nr
%x1_2 = fmul <WIDTH x float> %x1, %x1
binary4to8(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
%x2 = fmul <WIDTH x float> %x1, %x1_nr
ret <WIDTH x float> %x2
}
define float @__rsqrt_uniform_float(float) nounwind readnone {
%v1 = bitcast float %0 to <1 x float>
%vs = shufflevector <1 x float> %v1, <1 x float> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%vr = call <8 x float> @__rsqrt_varying_float(<8 x float> %vs)
%r = extractelement <8 x float> %vr, i32 0
ret float %r
}
define float @__rcp_uniform_float(float) nounwind readnone {
%v1 = bitcast float %0 to <1 x float>
%vs = shufflevector <1 x float> %v1, <1 x float> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%vr = call <8 x float> @__rcp_varying_float(<8 x float> %vs)
%r = extractelement <8 x float> %vr, i32 0
ret float %r
}
declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
unary4to8(result, float, @llvm.sqrt.v4f32, %0)
;; this returns nan for v=0, which is undesirable..
;; %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
;; %result = fmul <4 x float> %rsqrt, %0
ret <8 x float> %result
}
declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
unary4to8(r, double, @llvm.sqrt.v4f64, %0)
ret <WIDTH x double> %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reductions
define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
%and_mask = and <WIDTH x i16> %0,
<i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
%v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %and_mask)
%v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
%va = extractelement <2 x i64> %v2, i32 0
%vb = extractelement <2 x i64> %v2, i32 1
%v = or i64 %va, %vb
ret i64 %v
}
define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
v8tov4(MASK, %0, %v0123, %v4567)
%vor = or <4 x MASK> %v0123, %v4567
%v0 = extractelement <4 x MASK> %vor, i32 0
%v1 = extractelement <4 x MASK> %vor, i32 1
%v2 = extractelement <4 x MASK> %vor, i32 2
%v3 = extractelement <4 x MASK> %vor, i32 3
%v01 = or MASK %v0, %v1
%v23 = or MASK %v2, %v3
%v = or MASK %v01, %v23
%cmp = icmp ne MASK %v, 0
ret i1 %cmp
}
define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
v8tov4(MASK, %0, %v0123, %v4567)
%vand = and <4 x MASK> %v0123, %v4567
%v0 = extractelement <4 x MASK> %vand, i32 0
%v1 = extractelement <4 x MASK> %vand, i32 1
%v2 = extractelement <4 x MASK> %vand, i32 2
%v3 = extractelement <4 x MASK> %vand, i32 3
%v01 = and MASK %v0, %v1
%v23 = and MASK %v2, %v3
%v = and MASK %v01, %v23
%cmp = icmp ne MASK %v, 0
ret i1 %cmp
}
define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
%any = call i1 @__any(<WIDTH x MASK> %0)
%none = icmp eq i1 %any, 0
ret i1 %none
}
;; $1: scalar type
;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
;; $4: scalar reduce function
define(`neon_reduce', `
v8tov4($1, %0, %v0123, %v4567)
%v0123_8 = shufflevector <4 x $1> %v0123, <4 x $1> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%v4567_8 = shufflevector <4 x $1> %v4567, <4 x $1> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%vfirst = call <8 x $1> $2(<8 x $1> %v0123_8, <8 x $1> %v4567_8)
%vfirst_4 = shufflevector <8 x $1> %vfirst, <8 x $1> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
v4tov2($1, %vfirst_4, %v0, %v1)
%vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
%vh0 = extractelement <2 x $1> %vh, i32 0
%vh1 = extractelement <2 x $1> %vh, i32 1
%r = call $1 $4($1 %vh0, $1 %vh1)
ret $1 %r
')
declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
define internal float @add_f32(float, float) {
%r = fadd float %0, %1
ret float %r
}
define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
%r = fadd <WIDTH x float> %0, %1
ret <WIDTH x float> %r
}
define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
}
declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
define internal float @min_f32(float, float) {
%cmp = fcmp olt float %0, %1
%r = select i1 %cmp, float %0, float %1
ret float %r
}
define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
}
declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
define internal float @max_f32(float, float) {
%cmp = fcmp ugt float %0, %1
%r = select i1 %cmp, float %0, float %1
ret float %r
}
define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
}
declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
%a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %0)
%a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
%a0 = extractelement <2 x i32> %a32, i32 0
%a1 = extractelement <2 x i32> %a32, i32 1
%r = add i32 %a0, %a1
%r16 = trunc i32 %r to i16
ret i16 %r16
}
declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16>)
define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
%a1 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16> %0)
%a2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a1)
%aa = extractelement <2 x i64> %a2, i32 0
%ab = extractelement <2 x i64> %a2, i32 1
%r = add i64 %aa, %ab
ret i64 %r
}
declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
v8tov4(i32, %0, %va, %vb)
%pa = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
%pb = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
%psum = add <2 x i64> %pa, %pb
%a0 = extractelement <2 x i64> %psum, i32 0
%a1 = extractelement <2 x i64> %psum, i32 1
%r = add i64 %a0, %a1
ret i64 %r
}
declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
define internal i32 @min_si32(i32, i32) {
%cmp = icmp slt i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
}
declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
define internal i32 @max_si32(i32, i32) {
%cmp = icmp sgt i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
}
declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
define internal i32 @min_ui32(i32, i32) {
%cmp = icmp ult i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
}
declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
define internal i32 @max_ui32(i32, i32) {
%cmp = icmp ugt i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
}
define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
v8tov2(double, %0, %v0, %v1, %v2, %v3)
%v01 = fadd <2 x double> %v0, %v1
%v23 = fadd <2 x double> %v2, %v3
%sum = fadd <2 x double> %v01, %v23
%e0 = extractelement <2 x double> %sum, i32 0
%e1 = extractelement <2 x double> %sum, i32 1
%m = fadd double %e0, %e1
ret double %m
}
define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
reduce8(double, @__min_varying_double, @__min_uniform_double)
}
define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
reduce8(double, @__max_varying_double, @__max_uniform_double)
}
define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
v8tov2(i64, %0, %v0, %v1, %v2, %v3)
%v01 = add <2 x i64> %v0, %v1
%v23 = add <2 x i64> %v2, %v3
%sum = add <2 x i64> %v01, %v23
%e0 = extractelement <2 x i64> %sum, i32 0
%e1 = extractelement <2 x i64> %sum, i32 1
%m = add i64 %e0, %e1
ret i64 %m
}
define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
}
define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
}
define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
}
define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16
declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
%r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
ret <8 x i8> %r
}
declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone {
%r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
ret <8 x i8> %r
}
declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
%r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
ret <8 x i8> %r
}
declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone {
%r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
ret <8 x i8> %r
}
declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
%r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
ret <8 x i16> %r
}
declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone {
%r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
ret <8 x i16> %r
}
declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
%r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
ret <8 x i16> %r
}
declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone {
%r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
ret <8 x i16> %r
}

View File

@@ -1,5 +1,5 @@
;;
;; target-neon.ll
;; target-neon-32.ll
;;
;; Copyright(c) 2012-2013 Matt Pharr
;; Copyright(c) 2013 Google, Inc.
@@ -34,52 +34,20 @@
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
define(`WIDTH',`4')
define(`MASK',`i32')
include(`util.m4')
stdlib_core()
scans()
reduce_equal(WIDTH)
rdrand_decls()
define_shuffles()
aossoa()
ctlztz()
include(`target-neon-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
%v1 = bitcast i16 %v to <1 x i16>
%vec = shufflevector <1 x i16> %v1, <1 x i16> undef,
<4 x i32> <i32 0, i32 0, i32 0, i32 0>
%h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
%r = extractelement <4 x float> %h, i32 0
ret float %r
}
define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone {
%r = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %v)
ret <4 x float> %r
}
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
%v1 = bitcast float %v to <1 x float>
%vec = shufflevector <1 x float> %v1, <1 x float> undef,
<4 x i32> <i32 0, i32 0, i32 0, i32 0>
%h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
%r = extractelement <4 x i16> %h, i32 0
ret i16 %r
}
define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
%r = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %v)
ret <4 x i16> %r
@@ -88,48 +56,11 @@ define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; math
define void @__fastmath() nounwind {
ret void
}
;; round/floor/ceil
;; FIXME: grabbed these from the sse2 target, which does not have native
;; instructions for these. Is there a better approach for NEON?
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
%float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
%bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
%bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
%int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
%binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
%binop21.i = fadd float %binop.i, -8.388608e+06
%float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
%bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
ret float %int_to_float_bitcast.i.i.i
}
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
%bincmp.i = fcmp ogt float %calltmp.i, %0
%selectexpr.i = sext i1 %bincmp.i to i32
%bitop.i = and i32 %selectexpr.i, -1082130432
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
ret float %binop.i
}
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
%bincmp.i = fcmp olt float %calltmp.i, %0
%selectexpr.i = sext i1 %bincmp.i to i32
%bitop.i = and i32 %selectexpr.i, 1065353216
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
ret float %binop.i
}
define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
%float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
%bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
@@ -164,10 +95,6 @@ define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysin
}
;; FIXME: rounding doubles and double vectors needs to be implemented
declare double @__round_uniform_double(double) nounwind readnone
declare double @__floor_uniform_double(double) nounwind readnone
declare double @__ceil_uniform_double(double) nounwind readnone
declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone
declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone
declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone
@@ -175,78 +102,6 @@ declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readn
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; min/max
define float @__max_uniform_float(float, float) nounwind readnone {
%cmp = fcmp ugt float %0, %1
%r = select i1 %cmp, float %0, float %1
ret float %r
}
define float @__min_uniform_float(float, float) nounwind readnone {
%cmp = fcmp ult float %0, %1
%r = select i1 %cmp, float %0, float %1
ret float %r
}
define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
%cmp = icmp slt i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
%cmp = icmp sgt i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
%cmp = icmp ult i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
%cmp = icmp ugt i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
%cmp = icmp slt i64 %0, %1
%r = select i1 %cmp, i64 %0, i64 %1
ret i64 %r
}
define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
%cmp = icmp sgt i64 %0, %1
%r = select i1 %cmp, i64 %0, i64 %1
ret i64 %r
}
define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
%cmp = icmp ult i64 %0, %1
%r = select i1 %cmp, i64 %0, i64 %1
ret i64 %r
}
define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
%cmp = icmp ugt i64 %0, %1
%r = select i1 %cmp, i64 %0, i64 %1
ret i64 %r
}
define double @__min_uniform_double(double, double) nounwind readnone {
%cmp = fcmp olt double %0, %1
%r = select i1 %cmp, double %0, double %1
ret double %r
}
define double @__max_uniform_double(double, double) nounwind readnone {
%cmp = fcmp ogt double %0, %1
%r = select i1 %cmp, double %0, double %1
ret double %r
}
declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
@@ -287,44 +142,6 @@ define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwin
ret <4 x i32> %r
}
define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
%m = icmp slt <WIDTH x i64> %0, %1
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
ret <WIDTH x i64> %r
}
define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
%m = icmp sgt <WIDTH x i64> %0, %1
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
ret <WIDTH x i64> %r
}
define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
%m = icmp ult <WIDTH x i64> %0, %1
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
ret <WIDTH x i64> %r
}
define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
%m = icmp ugt <WIDTH x i64> %0, %1
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
ret <WIDTH x i64> %r
}
define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
<WIDTH x double>) nounwind readnone {
%m = fcmp olt <WIDTH x double> %0, %1
%r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
ret <WIDTH x double> %r
}
define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
<WIDTH x double>) nounwind readnone {
%m = fcmp ogt <WIDTH x double> %0, %1
%r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
ret <WIDTH x double> %r
}
;; sqrt/rsqrt/rcp
declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
@@ -371,13 +188,6 @@ define float @__rcp_uniform_float(float) nounwind readnone {
ret float %r
}
declare float @llvm.sqrt.f32(float)
define float @__sqrt_uniform_float(float) nounwind readnone {
%r = call float @llvm.sqrt.f32(float %0)
ret float %r
}
declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
@@ -388,13 +198,6 @@ define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
ret <4 x float> %result
}
declare double @llvm.sqrt.f64(double)
define double @__sqrt_uniform_double(double) nounwind readnone {
%r = call double @llvm.sqrt.f64(double %0)
ret double %r
}
declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
@@ -402,21 +205,6 @@ define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readno
ret <4 x double> %r
}
;; bit ops
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
define i32 @__popcnt_int32(i32) nounwind readnone {
%v = call i32 @llvm.ctpop.i32(i32 %0)
ret i32 %v
}
define i64 @__popcnt_int64(i64) nounwind readnone {
%v = call i64 @llvm.ctpop.i64(i64 %0)
ret i64 %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reductions
@@ -509,15 +297,38 @@ define float @__reduce_max_float(<4 x float>) nounwind readnone {
neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
}
define internal i32 @add_i32(i32, i32) {
%r = add i32 %0, %1
declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
%v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
%a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8)
%a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
%a0 = extractelement <2 x i32> %a32, i32 0
%a1 = extractelement <2 x i32> %a32, i32 1
%r = add i32 %a0, %a1
%r16 = trunc i32 %r to i16
ret i16 %r16
}
declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
define i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
%a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0)
%a0 = extractelement <2 x i32> %a32, i32 0
%a1 = extractelement <2 x i32> %a32, i32 1
%r = add i32 %a0, %a1
ret i32 %r
}
declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
define i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
neon_reduce(i32, @llvm.arm.neon.vpadd.v2i32, @add_i32)
define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
%a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0)
%a0 = extractelement <2 x i64> %a64, i32 0
%a1 = extractelement <2 x i64> %a64, i32 1
%r = add i64 %a0, %a1
ret i64 %r
}
declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
@@ -617,90 +428,60 @@ define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
;; int8/int16
masked_load(i8, 1)
masked_load(i16, 2)
masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8)
masked_load(double, 8)
declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
gen_masked_store(i8)
gen_masked_store(i16)
gen_masked_store(i32)
gen_masked_store(i64)
masked_store_float_double()
define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
<WIDTH x MASK> %mask) nounwind alwaysinline {
%old = load <WIDTH x i8> * %ptr
%mask1 = trunc <4 x MASK> %mask to <4 x i1>
%result = select <4 x i1> %mask1, <4 x i8> %new, <4 x i8> %old
store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
ret void
define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
%r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
ret <4 x i8> %r
}
define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new,
<WIDTH x MASK> %mask) nounwind alwaysinline {
%old = load <WIDTH x i16> * %ptr
%mask1 = trunc <4 x MASK> %mask to <4 x i1>
%result = select <4 x i1> %mask1, <4 x i16> %new, <4 x i16> %old
store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
ret void
declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone {
%r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
ret <4 x i8> %r
}
define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new,
<WIDTH x MASK> %mask) nounwind alwaysinline {
%old = load <WIDTH x i32> * %ptr
%mask1 = trunc <4 x MASK> %mask to <4 x i1>
%result = select <4 x i1> %mask1, <4 x i32> %new, <4 x i32> %old
store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
ret void
declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
%r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
ret <4 x i8> %r
}
define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
<WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
%old = load <WIDTH x i64> * %ptr
%mask1 = trunc <4 x MASK> %mask to <4 x i1>
%result = select <4 x i1> %mask1, <4 x i64> %new, <4 x i64> %old
store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
ret void
declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone {
%r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
ret <4 x i8> %r
}
;; yuck. We need declarations of these, even though we shouldnt ever
;; actually generate calls to them for the NEON target...
declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
declare <WIDTH x float> @__svml_log(<WIDTH x float>)
declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
%r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
ret <4 x i16> %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)
define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone {
%r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
ret <4 x i16> %r
}
gen_scatter(i8)
gen_scatter(i16)
gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
packed_load_and_store(4)
define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
%r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
ret <4 x i16> %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; prefetch
declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
define_prefetches()
define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone {
%r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
ret <4 x i16> %r
}

583
builtins/target-neon-8.ll Normal file
View File

@@ -0,0 +1,583 @@
;;
;; target-neon-8.ll
;;
;; Copyright(c) 2013 Google, Inc.
;;
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Matt Pharr nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
define(`WIDTH',`16')
define(`MASK',`i8')
include(`util.m4')
include(`target-neon-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
unary4to16conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
ret <16 x float> %r
}
define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
unary4to16conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
ret <16 x i16> %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; math
;; round/floor/ceil
;; FIXME: grabbed these from the sse2 target, which does not have native
;; instructions for these. Is there a better approach for NEON?
define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
%float_to_int_bitcast.i.i.i.i = bitcast <16 x float> %0 to <16 x i32>
%bitop.i.i = and <16 x i32> %float_to_int_bitcast.i.i.i.i,
<i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
%bitop.i = xor <16 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
%int_to_float_bitcast.i.i40.i = bitcast <16 x i32> %bitop.i to <16 x float>
%binop.i = fadd <16 x float> %int_to_float_bitcast.i.i40.i,
<float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
%binop21.i = fadd <16 x float> %binop.i,
<float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
%float_to_int_bitcast.i.i.i = bitcast <16 x float> %binop21.i to <16 x i32>
%bitop31.i = xor <16 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
%int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop31.i to <16 x float>
ret <16 x float> %int_to_float_bitcast.i.i.i
}
define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
%calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
%bincmp.i = fcmp ogt <16 x float> %calltmp.i, %0
%val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
%bitop.i = and <16 x i32> %val_to_boolvec32.i,
<i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
%int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
%binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
ret <16 x float> %binop.i
}
define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
%calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
%bincmp.i = fcmp olt <16 x float> %calltmp.i, %0
%val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
%bitop.i = and <16 x i32> %val_to_boolvec32.i,
<i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
%int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
%binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
ret <16 x float> %binop.i
}
;; FIXME: rounding doubles and double vectors needs to be implemented
declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone
declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone
declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; min/max
declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
<WIDTH x float>) nounwind readnone {
binary4to16(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
ret <WIDTH x float> %r
}
define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
<WIDTH x float>) nounwind readnone {
binary4to16(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
ret <WIDTH x float> %r
}
declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
binary4to16(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
ret <WIDTH x i32> %r
}
define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
binary4to16(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
ret <WIDTH x i32> %r
}
define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
binary4to16(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
ret <WIDTH x i32> %r
}
define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
binary4to16(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
ret <WIDTH x i32> %r
}
;; sqrt/rsqrt/rcp
declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
unary4to16(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
binary4to16(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
%x1 = fmul <WIDTH x float> %x0, %x0_nr
binary4to16(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
%x2 = fmul <WIDTH x float> %x1, %x1_nr
ret <WIDTH x float> %x2
}
declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
unary4to16(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
%x0_2 = fmul <WIDTH x float> %x0, %x0
binary4to16(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
%x1 = fmul <WIDTH x float> %x0, %x0_nr
%x1_2 = fmul <WIDTH x float> %x1, %x1
binary4to16(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
%x2 = fmul <WIDTH x float> %x1, %x1_nr
ret <WIDTH x float> %x2
}
define float @__rsqrt_uniform_float(float) nounwind readnone {
%v1 = bitcast float %0 to <1 x float>
%vs = shufflevector <1 x float> %v1, <1 x float> undef,
<16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%vr = call <16 x float> @__rsqrt_varying_float(<16 x float> %vs)
%r = extractelement <16 x float> %vr, i32 0
ret float %r
}
define float @__rcp_uniform_float(float) nounwind readnone {
%v1 = bitcast float %0 to <1 x float>
%vs = shufflevector <1 x float> %v1, <1 x float> undef,
<16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%vr = call <16 x float> @__rcp_varying_float(<16 x float> %vs)
%r = extractelement <16 x float> %vr, i32 0
ret float %r
}
declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
unary4to16(result, float, @llvm.sqrt.v4f32, %0)
;; this returns nan for v=0, which is undesirable..
;; %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
;; %result = fmul <4 x float> %rsqrt, %0
ret <16 x float> %result
}
declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
unary4to16(r, double, @llvm.sqrt.v4f64, %0)
ret <WIDTH x double> %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reductions
define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
%and_mask = and <WIDTH x i8> %0,
<i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128,
i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128>
%v8 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %and_mask)
%v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %v8)
%v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
%va = extractelement <2 x i64> %v2, i32 0
%vb = extractelement <2 x i64> %v2, i32 1
%vbshift = shl i64 %vb, 8
%v = or i64 %va, %vbshift
ret i64 %v
}
define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
v16tov8(MASK, %0, %v8a, %v8b)
%vor8 = or <8 x MASK> %v8a, %v8b
%v16 = sext <8 x i8> %vor8 to <8 x i16>
v8tov4(i16, %v16, %v16a, %v16b)
%vor16 = or <4 x i16> %v16a, %v16b
%v32 = sext <4 x i16> %vor16 to <4 x i32>
v4tov2(i32, %v32, %v32a, %v32b)
%vor32 = or <2 x i32> %v32a, %v32b
%v0 = extractelement <2 x i32> %vor32, i32 0
%v1 = extractelement <2 x i32> %vor32, i32 1
%v = or i32 %v0, %v1
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
v16tov8(MASK, %0, %v8a, %v8b)
%vand8 = and <8 x MASK> %v8a, %v8b
%v16 = sext <8 x i8> %vand8 to <8 x i16>
v8tov4(i16, %v16, %v16a, %v16b)
%vand16 = and <4 x i16> %v16a, %v16b
%v32 = sext <4 x i16> %vand16 to <4 x i32>
v4tov2(i32, %v32, %v32a, %v32b)
%vand32 = and <2 x i32> %v32a, %v32b
%v0 = extractelement <2 x i32> %vand32, i32 0
%v1 = extractelement <2 x i32> %vand32, i32 1
%v = and i32 %v0, %v1
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
%any = call i1 @__any(<WIDTH x MASK> %0)
%none = icmp eq i1 %any, 0
ret i1 %none
}
;; $1: scalar type
;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
;; $4: scalar reduce function
define(`neon_reduce', `
v16tov8($1, %0, %va, %vb)
%va_16 = shufflevector <8 x $1> %va, <8 x $1> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%vb_16 = shufflevector <8 x $1> %vb, <8 x $1> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%v8 = call <16 x $1> $2(<16 x $1> %va_16, <16 x $1> %vb_16)
%v8a = shufflevector <16 x $1> %v8, <16 x $1> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%v8b = shufflevector <16 x $1> %v8, <16 x $1> undef,
<16 x i32> <i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%v4 = call <16 x $1> $2(<16 x $1> %v8a, <16 x $1> %v8b)
%vfirst_4 = shufflevector <16 x $1> %v4, <16 x $1> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
v4tov2($1, %vfirst_4, %v0, %v1)
%vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
%vh0 = extractelement <2 x $1> %vh, i32 0
%vh1 = extractelement <2 x $1> %vh, i32 1
%r = call $1 $4($1 %vh0, $1 %vh1)
ret $1 %r
')
declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
define internal float @add_f32(float, float) {
%r = fadd float %0, %1
ret float %r
}
define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
%r = fadd <WIDTH x float> %0, %1
ret <WIDTH x float> %r
}
define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
}
declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
define internal float @min_f32(float, float) {
%cmp = fcmp olt float %0, %1
%r = select i1 %cmp, float %0, float %1
ret float %r
}
define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
}
declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
define internal float @max_f32(float, float) {
%cmp = fcmp ugt float %0, %1
%r = select i1 %cmp, float %0, float %1
ret float %r
}
define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
}
declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
define i64 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
%a16 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %0)
%a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a16)
%a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
%a0 = extractelement <2 x i64> %a64, i32 0
%a1 = extractelement <2 x i64> %a64, i32 1
%r = add i64 %a0, %a1
ret i64 %r
}
define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
v16tov8(i16, %0, %va, %vb)
%a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %va)
%b32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %vb)
%a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
%b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %b32)
%sum = add <2 x i64> %a64, %b64
%a0 = extractelement <2 x i64> %sum, i32 0
%a1 = extractelement <2 x i64> %sum, i32 1
%r = add i64 %a0, %a1
ret i64 %r
}
define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
v16tov4(i32, %0, %va, %vb, %vc, %vd)
%a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
%b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
%c64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vc)
%d64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vd)
%ab = add <2 x i64> %a64, %b64
%cd = add <2 x i64> %c64, %d64
%sum = add <2 x i64> %ab, %cd
%a0 = extractelement <2 x i64> %sum, i32 0
%a1 = extractelement <2 x i64> %sum, i32 1
%r = add i64 %a0, %a1
ret i64 %r
}
declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
define internal i32 @min_si32(i32, i32) {
%cmp = icmp slt i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
}
declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
define internal i32 @max_si32(i32, i32) {
%cmp = icmp sgt i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
}
declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
define internal i32 @min_ui32(i32, i32) {
%cmp = icmp ult i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
}
declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
define internal i32 @max_ui32(i32, i32) {
%cmp = icmp ugt i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
}
define internal double @__add_uniform_double(double, double) {
%r = fadd double %0, %1
ret double %r
}
define internal <WIDTH x double> @__add_varying_double(<WIDTH x double>, <WIDTH x double>) {
%r = fadd <WIDTH x double> %0, %1
ret <WIDTH x double> %r
}
define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
reduce16(double, @__add_varying_double, @__add_uniform_double)
}
define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
reduce16(double, @__min_varying_double, @__min_uniform_double)
}
define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
reduce16(double, @__max_varying_double, @__max_uniform_double)
}
define internal i64 @__add_uniform_int64(i64, i64) {
%r = add i64 %0, %1
ret i64 %r
}
define internal <WIDTH x i64> @__add_varying_int64(<WIDTH x i64>, <WIDTH x i64>) {
%r = add <WIDTH x i64> %0, %1
ret <WIDTH x i64> %r
}
define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
}
define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
}
define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
}
define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
}
define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
%r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
ret <16 x i8> %r
}
declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone {
%r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
ret <16 x i8> %r
}
declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
%r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
ret <16 x i8> %r
}
declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone {
%r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
ret <16 x i8> %r
}
declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
v16tov8(i16, %0, %a0, %b0)
v16tov8(i16, %1, %a1, %b1)
%r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
%r1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
v8tov16(i16, %r0, %r1, %r)
ret <16 x i16> %r
}
declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone {
v16tov8(i16, %0, %a0, %b0)
v16tov8(i16, %1, %a1, %b1)
%r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
%r1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
v8tov16(i16, %r0, %r1, %r)
ret <16 x i16> %r
}
declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
v16tov8(i16, %0, %a0, %b0)
v16tov8(i16, %1, %a1, %b1)
%r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
%r1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
v8tov16(i16, %r0, %r1, %r)
ret <16 x i16> %r
}
declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone {
v16tov8(i16, %0, %a0, %b0)
v16tov8(i16, %1, %a1, %b1)
%r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
%r1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
v8tov16(i16, %r0, %r1, %r)
ret <16 x i16> %r
}

View File

@@ -0,0 +1,346 @@
;;
;; target-neon-common.ll
;;
;; Copyright(c) 2013 Google, Inc.
;;
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Matt Pharr nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
stdlib_core()
scans()
reduce_equal(WIDTH)
rdrand_decls()
define_shuffles()
aossoa()
ctlztz()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
%v1 = bitcast i16 %v to <1 x i16>
%vec = shufflevector <1 x i16> %v1, <1 x i16> undef,
<4 x i32> <i32 0, i32 0, i32 0, i32 0>
%h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
%r = extractelement <4 x float> %h, i32 0
ret float %r
}
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
%v1 = bitcast float %v to <1 x float>
%vec = shufflevector <1 x float> %v1, <1 x float> undef,
<4 x i32> <i32 0, i32 0, i32 0, i32 0>
%h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
%r = extractelement <4 x i16> %h, i32 0
ret i16 %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; math
define void @__fastmath() nounwind {
ret void
}
;; round/floor/ceil
;; FIXME: grabbed these from the sse2 target, which does not have native
;; instructions for these. Is there a better approach for NEON?
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
%float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
%bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
%bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
%int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
%binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
%binop21.i = fadd float %binop.i, -8.388608e+06
%float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
%bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
ret float %int_to_float_bitcast.i.i.i
}
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
%bincmp.i = fcmp ogt float %calltmp.i, %0
%selectexpr.i = sext i1 %bincmp.i to i32
%bitop.i = and i32 %selectexpr.i, -1082130432
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
ret float %binop.i
}
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
%bincmp.i = fcmp olt float %calltmp.i, %0
%selectexpr.i = sext i1 %bincmp.i to i32
%bitop.i = and i32 %selectexpr.i, 1065353216
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
ret float %binop.i
}
;; FIXME: rounding doubles and double vectors needs to be implemented
declare double @__round_uniform_double(double) nounwind readnone
declare double @__floor_uniform_double(double) nounwind readnone
declare double @__ceil_uniform_double(double) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; min/max
define float @__max_uniform_float(float, float) nounwind readnone {
%cmp = fcmp ugt float %0, %1
%r = select i1 %cmp, float %0, float %1
ret float %r
}
define float @__min_uniform_float(float, float) nounwind readnone {
%cmp = fcmp ult float %0, %1
%r = select i1 %cmp, float %0, float %1
ret float %r
}
define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
%cmp = icmp slt i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
%cmp = icmp sgt i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
%cmp = icmp ult i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
%cmp = icmp ugt i32 %0, %1
%r = select i1 %cmp, i32 %0, i32 %1
ret i32 %r
}
define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
%cmp = icmp slt i64 %0, %1
%r = select i1 %cmp, i64 %0, i64 %1
ret i64 %r
}
define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
%cmp = icmp sgt i64 %0, %1
%r = select i1 %cmp, i64 %0, i64 %1
ret i64 %r
}
define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
%cmp = icmp ult i64 %0, %1
%r = select i1 %cmp, i64 %0, i64 %1
ret i64 %r
}
define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
%cmp = icmp ugt i64 %0, %1
%r = select i1 %cmp, i64 %0, i64 %1
ret i64 %r
}
define double @__min_uniform_double(double, double) nounwind readnone {
%cmp = fcmp olt double %0, %1
%r = select i1 %cmp, double %0, double %1
ret double %r
}
define double @__max_uniform_double(double, double) nounwind readnone {
%cmp = fcmp ogt double %0, %1
%r = select i1 %cmp, double %0, double %1
ret double %r
}
define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
%m = icmp slt <WIDTH x i64> %0, %1
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
ret <WIDTH x i64> %r
}
define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
%m = icmp sgt <WIDTH x i64> %0, %1
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
ret <WIDTH x i64> %r
}
define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
%m = icmp ult <WIDTH x i64> %0, %1
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
ret <WIDTH x i64> %r
}
define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
%m = icmp ugt <WIDTH x i64> %0, %1
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
ret <WIDTH x i64> %r
}
define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
<WIDTH x double>) nounwind readnone {
%m = fcmp olt <WIDTH x double> %0, %1
%r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
ret <WIDTH x double> %r
}
define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
<WIDTH x double>) nounwind readnone {
%m = fcmp ogt <WIDTH x double> %0, %1
%r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
ret <WIDTH x double> %r
}
;; sqrt/rsqrt/rcp
declare float @llvm.sqrt.f32(float)
define float @__sqrt_uniform_float(float) nounwind readnone {
%r = call float @llvm.sqrt.f32(float %0)
ret float %r
}
declare double @llvm.sqrt.f64(double)
define double @__sqrt_uniform_double(double) nounwind readnone {
%r = call double @llvm.sqrt.f64(double %0)
ret double %r
}
;; bit ops
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
define i32 @__popcnt_int32(i32) nounwind readnone {
%v = call i32 @llvm.ctpop.i32(i32 %0)
ret i32 %v
}
define i64 @__popcnt_int64(i64) nounwind readnone {
%v = call i64 @llvm.ctpop.i64(i64 %0)
ret i64 %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
masked_load(i8, 1)
masked_load(i16, 2)
masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8)
masked_load(double, 8)
gen_masked_store(i8)
gen_masked_store(i16)
gen_masked_store(i32)
gen_masked_store(i64)
masked_store_float_double()
define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
<WIDTH x MASK> %mask) nounwind alwaysinline {
%old = load <WIDTH x i8> * %ptr
%mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
%result = select <WIDTH x i1> %mask1, <WIDTH x i8> %new, <WIDTH x i8> %old
store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
ret void
}
define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new,
<WIDTH x MASK> %mask) nounwind alwaysinline {
%old = load <WIDTH x i16> * %ptr
%mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
%result = select <WIDTH x i1> %mask1, <WIDTH x i16> %new, <WIDTH x i16> %old
store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
ret void
}
define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new,
<WIDTH x MASK> %mask) nounwind alwaysinline {
%old = load <WIDTH x i32> * %ptr
%mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
%result = select <WIDTH x i1> %mask1, <WIDTH x i32> %new, <WIDTH x i32> %old
store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
ret void
}
define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
<WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
%old = load <WIDTH x i64> * %ptr
%mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
%result = select <WIDTH x i1> %mask1, <WIDTH x i64> %new, <WIDTH x i64> %old
store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
ret void
}
;; yuck. We need declarations of these, even though we shouldnt ever
;; actually generate calls to them for the NEON target...
include(`svml.m4')
svml_stubs(float,f,WIDTH)
svml_stubs(double,d,WIDTH)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)
gen_scatter(i8)
gen_scatter(i16)
gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
packed_load_and_store(4)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; prefetch
define_prefetches()

View File

@@ -5,6 +5,10 @@ define(`WIDTH',`1')
include(`util.m4')
include(`svml.m4')
svml_stubs(float,f,WIDTH)
svml_stubs(double,d,WIDTH)
; Define some basics for a 1-wide target
stdlib_core()
packed_load_and_store()
@@ -467,6 +471,9 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
%call = call i32 @llvm.ctpop.i32(i32 %0)
ret i32 %call
@@ -643,103 +650,6 @@ define <1 x double> @__rsqrt_varying_double(<1 x double> %v) nounwind readonly
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
;ret <1 x float> %ret
;%r = extractelement <1 x float> %0, i32 0
;%s = call float @llvm.sin.f32(float %r)
;%rv = insertelement <1 x float> undef, float %r, i32 0
;ret <1 x float> %rv
unary1to1(float,@llvm.sin.f32)
}
define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
;ret <1 x float> %ret
;%r = extractelement <1 x float> %0, i32 0
;%s = call float @llvm.cos.f32(float %r)
;%rv = insertelement <1 x float> undef, float %r, i32 0
;ret <1 x float> %rv
unary1to1(float, @llvm.cos.f32)
}
define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
; store <1 x float> %s, <1 x float> * %1
; ret void
%sin = call <1 x float> @__svml_sin (<1 x float> %0)
%cos = call <1 x float> @__svml_cos (<1 x float> %0)
store <1 x float> %sin, <1 x float> * %1
store <1 x float> %cos, <1 x float> * %2
ret void
}
define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
;ret <1 x float> %ret
;%r = extractelement <1 x float> %0, i32 0
;%s = call float @llvm_tan_f32(float %r)
;%rv = insertelement <1 x float> undef, float %r, i32 0
;ret <1 x float> %rv
;unasry1to1(float, @llvm.tan.f32)
; UNSUPPORTED!
ret <1 x float > %0
}
define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
; ret <1 x float> %ret
;%r = extractelement <1 x float> %0, i32 0
;%s = call float @llvm_atan_f32(float %r)
;%rv = insertelement <1 x float> undef, float %r, i32 0
;ret <1 x float> %rv
;unsary1to1(float,@llvm.atan.f32)
;UNSUPPORTED!
ret <1 x float > %0
}
define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
;ret <1 x float> %ret
;%y = extractelement <1 x float> %0, i32 0
;%x = extractelement <1 x float> %1, i32 0
;%q = fdiv float %y, %x
;%a = call float @llvm.atan.f32 (float %q)
;%rv = insertelement <1 x float> undef, float %a, i32 0
;ret <1 x float> %rv
; UNSUPPORTED!
ret <1 x float > %0
}
define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
;ret <1 x float> %ret
unary1to1(float, @llvm.exp.f32)
}
define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
;ret <1 x float> %ret
unary1to1(float, @llvm.log.f32)
}
define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
;ret <1 x float> %ret
%r = extractelement <1 x float> %0, i32 0
%e = extractelement <1 x float> %1, i32 0
%s = call float @llvm.pow.f32(float %r,float %e)
%rv = insertelement <1 x float> undef, float %s, i32 0
ret <1 x float> %rv
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max
@@ -957,3 +867,8 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
define_avgs()

View File

@@ -269,4 +269,8 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
ret i64 %val
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
define_avgs()

View File

@@ -105,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
include(`svml.m4')
;; single precision
svml_declare(float,f4,4)
svml_define_x(float,f4,4,f,8)
define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_sinf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_cosf4, %0)
ret <8 x float> %ret
}
define void @__svml_sincos(<8 x float>, <8 x float> *,
<8 x float> *) nounwind readnone alwaysinline {
; call svml_sincosf4 two times with the two 4-wide sub-vectors
%a = shufflevector <8 x float> %0, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%b = shufflevector <8 x float> %0, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%cospa = alloca <4 x float>
%sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
%cospb = alloca <4 x float>
%sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
%sin = shufflevector <4 x float> %sa, <4 x float> %sb,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7>
store <8 x float> %sin, <8 x float> * %1
%cosa = load <4 x float> * %cospa
%cosb = load <4 x float> * %cospb
%cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7>
store <8 x float> %cos, <8 x float> * %2
ret void
}
define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_tanf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_atanf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_atan2(<8 x float>,
<8 x float>) nounwind readnone alwaysinline {
binary4to8(ret, float, @__svml_atan2f4, %0, %1)
ret <8 x float> %ret
}
define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_expf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_logf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_pow(<8 x float>,
<8 x float>) nounwind readnone alwaysinline {
binary4to8(ret, float, @__svml_powf4, %0, %1)
ret <8 x float> %ret
}
;; double precision
svml_declare(double,2,2)
svml_define_x(double,2,2,d,8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -367,6 +294,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
<8 x i16>) nounwind readnone alwaysinline {
%r = add <8 x i16> %0, %1
ret <8 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
}
define <4 x float> @__vec4_add_float(<4 x float> %v0,
<4 x float> %v1) nounwind readnone alwaysinline {
%v = fadd <4 x float> %v0, %v1

View File

@@ -267,6 +267,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <4 x i16> @__add_varying_i16(<4 x i16>,
<4 x i16>) nounwind readnone alwaysinline {
%r = add <4 x i16> %0, %1
ret <4 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
}
define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
%v1 = shufflevector <4 x float> %v, <4 x float> undef,
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -466,62 +496,15 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
include(`svml.m4')
;; single precision
svml_declare(float,f4,4)
svml_define(float,f4,4,f)
;; double precision
svml_declare(double,2,2)
svml_define_x(double,2,2,d,4)
define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
ret <4 x float> %ret
}
define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
store <4 x float> %s, <4 x float> * %1
ret void
}
define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
ret <4 x float> %ret
}
define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
ret <4 x float> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max

490
builtins/target-sse4-16.ll Normal file
View File

@@ -0,0 +1,490 @@
;; Copyright (c) 2013, Google, Inc.
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Google, Inc. nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Define common 4-wide stuff
define(`WIDTH',`8')
define(`MASK',`i16')
include(`util.m4')
stdlib_core()
packed_load_and_store()
scans()
int64minmax()
include(`target-sse4-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
; do one N-R iteration to improve precision
; float iv = __rcp_v(v);
; return iv * (2. - v * iv);
%v_iv = fmul <8 x float> %0, %call
%two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.>, %v_iv
%iv_mul = fmul <8 x float> %call, %two_minus
ret <8 x float> %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; rsqrt
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %v) nounwind readonly alwaysinline {
; float is = __rsqrt_v(v);
unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <8 x float> %v, %is
%v_is_is = fmul <8 x float> %v_is, %is
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <8 x float> %is, %three_sub
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
ret <8 x float> %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; sqrt
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
ret <8 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind
alwaysinline {
unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
ret <8 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
round4to8(%0, 8)
}
define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
round4to8(%0, 9)
}
define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
round4to8(%0, 10)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding doubles
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
round2to8double(%0, 8)
}
define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
round2to8double(%0, 9)
}
define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
round2to8double(%0, 10)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
ret <8 x float> %call
}
define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
ret <8 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int32 min/max
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
ret <8 x i32> %call
}
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
ret <8 x i32> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unsigned int min/max
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
ret <8 x i32> %call
}
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
ret <8 x i32> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone {
binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
ret <8 x double> %ret
}
define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone {
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
ret <8 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; svml
; FIXME
include(`svml.m4')
svml_stubs(float,f,WIDTH)
svml_stubs(double,d,WIDTH)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions
declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
define i64 @__movmsk(<8 x MASK>) nounwind readnone alwaysinline {
%m8 = trunc <8 x MASK> %0 to <8 x i8>
%mask8 = shufflevector <8 x i8> %m8, <8 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
%m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %mask8)
%m64 = zext i32 %m to i64
ret i64 %m64
}
define i1 @__any(<8 x MASK>) nounwind readnone alwaysinline {
%m = call i64 @__movmsk(<8 x MASK> %0)
%mne = icmp ne i64 %m, 0
ret i1 %mne
}
define i1 @__all(<8 x MASK>) nounwind readnone alwaysinline {
%m = call i64 @__movmsk(<8 x MASK> %0)
%meq = icmp eq i64 %m, ALL_ON_MASK
ret i1 %meq
}
define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline {
%m = call i64 @__movmsk(<8 x MASK> %0)
%meq = icmp eq i64 %m, 0
ret i1 %meq
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
<8 x i16>) nounwind readnone alwaysinline {
%r = add <8 x i16> %0, %1
ret <8 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
}
define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) {
%r = fadd <8 x float> %0, %1
ret <8 x float> %r
}
define internal float @__add_uniform_float(float, float) {
%r = fadd float %0, %1
ret float %r
}
define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
reduce8(float, @__add_varying_float, @__add_uniform_float)
}
define float @__reduce_min_float(<8 x float>) nounwind readnone {
reduce8(float, @__min_varying_float, @__min_uniform_float)
}
define float @__reduce_max_float(<8 x float>) nounwind readnone {
reduce8(float, @__max_varying_float, @__max_uniform_float)
}
define internal <8 x i32> @__add_varying_int32(<8 x i32>, <8 x i32>) {
%r = add <8 x i32> %0, %1
ret <8 x i32> %r
}
define internal i32 @__add_uniform_int32(i32, i32) {
%r = add i32 %0, %1
ret i32 %r
}
define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone {
reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
}
define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone {
reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
}
define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone {
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
}
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone {
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone {
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
}
define internal <8 x double> @__add_varying_double(<8 x double>, <8 x double>) {
%r = fadd <8 x double> %0, %1
ret <8 x double> %r
}
define internal double @__add_uniform_double(double, double) {
%r = fadd double %0, %1
ret double %r
}
define double @__reduce_add_double(<8 x double>) nounwind readnone {
reduce8(double, @__add_varying_double, @__add_uniform_double)
}
define double @__reduce_min_double(<8 x double>) nounwind readnone {
reduce8(double, @__min_varying_double, @__min_uniform_double)
}
define double @__reduce_max_double(<8 x double>) nounwind readnone {
reduce8(double, @__max_varying_double, @__max_uniform_double)
}
define internal <8 x i64> @__add_varying_int64(<8 x i64>, <8 x i64>) {
%r = add <8 x i64> %0, %1
ret <8 x i64> %r
}
define internal i64 @__add_uniform_int64(i64, i64) {
%r = add i64 %0, %1
ret i64 %r
}
define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
}
define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
}
define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
}
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
}
define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
}
reduce_equal(8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
define void @__masked_store_blend_i64(<8 x i64>* nocapture, <8 x i64>,
<8 x MASK> %mask) nounwind
alwaysinline {
%mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
%old = load <8 x i64>* %0, align 4
%blend = select <8 x i1> %mask_as_i1, <8 x i64> %1, <8 x i64> %old
store <8 x i64> %blend, <8 x i64>* %0, align 4
ret void
}
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
<8 x MASK> %mask) nounwind alwaysinline {
%mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
%old = load <8 x i32>* %0, align 4
%blend = select <8 x i1> %mask_as_i1, <8 x i32> %1, <8 x i32> %old
store <8 x i32> %blend, <8 x i32>* %0, align 4
ret void
}
define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>,
<8 x MASK> %mask) nounwind alwaysinline {
%mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
%old = load <8 x i16>* %0, align 4
%blend = select <8 x i1> %mask_as_i1, <8 x i16> %1, <8 x i16> %old
store <8 x i16> %blend, <8 x i16>* %0, align 4
ret void
}
define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
<8 x MASK> %mask) nounwind alwaysinline {
%mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
%old = load <8 x i8>* %0, align 4
%blend = select <8 x i1> %mask_as_i1, <8 x i8> %1, <8 x i8> %old
store <8 x i8> %blend, <8 x i8>* %0, align 4
ret void
}
gen_masked_store(i8)
gen_masked_store(i16)
gen_masked_store(i32)
gen_masked_store(i64)
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
masked_load(i8, 1)
masked_load(i16, 2)
masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
; define these with the macros from stdlib.m4
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)
gen_scatter(i8)
gen_scatter(i16)
gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) {
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%r16 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %v0, <16 x i8> %v1)
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i8> %r
}
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) {
%r = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1)
ret <8 x i16> %r
}
define_avg_up_int8()
define_avg_up_int16()
define_down_avgs()

492
builtins/target-sse4-8.ll Normal file
View File

@@ -0,0 +1,492 @@
;; Copyright (c) 2013, Google, Inc.
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Google, Inc. nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Define common 4-wide stuff
define(`WIDTH',`16')
define(`MASK',`i8')
include(`util.m4')
stdlib_core()
packed_load_and_store()
scans()
int64minmax()
include(`target-sse4-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
unary4to16(call, float, @llvm.x86.sse.rcp.ps, %0)
; do one N-R iteration to improve precision
; float iv = __rcp_v(v);
; return iv * (2. - v * iv);
%v_iv = fmul <16 x float> %0, %call
%two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.>, %v_iv
%iv_mul = fmul <16 x float> %call, %two_minus
ret <16 x float> %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; rsqrt
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
; float is = __rsqrt_v(v);
unary4to16(is, float, @llvm.x86.sse.rsqrt.ps, %v)
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <16 x float> %v, %is
%v_is_is = fmul <16 x float> %v_is, %is
%three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <16 x float> %is, %three_sub
%half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
ret <16 x float> %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; sqrt
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
unary4to16(call, float, @llvm.x86.sse.sqrt.ps, %0)
ret <16 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind
alwaysinline {
unary2to16(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
ret <16 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
round4to16(%0, 8)
}
define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
round4to16(%0, 9)
}
define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
round4to16(%0, 10)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding doubles
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
; XXXround2to4double(%0, 8)
; FIXME: need round2to16double in util.m4...
ret <16 x double> undef
}
define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
; XXXround2to4double(%0, 9)
ret <16 x double> undef
}
define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
; XXXround2to4double(%0, 10)
ret <16 x double> undef
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
binary4to16(call, float, @llvm.x86.sse.max.ps, %0, %1)
ret <16 x float> %call
}
define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
binary4to16(call, float, @llvm.x86.sse.min.ps, %0, %1)
ret <16 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int32 min/max
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
ret <16 x i32> %call
}
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
ret <16 x i32> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unsigned int min/max
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(call, i32, @llvm.x86.sse41.pminud, %0, %1)
ret <16 x i32> %call
}
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
ret <16 x i32> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone {
binary2to16(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
ret <16 x double> %ret
}
define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone {
binary2to16(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
ret <16 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; svml
; FIXME
include(`svml.m4')
svml_stubs(float,f,WIDTH)
svml_stubs(double,d,WIDTH)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions
declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
define i64 @__movmsk(<16 x i8>) nounwind readnone alwaysinline {
%m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
%m64 = zext i32 %m to i64
ret i64 %m64
}
define i1 @__any(<16 x i8>) nounwind readnone alwaysinline {
%m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
%mne = icmp ne i32 %m, 0
ret i1 %mne
}
define i1 @__all(<16 x i8>) nounwind readnone alwaysinline {
%m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
%meq = icmp eq i32 %m, ALL_ON_MASK
ret i1 %meq
}
define i1 @__none(<16 x i8>) nounwind readnone alwaysinline {
%m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
%meq = icmp eq i32 %m, 0
ret i1 %meq
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <16 x i16> @__add_varying_i16(<16 x i16>,
<16 x i16>) nounwind readnone alwaysinline {
%r = add <16 x i16> %0, %1
ret <16 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
}
define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) {
%r = fadd <16 x float> %0, %1
ret <16 x float> %r
}
define internal float @__add_uniform_float(float, float) {
%r = fadd float %0, %1
ret float %r
}
define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
reduce16(float, @__add_varying_float, @__add_uniform_float)
}
define float @__reduce_min_float(<16 x float>) nounwind readnone {
reduce16(float, @__min_varying_float, @__min_uniform_float)
}
define float @__reduce_max_float(<16 x float>) nounwind readnone {
reduce16(float, @__max_varying_float, @__max_uniform_float)
}
define internal <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) {
%r = add <16 x i32> %0, %1
ret <16 x i32> %r
}
define internal i32 @__add_uniform_int32(i32, i32) {
%r = add i32 %0, %1
ret i32 %r
}
define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone {
reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
}
define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone {
reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
}
define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone {
reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
}
define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone {
reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone {
reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
}
define internal <16 x double> @__add_varying_double(<16 x double>, <16 x double>) {
%r = fadd <16 x double> %0, %1
ret <16 x double> %r
}
define internal double @__add_uniform_double(double, double) {
%r = fadd double %0, %1
ret double %r
}
define double @__reduce_add_double(<16 x double>) nounwind readnone {
reduce16(double, @__add_varying_double, @__add_uniform_double)
}
define double @__reduce_min_double(<16 x double>) nounwind readnone {
reduce16(double, @__min_varying_double, @__min_uniform_double)
}
define double @__reduce_max_double(<16 x double>) nounwind readnone {
reduce16(double, @__max_varying_double, @__max_uniform_double)
}
define internal <16 x i64> @__add_varying_int64(<16 x i64>, <16 x i64>) {
%r = add <16 x i64> %0, %1
ret <16 x i64> %r
}
define internal i64 @__add_uniform_int64(i64, i64) {
%r = add i64 %0, %1
ret i64 %r
}
define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone {
reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
}
define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone {
reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
}
define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone {
reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
}
define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone {
reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
}
define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone {
reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
}
reduce_equal(16)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
define void @__masked_store_blend_i64(<16 x i64>* nocapture, <16 x i64>,
<16 x i8> %mask) nounwind
alwaysinline {
%mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
%old = load <16 x i64>* %0, align 4
%blend = select <16 x i1> %mask_as_i1, <16 x i64> %1, <16 x i64> %old
store <16 x i64> %blend, <16 x i64>* %0, align 4
ret void
}
define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>,
<16 x MASK> %mask) nounwind alwaysinline {
%mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
%old = load <16 x i32>* %0, align 4
%blend = select <16 x i1> %mask_as_i1, <16 x i32> %1, <16 x i32> %old
store <16 x i32> %blend, <16 x i32>* %0, align 4
ret void
}
define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
<16 x MASK> %mask) nounwind alwaysinline {
%mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
%old = load <16 x i16>* %0, align 4
%blend = select <16 x i1> %mask_as_i1, <16 x i16> %1, <16 x i16> %old
store <16 x i16> %blend, <16 x i16>* %0, align 4
ret void
}
declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
<16 x MASK> %mask) nounwind alwaysinline {
%old = load <16 x i8>* %0, align 4
%blend = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %old, <16 x i8> %1,
<16 x i8> %mask)
store <16 x i8> %blend, <16 x i8>* %0, align 4
ret void
}
gen_masked_store(i8)
gen_masked_store(i16)
gen_masked_store(i32)
gen_masked_store(i64)
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
masked_load(i8, 1)
masked_load(i16, 2)
masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
; define these with the macros from stdlib.m4
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)
gen_scatter(i8)
gen_scatter(i16)
gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
%r = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1)
ret <16 x i8> %r
}
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
v16tov8(i16, %0, %a0, %b0)
v16tov8(i16, %1, %a1, %b1)
%r0 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
%r1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %b0, <8 x i16> %b1)
v8tov16(i16, %r0, %r1, %r)
ret <16 x i16> %r
}
define_avg_up_int8()
define_avg_up_int16()
define_down_avgs()

View File

@@ -105,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
include(`svml.m4')
;; single precision
svml_declare(float,f4,4)
svml_define_x(float,f4,4,f,8)
define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_sinf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_cosf4, %0)
ret <8 x float> %ret
}
define void @__svml_sincos(<8 x float>, <8 x float> *,
<8 x float> *) nounwind readnone alwaysinline {
; call svml_sincosf4 two times with the two 4-wide sub-vectors
%a = shufflevector <8 x float> %0, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%b = shufflevector <8 x float> %0, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%cospa = alloca <4 x float>
%sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
%cospb = alloca <4 x float>
%sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
%sin = shufflevector <4 x float> %sa, <4 x float> %sb,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7>
store <8 x float> %sin, <8 x float> * %1
%cosa = load <4 x float> * %cospa
%cosb = load <4 x float> * %cospb
%cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7>
store <8 x float> %cos, <8 x float> * %2
ret void
}
define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_tanf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_atanf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_atan2(<8 x float>,
<8 x float>) nounwind readnone alwaysinline {
binary4to8(ret, float, @__svml_atan2f4, %0, %1)
ret <8 x float> %ret
}
define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_expf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_logf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_pow(<8 x float>,
<8 x float>) nounwind readnone alwaysinline {
binary4to8(ret, float, @__svml_powf4, %0, %1)
ret <8 x float> %ret
}
;; double precision
svml_declare(double,2,2)
svml_define_x(double,2,2,d,8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -309,6 +236,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
<8 x i16>) nounwind readnone alwaysinline {
%r = add <8 x i16> %0, %1
ret <8 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
}
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
}
@@ -629,3 +586,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
ret <8 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
define_avgs()

View File

@@ -209,62 +209,14 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
include(`svml.m4')
;; single precision
svml_declare(float,f4,4)
svml_define(float,f4,4,f)
define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
ret <4 x float> %ret
}
define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
store <4 x float> %s, <4 x float> * %1
ret void
}
define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
ret <4 x float> %ret
}
define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
ret <4 x float> %ret
}
;; double precision
svml_declare(double,2,2)
svml_define_x(double,2,2,d,4)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions
@@ -299,6 +251,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <4 x i16> @__add_varying_i16(<4 x i16>,
<4 x i16>) nounwind readnone alwaysinline {
%r = add <4 x i16> %0, %1
ret <4 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
}
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
@@ -503,3 +485,9 @@ gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
define_avgs()

View File

@@ -49,6 +49,63 @@ define(`MASK_HIGH_BIT_ON',
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; vector deconstruction utilities
;; split 8-wide vector into 2 4-wide vectors
;;
;; $1: vector element type
;; $2: 8-wide vector
;; $3: first 4-wide vector
;; $4: second 4-wide vector
define(`v8tov4', `
$3 = shufflevector <8 x $1> $2, <8 x $1> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
$4 = shufflevector <8 x $1> $2, <8 x $1> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
')
define(`v16tov8', `
$3 = shufflevector <16 x $1> $2, <16 x $1> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
$4 = shufflevector <16 x $1> $2, <16 x $1> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
')
define(`v4tov2', `
$3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 0, i32 1>
$4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 2, i32 3>
')
define(`v8tov2', `
$3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 0, i32 1>
$4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 2, i32 3>
$5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 4, i32 5>
$6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 6, i32 7>
')
define(`v16tov4', `
$3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
$4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
$5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
$6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; vector assembly: wider vector from two narrower vectors
;;
;; $1: vector element type
;; $2: first n-wide vector
;; $3: second n-wide vector
;; $4: result 2*n-wide vector
define(`v8tov16', `
$4 = shufflevector <8 x $1> $2, <8 x $1> $3,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Helper macro for calling various SSE instructions for scalar values
;; but where the instruction takes a vector parameter.
;; $1 : name of variable to put the final value in
@@ -156,10 +213,7 @@ define(`reduce16', `
;; the final reduction
define(`reduce8by4', `
%v1 = shufflevector <8 x $1> %0, <8 x $1> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v2 = shufflevector <8 x $1> %0, <8 x $1> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
v8tov4($1, %0, %v1, %v2)
%m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2)
%v3 = shufflevector <4 x $1> %m1, <4 x $1> undef,
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -266,30 +320,66 @@ define(`binary2to4', `
;; $4: 8-wide operand value
define(`unary4to8', `
%$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
%$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
%$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1,
%__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
%__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
%$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
'
)
;; $1: name of variable into which the final result should go
;; $2: scalar type of the input vector elements
;; $3: scalar type of the result vector elements
;; $4: 4-wide unary vector function to apply
;; $5: 8-wide operand value
define(`unary4to8conv', `
%$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
%$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
%$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
'
)
define(`unary4to16', `
%$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
%$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
%$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2)
%$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3)
%__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
%__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
%__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
%__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2)
%__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
%__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3)
%$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1,
%__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3,
%__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b,
%$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
'
)
define(`unary4to16conv', `
%$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
%$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
%$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2)
%$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
%v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3)
%$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
'
@@ -411,6 +501,42 @@ define(`unary2to8', `
'
)
define(`unary2to16', `
%$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
%v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
%$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
%v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
%$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
%v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2)
%$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
%v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3)
%$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
%v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4)
%$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
%v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5)
%$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
%v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6)
%$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
%v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7)
%$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
'
)
;; Maps an 2-wide binary function to two 8-wide vector operands
;; $1: name of variable into which the final result should go
;; $2: scalar type of the vector elements
@@ -432,12 +558,58 @@ define(`binary2to8', `
%$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
%v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
%$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
'
)
define(`binary2to16', `
%$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
%$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
%v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
%$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
%$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
%v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
%$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
%$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
%v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b)
%$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
%$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
%v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
%$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
%$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
%v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b)
%$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
%$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
%v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b)
%$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
%$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
%v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b)
%$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
%$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
%v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b)
%$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
%$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
'
)
@@ -460,6 +632,26 @@ ret <8 x float> %ret
'
)
define(`round4to16', `
%v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2)
%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2)
%r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2)
%r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2)
%ret01 = shufflevector <4 x float> %r0, <4 x float> %r1,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%ret23 = shufflevector <4 x float> %r2, <4 x float> %r3,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%ret = shufflevector <8 x float> %ret01, <8 x float> %ret23,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x float> %ret
'
)
define(`round8to16', `
%v0 = shufflevector <16 x float> $1, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -690,6 +882,91 @@ shuffles(i64, 8)
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
define(`mask_converts', `
define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) {
%r = sext <$1 x i1> %0 to <$1 x i8>
ret <$1 x i8> %r
}
define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) {
%r = sext <$1 x i1> %0 to <$1 x i16>
ret <$1 x i16> %r
}
define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) {
%r = sext <$1 x i1> %0 to <$1 x i32>
ret <$1 x i32> %r
}
define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) {
%r = sext <$1 x i1> %0 to <$1 x i64>
ret <$1 x i64> %r
}
define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) {
ret <$1 x i8> %0
}
define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) {
%r = sext <$1 x i8> %0 to <$1 x i16>
ret <$1 x i16> %r
}
define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) {
%r = sext <$1 x i8> %0 to <$1 x i32>
ret <$1 x i32> %r
}
define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) {
%r = sext <$1 x i8> %0 to <$1 x i64>
ret <$1 x i64> %r
}
define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) {
%r = trunc <$1 x i16> %0 to <$1 x i8>
ret <$1 x i8> %r
}
define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) {
ret <$1 x i16> %0
}
define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) {
%r = sext <$1 x i16> %0 to <$1 x i32>
ret <$1 x i32> %r
}
define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) {
%r = sext <$1 x i16> %0 to <$1 x i64>
ret <$1 x i64> %r
}
define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) {
%r = trunc <$1 x i32> %0 to <$1 x i8>
ret <$1 x i8> %r
}
define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) {
%r = trunc <$1 x i32> %0 to <$1 x i16>
ret <$1 x i16> %r
}
define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) {
ret <$1 x i32> %0
}
define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
%r = sext <$1 x i32> %0 to <$1 x i64>
ret <$1 x i64> %r
}
define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) {
%r = trunc <$1 x i64> %0 to <$1 x i8>
ret <$1 x i8> %r
}
define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) {
%r = trunc <$1 x i64> %0 to <$1 x i16>
ret <$1 x i16> %r
}
define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) {
%r = trunc <$1 x i64> %0 to <$1 x i32>
ret <$1 x i32> %r
}
define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
ret <$1 x i64> %0
}
')
mask_converts(WIDTH)
define(`global_atomic_associative', `
define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
@@ -697,17 +974,10 @@ define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
; first, for any lanes where the mask is off, compute a vector where those lanes
; hold the identity value..
; for the bit tricks below, we need the mask to be sign extended to be
; the size of the element type.
ifelse(
MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>',
$3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>',
$3,i32, `
; silly workaround to do %mask = %m, which is not possible directly..
%maskmem = alloca <$1 x i32>
store <$1 x i32> %m, <$1 x i32> * %maskmem
%mask = load <$1 x i32> * %maskmem'
)
; for the bit tricks below, we need the mask to have the
; the same element size as the element type.
%mask = call <$1 x $3> @convertmask_`'MASK`'_$3_$1(<$1 x MASK> %m)
; zero out any lanes that are off
%valoff = and <$1 x $3> %val, %mask
@@ -1551,11 +1821,6 @@ declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
declare i1 @__is_compile_time_constant_uniform_int32(i32)
declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
define void @__pause() nounwind readnone {
call void asm sideeffect "pause", "~{dirflag},~{fpsr},~{flags}"() nounwind
ret void
}
; This function declares placeholder masked store functions for the
; front-end to use.
;
@@ -2440,13 +2705,16 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
}
define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
ifelse(MASK,i1, `
%se = sext <WIDTH x i1> %0 to <WIDTH x i32>
;; ifelse(MASK,i32, `ret <WIDTH x i32> %0',
;; `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
;; ret <WIDTH x i32> %se')
ifelse(MASK,i32, `%se = bitcast <WIDTH x i32> %0 to <WIDTH x i32>',
MASK,i64, `%se = trunc <WIDTH x MASK> %0 to <WIDTH x i32>',
`%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>')
ret <WIDTH x i32> %se
', `
ret <WIDTH x i32> %0')
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; memcpy/memmove/memset
@@ -2830,17 +3098,11 @@ m4exit(`1')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; read hw clock
declare i64 @llvm.readcyclecounter()
define i64 @__clock() nounwind {
entry:
tail call void asm sideeffect "xorl %eax,%eax \0A cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind
%0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind
%asmresult = extractvalue { i32, i32 } %0, 0
%asmresult1 = extractvalue { i32, i32 } %0, 1
%conv = zext i32 %asmresult1 to i64
%shl = shl nuw i64 %conv, 32
%conv2 = zext i32 %asmresult to i64
%or = or i64 %shl, %conv2
ret i64 %or
%r = call i64 @llvm.readcyclecounter()
ret i64 %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2918,6 +3180,7 @@ define float @__stdlib_powf(float, float) nounwind readnone alwaysinline {
}
declare double @sin(double) nounwind readnone
declare double @asin(double) nounwind readnone
declare double @cos(double) nounwind readnone
declare void @sincos(double, double *, double *) nounwind readnone
declare double @tan(double) nounwind readnone
@@ -2932,6 +3195,11 @@ define double @__stdlib_sin(double) nounwind readnone alwaysinline {
ret double %r
}
define double @__stdlib_asin(double) nounwind readnone alwaysinline {
%r = call double @asin(double %0)
ret double %r
}
define double @__stdlib_cos(double) nounwind readnone alwaysinline {
%r = call double @cos(double %0)
ret double %r
@@ -3201,8 +3469,8 @@ return:
;; $1: llvm type of elements (and suffix for function name)
define(`gen_masked_store', `
define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x i32>) nounwind alwaysinline {
per_lane(WIDTH, <WIDTH x i32> %2, `
define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x MASK>) nounwind alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %2, `
%ptr_LANE_ID = getelementptr <WIDTH x $1> * %0, i32 0, i32 LANE
%storeval_LANE_ID = extractelement <WIDTH x $1> %1, i32 LANE
store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID')
@@ -3260,6 +3528,56 @@ define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
}
')
define(`masked_store_blend_8_16_by_4_mask64', `
define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
<4 x i64>) nounwind alwaysinline {
%old = load <4 x i8> * %0, align 1
ifelse(LLVM_VERSION,LLVM_3_0,`
%old32 = bitcast <4 x i8> %old to i32
%new32 = bitcast <4 x i8> %1 to i32
%mask8 = trunc <4 x i64> %2 to <4 x i8>
%mask32 = bitcast <4 x i8> %mask8 to i32
%notmask32 = xor i32 %mask32, -1
%newmasked = and i32 %new32, %mask32
%oldmasked = and i32 %old32, %notmask32
%result = or i32 %newmasked, %oldmasked
%resultvec = bitcast i32 %result to <4 x i8>
',`
%m = trunc <4 x i64> %2 to <4 x i1>
%resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
')
store <4 x i8> %resultvec, <4 x i8> * %0, align 1
ret void
}
define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
<4 x i64>) nounwind alwaysinline {
%old = load <4 x i16> * %0, align 2
ifelse(LLVM_VERSION,LLVM_3_0,`
%old64 = bitcast <4 x i16> %old to i64
%new64 = bitcast <4 x i16> %1 to i64
%mask16 = trunc <4 x i64> %2 to <4 x i16>
%mask64 = bitcast <4 x i16> %mask16 to i64
%notmask64 = xor i64 %mask64, -1
%newmasked = and i64 %new64, %mask64
%oldmasked = and i64 %old64, %notmask64
%result = or i64 %newmasked, %oldmasked
%resultvec = bitcast i64 %result to <4 x i16>
',`
%m = trunc <4 x i64> %2 to <4 x i1>
%resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
')
store <4 x i16> %resultvec, <4 x i16> * %0, align 2
ret void
}
')
define(`masked_store_blend_8_16_by_8', `
define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
<8 x i32>) nounwind alwaysinline {
@@ -3378,10 +3696,10 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
define(`packed_load_and_store', `
define i32 @__packed_load_active(i32 * %startptr, <WIDTH x i32> * %val_ptr,
<WIDTH x i32> %full_mask) nounwind alwaysinline {
<WIDTH x MASK> %full_mask) nounwind alwaysinline {
entry:
%mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
%mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
br i1 %mask_known, label %known_mask, label %unknown_mask
known_mask:
@@ -3432,10 +3750,10 @@ done:
}
define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
<WIDTH x i32> %full_mask) nounwind alwaysinline {
<WIDTH x MASK> %full_mask) nounwind alwaysinline {
entry:
%mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
%mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
br i1 %mask_known, label %known_mask, label %unknown_mask
known_mask:
@@ -3544,10 +3862,10 @@ check_neighbors:
%castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1)
%vr = bitcast <$1 x $4> %castvr to <$1 x $2>
%eq = $5 $7 <$1 x $2> %vec, %vr
ifelse(MASK,i32, `
%eq32 = sext <$1 x i1> %eq to <$1 x i32>
%eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', `
%eqmm = call i64 @__movmsk(<$1 x MASK> %eq)')
ifelse(MASK,i1, `
%eqmm = call i64 @__movmsk(<$1 x MASK> %eq)',
`%eqm = sext <$1 x i1> %eq to <$1 x MASK>
%eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)')
%alleq = icmp eq i64 %eqmm, ALL_ON_MASK
br i1 %alleq, label %all_equal, label %not_all_equal
', `
@@ -3722,9 +4040,9 @@ pl_done:
define(`gen_gather_general', `
; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
%ret_ptr = alloca <WIDTH x $1>
per_lane(WIDTH, <WIDTH x i32> %vecmask, `
per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
%iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
%ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
%val_LANE_ID = load $1 * %ptr_LANE_ID
@@ -3738,9 +4056,9 @@ define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs,
; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
%ret_ptr = alloca <WIDTH x $1>
per_lane(WIDTH, <WIDTH x i32> %vecmask, `
per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
%val_LANE_ID = load $1 * %ptr_LANE_ID
@@ -3804,7 +4122,7 @@ define <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %o
define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
<WIDTH x i32> %offset_delta,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
; We can be clever and avoid the per-lane stuff for gathers if we are willing
; to require that the 0th element of the array being gathered from is always
; legal to read from (and we do indeed require that, given the benefits!)
@@ -3813,13 +4131,13 @@ define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32
%offsetsPtr = alloca <WIDTH x i32>
store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %offsetsPtr
call void @__masked_store_blend_i32(<WIDTH x i32> * %offsetsPtr, <WIDTH x i32> %offsets,
<WIDTH x i32> %vecmask)
<WIDTH x MASK> %vecmask)
%newOffsets = load <WIDTH x i32> * %offsetsPtr
%deltaPtr = alloca <WIDTH x i32>
store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %deltaPtr
call void @__masked_store_blend_i32(<WIDTH x i32> * %deltaPtr, <WIDTH x i32> %offset_delta,
<WIDTH x i32> %vecmask)
<WIDTH x MASK> %vecmask)
%newDelta = load <WIDTH x i32> * %deltaPtr
%ret0 = call <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, <WIDTH x i32> %newOffsets,
@@ -3835,7 +4153,7 @@ define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32
define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
<WIDTH x i64> %offset_delta,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
; We can be clever and avoid the per-lane stuff for gathers if we are willing
; to require that the 0th element of the array being gathered from is always
; legal to read from (and we do indeed require that, given the benefits!)
@@ -3844,13 +4162,13 @@ define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64
%offsetsPtr = alloca <WIDTH x i64>
store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %offsetsPtr
call void @__masked_store_blend_i64(<WIDTH x i64> * %offsetsPtr, <WIDTH x i64> %offsets,
<WIDTH x i32> %vecmask)
<WIDTH x MASK> %vecmask)
%newOffsets = load <WIDTH x i64> * %offsetsPtr
%deltaPtr = alloca <WIDTH x i64>
store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %deltaPtr
call void @__masked_store_blend_i64(<WIDTH x i64> * %deltaPtr, <WIDTH x i64> %offset_delta,
<WIDTH x i32> %vecmask)
<WIDTH x MASK> %vecmask)
%newDelta = load <WIDTH x i64> * %deltaPtr
%ret0 = call <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %newOffsets,
@@ -3876,27 +4194,27 @@ gen_gather_factored($1)
define <WIDTH x $1>
@__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale,
<WIDTH x i32> %offsets,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
%scale_vec = bitcast i32 %offset_scale to <1 x i32>
%smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef,
<WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
%scaled_offsets = mul <WIDTH x i32> %smear_scale, %offsets
%v = call <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %scaled_offsets, i32 1,
<WIDTH x i32> zeroinitializer, <WIDTH x i32> %vecmask)
<WIDTH x i32> zeroinitializer, <WIDTH x MASK> %vecmask)
ret <WIDTH x $1> %v
}
define <WIDTH x $1>
@__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale,
<WIDTH x i64> %offsets,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
%scale64 = zext i32 %offset_scale to i64
%scale_vec = bitcast i64 %scale64 to <1 x i64>
%smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef,
<WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
%scaled_offsets = mul <WIDTH x i64> %smear_scale, %offsets
%v = call <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %scaled_offsets,
i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x i32> %vecmask)
i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x MASK> %vecmask)
ret <WIDTH x $1> %v
}
@@ -3955,9 +4273,9 @@ define void @__scatter_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_s
define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
<WIDTH x i32> %offset_delta, <WIDTH x $1> %values,
<WIDTH x i32> %mask) nounwind alwaysinline {
<WIDTH x MASK> %mask) nounwind alwaysinline {
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
per_lane(WIDTH, <WIDTH x i32> %mask, `
per_lane(WIDTH, <WIDTH x MASK> %mask, `
call void @__scatter_elt32_$1(i8 * %base, <WIDTH x i32> %offsets, i32 %offset_scale,
<WIDTH x i32> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
ret void
@@ -3965,9 +4283,9 @@ define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offs
define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
<WIDTH x i64> %offset_delta, <WIDTH x $1> %values,
<WIDTH x i32> %mask) nounwind alwaysinline {
<WIDTH x MASK> %mask) nounwind alwaysinline {
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
per_lane(WIDTH, <WIDTH x i32> %mask, `
per_lane(WIDTH, <WIDTH x MASK> %mask, `
call void @__scatter_elt64_$1(i8 * %base, <WIDTH x i64> %offsets, i32 %offset_scale,
<WIDTH x i64> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
ret void
@@ -3975,8 +4293,8 @@ define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offs
; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s
define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
<WIDTH x i32> %mask) nounwind alwaysinline {
per_lane(WIDTH, <WIDTH x i32> %mask, `
<WIDTH x MASK> %mask) nounwind alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %mask, `
%iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
%ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
%val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
@@ -3987,8 +4305,8 @@ define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s
define void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,
<WIDTH x i32> %mask) nounwind alwaysinline {
per_lane(WIDTH, <WIDTH x i32> %mask, `
<WIDTH x MASK> %mask) nounwind alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %mask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
%val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
@@ -4044,3 +4362,109 @@ define i1 @__rdrand_i64(i64 * %ptr) {
ret i1 %good
}
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
define(`define_avg_up_uint8', `
define <WIDTH x i8> @__avg_up_uint8(<WIDTH x i8>, <WIDTH x i8>) {
%a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
%b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
%sum1 = add <WIDTH x i16> %a16, %b16
%sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
%avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
ret <WIDTH x i8> %r
}')
define(`define_avg_up_int8', `
define <WIDTH x i8> @__avg_up_int8(<WIDTH x i8>, <WIDTH x i8>) {
%a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
%b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
%sum1 = add <WIDTH x i16> %a16, %b16
%sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
%avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
ret <WIDTH x i8> %r
}')
define(`define_avg_up_uint16', `
define <WIDTH x i16> @__avg_up_uint16(<WIDTH x i16>, <WIDTH x i16>) {
%a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
%b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
%sum1 = add <WIDTH x i32> %a32, %b32
%sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
%avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
ret <WIDTH x i16> %r
}')
define(`define_avg_up_int16', `
define <WIDTH x i16> @__avg_up_int16(<WIDTH x i16>, <WIDTH x i16>) {
%a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
%b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
%sum1 = add <WIDTH x i32> %a32, %b32
%sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
%avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
ret <WIDTH x i16> %r
}')
define(`define_avg_down_uint8', `
define <WIDTH x i8> @__avg_down_uint8(<WIDTH x i8>, <WIDTH x i8>) {
%a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
%b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
%sum = add <WIDTH x i16> %a16, %b16
%avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
ret <WIDTH x i8> %r
}')
define(`define_avg_down_int8', `
define <WIDTH x i8> @__avg_down_int8(<WIDTH x i8>, <WIDTH x i8>) {
%a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
%b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
%sum = add <WIDTH x i16> %a16, %b16
%avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
ret <WIDTH x i8> %r
}')
define(`define_avg_down_uint16', `
define <WIDTH x i16> @__avg_down_uint16(<WIDTH x i16>, <WIDTH x i16>) {
%a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
%b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
%sum = add <WIDTH x i32> %a32, %b32
%avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
ret <WIDTH x i16> %r
}')
define(`define_avg_down_int16', `
define <WIDTH x i16> @__avg_down_int16(<WIDTH x i16>, <WIDTH x i16>) {
%a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
%b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
%sum = add <WIDTH x i32> %a32, %b32
%avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
ret <WIDTH x i16> %r
}')
define(`define_up_avgs', `
define_avg_up_uint8()
define_avg_up_int8()
define_avg_up_uint16()
define_avg_up_int16()
')
define(`define_down_avgs', `
define_avg_down_uint8()
define_avg_down_int8()
define_avg_down_uint16()
define_avg_down_int16()
')
define(`define_avgs', `
define_up_avgs()
define_down_avgs()
')