merged with master
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2011, Intel Corporation
|
||||
;; Copyright (c) 2011-2013, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -41,15 +41,13 @@
|
||||
|
||||
@__system_best_isa = internal global i32 -1
|
||||
|
||||
declare void @abort() noreturn
|
||||
|
||||
;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
|
||||
;; following code... Specifically, __get_system_isa should return a value
|
||||
;; corresponding to one of the Target::ISA enumerant values that gives the
|
||||
;; most capable ISA that the curremt system can run.
|
||||
;;
|
||||
;; Note: clang from LLVM 3.0 should be used if this is updated, for maximum
|
||||
;; backwards compatibility for anyone building ispc with LLVM 3.0
|
||||
;; Note: clang from LLVM 3.1 should be used if this is updated, for maximum
|
||||
;; backwards compatibility for anyone building ispc with LLVM 3.1
|
||||
;;
|
||||
;; #include <stdint.h>
|
||||
;; #include <stdlib.h>
|
||||
@@ -60,7 +58,7 @@ declare void @abort() noreturn
|
||||
;; : "0" (infoType));
|
||||
;; }
|
||||
;;
|
||||
;; /* Save %ebx in case it's the PIC register */
|
||||
;; // Save %ebx in case it's the PIC register.
|
||||
;; static void __cpuid_count(int info[4], int level, int count) {
|
||||
;; __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
|
||||
;; "cpuid\n\t"
|
||||
@@ -69,13 +67,23 @@ declare void @abort() noreturn
|
||||
;; : "0" (level), "2" (count));
|
||||
;; }
|
||||
;;
|
||||
;; static int __os_has_avx_support() {
|
||||
;; // Check xgetbv; this uses a .byte sequence instead of the instruction
|
||||
;; // directly because older assemblers do not include support for xgetbv and
|
||||
;; // there is no easy way to conditionally compile based on the assembler used.
|
||||
;; int rEAX, rEDX;
|
||||
;; __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
|
||||
;; return (rEAX & 6) == 6;
|
||||
;; }
|
||||
;;
|
||||
;; int32_t __get_system_isa() {
|
||||
;; int info[4];
|
||||
;; __cpuid(info, 1);
|
||||
;;
|
||||
;; /* NOTE: the values returned below must be the same as the
|
||||
;; corresponding enumerant values in Target::ISA. */
|
||||
;; if ((info[2] & (1 << 28)) != 0) {
|
||||
;; // NOTE: the values returned below must be the same as the
|
||||
;; // corresponding enumerant values in Target::ISA.
|
||||
;; if ((info[2] & (1 << 28)) != 0 &&
|
||||
;; __os_has_avx_support()) {
|
||||
;; if ((info[2] & (1 << 29)) != 0 && // F16C
|
||||
;; (info[2] & (1 << 30)) != 0) { // RDRAND
|
||||
;; // So far, so good. AVX2?
|
||||
@@ -98,47 +106,56 @@ declare void @abort() noreturn
|
||||
;; abort();
|
||||
;; }
|
||||
|
||||
define i32 @__get_system_isa() nounwind uwtable ssp {
|
||||
define i32 @__get_system_isa() nounwind uwtable {
|
||||
entry:
|
||||
%0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
|
||||
%asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2
|
||||
%asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
|
||||
%and = and i32 %asmresult5.i, 268435456
|
||||
%cmp = icmp eq i32 %and, 0
|
||||
br i1 %cmp, label %if.else13, label %if.then
|
||||
br i1 %cmp, label %if.else14, label %land.lhs.true
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%1 = and i32 %asmresult5.i, 1610612736
|
||||
%2 = icmp eq i32 %1, 1610612736
|
||||
br i1 %2, label %if.then7, label %return
|
||||
land.lhs.true: ; preds = %entry
|
||||
%1 = tail call { i32, i32 } asm sideeffect ".byte 0x0f, 0x01, 0xd0", "={ax},={dx},{cx},~{dirflag},~{fpsr},~{flags}"(i32 0) nounwind
|
||||
%asmresult.i25 = extractvalue { i32, i32 } %1, 0
|
||||
%and.i = and i32 %asmresult.i25, 6
|
||||
%cmp.i = icmp eq i32 %and.i, 6
|
||||
br i1 %cmp.i, label %if.then, label %if.else14
|
||||
|
||||
if.then7: ; preds = %if.then
|
||||
%3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
|
||||
%asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1
|
||||
%and10 = lshr i32 %asmresult4.i28, 5
|
||||
%4 = and i32 %and10, 1
|
||||
%5 = add i32 %4, 3
|
||||
if.then: ; preds = %land.lhs.true
|
||||
%2 = and i32 %asmresult5.i, 1610612736
|
||||
%3 = icmp eq i32 %2, 1610612736
|
||||
br i1 %3, label %if.then8, label %return
|
||||
|
||||
if.then8: ; preds = %if.then
|
||||
%4 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
|
||||
%asmresult4.i30 = extractvalue { i32, i32, i32, i32 } %4, 1
|
||||
%and11 = lshr i32 %asmresult4.i30, 5
|
||||
%5 = and i32 %and11, 1
|
||||
%6 = add i32 %5, 3
|
||||
br label %return
|
||||
|
||||
if.else13: ; preds = %entry
|
||||
%and15 = and i32 %asmresult5.i, 524288
|
||||
%cmp16 = icmp eq i32 %and15, 0
|
||||
br i1 %cmp16, label %if.else18, label %return
|
||||
if.else14: ; preds = %land.lhs.true, %entry
|
||||
%and16 = and i32 %asmresult5.i, 524288
|
||||
%cmp17 = icmp eq i32 %and16, 0
|
||||
br i1 %cmp17, label %if.else19, label %return
|
||||
|
||||
if.else18: ; preds = %if.else13
|
||||
%and20 = and i32 %asmresult6.i, 67108864
|
||||
%cmp21 = icmp eq i32 %and20, 0
|
||||
br i1 %cmp21, label %if.else23, label %return
|
||||
if.else19: ; preds = %if.else14
|
||||
%and21 = and i32 %asmresult6.i, 67108864
|
||||
%cmp22 = icmp eq i32 %and21, 0
|
||||
br i1 %cmp22, label %if.else24, label %return
|
||||
|
||||
if.else23: ; preds = %if.else18
|
||||
if.else24: ; preds = %if.else19
|
||||
tail call void @abort() noreturn nounwind
|
||||
unreachable
|
||||
|
||||
return: ; preds = %if.else18, %if.else13, %if.then7, %if.then
|
||||
%retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ]
|
||||
return: ; preds = %if.else19, %if.else14, %if.then8, %if.then
|
||||
%retval.0 = phi i32 [ %6, %if.then8 ], [ 2, %if.then ], [ 1, %if.else14 ], [ 0, %if.else19 ]
|
||||
ret i32 %retval.0
|
||||
}
|
||||
|
||||
declare void @abort() noreturn nounwind
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; This function is called by each of the dispatch functions we generate;
|
||||
;; it sets @__system_best_isa if it is unset.
|
||||
|
||||
217
builtins/svml.m4
Normal file
217
builtins/svml.m4
Normal file
@@ -0,0 +1,217 @@
|
||||
;; copyright stub :)
|
||||
;; Copyright (c) 2013, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
;; svml macro
|
||||
|
||||
;; svml_stubs : stubs for svml calls
|
||||
;; $1 - type ("float" or "double")
|
||||
;; $2 - svml internal function suffix ("f" for float, "d" for double)
|
||||
;; $3 - vector width
|
||||
define(`svml_stubs',`
|
||||
declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone alwaysinline
|
||||
declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone alwaysinline
|
||||
declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone alwaysinline
|
||||
declare void @__svml_sincos$2(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline
|
||||
declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone alwaysinline
|
||||
declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone alwaysinline
|
||||
declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline
|
||||
declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone alwaysinline
|
||||
declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone alwaysinline
|
||||
declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline
|
||||
')
|
||||
|
||||
;; svml_declare : declaration of __svml_* intrinsics
|
||||
;; $1 - type ("float" or "double")
|
||||
;; $2 - __svml_* intrinsic function suffix
|
||||
;; float: "f4"(sse) "f8"(avx) "f16"(avx512)
|
||||
;; double: "2"(sse) "4"(avx) "8"(avx512)
|
||||
;; $3 - vector width
|
||||
define(`svml_declare',`
|
||||
declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone
|
||||
declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone
|
||||
declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone
|
||||
declare <$3 x $1> @__svml_sincos$2(<$3 x $1> *, <$3 x $1>) nounwind readnone
|
||||
declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone
|
||||
declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone
|
||||
declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone
|
||||
declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone
|
||||
declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone
|
||||
declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone
|
||||
');
|
||||
|
||||
;; defintition of __svml_* internal functions
|
||||
;; $1 - type ("float" or "double")
|
||||
;; $2 - __svml_* intrinsic function suffix
|
||||
;; float: "f4"(sse) "f8"(avx) "f16"(avx512)
|
||||
;; double: "2"(sse) "4"(avx) "8"(avx512)
|
||||
;; $3 - vector width
|
||||
;; $4 - svml internal function suffix ("f" for float, "d" for double)
|
||||
define(`svml_define',`
|
||||
define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline {
|
||||
%ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0)
|
||||
ret <$3 x $1> %ret
|
||||
}
|
||||
define <$3 x $1> @__svml_asin$4(<$3 x $1>) nounwind readnone alwaysinline {
|
||||
%ret = call <$3 x $1> @__svml_asin$2(<$3 x $1> %0)
|
||||
ret <$3 x $1> %ret
|
||||
}
|
||||
|
||||
define <$3 x $1> @__svml_cos$4(<$3 x $1>) nounwind readnone alwaysinline {
|
||||
%ret = call <$3 x $1> @__svml_cos$2(<$3 x $1> %0)
|
||||
ret <$3 x $1> %ret
|
||||
}
|
||||
|
||||
define void @__svml_sincos$4(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline {
|
||||
%s = call <$3 x $1> @__svml_sincos$2(<$3 x $1> * %2, <$3 x $1> %0)
|
||||
store <$3 x $1> %s, <$3 x $1> * %1
|
||||
ret void
|
||||
}
|
||||
|
||||
define <$3 x $1> @__svml_tan$4(<$3 x $1>) nounwind readnone alwaysinline {
|
||||
%ret = call <$3 x $1> @__svml_tan$2(<$3 x $1> %0)
|
||||
ret <$3 x $1> %ret
|
||||
}
|
||||
|
||||
define <$3 x $1> @__svml_atan$4(<$3 x $1>) nounwind readnone alwaysinline {
|
||||
%ret = call <$3 x $1> @__svml_atan$2(<$3 x $1> %0)
|
||||
ret <$3 x $1> %ret
|
||||
}
|
||||
|
||||
define <$3 x $1> @__svml_atan2$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
|
||||
%ret = call <$3 x $1> @__svml_atan2$2(<$3 x $1> %0, <$3 x $1> %1)
|
||||
ret <$3 x $1> %ret
|
||||
}
|
||||
|
||||
define <$3 x $1> @__svml_exp$4(<$3 x $1>) nounwind readnone alwaysinline {
|
||||
%ret = call <$3 x $1> @__svml_exp$2(<$3 x $1> %0)
|
||||
ret <$3 x $1> %ret
|
||||
}
|
||||
|
||||
define <$3 x $1> @__svml_log$4(<$3 x $1>) nounwind readnone alwaysinline {
|
||||
%ret = call <$3 x $1> @__svml_log$2(<$3 x $1> %0)
|
||||
ret <$3 x $1> %ret
|
||||
}
|
||||
|
||||
define <$3 x $1> @__svml_pow$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
|
||||
%ret = call <$3 x $1> @__svml_pow$2(<$3 x $1> %0, <$3 x $1> %1)
|
||||
ret <$3 x $1> %ret
|
||||
}
|
||||
')
|
||||
|
||||
|
||||
;; svml_define_x : defintition of __svml_* internal functions operation on extended width
|
||||
;; $1 - type ("float" or "double")
|
||||
;; $2 - __svml_* intrinsic function suffix
|
||||
;; float: "f4"(sse) "f8"(avx) "f16"(avx512)
|
||||
;; double: "2"(sse) "4"(avx) "8"(avx512)
|
||||
;; $3 - vector width
|
||||
;; $4 - svml internal function suffix ("f" for float, "d" for double)
|
||||
;; $5 - extended width, must be at least twice the native vector width
|
||||
;; contigent on existing of unary$3to$5 and binary$3to$5 macros
|
||||
|
||||
;; *todo*: in sincos call use __svml_sincos[f][2,4,8,16] call, e.g.
|
||||
;;define void @__svml_sincosf(<8 x float>, <8 x float> *,
|
||||
;; <8 x float> *) nounwind readnone alwaysinline {
|
||||
;; ; call svml_sincosf4 two times with the two 4-wide sub-vectors
|
||||
;; %a = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
;; <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
;; %b = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
;; <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
;;
|
||||
;; %cospa = alloca <4 x float>
|
||||
;; %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
|
||||
;;
|
||||
;; %cospb = alloca <4 x float>
|
||||
;; %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
|
||||
;;
|
||||
;; %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
|
||||
;; <8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
;; i32 4, i32 5, i32 6, i32 7>
|
||||
;; store <8 x float> %sin, <8 x float> * %1
|
||||
;;
|
||||
;; %cosa = load <4 x float> * %cospa
|
||||
;; %cosb = load <4 x float> * %cospb
|
||||
;; %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
|
||||
;; <8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
;; i32 4, i32 5, i32 6, i32 7>
|
||||
;; store <8 x float> %cos, <8 x float> * %2
|
||||
;;
|
||||
;; ret void
|
||||
;;}
|
||||
define(`svml_define_x',`
|
||||
define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline {
|
||||
unary$3to$5(ret, $1, @__svml_sin$2, %0)
|
||||
ret <$5 x $1> %ret
|
||||
}
|
||||
define <$5 x $1> @__svml_asin$4(<$5 x $1>) nounwind readnone alwaysinline {
|
||||
unary$3to$5(ret, $1, @__svml_asin$2, %0)
|
||||
ret <$5 x $1> %ret
|
||||
}
|
||||
define <$5 x $1> @__svml_cos$4(<$5 x $1>) nounwind readnone alwaysinline {
|
||||
unary$3to$5(ret, $1, @__svml_cos$2, %0)
|
||||
ret <$5 x $1> %ret
|
||||
}
|
||||
define void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline
|
||||
{
|
||||
%s = call <$5 x $1> @__svml_sin$4(<$5 x $1> %0)
|
||||
%c = call <$5 x $1> @__svml_cos$4(<$5 x $1> %0)
|
||||
store <$5 x $1> %s, <$5 x $1> * %1
|
||||
store <$5 x $1> %c, <$5 x $1> * %2
|
||||
ret void
|
||||
}
|
||||
define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline {
|
||||
unary$3to$5(ret, $1, @__svml_tan$2, %0)
|
||||
ret <$5 x $1> %ret
|
||||
}
|
||||
define <$5 x $1> @__svml_atan$4(<$5 x $1>) nounwind readnone alwaysinline {
|
||||
unary$3to$5(ret, $1, @__svml_atan$2, %0)
|
||||
ret <$5 x $1> %ret
|
||||
}
|
||||
define <$5 x $1> @__svml_atan2$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
|
||||
binary$3to$5(ret, $1, @__svml_atan2$2, %0, %1)
|
||||
ret <$5 x $1> %ret
|
||||
}
|
||||
define <$5 x $1> @__svml_exp$4(<$5 x $1>) nounwind readnone alwaysinline {
|
||||
unary$3to$5(ret, $1, @__svml_exp$2, %0)
|
||||
ret <$5 x $1> %ret
|
||||
}
|
||||
define <$5 x $1> @__svml_log$4(<$5 x $1>) nounwind readnone alwaysinline {
|
||||
unary$3to$5(ret, $1, @__svml_log$2, %0)
|
||||
ret <$5 x $1> %ret
|
||||
}
|
||||
define <$5 x $1> @__svml_pow$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
|
||||
binary$3to$5(ret, $1, @__svml_pow$2, %0, %1)
|
||||
ret <$5 x $1> %ret
|
||||
}
|
||||
')
|
||||
|
||||
@@ -277,3 +277,9 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
|
||||
|
||||
@@ -137,19 +137,14 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; svml
|
||||
|
||||
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
|
||||
; or, use the macro to call the 4-wide ones 4x with our 16-wide
|
||||
; vectors...
|
||||
include(`svml.m4')
|
||||
;; single precision
|
||||
svml_declare(float,f8,8)
|
||||
svml_define_x(float,f8,8,f,16)
|
||||
|
||||
declare <16 x float> @__svml_sin(<16 x float>)
|
||||
declare <16 x float> @__svml_cos(<16 x float>)
|
||||
declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
|
||||
declare <16 x float> @__svml_tan(<16 x float>)
|
||||
declare <16 x float> @__svml_atan(<16 x float>)
|
||||
declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
|
||||
declare <16 x float> @__svml_exp(<16 x float>)
|
||||
declare <16 x float> @__svml_log(<16 x float>)
|
||||
declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
|
||||
;; double precision
|
||||
svml_declare(double,4,4)
|
||||
svml_define_x(double,4,4,d,16)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
@@ -271,6 +266,33 @@ reduce_equal(16)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int32 ops
|
||||
|
||||
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
|
||||
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
|
||||
<16 x i8> zeroinitializer)
|
||||
%r0 = extractelement <2 x i64> %rv, i32 0
|
||||
%r1 = extractelement <2 x i64> %rv, i32 1
|
||||
%r = add i64 %r0, %r1
|
||||
%r16 = trunc i64 %r to i16
|
||||
ret i16 %r16
|
||||
}
|
||||
|
||||
define internal <16 x i16> @__add_varying_i16(<16 x i16>,
|
||||
<16 x i16>) nounwind readnone alwaysinline {
|
||||
%r = add <16 x i16> %0, %1
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
|
||||
%r = add i16 %0, %1
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
|
||||
reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
|
||||
}
|
||||
|
||||
define <16 x i32> @__add_varying_int32(<16 x i32>,
|
||||
<16 x i32>) nounwind readnone alwaysinline {
|
||||
%s = add <16 x i32> %0, %1
|
||||
|
||||
@@ -137,19 +137,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; svml
|
||||
|
||||
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
|
||||
; or, use the macro to call the 4-wide ones twice with our 8-wide
|
||||
; vectors...
|
||||
include(`svml.m4')
|
||||
;; single precision
|
||||
svml_declare(float,f8,8)
|
||||
svml_define(float,f8,8,f)
|
||||
|
||||
declare <8 x float> @__svml_sin(<8 x float>)
|
||||
declare <8 x float> @__svml_cos(<8 x float>)
|
||||
declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
|
||||
declare <8 x float> @__svml_tan(<8 x float>)
|
||||
declare <8 x float> @__svml_atan(<8 x float>)
|
||||
declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
|
||||
declare <8 x float> @__svml_exp(<8 x float>)
|
||||
declare <8 x float> @__svml_log(<8 x float>)
|
||||
declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
|
||||
;; double precision
|
||||
svml_declare(double,4,4)
|
||||
svml_define_x(double,4,4,d,8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
@@ -217,7 +212,6 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
ret float %sum
|
||||
}
|
||||
|
||||
|
||||
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
@@ -229,6 +223,42 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
|
||||
reduce_equal(8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int8 ops
|
||||
|
||||
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
|
||||
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
|
||||
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
|
||||
<16 x i8> zeroinitializer)
|
||||
%r0 = extractelement <2 x i64> %rv, i32 0
|
||||
%r1 = extractelement <2 x i64> %rv, i32 1
|
||||
%r = add i64 %r0, %r1
|
||||
%r16 = trunc i64 %r to i16
|
||||
ret i16 %r16
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int16 ops
|
||||
|
||||
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
|
||||
<8 x i16>) nounwind readnone alwaysinline {
|
||||
%r = add <8 x i16> %0, %1
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
|
||||
%r = add i16 %0, %1
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
|
||||
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int32 ops
|
||||
|
||||
@@ -257,20 +287,14 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint32 ops
|
||||
|
||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal double ops
|
||||
|
||||
@@ -329,9 +353,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint64 ops
|
||||
|
||||
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
81
builtins/target-avx1-i64x4.ll
Normal file
81
builtins/target-avx1-i64x4.ll
Normal file
@@ -0,0 +1,81 @@
|
||||
;; Copyright (c) 2013, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx1-i64x4base.ll')
|
||||
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
|
||||
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
|
||||
define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
ifelse(NO_HALF_DECLARES, `1', `', `
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
513
builtins/target-avx1-i64x4base.ll
Normal file
513
builtins/target-avx1-i64x4base.ll
Normal file
@@ -0,0 +1,513 @@
|
||||
;; Copyright (c) 2013, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Basic 4-wide definitions
|
||||
|
||||
define(`WIDTH',`4')
|
||||
define(`MASK',`i64')
|
||||
include(`util.m4')
|
||||
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
|
||||
include(`target-avx-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
;; sse intrinsic
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
|
||||
; do one N-R iteration
|
||||
%v_iv = fmul <4 x float> %0, %call
|
||||
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
|
||||
%iv_mul = fmul <4 x float> %call, %two_minus
|
||||
ret <4 x float> %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
;; sse intrinsic
|
||||
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
|
||||
|
||||
define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
;; avx intrinsic
|
||||
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
|
||||
|
||||
define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 8)
|
||||
ret <4 x double> %call
|
||||
}
|
||||
|
||||
define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
%call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 9)
|
||||
ret <4 x double> %call
|
||||
}
|
||||
|
||||
|
||||
define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
%call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 10)
|
||||
ret <4 x double> %call
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rsqrt
|
||||
|
||||
;; sse intrinsic
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <4 x float> %v, %is
|
||||
%v_is_is = fmul <4 x float> %v_is, %is
|
||||
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <4 x float> %is, %three_sub
|
||||
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
ret <4 x float> %half_scale
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; sqrt
|
||||
|
||||
;; sse intrinsic
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
;; avx<76> intrinsic
|
||||
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
|
||||
|
||||
define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
|
||||
%call = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %0)
|
||||
ret <4 x double> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; svml
|
||||
|
||||
include(`svml.m4')
|
||||
;; single precision
|
||||
svml_declare(float,f4,4)
|
||||
svml_define(float,f4,4,f)
|
||||
|
||||
;; double precision
|
||||
svml_declare(double,4,4)
|
||||
svml_define(double,4,4,d)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
;; sse intrinsics
|
||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops
|
||||
|
||||
;; sse intrinsic
|
||||
declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
|
||||
|
||||
define i64 @__movmsk(<4 x i64>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i64> %0 to <4 x double>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<4 x i64>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i64> %0 to <4 x double>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<4 x i64>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i64> %0 to <4 x double>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 15
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<4 x i64>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i64> %0 to <4 x double>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal float ops
|
||||
|
||||
;; sse intrinsic
|
||||
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
|
||||
%v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
|
||||
%scalar = extractelement <4 x float> %v2, i32 0
|
||||
ret float %scalar
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<4 x float>) nounwind readnone {
|
||||
reduce4(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
define float @__reduce_max_float(<4 x float>) nounwind readnone {
|
||||
reduce4(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
reduce_equal(4)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int8 ops
|
||||
|
||||
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline
|
||||
{
|
||||
%wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
|
||||
i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
|
||||
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
|
||||
<16 x i8> zeroinitializer)
|
||||
%r0 = extractelement <2 x i64> %rv, i32 0
|
||||
%r1 = extractelement <2 x i64> %rv, i32 1
|
||||
%r = add i64 %r0, %r1
|
||||
%r16 = trunc i64 %r to i16
|
||||
ret i16 %r16
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int16 ops
|
||||
|
||||
define internal <4 x i16> @__add_varying_i16(<4 x i16>,
|
||||
<4 x i16>) nounwind readnone alwaysinline {
|
||||
%r = add <4 x i16> %0, %1
|
||||
ret <4 x i16> %r
|
||||
}
|
||||
|
||||
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
|
||||
%r = add i16 %0, %1
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
|
||||
reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int32 ops
|
||||
|
||||
define <4 x i32> @__add_varying_int32(<4 x i32>,
|
||||
<4 x i32>) nounwind readnone alwaysinline {
|
||||
%s = add <4 x i32> %0, %1
|
||||
ret <4 x i32> %s
|
||||
}
|
||||
|
||||
define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
%s = add i32 %0, %1
|
||||
ret i32 %s
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_int32(<4 x i32>) nounwind readnone alwaysinline {
|
||||
reduce4(i32, @__add_varying_int32, @__add_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
|
||||
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
|
||||
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
|
||||
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
|
||||
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal double ops
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
|
||||
define double @__reduce_add_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1 = shufflevector <4 x double> <double 0.,double 0.,double 0.,double 0.>, <4 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
;; %v1 = <4 x double> <double 0., double 0., double 0., double 0.>
|
||||
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
|
||||
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
||||
%final0 = extractelement <4 x double> %sum1, i32 0
|
||||
%final1 = extractelement <4 x double> %sum1, i32 2
|
||||
%sum = fadd double %final0, %final1
|
||||
|
||||
ret double %sum
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
|
||||
reduce4(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
|
||||
define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
|
||||
reduce4(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int64 ops
|
||||
|
||||
define <4 x i64> @__add_varying_int64(<4 x i64>,
|
||||
<4 x i64>) nounwind readnone alwaysinline {
|
||||
%s = add <4 x i64> %0, %1
|
||||
ret <4 x i64> %s
|
||||
}
|
||||
|
||||
define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
%s = add i64 %0, %1
|
||||
ret i64 %s
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
|
||||
reduce4(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
|
||||
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
|
||||
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
|
||||
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
|
||||
define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
|
||||
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
|
||||
; no masked load instruction for i8 and i16 types??
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
|
||||
;; avx intrinsics
|
||||
declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask)
|
||||
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
||||
|
||||
define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline {
|
||||
%mask = trunc <4 x i64> %mask64 to <4 x i32>
|
||||
%floatmask = bitcast <4 x i32> %mask to <4 x float>
|
||||
%floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask)
|
||||
%retval = bitcast <4 x float> %floatval to <4 x i32>
|
||||
ret <4 x i32> %retval
|
||||
}
|
||||
|
||||
|
||||
define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline {
|
||||
%doublemask = bitcast <4 x i64> %mask to <4 x double>
|
||||
%doubleval = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %doublemask)
|
||||
%retval = bitcast <4 x double> %doubleval to <4 x i64>
|
||||
ret <4 x i64> %retval
|
||||
}
|
||||
|
||||
masked_load_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
|
||||
; note that mask is the 2nd parameter, not the 3rd one!!
|
||||
;; avx intrinsics
|
||||
declare void @llvm.x86.avx.maskstore.ps (i8 *, <4 x float>, <4 x float>)
|
||||
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
||||
|
||||
define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i64>) nounwind alwaysinline {
|
||||
%mask32 = trunc <4 x i64> %2 to <4 x i32>
|
||||
|
||||
%ptr = bitcast <4 x i32> * %0 to i8 *
|
||||
%val = bitcast <4 x i32> %1 to <4 x float>
|
||||
%mask = bitcast <4 x i32> %mask32 to <4 x float>
|
||||
call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
|
||||
<4 x i64>) nounwind alwaysinline {
|
||||
%ptr = bitcast <4 x i64> * %0 to i8 *
|
||||
%val = bitcast <4 x i64> %1 to <4 x double>
|
||||
%mask = bitcast <4 x i64> %2 to <4 x double>
|
||||
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask, <4 x double> %val)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
masked_store_blend_8_16_by_4_mask64()
|
||||
|
||||
;; sse intrinsic
|
||||
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
||||
<4 x float>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i64>) nounwind alwaysinline {
|
||||
%mask = trunc <4 x i64> %2 to <4 x i32>
|
||||
%mask_as_float = bitcast <4 x i32> %mask to <4 x float>
|
||||
%oldValue = load <4 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
|
||||
%newAsFloat = bitcast <4 x i32> %1 to <4 x float>
|
||||
%blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
|
||||
<4 x float> %newAsFloat,
|
||||
<4 x float> %mask_as_float)
|
||||
%blendAsInt = bitcast <4 x float> %blend to <4 x i32>
|
||||
store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
;; avx intrinsic
|
||||
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
|
||||
<4 x double>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_i64(<4 x i64>* nocapture , <4 x i64>,
|
||||
<4 x i64>) nounwind alwaysinline {
|
||||
%mask_as_double = bitcast <4 x i64> %2 to <4 x double>
|
||||
%oldValue = load <4 x i64>* %0, align 4
|
||||
%oldAsDouble = bitcast <4 x i64> %oldValue to <4 x double>
|
||||
%newAsDouble = bitcast <4 x i64> %1 to <4 x double>
|
||||
%blend = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %oldAsDouble,
|
||||
<4 x double> %newAsDouble,
|
||||
<4 x double> %mask_as_double)
|
||||
%blendAsInt = bitcast <4 x double> %blend to <4 x i64>
|
||||
store <4 x i64> %blendAsInt, <4 x i64>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; scatter
|
||||
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
|
||||
define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
|
||||
%call = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %0, <4 x double> %1)
|
||||
ret <4 x double> %call
|
||||
}
|
||||
|
||||
define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
|
||||
%call = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %0, <4 x double> %1)
|
||||
ret <4 x double> %call
|
||||
}
|
||||
|
||||
@@ -310,6 +310,7 @@ declare double @round (double) nounwind readnone
|
||||
;declare float @llvm.sqrt.f32(float %Val)
|
||||
declare double @llvm.sqrt.f64(double %Val)
|
||||
declare float @llvm.sin.f32(float %Val)
|
||||
declare float @llvm.asin.f32(float %Val)
|
||||
declare float @llvm.cos.f32(float %Val)
|
||||
declare float @llvm.sqrt.f32(float %Val)
|
||||
declare float @llvm.exp.f32(float %Val)
|
||||
@@ -471,6 +472,15 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
ret i64 %call
|
||||
}
|
||||
|
||||
define i8 @__reduce_add_int8(<1 x i8> %v) nounwind readonly alwaysinline {
|
||||
%r = extractelement <1 x i8> %v, i32 0
|
||||
ret i8 %r
|
||||
}
|
||||
|
||||
define i16 @__reduce_add_int16(<1 x i16> %v) nounwind readonly alwaysinline {
|
||||
%r = extractelement <1 x i16> %v, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
|
||||
%r = extractelement <1 x float> %v, i32 0
|
||||
@@ -642,7 +652,18 @@ define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
|
||||
declare <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline
|
||||
declare <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline
|
||||
declare <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline
|
||||
declare void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline
|
||||
declare <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline
|
||||
declare <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline
|
||||
declare <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline
|
||||
declare <1 x float> @__svml_expd(<1 x float>) nounwind readnone alwaysinline
|
||||
declare <1 x float> @__svml_logd(<1 x float>) nounwind readnone alwaysinline
|
||||
declare <1 x float> @__svml_powd(<1 x float>, <1 x float>) nounwind readnone alwaysinline
|
||||
|
||||
define <1 x float> @__svml_sinf(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
@@ -653,7 +674,18 @@ define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
|
||||
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
|
||||
define <1 x float> @__svml_asinf(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_asinf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm.asin.f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
unary1to1(float,@llvm.asin.f32)
|
||||
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_cosf(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
@@ -664,18 +696,18 @@ define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
|
||||
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
|
||||
define void @__svml_sincosf(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
|
||||
; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
|
||||
; store <1 x float> %s, <1 x float> * %1
|
||||
; ret void
|
||||
%sin = call <1 x float> @__svml_sin (<1 x float> %0)
|
||||
%cos = call <1 x float> @__svml_cos (<1 x float> %0)
|
||||
%sin = call <1 x float> @__svml_sinf(<1 x float> %0)
|
||||
%cos = call <1 x float> @__svml_cosf(<1 x float> %0)
|
||||
store <1 x float> %sin, <1 x float> * %1
|
||||
store <1 x float> %cos, <1 x float> * %2
|
||||
ret void
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
|
||||
define <1 x float> @__svml_tanf(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
@@ -687,7 +719,7 @@ define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
|
||||
ret <1 x float > %0
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
|
||||
define <1 x float> @__svml_atanf(<1 x float>) nounwind readnone alwaysinline {
|
||||
; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
|
||||
; ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
@@ -700,7 +732,7 @@ define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
|
||||
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
|
||||
define <1 x float> @__svml_atan2f(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
|
||||
;ret <1 x float> %ret
|
||||
;%y = extractelement <1 x float> %0, i32 0
|
||||
@@ -713,19 +745,19 @@ define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone al
|
||||
ret <1 x float > %0
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
|
||||
define <1 x float> @__svml_expf(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
unary1to1(float, @llvm.exp.f32)
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
|
||||
define <1 x float> @__svml_logf(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
unary1to1(float, @llvm.log.f32)
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
|
||||
define <1 x float> @__svml_powf(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
|
||||
;ret <1 x float> %ret
|
||||
%r = extractelement <1 x float> %0, i32 0
|
||||
@@ -953,3 +985,9 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
|
||||
|
||||
@@ -202,21 +202,15 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
|
||||
declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
|
||||
declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
|
||||
|
||||
;; svml
|
||||
|
||||
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
|
||||
; or, use the macro to call the 4-wide ones twice with our 8-wide
|
||||
; vectors...
|
||||
|
||||
declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
|
||||
declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
|
||||
declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_log(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
|
||||
;; svml
|
||||
|
||||
include(`svml.m4')
|
||||
svml_stubs(float,f,WIDTH)
|
||||
svml_stubs(double,d,WIDTH)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reductions
|
||||
@@ -226,14 +220,16 @@ declare i1 @__any(<WIDTH x i1>) nounwind readnone
|
||||
declare i1 @__all(<WIDTH x i1>) nounwind readnone
|
||||
declare i1 @__none(<WIDTH x i1>) nounwind readnone
|
||||
|
||||
declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
|
||||
declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
|
||||
|
||||
declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
|
||||
declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone
|
||||
declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone
|
||||
|
||||
declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
|
||||
declare i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
|
||||
declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone
|
||||
declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone
|
||||
|
||||
declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone
|
||||
declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone
|
||||
|
||||
@@ -244,7 +240,6 @@ declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone
|
||||
declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone
|
||||
declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone
|
||||
declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone
|
||||
|
||||
declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone
|
||||
declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone
|
||||
|
||||
@@ -379,3 +374,8 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
|
||||
declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind
|
||||
declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
|
||||
|
||||
517
builtins/target-neon-16.ll
Normal file
517
builtins/target-neon-16.ll
Normal file
@@ -0,0 +1,517 @@
|
||||
;;
|
||||
;; target-neon-16.ll
|
||||
;;
|
||||
;; Copyright(c) 2013 Google, Inc.
|
||||
;;
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Matt Pharr nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`WIDTH',`8')
|
||||
define(`MASK',`i16')
|
||||
|
||||
include(`util.m4')
|
||||
include(`target-neon-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
|
||||
unary4to8conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
|
||||
ret <8 x float> %r
|
||||
}
|
||||
|
||||
define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
|
||||
unary4to8conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; math
|
||||
|
||||
;; round/floor/ceil
|
||||
|
||||
;; FIXME: grabbed these from the sse2 target, which does not have native
|
||||
;; instructions for these. Is there a better approach for NEON?
|
||||
|
||||
define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
|
||||
%bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i,
|
||||
<i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
|
||||
i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
|
||||
%bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
|
||||
%binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i,
|
||||
<float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
|
||||
float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
|
||||
%binop21.i = fadd <8 x float> %binop.i,
|
||||
<float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
|
||||
float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
|
||||
%float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
|
||||
%bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
|
||||
ret <8 x float> %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
|
||||
%bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
|
||||
%bitop.i = and <8 x i32> %val_to_boolvec32.i,
|
||||
<i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
|
||||
i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
|
||||
%binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <8 x float> %binop.i
|
||||
}
|
||||
|
||||
define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
|
||||
%bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
|
||||
%bitop.i = and <8 x i32> %val_to_boolvec32.i,
|
||||
<i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
|
||||
i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
|
||||
%binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <8 x float> %binop.i
|
||||
}
|
||||
|
||||
;; FIXME: rounding doubles and double vectors needs to be implemented
|
||||
declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone
|
||||
declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone
|
||||
declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; min/max
|
||||
|
||||
declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
|
||||
<WIDTH x float>) nounwind readnone {
|
||||
binary4to8(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
|
||||
ret <WIDTH x float> %r
|
||||
}
|
||||
|
||||
define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
|
||||
<WIDTH x float>) nounwind readnone {
|
||||
binary4to8(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
|
||||
ret <WIDTH x float> %r
|
||||
}
|
||||
|
||||
declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
|
||||
binary4to8(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
|
||||
ret <WIDTH x i32> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
|
||||
binary4to8(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
|
||||
ret <WIDTH x i32> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
|
||||
binary4to8(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
|
||||
ret <WIDTH x i32> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
|
||||
binary4to8(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
|
||||
ret <WIDTH x i32> %r
|
||||
}
|
||||
|
||||
;; sqrt/rsqrt/rcp
|
||||
|
||||
declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
|
||||
unary4to8(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
|
||||
binary4to8(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
|
||||
%x1 = fmul <WIDTH x float> %x0, %x0_nr
|
||||
binary4to8(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
|
||||
%x2 = fmul <WIDTH x float> %x1, %x1_nr
|
||||
ret <WIDTH x float> %x2
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
|
||||
unary4to8(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
|
||||
%x0_2 = fmul <WIDTH x float> %x0, %x0
|
||||
binary4to8(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
|
||||
%x1 = fmul <WIDTH x float> %x0, %x0_nr
|
||||
%x1_2 = fmul <WIDTH x float> %x1, %x1
|
||||
binary4to8(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
|
||||
%x2 = fmul <WIDTH x float> %x1, %x1_nr
|
||||
ret <WIDTH x float> %x2
|
||||
}
|
||||
|
||||
define float @__rsqrt_uniform_float(float) nounwind readnone {
|
||||
%v1 = bitcast float %0 to <1 x float>
|
||||
%vs = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%vr = call <8 x float> @__rsqrt_varying_float(<8 x float> %vs)
|
||||
%r = extractelement <8 x float> %vr, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__rcp_uniform_float(float) nounwind readnone {
|
||||
%v1 = bitcast float %0 to <1 x float>
|
||||
%vs = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%vr = call <8 x float> @__rcp_varying_float(<8 x float> %vs)
|
||||
%r = extractelement <8 x float> %vr, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
|
||||
|
||||
define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
|
||||
unary4to8(result, float, @llvm.sqrt.v4f32, %0)
|
||||
;; this returns nan for v=0, which is undesirable..
|
||||
;; %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
|
||||
;; %result = fmul <4 x float> %rsqrt, %0
|
||||
ret <8 x float> %result
|
||||
}
|
||||
|
||||
declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
|
||||
|
||||
define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
|
||||
unary4to8(r, double, @llvm.sqrt.v4f64, %0)
|
||||
ret <WIDTH x double> %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reductions
|
||||
|
||||
define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
|
||||
%and_mask = and <WIDTH x i16> %0,
|
||||
<i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
|
||||
%v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %and_mask)
|
||||
%v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
|
||||
%va = extractelement <2 x i64> %v2, i32 0
|
||||
%vb = extractelement <2 x i64> %v2, i32 1
|
||||
%v = or i64 %va, %vb
|
||||
ret i64 %v
|
||||
}
|
||||
|
||||
define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
|
||||
v8tov4(MASK, %0, %v0123, %v4567)
|
||||
%vor = or <4 x MASK> %v0123, %v4567
|
||||
%v0 = extractelement <4 x MASK> %vor, i32 0
|
||||
%v1 = extractelement <4 x MASK> %vor, i32 1
|
||||
%v2 = extractelement <4 x MASK> %vor, i32 2
|
||||
%v3 = extractelement <4 x MASK> %vor, i32 3
|
||||
%v01 = or MASK %v0, %v1
|
||||
%v23 = or MASK %v2, %v3
|
||||
%v = or MASK %v01, %v23
|
||||
%cmp = icmp ne MASK %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
|
||||
v8tov4(MASK, %0, %v0123, %v4567)
|
||||
%vand = and <4 x MASK> %v0123, %v4567
|
||||
%v0 = extractelement <4 x MASK> %vand, i32 0
|
||||
%v1 = extractelement <4 x MASK> %vand, i32 1
|
||||
%v2 = extractelement <4 x MASK> %vand, i32 2
|
||||
%v3 = extractelement <4 x MASK> %vand, i32 3
|
||||
%v01 = and MASK %v0, %v1
|
||||
%v23 = and MASK %v2, %v3
|
||||
%v = and MASK %v01, %v23
|
||||
%cmp = icmp ne MASK %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
|
||||
%any = call i1 @__any(<WIDTH x MASK> %0)
|
||||
%none = icmp eq i1 %any, 0
|
||||
ret i1 %none
|
||||
}
|
||||
|
||||
;; $1: scalar type
|
||||
;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
|
||||
;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
|
||||
;; $4: scalar reduce function
|
||||
|
||||
define(`neon_reduce', `
|
||||
v8tov4($1, %0, %v0123, %v4567)
|
||||
%v0123_8 = shufflevector <4 x $1> %v0123, <4 x $1> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v4567_8 = shufflevector <4 x $1> %v4567, <4 x $1> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%vfirst = call <8 x $1> $2(<8 x $1> %v0123_8, <8 x $1> %v4567_8)
|
||||
%vfirst_4 = shufflevector <8 x $1> %vfirst, <8 x $1> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
v4tov2($1, %vfirst_4, %v0, %v1)
|
||||
%vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
|
||||
%vh0 = extractelement <2 x $1> %vh, i32 0
|
||||
%vh1 = extractelement <2 x $1> %vh, i32 1
|
||||
%r = call $1 $4($1 %vh0, $1 %vh1)
|
||||
ret $1 %r
|
||||
')
|
||||
|
||||
declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
|
||||
|
||||
define internal float @add_f32(float, float) {
|
||||
%r = fadd float %0, %1
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
|
||||
%r = fadd <WIDTH x float> %0, %1
|
||||
ret <WIDTH x float> %r
|
||||
}
|
||||
|
||||
define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
|
||||
neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
|
||||
}
|
||||
|
||||
declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
|
||||
|
||||
define internal float @min_f32(float, float) {
|
||||
%cmp = fcmp olt float %0, %1
|
||||
%r = select i1 %cmp, float %0, float %1
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
|
||||
neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
|
||||
}
|
||||
|
||||
declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
|
||||
|
||||
define internal float @max_f32(float, float) {
|
||||
%cmp = fcmp ugt float %0, %1
|
||||
%r = select i1 %cmp, float %0, float %1
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
|
||||
neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
|
||||
}
|
||||
|
||||
declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
|
||||
declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
|
||||
|
||||
define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
|
||||
%a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %0)
|
||||
%a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
|
||||
%a0 = extractelement <2 x i32> %a32, i32 0
|
||||
%a1 = extractelement <2 x i32> %a32, i32 1
|
||||
%r = add i32 %a0, %a1
|
||||
%r16 = trunc i32 %r to i16
|
||||
ret i16 %r16
|
||||
}
|
||||
|
||||
declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16>)
|
||||
|
||||
define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
|
||||
%a1 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16> %0)
|
||||
%a2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a1)
|
||||
%aa = extractelement <2 x i64> %a2, i32 0
|
||||
%ab = extractelement <2 x i64> %a2, i32 1
|
||||
%r = add i64 %aa, %ab
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
|
||||
|
||||
define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
|
||||
v8tov4(i32, %0, %va, %vb)
|
||||
%pa = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
|
||||
%pb = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
|
||||
%psum = add <2 x i64> %pa, %pb
|
||||
%a0 = extractelement <2 x i64> %psum, i32 0
|
||||
%a1 = extractelement <2 x i64> %psum, i32 1
|
||||
%r = add i64 %a0, %a1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
||||
define internal i32 @min_si32(i32, i32) {
|
||||
%cmp = icmp slt i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
|
||||
neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
|
||||
}
|
||||
|
||||
declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
||||
define internal i32 @max_si32(i32, i32) {
|
||||
%cmp = icmp sgt i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
|
||||
neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
|
||||
}
|
||||
|
||||
declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
||||
define internal i32 @min_ui32(i32, i32) {
|
||||
%cmp = icmp ult i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
|
||||
neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
|
||||
}
|
||||
|
||||
declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
||||
define internal i32 @max_ui32(i32, i32) {
|
||||
%cmp = icmp ugt i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
|
||||
neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
|
||||
}
|
||||
|
||||
define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
|
||||
v8tov2(double, %0, %v0, %v1, %v2, %v3)
|
||||
%v01 = fadd <2 x double> %v0, %v1
|
||||
%v23 = fadd <2 x double> %v2, %v3
|
||||
%sum = fadd <2 x double> %v01, %v23
|
||||
%e0 = extractelement <2 x double> %sum, i32 0
|
||||
%e1 = extractelement <2 x double> %sum, i32 1
|
||||
%m = fadd double %e0, %e1
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
|
||||
reduce8(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
|
||||
reduce8(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
|
||||
v8tov2(i64, %0, %v0, %v1, %v2, %v3)
|
||||
%v01 = add <2 x i64> %v0, %v1
|
||||
%v23 = add <2 x i64> %v2, %v3
|
||||
%sum = add <2 x i64> %v01, %v23
|
||||
%e0 = extractelement <2 x i64> %sum, i32 0
|
||||
%e1 = extractelement <2 x i64> %sum, i32 1
|
||||
%m = add i64 %e0, %e1
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
|
||||
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
|
||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16
|
||||
|
||||
declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
|
||||
|
||||
define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
|
||||
%r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
|
||||
ret <8 x i8> %r
|
||||
}
|
||||
|
||||
declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
|
||||
|
||||
define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone {
|
||||
%r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
|
||||
ret <8 x i8> %r
|
||||
}
|
||||
|
||||
declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
|
||||
|
||||
define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
|
||||
%r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
|
||||
ret <8 x i8> %r
|
||||
}
|
||||
|
||||
declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
|
||||
|
||||
define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone {
|
||||
%r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
|
||||
ret <8 x i8> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
;;
|
||||
;; target-neon.ll
|
||||
;; target-neon-32.ll
|
||||
;;
|
||||
;; Copyright(c) 2012-2013 Matt Pharr
|
||||
;; Copyright(c) 2013 Google, Inc.
|
||||
@@ -34,52 +34,20 @@
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
|
||||
|
||||
define(`WIDTH',`4')
|
||||
|
||||
define(`MASK',`i32')
|
||||
|
||||
include(`util.m4')
|
||||
|
||||
stdlib_core()
|
||||
scans()
|
||||
reduce_equal(WIDTH)
|
||||
rdrand_decls()
|
||||
define_shuffles()
|
||||
aossoa()
|
||||
ctlztz()
|
||||
include(`target-neon-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vec = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 0, i32 0, i32 0>
|
||||
%h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
|
||||
%r = extractelement <4 x float> %h, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone {
|
||||
%r = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %v)
|
||||
ret <4 x float> %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vec = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<4 x i32> <i32 0, i32 0, i32 0, i32 0>
|
||||
%h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
|
||||
%r = extractelement <4 x i16> %h, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
|
||||
define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
|
||||
%r = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %v)
|
||||
ret <4 x i16> %r
|
||||
@@ -88,48 +56,11 @@ define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; math
|
||||
|
||||
define void @__fastmath() nounwind {
|
||||
ret void
|
||||
}
|
||||
|
||||
;; round/floor/ceil
|
||||
|
||||
;; FIXME: grabbed these from the sse2 target, which does not have native
|
||||
;; instructions for these. Is there a better approach for NEON?
|
||||
|
||||
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
|
||||
%bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
|
||||
%bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
|
||||
%binop21.i = fadd float %binop.i, -8.388608e+06
|
||||
%float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
|
||||
%bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
|
||||
ret float %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
||||
%bincmp.i = fcmp ogt float %calltmp.i, %0
|
||||
%selectexpr.i = sext i1 %bincmp.i to i32
|
||||
%bitop.i = and i32 %selectexpr.i, -1082130432
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret float %binop.i
|
||||
}
|
||||
|
||||
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
||||
%bincmp.i = fcmp olt float %calltmp.i, %0
|
||||
%selectexpr.i = sext i1 %bincmp.i to i32
|
||||
%bitop.i = and i32 %selectexpr.i, 1065353216
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret float %binop.i
|
||||
}
|
||||
|
||||
define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
|
||||
%bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
|
||||
@@ -164,10 +95,6 @@ define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysin
|
||||
}
|
||||
|
||||
;; FIXME: rounding doubles and double vectors needs to be implemented
|
||||
declare double @__round_uniform_double(double) nounwind readnone
|
||||
declare double @__floor_uniform_double(double) nounwind readnone
|
||||
declare double @__ceil_uniform_double(double) nounwind readnone
|
||||
|
||||
declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone
|
||||
declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone
|
||||
declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone
|
||||
@@ -175,78 +102,6 @@ declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readn
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; min/max
|
||||
|
||||
define float @__max_uniform_float(float, float) nounwind readnone {
|
||||
%cmp = fcmp ugt float %0, %1
|
||||
%r = select i1 %cmp, float %0, float %1
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__min_uniform_float(float, float) nounwind readnone {
|
||||
%cmp = fcmp ult float %0, %1
|
||||
%r = select i1 %cmp, float %0, float %1
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
|
||||
%cmp = icmp slt i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
|
||||
%cmp = icmp sgt i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
|
||||
%cmp = icmp ult i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
|
||||
%cmp = icmp ugt i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
|
||||
%cmp = icmp slt i64 %0, %1
|
||||
%r = select i1 %cmp, i64 %0, i64 %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
|
||||
%cmp = icmp sgt i64 %0, %1
|
||||
%r = select i1 %cmp, i64 %0, i64 %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
|
||||
%cmp = icmp ult i64 %0, %1
|
||||
%r = select i1 %cmp, i64 %0, i64 %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
|
||||
%cmp = icmp ugt i64 %0, %1
|
||||
%r = select i1 %cmp, i64 %0, i64 %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define double @__min_uniform_double(double, double) nounwind readnone {
|
||||
%cmp = fcmp olt double %0, %1
|
||||
%r = select i1 %cmp, double %0, double %1
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define double @__max_uniform_double(double, double) nounwind readnone {
|
||||
%cmp = fcmp ogt double %0, %1
|
||||
%r = select i1 %cmp, double %0, double %1
|
||||
ret double %r
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
@@ -287,44 +142,6 @@ define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwin
|
||||
ret <4 x i32> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
|
||||
%m = icmp slt <WIDTH x i64> %0, %1
|
||||
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
|
||||
ret <WIDTH x i64> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
|
||||
%m = icmp sgt <WIDTH x i64> %0, %1
|
||||
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
|
||||
ret <WIDTH x i64> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
|
||||
%m = icmp ult <WIDTH x i64> %0, %1
|
||||
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
|
||||
ret <WIDTH x i64> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
|
||||
%m = icmp ugt <WIDTH x i64> %0, %1
|
||||
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
|
||||
ret <WIDTH x i64> %r
|
||||
}
|
||||
|
||||
define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
|
||||
<WIDTH x double>) nounwind readnone {
|
||||
%m = fcmp olt <WIDTH x double> %0, %1
|
||||
%r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
|
||||
ret <WIDTH x double> %r
|
||||
}
|
||||
|
||||
define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
|
||||
<WIDTH x double>) nounwind readnone {
|
||||
%m = fcmp ogt <WIDTH x double> %0, %1
|
||||
%r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
|
||||
ret <WIDTH x double> %r
|
||||
}
|
||||
|
||||
;; sqrt/rsqrt/rcp
|
||||
|
||||
declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
|
||||
@@ -371,13 +188,6 @@ define float @__rcp_uniform_float(float) nounwind readnone {
|
||||
ret float %r
|
||||
}
|
||||
|
||||
declare float @llvm.sqrt.f32(float)
|
||||
|
||||
define float @__sqrt_uniform_float(float) nounwind readnone {
|
||||
%r = call float @llvm.sqrt.f32(float %0)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
|
||||
|
||||
define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
|
||||
@@ -388,13 +198,6 @@ define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
|
||||
ret <4 x float> %result
|
||||
}
|
||||
|
||||
declare double @llvm.sqrt.f64(double)
|
||||
|
||||
define double @__sqrt_uniform_double(double) nounwind readnone {
|
||||
%r = call double @llvm.sqrt.f64(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
|
||||
|
||||
define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
|
||||
@@ -402,21 +205,6 @@ define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readno
|
||||
ret <4 x double> %r
|
||||
}
|
||||
|
||||
;; bit ops
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define i32 @__popcnt_int32(i32) nounwind readnone {
|
||||
%v = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
define i64 @__popcnt_int64(i64) nounwind readnone {
|
||||
%v = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reductions
|
||||
|
||||
@@ -509,15 +297,38 @@ define float @__reduce_max_float(<4 x float>) nounwind readnone {
|
||||
neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
|
||||
}
|
||||
|
||||
define internal i32 @add_i32(i32, i32) {
|
||||
%r = add i32 %0, %1
|
||||
declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
|
||||
|
||||
define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
|
||||
%v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
|
||||
%a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8)
|
||||
%a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
|
||||
%a0 = extractelement <2 x i32> %a32, i32 0
|
||||
%a1 = extractelement <2 x i32> %a32, i32 1
|
||||
%r = add i32 %a0, %a1
|
||||
%r16 = trunc i32 %r to i16
|
||||
ret i16 %r16
|
||||
}
|
||||
|
||||
declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
|
||||
|
||||
define i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
|
||||
%a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0)
|
||||
%a0 = extractelement <2 x i32> %a32, i32 0
|
||||
%a1 = extractelement <2 x i32> %a32, i32 1
|
||||
%r = add i32 %a0, %a1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
|
||||
declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
|
||||
|
||||
define i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
|
||||
neon_reduce(i32, @llvm.arm.neon.vpadd.v2i32, @add_i32)
|
||||
define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
|
||||
%a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0)
|
||||
%a0 = extractelement <2 x i64> %a64, i32 0
|
||||
%a1 = extractelement <2 x i64> %a64, i32 1
|
||||
%r = add i64 %a0, %a1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
|
||||
@@ -617,90 +428,60 @@ define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
;; int8/int16
|
||||
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
|
||||
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
masked_store_float_double()
|
||||
|
||||
define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
%old = load <WIDTH x i8> * %ptr
|
||||
%mask1 = trunc <4 x MASK> %mask to <4 x i1>
|
||||
%result = select <4 x i1> %mask1, <4 x i8> %new, <4 x i8> %old
|
||||
store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
|
||||
ret void
|
||||
define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
|
||||
%r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
|
||||
ret <4 x i8> %r
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
%old = load <WIDTH x i16> * %ptr
|
||||
%mask1 = trunc <4 x MASK> %mask to <4 x i1>
|
||||
%result = select <4 x i1> %mask1, <4 x i16> %new, <4 x i16> %old
|
||||
store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
|
||||
ret void
|
||||
declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
|
||||
|
||||
define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone {
|
||||
%r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
|
||||
ret <4 x i8> %r
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
%old = load <WIDTH x i32> * %ptr
|
||||
%mask1 = trunc <4 x MASK> %mask to <4 x i1>
|
||||
%result = select <4 x i1> %mask1, <4 x i32> %new, <4 x i32> %old
|
||||
store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
|
||||
ret void
|
||||
declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
|
||||
|
||||
define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
|
||||
%r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
|
||||
ret <4 x i8> %r
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
|
||||
<WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
%old = load <WIDTH x i64> * %ptr
|
||||
%mask1 = trunc <4 x MASK> %mask to <4 x i1>
|
||||
%result = select <4 x i1> %mask1, <4 x i64> %new, <4 x i64> %old
|
||||
store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
|
||||
ret void
|
||||
declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
|
||||
|
||||
define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone {
|
||||
%r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
|
||||
ret <4 x i8> %r
|
||||
}
|
||||
|
||||
;; yuck. We need declarations of these, even though we shouldnt ever
|
||||
;; actually generate calls to them for the NEON target...
|
||||
declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
|
||||
|
||||
declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
|
||||
declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
|
||||
declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_log(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
|
||||
define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
|
||||
%r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
|
||||
ret <4 x i16> %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
|
||||
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone {
|
||||
%r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
|
||||
ret <4 x i16> %r
|
||||
}
|
||||
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
|
||||
|
||||
packed_load_and_store(4)
|
||||
define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
|
||||
%r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
|
||||
ret <4 x i16> %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; prefetch
|
||||
declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
|
||||
|
||||
define_prefetches()
|
||||
define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone {
|
||||
%r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
|
||||
ret <4 x i16> %r
|
||||
}
|
||||
583
builtins/target-neon-8.ll
Normal file
583
builtins/target-neon-8.ll
Normal file
@@ -0,0 +1,583 @@
|
||||
;;
|
||||
;; target-neon-8.ll
|
||||
;;
|
||||
;; Copyright(c) 2013 Google, Inc.
|
||||
;;
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Matt Pharr nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`WIDTH',`16')
|
||||
define(`MASK',`i8')
|
||||
|
||||
include(`util.m4')
|
||||
include(`target-neon-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
|
||||
unary4to16conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
|
||||
ret <16 x float> %r
|
||||
}
|
||||
|
||||
define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
|
||||
unary4to16conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; math
|
||||
|
||||
;; round/floor/ceil
|
||||
|
||||
;; FIXME: grabbed these from the sse2 target, which does not have native
|
||||
;; instructions for these. Is there a better approach for NEON?
|
||||
|
||||
define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast <16 x float> %0 to <16 x i32>
|
||||
%bitop.i.i = and <16 x i32> %float_to_int_bitcast.i.i.i.i,
|
||||
<i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
|
||||
i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
|
||||
i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
|
||||
i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
|
||||
%bitop.i = xor <16 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast <16 x i32> %bitop.i to <16 x float>
|
||||
%binop.i = fadd <16 x float> %int_to_float_bitcast.i.i40.i,
|
||||
<float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
|
||||
float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
|
||||
float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
|
||||
float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
|
||||
%binop21.i = fadd <16 x float> %binop.i,
|
||||
<float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
|
||||
float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
|
||||
float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
|
||||
float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
|
||||
%float_to_int_bitcast.i.i.i = bitcast <16 x float> %binop21.i to <16 x i32>
|
||||
%bitop31.i = xor <16 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop31.i to <16 x float>
|
||||
ret <16 x float> %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
|
||||
%bincmp.i = fcmp ogt <16 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
|
||||
%bitop.i = and <16 x i32> %val_to_boolvec32.i,
|
||||
<i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
|
||||
i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
|
||||
i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
|
||||
i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
|
||||
%binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <16 x float> %binop.i
|
||||
}
|
||||
|
||||
define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
|
||||
%bincmp.i = fcmp olt <16 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
|
||||
%bitop.i = and <16 x i32> %val_to_boolvec32.i,
|
||||
<i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
|
||||
i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
|
||||
i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
|
||||
i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
|
||||
%binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <16 x float> %binop.i
|
||||
}
|
||||
|
||||
;; FIXME: rounding doubles and double vectors needs to be implemented
|
||||
declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone
|
||||
declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone
|
||||
declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; min/max
|
||||
|
||||
declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
|
||||
<WIDTH x float>) nounwind readnone {
|
||||
binary4to16(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
|
||||
ret <WIDTH x float> %r
|
||||
}
|
||||
|
||||
define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
|
||||
<WIDTH x float>) nounwind readnone {
|
||||
binary4to16(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
|
||||
ret <WIDTH x float> %r
|
||||
}
|
||||
|
||||
declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
|
||||
binary4to16(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
|
||||
ret <WIDTH x i32> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
|
||||
binary4to16(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
|
||||
ret <WIDTH x i32> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
|
||||
binary4to16(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
|
||||
ret <WIDTH x i32> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
|
||||
binary4to16(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
|
||||
ret <WIDTH x i32> %r
|
||||
}
|
||||
|
||||
;; sqrt/rsqrt/rcp
|
||||
|
||||
declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
|
||||
unary4to16(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
|
||||
binary4to16(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
|
||||
%x1 = fmul <WIDTH x float> %x0, %x0_nr
|
||||
binary4to16(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
|
||||
%x2 = fmul <WIDTH x float> %x1, %x1_nr
|
||||
ret <WIDTH x float> %x2
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
|
||||
unary4to16(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
|
||||
%x0_2 = fmul <WIDTH x float> %x0, %x0
|
||||
binary4to16(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
|
||||
%x1 = fmul <WIDTH x float> %x0, %x0_nr
|
||||
%x1_2 = fmul <WIDTH x float> %x1, %x1
|
||||
binary4to16(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
|
||||
%x2 = fmul <WIDTH x float> %x1, %x1_nr
|
||||
ret <WIDTH x float> %x2
|
||||
}
|
||||
|
||||
define float @__rsqrt_uniform_float(float) nounwind readnone {
|
||||
%v1 = bitcast float %0 to <1 x float>
|
||||
%vs = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%vr = call <16 x float> @__rsqrt_varying_float(<16 x float> %vs)
|
||||
%r = extractelement <16 x float> %vr, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__rcp_uniform_float(float) nounwind readnone {
|
||||
%v1 = bitcast float %0 to <1 x float>
|
||||
%vs = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%vr = call <16 x float> @__rcp_varying_float(<16 x float> %vs)
|
||||
%r = extractelement <16 x float> %vr, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
|
||||
|
||||
define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
|
||||
unary4to16(result, float, @llvm.sqrt.v4f32, %0)
|
||||
;; this returns nan for v=0, which is undesirable..
|
||||
;; %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
|
||||
;; %result = fmul <4 x float> %rsqrt, %0
|
||||
ret <16 x float> %result
|
||||
}
|
||||
|
||||
declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
|
||||
|
||||
define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
|
||||
unary4to16(r, double, @llvm.sqrt.v4f64, %0)
|
||||
ret <WIDTH x double> %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reductions
|
||||
|
||||
define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
|
||||
%and_mask = and <WIDTH x i8> %0,
|
||||
<i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128,
|
||||
i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128>
|
||||
%v8 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %and_mask)
|
||||
%v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %v8)
|
||||
%v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
|
||||
%va = extractelement <2 x i64> %v2, i32 0
|
||||
%vb = extractelement <2 x i64> %v2, i32 1
|
||||
%vbshift = shl i64 %vb, 8
|
||||
%v = or i64 %va, %vbshift
|
||||
ret i64 %v
|
||||
}
|
||||
|
||||
define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
|
||||
v16tov8(MASK, %0, %v8a, %v8b)
|
||||
%vor8 = or <8 x MASK> %v8a, %v8b
|
||||
%v16 = sext <8 x i8> %vor8 to <8 x i16>
|
||||
v8tov4(i16, %v16, %v16a, %v16b)
|
||||
%vor16 = or <4 x i16> %v16a, %v16b
|
||||
%v32 = sext <4 x i16> %vor16 to <4 x i32>
|
||||
v4tov2(i32, %v32, %v32a, %v32b)
|
||||
%vor32 = or <2 x i32> %v32a, %v32b
|
||||
%v0 = extractelement <2 x i32> %vor32, i32 0
|
||||
%v1 = extractelement <2 x i32> %vor32, i32 1
|
||||
%v = or i32 %v0, %v1
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
|
||||
v16tov8(MASK, %0, %v8a, %v8b)
|
||||
%vand8 = and <8 x MASK> %v8a, %v8b
|
||||
%v16 = sext <8 x i8> %vand8 to <8 x i16>
|
||||
v8tov4(i16, %v16, %v16a, %v16b)
|
||||
%vand16 = and <4 x i16> %v16a, %v16b
|
||||
%v32 = sext <4 x i16> %vand16 to <4 x i32>
|
||||
v4tov2(i32, %v32, %v32a, %v32b)
|
||||
%vand32 = and <2 x i32> %v32a, %v32b
|
||||
%v0 = extractelement <2 x i32> %vand32, i32 0
|
||||
%v1 = extractelement <2 x i32> %vand32, i32 1
|
||||
%v = and i32 %v0, %v1
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
|
||||
%any = call i1 @__any(<WIDTH x MASK> %0)
|
||||
%none = icmp eq i1 %any, 0
|
||||
ret i1 %none
|
||||
}
|
||||
|
||||
;; $1: scalar type
|
||||
;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
|
||||
;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
|
||||
;; $4: scalar reduce function
|
||||
|
||||
define(`neon_reduce', `
|
||||
v16tov8($1, %0, %va, %vb)
|
||||
%va_16 = shufflevector <8 x $1> %va, <8 x $1> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%vb_16 = shufflevector <8 x $1> %vb, <8 x $1> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v8 = call <16 x $1> $2(<16 x $1> %va_16, <16 x $1> %vb_16)
|
||||
|
||||
%v8a = shufflevector <16 x $1> %v8, <16 x $1> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v8b = shufflevector <16 x $1> %v8, <16 x $1> undef,
|
||||
<16 x i32> <i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
|
||||
%v4 = call <16 x $1> $2(<16 x $1> %v8a, <16 x $1> %v8b)
|
||||
|
||||
%vfirst_4 = shufflevector <16 x $1> %v4, <16 x $1> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
v4tov2($1, %vfirst_4, %v0, %v1)
|
||||
%vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
|
||||
%vh0 = extractelement <2 x $1> %vh, i32 0
|
||||
%vh1 = extractelement <2 x $1> %vh, i32 1
|
||||
%r = call $1 $4($1 %vh0, $1 %vh1)
|
||||
ret $1 %r
|
||||
')
|
||||
|
||||
declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
|
||||
|
||||
define internal float @add_f32(float, float) {
|
||||
%r = fadd float %0, %1
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
|
||||
%r = fadd <WIDTH x float> %0, %1
|
||||
ret <WIDTH x float> %r
|
||||
}
|
||||
|
||||
define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
|
||||
neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
|
||||
}
|
||||
|
||||
declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
|
||||
|
||||
define internal float @min_f32(float, float) {
|
||||
%cmp = fcmp olt float %0, %1
|
||||
%r = select i1 %cmp, float %0, float %1
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
|
||||
neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
|
||||
}
|
||||
|
||||
declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
|
||||
|
||||
define internal float @max_f32(float, float) {
|
||||
%cmp = fcmp ugt float %0, %1
|
||||
%r = select i1 %cmp, float %0, float %1
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
|
||||
neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
|
||||
declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
|
||||
declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
|
||||
|
||||
define i64 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
|
||||
%a16 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %0)
|
||||
%a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a16)
|
||||
%a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
|
||||
%a0 = extractelement <2 x i64> %a64, i32 0
|
||||
%a1 = extractelement <2 x i64> %a64, i32 1
|
||||
%r = add i64 %a0, %a1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
|
||||
v16tov8(i16, %0, %va, %vb)
|
||||
%a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %va)
|
||||
%b32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %vb)
|
||||
%a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
|
||||
%b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %b32)
|
||||
%sum = add <2 x i64> %a64, %b64
|
||||
%a0 = extractelement <2 x i64> %sum, i32 0
|
||||
%a1 = extractelement <2 x i64> %sum, i32 1
|
||||
%r = add i64 %a0, %a1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
|
||||
v16tov4(i32, %0, %va, %vb, %vc, %vd)
|
||||
%a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
|
||||
%b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
|
||||
%c64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vc)
|
||||
%d64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vd)
|
||||
%ab = add <2 x i64> %a64, %b64
|
||||
%cd = add <2 x i64> %c64, %d64
|
||||
%sum = add <2 x i64> %ab, %cd
|
||||
%a0 = extractelement <2 x i64> %sum, i32 0
|
||||
%a1 = extractelement <2 x i64> %sum, i32 1
|
||||
%r = add i64 %a0, %a1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
||||
define internal i32 @min_si32(i32, i32) {
|
||||
%cmp = icmp slt i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
|
||||
neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
|
||||
}
|
||||
|
||||
declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
||||
define internal i32 @max_si32(i32, i32) {
|
||||
%cmp = icmp sgt i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
|
||||
neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
|
||||
}
|
||||
|
||||
declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
||||
define internal i32 @min_ui32(i32, i32) {
|
||||
%cmp = icmp ult i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
|
||||
neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
|
||||
}
|
||||
|
||||
declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
||||
define internal i32 @max_ui32(i32, i32) {
|
||||
%cmp = icmp ugt i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
|
||||
neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
|
||||
}
|
||||
|
||||
define internal double @__add_uniform_double(double, double) {
|
||||
%r = fadd double %0, %1
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define internal <WIDTH x double> @__add_varying_double(<WIDTH x double>, <WIDTH x double>) {
|
||||
%r = fadd <WIDTH x double> %0, %1
|
||||
ret <WIDTH x double> %r
|
||||
}
|
||||
|
||||
define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
|
||||
reduce16(double, @__add_varying_double, @__add_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
|
||||
reduce16(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
|
||||
reduce16(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define internal i64 @__add_uniform_int64(i64, i64) {
|
||||
%r = add i64 %0, %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define internal <WIDTH x i64> @__add_varying_int64(<WIDTH x i64>, <WIDTH x i64>) {
|
||||
%r = add <WIDTH x i64> %0, %1
|
||||
ret <WIDTH x i64> %r
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
|
||||
reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
|
||||
reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
|
||||
reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
|
||||
reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
|
||||
reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
|
||||
%r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
|
||||
ret <16 x i8> %r
|
||||
}
|
||||
|
||||
declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone {
|
||||
%r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
|
||||
ret <16 x i8> %r
|
||||
}
|
||||
|
||||
declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
|
||||
%r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
|
||||
ret <16 x i8> %r
|
||||
}
|
||||
|
||||
declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone {
|
||||
%r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
|
||||
ret <16 x i8> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
|
||||
v16tov8(i16, %0, %a0, %b0)
|
||||
v16tov8(i16, %1, %a1, %b1)
|
||||
%r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
|
||||
%r1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
|
||||
v8tov16(i16, %r0, %r1, %r)
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone {
|
||||
v16tov8(i16, %0, %a0, %b0)
|
||||
v16tov8(i16, %1, %a1, %b1)
|
||||
%r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
|
||||
%r1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
|
||||
v8tov16(i16, %r0, %r1, %r)
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
|
||||
v16tov8(i16, %0, %a0, %b0)
|
||||
v16tov8(i16, %1, %a1, %b1)
|
||||
%r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
|
||||
%r1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
|
||||
v8tov16(i16, %r0, %r1, %r)
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone {
|
||||
v16tov8(i16, %0, %a0, %b0)
|
||||
v16tov8(i16, %1, %a1, %b1)
|
||||
%r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
|
||||
%r1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
|
||||
v8tov16(i16, %r0, %r1, %r)
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
346
builtins/target-neon-common.ll
Normal file
346
builtins/target-neon-common.ll
Normal file
@@ -0,0 +1,346 @@
|
||||
;;
|
||||
;; target-neon-common.ll
|
||||
;;
|
||||
;; Copyright(c) 2013 Google, Inc.
|
||||
;;
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Matt Pharr nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
|
||||
|
||||
stdlib_core()
|
||||
scans()
|
||||
reduce_equal(WIDTH)
|
||||
rdrand_decls()
|
||||
define_shuffles()
|
||||
aossoa()
|
||||
ctlztz()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vec = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 0, i32 0, i32 0>
|
||||
%h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
|
||||
%r = extractelement <4 x float> %h, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vec = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<4 x i32> <i32 0, i32 0, i32 0, i32 0>
|
||||
%h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
|
||||
%r = extractelement <4 x i16> %h, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; math
|
||||
|
||||
define void @__fastmath() nounwind {
|
||||
ret void
|
||||
}
|
||||
|
||||
;; round/floor/ceil
|
||||
|
||||
;; FIXME: grabbed these from the sse2 target, which does not have native
|
||||
;; instructions for these. Is there a better approach for NEON?
|
||||
|
||||
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
|
||||
%bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
|
||||
%bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
|
||||
%binop21.i = fadd float %binop.i, -8.388608e+06
|
||||
%float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
|
||||
%bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
|
||||
ret float %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
||||
%bincmp.i = fcmp ogt float %calltmp.i, %0
|
||||
%selectexpr.i = sext i1 %bincmp.i to i32
|
||||
%bitop.i = and i32 %selectexpr.i, -1082130432
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret float %binop.i
|
||||
}
|
||||
|
||||
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
||||
%bincmp.i = fcmp olt float %calltmp.i, %0
|
||||
%selectexpr.i = sext i1 %bincmp.i to i32
|
||||
%bitop.i = and i32 %selectexpr.i, 1065353216
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret float %binop.i
|
||||
}
|
||||
|
||||
;; FIXME: rounding doubles and double vectors needs to be implemented
|
||||
declare double @__round_uniform_double(double) nounwind readnone
|
||||
declare double @__floor_uniform_double(double) nounwind readnone
|
||||
declare double @__ceil_uniform_double(double) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; min/max
|
||||
|
||||
define float @__max_uniform_float(float, float) nounwind readnone {
|
||||
%cmp = fcmp ugt float %0, %1
|
||||
%r = select i1 %cmp, float %0, float %1
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__min_uniform_float(float, float) nounwind readnone {
|
||||
%cmp = fcmp ult float %0, %1
|
||||
%r = select i1 %cmp, float %0, float %1
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
|
||||
%cmp = icmp slt i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
|
||||
%cmp = icmp sgt i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
|
||||
%cmp = icmp ult i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
|
||||
%cmp = icmp ugt i32 %0, %1
|
||||
%r = select i1 %cmp, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
|
||||
%cmp = icmp slt i64 %0, %1
|
||||
%r = select i1 %cmp, i64 %0, i64 %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
|
||||
%cmp = icmp sgt i64 %0, %1
|
||||
%r = select i1 %cmp, i64 %0, i64 %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
|
||||
%cmp = icmp ult i64 %0, %1
|
||||
%r = select i1 %cmp, i64 %0, i64 %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
|
||||
%cmp = icmp ugt i64 %0, %1
|
||||
%r = select i1 %cmp, i64 %0, i64 %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define double @__min_uniform_double(double, double) nounwind readnone {
|
||||
%cmp = fcmp olt double %0, %1
|
||||
%r = select i1 %cmp, double %0, double %1
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define double @__max_uniform_double(double, double) nounwind readnone {
|
||||
%cmp = fcmp ogt double %0, %1
|
||||
%r = select i1 %cmp, double %0, double %1
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
|
||||
%m = icmp slt <WIDTH x i64> %0, %1
|
||||
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
|
||||
ret <WIDTH x i64> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
|
||||
%m = icmp sgt <WIDTH x i64> %0, %1
|
||||
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
|
||||
ret <WIDTH x i64> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
|
||||
%m = icmp ult <WIDTH x i64> %0, %1
|
||||
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
|
||||
ret <WIDTH x i64> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
|
||||
%m = icmp ugt <WIDTH x i64> %0, %1
|
||||
%r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
|
||||
ret <WIDTH x i64> %r
|
||||
}
|
||||
|
||||
define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
|
||||
<WIDTH x double>) nounwind readnone {
|
||||
%m = fcmp olt <WIDTH x double> %0, %1
|
||||
%r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
|
||||
ret <WIDTH x double> %r
|
||||
}
|
||||
|
||||
define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
|
||||
<WIDTH x double>) nounwind readnone {
|
||||
%m = fcmp ogt <WIDTH x double> %0, %1
|
||||
%r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
|
||||
ret <WIDTH x double> %r
|
||||
}
|
||||
|
||||
;; sqrt/rsqrt/rcp
|
||||
|
||||
declare float @llvm.sqrt.f32(float)
|
||||
|
||||
define float @__sqrt_uniform_float(float) nounwind readnone {
|
||||
%r = call float @llvm.sqrt.f32(float %0)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
declare double @llvm.sqrt.f64(double)
|
||||
|
||||
define double @__sqrt_uniform_double(double) nounwind readnone {
|
||||
%r = call double @llvm.sqrt.f64(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
;; bit ops
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define i32 @__popcnt_int32(i32) nounwind readnone {
|
||||
%v = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
define i64 @__popcnt_int64(i64) nounwind readnone {
|
||||
%v = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
masked_store_float_double()
|
||||
|
||||
define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
%old = load <WIDTH x i8> * %ptr
|
||||
%mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
|
||||
%result = select <WIDTH x i1> %mask1, <WIDTH x i8> %new, <WIDTH x i8> %old
|
||||
store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
%old = load <WIDTH x i16> * %ptr
|
||||
%mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
|
||||
%result = select <WIDTH x i1> %mask1, <WIDTH x i16> %new, <WIDTH x i16> %old
|
||||
store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
%old = load <WIDTH x i32> * %ptr
|
||||
%mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
|
||||
%result = select <WIDTH x i1> %mask1, <WIDTH x i32> %new, <WIDTH x i32> %old
|
||||
store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
|
||||
<WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
%old = load <WIDTH x i64> * %ptr
|
||||
%mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
|
||||
%result = select <WIDTH x i1> %mask1, <WIDTH x i64> %new, <WIDTH x i64> %old
|
||||
store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
;; yuck. We need declarations of these, even though we shouldnt ever
|
||||
;; actually generate calls to them for the NEON target...
|
||||
|
||||
|
||||
include(`svml.m4')
|
||||
svml_stubs(float,f,WIDTH)
|
||||
svml_stubs(double,d,WIDTH)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
packed_load_and_store(4)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; prefetch
|
||||
|
||||
define_prefetches()
|
||||
@@ -5,6 +5,10 @@ define(`WIDTH',`1')
|
||||
|
||||
include(`util.m4')
|
||||
|
||||
include(`svml.m4')
|
||||
svml_stubs(float,f,WIDTH)
|
||||
svml_stubs(double,d,WIDTH)
|
||||
|
||||
; Define some basics for a 1-wide target
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
@@ -467,6 +471,9 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
|
||||
declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
|
||||
|
||||
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
@@ -643,103 +650,6 @@ define <1 x double> @__rsqrt_varying_double(<1 x double> %v) nounwind readonly
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm.sin.f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
unary1to1(float,@llvm.sin.f32)
|
||||
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm.cos.f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
unary1to1(float, @llvm.cos.f32)
|
||||
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
|
||||
; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
|
||||
; store <1 x float> %s, <1 x float> * %1
|
||||
; ret void
|
||||
%sin = call <1 x float> @__svml_sin (<1 x float> %0)
|
||||
%cos = call <1 x float> @__svml_cos (<1 x float> %0)
|
||||
store <1 x float> %sin, <1 x float> * %1
|
||||
store <1 x float> %cos, <1 x float> * %2
|
||||
ret void
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm_tan_f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
;unasry1to1(float, @llvm.tan.f32)
|
||||
; UNSUPPORTED!
|
||||
ret <1 x float > %0
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
|
||||
; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
|
||||
; ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm_atan_f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
;unsary1to1(float,@llvm.atan.f32)
|
||||
;UNSUPPORTED!
|
||||
ret <1 x float > %0
|
||||
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
|
||||
;ret <1 x float> %ret
|
||||
;%y = extractelement <1 x float> %0, i32 0
|
||||
;%x = extractelement <1 x float> %1, i32 0
|
||||
;%q = fdiv float %y, %x
|
||||
;%a = call float @llvm.atan.f32 (float %q)
|
||||
;%rv = insertelement <1 x float> undef, float %a, i32 0
|
||||
;ret <1 x float> %rv
|
||||
; UNSUPPORTED!
|
||||
ret <1 x float > %0
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
unary1to1(float, @llvm.exp.f32)
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
unary1to1(float, @llvm.log.f32)
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
|
||||
;ret <1 x float> %ret
|
||||
%r = extractelement <1 x float> %0, i32 0
|
||||
%e = extractelement <1 x float> %1, i32 0
|
||||
%s = call float @llvm.pow.f32(float %r,float %e)
|
||||
%rv = insertelement <1 x float> undef, float %s, i32 0
|
||||
ret <1 x float> %rv
|
||||
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
@@ -957,3 +867,8 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
|
||||
@@ -269,4 +269,8 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
||||
ret i64 %val
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
|
||||
|
||||
@@ -105,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||
include(`svml.m4')
|
||||
;; single precision
|
||||
svml_declare(float,f4,4)
|
||||
svml_define_x(float,f4,4,f,8)
|
||||
|
||||
|
||||
define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_sinf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_cosf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<8 x float>, <8 x float> *,
|
||||
<8 x float> *) nounwind readnone alwaysinline {
|
||||
; call svml_sincosf4 two times with the two 4-wide sub-vectors
|
||||
%a = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%b = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
%cospa = alloca <4 x float>
|
||||
%sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
|
||||
|
||||
%cospb = alloca <4 x float>
|
||||
%sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
|
||||
|
||||
%sin = shufflevector <4 x float> %sa, <4 x float> %sb,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 4, i32 5, i32 6, i32 7>
|
||||
store <8 x float> %sin, <8 x float> * %1
|
||||
|
||||
%cosa = load <4 x float> * %cospa
|
||||
%cosb = load <4 x float> * %cospb
|
||||
%cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 4, i32 5, i32 6, i32 7>
|
||||
store <8 x float> %cos, <8 x float> * %2
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_tanf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_atanf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_atan2(<8 x float>,
|
||||
<8 x float>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, float, @__svml_atan2f4, %0, %1)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_expf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_logf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_pow(<8 x float>,
|
||||
<8 x float>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, float, @__svml_powf4, %0, %1)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
;; double precision
|
||||
svml_declare(double,2,2)
|
||||
svml_define_x(double,2,2,d,8)
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -367,6 +294,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
|
||||
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
|
||||
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
|
||||
<16 x i8> zeroinitializer)
|
||||
%r0 = extractelement <2 x i64> %rv, i32 0
|
||||
%r1 = extractelement <2 x i64> %rv, i32 1
|
||||
%r = add i64 %r0, %r1
|
||||
%r16 = trunc i64 %r to i16
|
||||
ret i16 %r16
|
||||
}
|
||||
|
||||
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
|
||||
<8 x i16>) nounwind readnone alwaysinline {
|
||||
%r = add <8 x i16> %0, %1
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
|
||||
%r = add i16 %0, %1
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
|
||||
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
|
||||
}
|
||||
|
||||
define <4 x float> @__vec4_add_float(<4 x float> %v0,
|
||||
<4 x float> %v1) nounwind readnone alwaysinline {
|
||||
%v = fadd <4 x float> %v0, %v1
|
||||
|
||||
@@ -267,6 +267,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
|
||||
%wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
|
||||
i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
|
||||
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
|
||||
<16 x i8> zeroinitializer)
|
||||
%r0 = extractelement <2 x i64> %rv, i32 0
|
||||
%r1 = extractelement <2 x i64> %rv, i32 1
|
||||
%r = add i64 %r0, %r1
|
||||
%r16 = trunc i64 %r to i16
|
||||
ret i16 %r16
|
||||
}
|
||||
|
||||
define internal <4 x i16> @__add_varying_i16(<4 x i16>,
|
||||
<4 x i16>) nounwind readnone alwaysinline {
|
||||
%r = add <4 x i16> %0, %1
|
||||
ret <4 x i16> %r
|
||||
}
|
||||
|
||||
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
|
||||
%r = add i16 %0, %1
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
|
||||
reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
|
||||
}
|
||||
|
||||
define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
%v1 = shufflevector <4 x float> %v, <4 x float> undef,
|
||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
@@ -466,62 +496,15 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||
include(`svml.m4')
|
||||
;; single precision
|
||||
svml_declare(float,f4,4)
|
||||
svml_define(float,f4,4,f)
|
||||
|
||||
;; double precision
|
||||
svml_declare(double,2,2)
|
||||
svml_define_x(double,2,2,d,4)
|
||||
|
||||
define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
|
||||
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
|
||||
store <4 x float> %s, <4 x float> * %1
|
||||
ret void
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
490
builtins/target-sse4-16.ll
Normal file
490
builtins/target-sse4-16.ll
Normal file
@@ -0,0 +1,490 @@
|
||||
;; Copyright (c) 2013, Google, Inc.
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Google, Inc. nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; Define common 4-wide stuff
|
||||
define(`WIDTH',`8')
|
||||
define(`MASK',`i16')
|
||||
include(`util.m4')
|
||||
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
|
||||
include(`target-sse4-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
|
||||
unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
|
||||
; do one N-R iteration to improve precision
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
%v_iv = fmul <8 x float> %0, %call
|
||||
%two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
|
||||
float 2., float 2., float 2., float 2.>, %v_iv
|
||||
%iv_mul = fmul <8 x float> %call, %two_minus
|
||||
ret <8 x float> %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <8 x float> %v, %is
|
||||
%v_is_is = fmul <8 x float> %v_is, %is
|
||||
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
|
||||
float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <8 x float> %is, %three_sub
|
||||
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
|
||||
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
ret <8 x float> %half_scale
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
|
||||
define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind
|
||||
alwaysinline {
|
||||
unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
|
||||
|
||||
define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
round4to8(%0, 8)
|
||||
}
|
||||
|
||||
define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
round4to8(%0, 9)
|
||||
}
|
||||
|
||||
define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
round4to8(%0, 10)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
|
||||
|
||||
define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
round2to8double(%0, 8)
|
||||
}
|
||||
|
||||
define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
round2to8double(%0, 9)
|
||||
}
|
||||
|
||||
define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
round2to8double(%0, 10)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 min/max
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; unsigned int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone {
|
||||
binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone {
|
||||
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; svml
|
||||
|
||||
; FIXME
|
||||
include(`svml.m4')
|
||||
svml_stubs(float,f,WIDTH)
|
||||
svml_stubs(double,d,WIDTH)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
|
||||
|
||||
define i64 @__movmsk(<8 x MASK>) nounwind readnone alwaysinline {
|
||||
%m8 = trunc <8 x MASK> %0 to <8 x i8>
|
||||
%mask8 = shufflevector <8 x i8> %m8, <8 x i8> zeroinitializer,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
|
||||
%m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %mask8)
|
||||
%m64 = zext i32 %m to i64
|
||||
ret i64 %m64
|
||||
}
|
||||
|
||||
define i1 @__any(<8 x MASK>) nounwind readnone alwaysinline {
|
||||
%m = call i64 @__movmsk(<8 x MASK> %0)
|
||||
%mne = icmp ne i64 %m, 0
|
||||
ret i1 %mne
|
||||
}
|
||||
|
||||
define i1 @__all(<8 x MASK>) nounwind readnone alwaysinline {
|
||||
%m = call i64 @__movmsk(<8 x MASK> %0)
|
||||
%meq = icmp eq i64 %m, ALL_ON_MASK
|
||||
ret i1 %meq
|
||||
}
|
||||
|
||||
define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline {
|
||||
%m = call i64 @__movmsk(<8 x MASK> %0)
|
||||
%meq = icmp eq i64 %m, 0
|
||||
ret i1 %meq
|
||||
}
|
||||
|
||||
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
|
||||
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
|
||||
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
|
||||
<16 x i8> zeroinitializer)
|
||||
%r0 = extractelement <2 x i64> %rv, i32 0
|
||||
%r1 = extractelement <2 x i64> %rv, i32 1
|
||||
%r = add i64 %r0, %r1
|
||||
%r16 = trunc i64 %r to i16
|
||||
ret i16 %r16
|
||||
}
|
||||
|
||||
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
|
||||
<8 x i16>) nounwind readnone alwaysinline {
|
||||
%r = add <8 x i16> %0, %1
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
|
||||
%r = add i16 %0, %1
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
|
||||
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
|
||||
}
|
||||
|
||||
define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) {
|
||||
%r = fadd <8 x float> %0, %1
|
||||
ret <8 x float> %r
|
||||
}
|
||||
|
||||
define internal float @__add_uniform_float(float, float) {
|
||||
%r = fadd float %0, %1
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
reduce8(float, @__add_varying_float, @__add_uniform_float)
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<8 x float>) nounwind readnone {
|
||||
reduce8(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
define float @__reduce_max_float(<8 x float>) nounwind readnone {
|
||||
reduce8(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
define internal <8 x i32> @__add_varying_int32(<8 x i32>, <8 x i32>) {
|
||||
%r = add <8 x i32> %0, %1
|
||||
ret <8 x i32> %r
|
||||
}
|
||||
|
||||
define internal i32 @__add_uniform_int32(i32, i32) {
|
||||
%r = add i32 %0, %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone {
|
||||
reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone {
|
||||
reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone {
|
||||
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone {
|
||||
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone {
|
||||
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
define internal <8 x double> @__add_varying_double(<8 x double>, <8 x double>) {
|
||||
%r = fadd <8 x double> %0, %1
|
||||
ret <8 x double> %r
|
||||
}
|
||||
|
||||
define internal double @__add_uniform_double(double, double) {
|
||||
%r = fadd double %0, %1
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define double @__reduce_add_double(<8 x double>) nounwind readnone {
|
||||
reduce8(double, @__add_varying_double, @__add_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<8 x double>) nounwind readnone {
|
||||
reduce8(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_max_double(<8 x double>) nounwind readnone {
|
||||
reduce8(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define internal <8 x i64> @__add_varying_int64(<8 x i64>, <8 x i64>) {
|
||||
%r = add <8 x i64> %0, %1
|
||||
ret <8 x i64> %r
|
||||
}
|
||||
|
||||
define internal i64 @__add_uniform_int64(i64, i64) {
|
||||
%r = add i64 %0, %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
reduce_equal(8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
define void @__masked_store_blend_i64(<8 x i64>* nocapture, <8 x i64>,
|
||||
<8 x MASK> %mask) nounwind
|
||||
alwaysinline {
|
||||
%mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
|
||||
%old = load <8 x i64>* %0, align 4
|
||||
%blend = select <8 x i1> %mask_as_i1, <8 x i64> %1, <8 x i64> %old
|
||||
store <8 x i64> %blend, <8 x i64>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x MASK> %mask) nounwind alwaysinline {
|
||||
%mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
|
||||
%old = load <8 x i32>* %0, align 4
|
||||
%blend = select <8 x i1> %mask_as_i1, <8 x i32> %1, <8 x i32> %old
|
||||
store <8 x i32> %blend, <8 x i32>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>,
|
||||
<8 x MASK> %mask) nounwind alwaysinline {
|
||||
%mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
|
||||
%old = load <8 x i16>* %0, align 4
|
||||
%blend = select <8 x i1> %mask_as_i1, <8 x i16> %1, <8 x i16> %old
|
||||
store <8 x i16> %blend, <8 x i16>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
|
||||
<8 x MASK> %mask) nounwind alwaysinline {
|
||||
%mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
|
||||
%old = load <8 x i8>* %0, align 4
|
||||
%blend = select <8 x i1> %mask_as_i1, <8 x i8> %1, <8 x i8> %old
|
||||
store <8 x i8> %blend, <8 x i8>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i8> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) {
|
||||
%r = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
define_avg_up_int8()
|
||||
define_avg_up_int16()
|
||||
define_down_avgs()
|
||||
492
builtins/target-sse4-8.ll
Normal file
492
builtins/target-sse4-8.ll
Normal file
@@ -0,0 +1,492 @@
|
||||
;; Copyright (c) 2013, Google, Inc.
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Google, Inc. nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; Define common 4-wide stuff
|
||||
define(`WIDTH',`16')
|
||||
define(`MASK',`i8')
|
||||
include(`util.m4')
|
||||
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
|
||||
include(`target-sse4-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
|
||||
unary4to16(call, float, @llvm.x86.sse.rcp.ps, %0)
|
||||
; do one N-R iteration to improve precision
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
%v_iv = fmul <16 x float> %0, %call
|
||||
%two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
|
||||
float 2., float 2., float 2., float 2.,
|
||||
float 2., float 2., float 2., float 2.,
|
||||
float 2., float 2., float 2., float 2.>, %v_iv
|
||||
%iv_mul = fmul <16 x float> %call, %two_minus
|
||||
ret <16 x float> %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
unary4to16(is, float, @llvm.x86.sse.rsqrt.ps, %v)
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <16 x float> %v, %is
|
||||
%v_is_is = fmul <16 x float> %v_is, %is
|
||||
%three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
|
||||
float 3., float 3., float 3., float 3.,
|
||||
float 3., float 3., float 3., float 3.,
|
||||
float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <16 x float> %is, %three_sub
|
||||
%half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
|
||||
float 0.5, float 0.5, float 0.5, float 0.5,
|
||||
float 0.5, float 0.5, float 0.5, float 0.5,
|
||||
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
ret <16 x float> %half_scale
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
unary4to16(call, float, @llvm.x86.sse.sqrt.ps, %0)
|
||||
ret <16 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
|
||||
define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind
|
||||
alwaysinline {
|
||||
unary2to16(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret <16 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
|
||||
|
||||
define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
round4to16(%0, 8)
|
||||
}
|
||||
|
||||
define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
round4to16(%0, 9)
|
||||
}
|
||||
|
||||
define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
round4to16(%0, 10)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
|
||||
|
||||
define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
; XXXround2to4double(%0, 8)
|
||||
; FIXME: need round2to16double in util.m4...
|
||||
ret <16 x double> undef
|
||||
}
|
||||
|
||||
define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
; XXXround2to4double(%0, 9)
|
||||
ret <16 x double> undef
|
||||
}
|
||||
|
||||
define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
; XXXround2to4double(%0, 10)
|
||||
ret <16 x double> undef
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
|
||||
binary4to16(call, float, @llvm.x86.sse.max.ps, %0, %1)
|
||||
ret <16 x float> %call
|
||||
}
|
||||
|
||||
define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
|
||||
binary4to16(call, float, @llvm.x86.sse.min.ps, %0, %1)
|
||||
ret <16 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 min/max
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <16 x i32> %call
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <16 x i32> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; unsigned int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(call, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <16 x i32> %call
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <16 x i32> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone {
|
||||
binary2to16(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret <16 x double> %ret
|
||||
}
|
||||
|
||||
define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone {
|
||||
binary2to16(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <16 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; svml
|
||||
|
||||
; FIXME
|
||||
|
||||
include(`svml.m4')
|
||||
svml_stubs(float,f,WIDTH)
|
||||
svml_stubs(double,d,WIDTH)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
|
||||
|
||||
define i64 @__movmsk(<16 x i8>) nounwind readnone alwaysinline {
|
||||
%m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
|
||||
%m64 = zext i32 %m to i64
|
||||
ret i64 %m64
|
||||
}
|
||||
|
||||
define i1 @__any(<16 x i8>) nounwind readnone alwaysinline {
|
||||
%m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
|
||||
%mne = icmp ne i32 %m, 0
|
||||
ret i1 %mne
|
||||
}
|
||||
|
||||
define i1 @__all(<16 x i8>) nounwind readnone alwaysinline {
|
||||
%m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
|
||||
%meq = icmp eq i32 %m, ALL_ON_MASK
|
||||
ret i1 %meq
|
||||
}
|
||||
|
||||
define i1 @__none(<16 x i8>) nounwind readnone alwaysinline {
|
||||
%m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
|
||||
%meq = icmp eq i32 %m, 0
|
||||
ret i1 %meq
|
||||
}
|
||||
|
||||
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
|
||||
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
|
||||
<16 x i8> zeroinitializer)
|
||||
%r0 = extractelement <2 x i64> %rv, i32 0
|
||||
%r1 = extractelement <2 x i64> %rv, i32 1
|
||||
%r = add i64 %r0, %r1
|
||||
%r16 = trunc i64 %r to i16
|
||||
ret i16 %r16
|
||||
}
|
||||
|
||||
define internal <16 x i16> @__add_varying_i16(<16 x i16>,
|
||||
<16 x i16>) nounwind readnone alwaysinline {
|
||||
%r = add <16 x i16> %0, %1
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
|
||||
%r = add i16 %0, %1
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
|
||||
reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
|
||||
}
|
||||
|
||||
define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) {
|
||||
%r = fadd <16 x float> %0, %1
|
||||
ret <16 x float> %r
|
||||
}
|
||||
|
||||
define internal float @__add_uniform_float(float, float) {
|
||||
%r = fadd float %0, %1
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
reduce16(float, @__add_varying_float, @__add_uniform_float)
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<16 x float>) nounwind readnone {
|
||||
reduce16(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
define float @__reduce_max_float(<16 x float>) nounwind readnone {
|
||||
reduce16(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
define internal <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) {
|
||||
%r = add <16 x i32> %0, %1
|
||||
ret <16 x i32> %r
|
||||
}
|
||||
|
||||
define internal i32 @__add_uniform_int32(i32, i32) {
|
||||
%r = add i32 %0, %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone {
|
||||
reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone {
|
||||
reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone {
|
||||
reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone {
|
||||
reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone {
|
||||
reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
define internal <16 x double> @__add_varying_double(<16 x double>, <16 x double>) {
|
||||
%r = fadd <16 x double> %0, %1
|
||||
ret <16 x double> %r
|
||||
}
|
||||
|
||||
define internal double @__add_uniform_double(double, double) {
|
||||
%r = fadd double %0, %1
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define double @__reduce_add_double(<16 x double>) nounwind readnone {
|
||||
reduce16(double, @__add_varying_double, @__add_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<16 x double>) nounwind readnone {
|
||||
reduce16(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_max_double(<16 x double>) nounwind readnone {
|
||||
reduce16(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define internal <16 x i64> @__add_varying_int64(<16 x i64>, <16 x i64>) {
|
||||
%r = add <16 x i64> %0, %1
|
||||
ret <16 x i64> %r
|
||||
}
|
||||
|
||||
define internal i64 @__add_uniform_int64(i64, i64) {
|
||||
%r = add i64 %0, %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone {
|
||||
reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone {
|
||||
reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone {
|
||||
reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone {
|
||||
reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone {
|
||||
reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
reduce_equal(16)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
define void @__masked_store_blend_i64(<16 x i64>* nocapture, <16 x i64>,
|
||||
<16 x i8> %mask) nounwind
|
||||
alwaysinline {
|
||||
%mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
|
||||
%old = load <16 x i64>* %0, align 4
|
||||
%blend = select <16 x i1> %mask_as_i1, <16 x i64> %1, <16 x i64> %old
|
||||
store <16 x i64> %blend, <16 x i64>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>,
|
||||
<16 x MASK> %mask) nounwind alwaysinline {
|
||||
%mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
|
||||
%old = load <16 x i32>* %0, align 4
|
||||
%blend = select <16 x i1> %mask_as_i1, <16 x i32> %1, <16 x i32> %old
|
||||
store <16 x i32> %blend, <16 x i32>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
|
||||
<16 x MASK> %mask) nounwind alwaysinline {
|
||||
%mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
|
||||
%old = load <16 x i16>* %0, align 4
|
||||
%blend = select <16 x i1> %mask_as_i1, <16 x i16> %1, <16 x i16> %old
|
||||
store <16 x i16> %blend, <16 x i16>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
|
||||
<16 x MASK> %mask) nounwind alwaysinline {
|
||||
%old = load <16 x i8>* %0, align 4
|
||||
%blend = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %old, <16 x i8> %1,
|
||||
<16 x i8> %mask)
|
||||
store <16 x i8> %blend, <16 x i8>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
|
||||
%r = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1)
|
||||
ret <16 x i8> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
|
||||
v16tov8(i16, %0, %a0, %b0)
|
||||
v16tov8(i16, %1, %a1, %b1)
|
||||
%r0 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
|
||||
%r1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %b0, <8 x i16> %b1)
|
||||
v8tov16(i16, %r0, %r1, %r)
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
define_avg_up_int8()
|
||||
define_avg_up_int16()
|
||||
define_down_avgs()
|
||||
@@ -105,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||
include(`svml.m4')
|
||||
;; single precision
|
||||
svml_declare(float,f4,4)
|
||||
svml_define_x(float,f4,4,f,8)
|
||||
|
||||
|
||||
define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_sinf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_cosf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<8 x float>, <8 x float> *,
|
||||
<8 x float> *) nounwind readnone alwaysinline {
|
||||
; call svml_sincosf4 two times with the two 4-wide sub-vectors
|
||||
%a = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%b = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
%cospa = alloca <4 x float>
|
||||
%sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
|
||||
|
||||
%cospb = alloca <4 x float>
|
||||
%sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
|
||||
|
||||
%sin = shufflevector <4 x float> %sa, <4 x float> %sb,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 4, i32 5, i32 6, i32 7>
|
||||
store <8 x float> %sin, <8 x float> * %1
|
||||
|
||||
%cosa = load <4 x float> * %cospa
|
||||
%cosb = load <4 x float> * %cospb
|
||||
%cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 4, i32 5, i32 6, i32 7>
|
||||
store <8 x float> %cos, <8 x float> * %2
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_tanf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_atanf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_atan2(<8 x float>,
|
||||
<8 x float>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, float, @__svml_atan2f4, %0, %1)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_expf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_logf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_pow(<8 x float>,
|
||||
<8 x float>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, float, @__svml_powf4, %0, %1)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
;; double precision
|
||||
svml_declare(double,2,2)
|
||||
svml_define_x(double,2,2,d,8)
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -309,6 +236,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
|
||||
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
|
||||
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
|
||||
<16 x i8> zeroinitializer)
|
||||
%r0 = extractelement <2 x i64> %rv, i32 0
|
||||
%r1 = extractelement <2 x i64> %rv, i32 1
|
||||
%r = add i64 %r0, %r1
|
||||
%r16 = trunc i64 %r to i16
|
||||
ret i16 %r16
|
||||
}
|
||||
|
||||
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
|
||||
<8 x i16>) nounwind readnone alwaysinline {
|
||||
%r = add <8 x i16> %0, %1
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
|
||||
%r = add i16 %0, %1
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
|
||||
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
|
||||
}
|
||||
@@ -629,3 +586,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
|
||||
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
|
||||
|
||||
@@ -209,62 +209,14 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||
include(`svml.m4')
|
||||
;; single precision
|
||||
svml_declare(float,f4,4)
|
||||
svml_define(float,f4,4,f)
|
||||
|
||||
|
||||
define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
|
||||
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
|
||||
store <4 x float> %s, <4 x float> * %1
|
||||
ret void
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
;; double precision
|
||||
svml_declare(double,2,2)
|
||||
svml_define_x(double,2,2,d,4)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
@@ -299,6 +251,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
|
||||
%wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
|
||||
i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
|
||||
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
|
||||
<16 x i8> zeroinitializer)
|
||||
%r0 = extractelement <2 x i64> %rv, i32 0
|
||||
%r1 = extractelement <2 x i64> %rv, i32 1
|
||||
%r = add i64 %r0, %r1
|
||||
%r16 = trunc i64 %r to i16
|
||||
ret i16 %r16
|
||||
}
|
||||
|
||||
define internal <4 x i16> @__add_varying_i16(<4 x i16>,
|
||||
<4 x i16>) nounwind readnone alwaysinline {
|
||||
%r = add <4 x i16> %0, %1
|
||||
ret <4 x i16> %r
|
||||
}
|
||||
|
||||
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
|
||||
%r = add i16 %0, %1
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
|
||||
reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
@@ -503,3 +485,9 @@ gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
|
||||
|
||||
594
builtins/util.m4
594
builtins/util.m4
@@ -49,6 +49,63 @@ define(`MASK_HIGH_BIT_ON',
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; vector deconstruction utilities
|
||||
;; split 8-wide vector into 2 4-wide vectors
|
||||
;;
|
||||
;; $1: vector element type
|
||||
;; $2: 8-wide vector
|
||||
;; $3: first 4-wide vector
|
||||
;; $4: second 4-wide vector
|
||||
|
||||
define(`v8tov4', `
|
||||
$3 = shufflevector <8 x $1> $2, <8 x $1> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
$4 = shufflevector <8 x $1> $2, <8 x $1> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
')
|
||||
|
||||
define(`v16tov8', `
|
||||
$3 = shufflevector <16 x $1> $2, <16 x $1> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
$4 = shufflevector <16 x $1> $2, <16 x $1> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
')
|
||||
|
||||
define(`v4tov2', `
|
||||
$3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 0, i32 1>
|
||||
$4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 2, i32 3>
|
||||
')
|
||||
|
||||
define(`v8tov2', `
|
||||
$3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 0, i32 1>
|
||||
$4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 2, i32 3>
|
||||
$5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 4, i32 5>
|
||||
$6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 6, i32 7>
|
||||
')
|
||||
|
||||
define(`v16tov4', `
|
||||
$3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
$4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
$5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
$6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; vector assembly: wider vector from two narrower vectors
|
||||
;;
|
||||
;; $1: vector element type
|
||||
;; $2: first n-wide vector
|
||||
;; $3: second n-wide vector
|
||||
;; $4: result 2*n-wide vector
|
||||
define(`v8tov16', `
|
||||
$4 = shufflevector <8 x $1> $2, <8 x $1> $3,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; Helper macro for calling various SSE instructions for scalar values
|
||||
;; but where the instruction takes a vector parameter.
|
||||
;; $1 : name of variable to put the final value in
|
||||
@@ -156,10 +213,7 @@ define(`reduce16', `
|
||||
;; the final reduction
|
||||
|
||||
define(`reduce8by4', `
|
||||
%v1 = shufflevector <8 x $1> %0, <8 x $1> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v2 = shufflevector <8 x $1> %0, <8 x $1> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
v8tov4($1, %0, %v1, %v2)
|
||||
%m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2)
|
||||
%v3 = shufflevector <4 x $1> %m1, <4 x $1> undef,
|
||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
@@ -266,30 +320,66 @@ define(`binary2to4', `
|
||||
;; $4: 8-wide operand value
|
||||
|
||||
define(`unary4to8', `
|
||||
%$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
|
||||
%$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
|
||||
%$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1,
|
||||
%__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
|
||||
%__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
|
||||
%$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
'
|
||||
)
|
||||
|
||||
;; $1: name of variable into which the final result should go
|
||||
;; $2: scalar type of the input vector elements
|
||||
;; $3: scalar type of the result vector elements
|
||||
;; $4: 4-wide unary vector function to apply
|
||||
;; $5: 8-wide operand value
|
||||
|
||||
define(`unary4to8conv', `
|
||||
%$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
|
||||
%$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
|
||||
%$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
'
|
||||
)
|
||||
|
||||
define(`unary4to16', `
|
||||
%$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
|
||||
%$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
|
||||
%$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2)
|
||||
%$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
%v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3)
|
||||
%__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
|
||||
%__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
|
||||
%__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2)
|
||||
%__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
%__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3)
|
||||
|
||||
%$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1,
|
||||
%__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3,
|
||||
%__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b,
|
||||
%$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
'
|
||||
)
|
||||
|
||||
define(`unary4to16conv', `
|
||||
%$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
|
||||
%$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
|
||||
%$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2)
|
||||
%$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
%v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3)
|
||||
|
||||
%$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
'
|
||||
@@ -411,6 +501,42 @@ define(`unary2to8', `
|
||||
'
|
||||
)
|
||||
|
||||
define(`unary2to16', `
|
||||
%$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
|
||||
%v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
|
||||
%$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
|
||||
%v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
|
||||
%$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
|
||||
%v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2)
|
||||
%$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
|
||||
%v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3)
|
||||
%$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
|
||||
%v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4)
|
||||
%$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
|
||||
%v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5)
|
||||
%$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
|
||||
%v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6)
|
||||
%$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
|
||||
%v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7)
|
||||
%$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
%$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
'
|
||||
)
|
||||
|
||||
;; Maps an 2-wide binary function to two 8-wide vector operands
|
||||
;; $1: name of variable into which the final result should go
|
||||
;; $2: scalar type of the vector elements
|
||||
@@ -432,12 +558,58 @@ define(`binary2to8', `
|
||||
%$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
|
||||
%v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
|
||||
|
||||
%$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
'
|
||||
)
|
||||
|
||||
define(`binary2to16', `
|
||||
%$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
|
||||
%$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
|
||||
%v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
|
||||
%$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
|
||||
%$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
|
||||
%v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
|
||||
%$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
|
||||
%$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
|
||||
%v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b)
|
||||
%$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
|
||||
%$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
|
||||
%v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
|
||||
%$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
|
||||
%$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
|
||||
%v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b)
|
||||
%$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
|
||||
%$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
|
||||
%v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b)
|
||||
%$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
|
||||
%$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
|
||||
%v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b)
|
||||
%$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
|
||||
%$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
|
||||
%v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b)
|
||||
|
||||
%$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
|
||||
%$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
%$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
%$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
'
|
||||
)
|
||||
|
||||
@@ -460,6 +632,26 @@ ret <8 x float> %ret
|
||||
'
|
||||
)
|
||||
|
||||
define(`round4to16', `
|
||||
%v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2)
|
||||
%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2)
|
||||
%r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2)
|
||||
%r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2)
|
||||
%ret01 = shufflevector <4 x float> %r0, <4 x float> %r1,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%ret23 = shufflevector <4 x float> %r2, <4 x float> %r3,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%ret = shufflevector <8 x float> %ret01, <8 x float> %ret23,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
ret <16 x float> %ret
|
||||
'
|
||||
)
|
||||
|
||||
define(`round8to16', `
|
||||
%v0 = shufflevector <16 x float> $1, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
@@ -690,6 +882,91 @@ shuffles(i64, 8)
|
||||
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
|
||||
;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
|
||||
|
||||
define(`mask_converts', `
|
||||
define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) {
|
||||
%r = sext <$1 x i1> %0 to <$1 x i8>
|
||||
ret <$1 x i8> %r
|
||||
}
|
||||
define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) {
|
||||
%r = sext <$1 x i1> %0 to <$1 x i16>
|
||||
ret <$1 x i16> %r
|
||||
}
|
||||
define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) {
|
||||
%r = sext <$1 x i1> %0 to <$1 x i32>
|
||||
ret <$1 x i32> %r
|
||||
}
|
||||
define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) {
|
||||
%r = sext <$1 x i1> %0 to <$1 x i64>
|
||||
ret <$1 x i64> %r
|
||||
}
|
||||
|
||||
define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) {
|
||||
ret <$1 x i8> %0
|
||||
}
|
||||
define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) {
|
||||
%r = sext <$1 x i8> %0 to <$1 x i16>
|
||||
ret <$1 x i16> %r
|
||||
}
|
||||
define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) {
|
||||
%r = sext <$1 x i8> %0 to <$1 x i32>
|
||||
ret <$1 x i32> %r
|
||||
}
|
||||
define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) {
|
||||
%r = sext <$1 x i8> %0 to <$1 x i64>
|
||||
ret <$1 x i64> %r
|
||||
}
|
||||
|
||||
define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) {
|
||||
%r = trunc <$1 x i16> %0 to <$1 x i8>
|
||||
ret <$1 x i8> %r
|
||||
}
|
||||
define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) {
|
||||
ret <$1 x i16> %0
|
||||
}
|
||||
define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) {
|
||||
%r = sext <$1 x i16> %0 to <$1 x i32>
|
||||
ret <$1 x i32> %r
|
||||
}
|
||||
define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) {
|
||||
%r = sext <$1 x i16> %0 to <$1 x i64>
|
||||
ret <$1 x i64> %r
|
||||
}
|
||||
|
||||
define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) {
|
||||
%r = trunc <$1 x i32> %0 to <$1 x i8>
|
||||
ret <$1 x i8> %r
|
||||
}
|
||||
define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) {
|
||||
%r = trunc <$1 x i32> %0 to <$1 x i16>
|
||||
ret <$1 x i16> %r
|
||||
}
|
||||
define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) {
|
||||
ret <$1 x i32> %0
|
||||
}
|
||||
define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
|
||||
%r = sext <$1 x i32> %0 to <$1 x i64>
|
||||
ret <$1 x i64> %r
|
||||
}
|
||||
|
||||
define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) {
|
||||
%r = trunc <$1 x i64> %0 to <$1 x i8>
|
||||
ret <$1 x i8> %r
|
||||
}
|
||||
define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) {
|
||||
%r = trunc <$1 x i64> %0 to <$1 x i16>
|
||||
ret <$1 x i16> %r
|
||||
}
|
||||
define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) {
|
||||
%r = trunc <$1 x i64> %0 to <$1 x i32>
|
||||
ret <$1 x i32> %r
|
||||
}
|
||||
define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
|
||||
ret <$1 x i64> %0
|
||||
}
|
||||
')
|
||||
|
||||
mask_converts(WIDTH)
|
||||
|
||||
define(`global_atomic_associative', `
|
||||
|
||||
define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
|
||||
@@ -697,17 +974,10 @@ define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
|
||||
; first, for any lanes where the mask is off, compute a vector where those lanes
|
||||
; hold the identity value..
|
||||
|
||||
; for the bit tricks below, we need the mask to be sign extended to be
|
||||
; the size of the element type.
|
||||
ifelse(
|
||||
MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>',
|
||||
$3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>',
|
||||
$3,i32, `
|
||||
; silly workaround to do %mask = %m, which is not possible directly..
|
||||
%maskmem = alloca <$1 x i32>
|
||||
store <$1 x i32> %m, <$1 x i32> * %maskmem
|
||||
%mask = load <$1 x i32> * %maskmem'
|
||||
)
|
||||
; for the bit tricks below, we need the mask to have the
|
||||
; the same element size as the element type.
|
||||
%mask = call <$1 x $3> @convertmask_`'MASK`'_$3_$1(<$1 x MASK> %m)
|
||||
|
||||
; zero out any lanes that are off
|
||||
%valoff = and <$1 x $3> %val, %mask
|
||||
|
||||
@@ -1551,11 +1821,6 @@ declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
|
||||
declare i1 @__is_compile_time_constant_uniform_int32(i32)
|
||||
declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
|
||||
|
||||
define void @__pause() nounwind readnone {
|
||||
call void asm sideeffect "pause", "~{dirflag},~{fpsr},~{flags}"() nounwind
|
||||
ret void
|
||||
}
|
||||
|
||||
; This function declares placeholder masked store functions for the
|
||||
; front-end to use.
|
||||
;
|
||||
@@ -2440,13 +2705,16 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
|
||||
}
|
||||
|
||||
define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
|
||||
ifelse(MASK,i1, `
|
||||
%se = sext <WIDTH x i1> %0 to <WIDTH x i32>
|
||||
;; ifelse(MASK,i32, `ret <WIDTH x i32> %0',
|
||||
;; `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
|
||||
;; ret <WIDTH x i32> %se')
|
||||
ifelse(MASK,i32, `%se = bitcast <WIDTH x i32> %0 to <WIDTH x i32>',
|
||||
MASK,i64, `%se = trunc <WIDTH x MASK> %0 to <WIDTH x i32>',
|
||||
`%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>')
|
||||
ret <WIDTH x i32> %se
|
||||
', `
|
||||
ret <WIDTH x i32> %0')
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; memcpy/memmove/memset
|
||||
|
||||
@@ -2830,17 +3098,11 @@ m4exit(`1')
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; read hw clock
|
||||
|
||||
declare i64 @llvm.readcyclecounter()
|
||||
|
||||
define i64 @__clock() nounwind {
|
||||
entry:
|
||||
tail call void asm sideeffect "xorl %eax,%eax \0A cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind
|
||||
%0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind
|
||||
%asmresult = extractvalue { i32, i32 } %0, 0
|
||||
%asmresult1 = extractvalue { i32, i32 } %0, 1
|
||||
%conv = zext i32 %asmresult1 to i64
|
||||
%shl = shl nuw i64 %conv, 32
|
||||
%conv2 = zext i32 %asmresult to i64
|
||||
%or = or i64 %shl, %conv2
|
||||
ret i64 %or
|
||||
%r = call i64 @llvm.readcyclecounter()
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -2918,6 +3180,7 @@ define float @__stdlib_powf(float, float) nounwind readnone alwaysinline {
|
||||
}
|
||||
|
||||
declare double @sin(double) nounwind readnone
|
||||
declare double @asin(double) nounwind readnone
|
||||
declare double @cos(double) nounwind readnone
|
||||
declare void @sincos(double, double *, double *) nounwind readnone
|
||||
declare double @tan(double) nounwind readnone
|
||||
@@ -2932,6 +3195,11 @@ define double @__stdlib_sin(double) nounwind readnone alwaysinline {
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define double @__stdlib_asin(double) nounwind readnone alwaysinline {
|
||||
%r = call double @asin(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define double @__stdlib_cos(double) nounwind readnone alwaysinline {
|
||||
%r = call double @cos(double %0)
|
||||
ret double %r
|
||||
@@ -3201,8 +3469,8 @@ return:
|
||||
;; $1: llvm type of elements (and suffix for function name)
|
||||
|
||||
define(`gen_masked_store', `
|
||||
define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x i32>) nounwind alwaysinline {
|
||||
per_lane(WIDTH, <WIDTH x i32> %2, `
|
||||
define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x MASK>) nounwind alwaysinline {
|
||||
per_lane(WIDTH, <WIDTH x MASK> %2, `
|
||||
%ptr_LANE_ID = getelementptr <WIDTH x $1> * %0, i32 0, i32 LANE
|
||||
%storeval_LANE_ID = extractelement <WIDTH x $1> %1, i32 LANE
|
||||
store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID')
|
||||
@@ -3260,6 +3528,56 @@ define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
|
||||
}
|
||||
')
|
||||
|
||||
define(`masked_store_blend_8_16_by_4_mask64', `
|
||||
define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
|
||||
<4 x i64>) nounwind alwaysinline {
|
||||
%old = load <4 x i8> * %0, align 1
|
||||
ifelse(LLVM_VERSION,LLVM_3_0,`
|
||||
%old32 = bitcast <4 x i8> %old to i32
|
||||
%new32 = bitcast <4 x i8> %1 to i32
|
||||
|
||||
%mask8 = trunc <4 x i64> %2 to <4 x i8>
|
||||
%mask32 = bitcast <4 x i8> %mask8 to i32
|
||||
%notmask32 = xor i32 %mask32, -1
|
||||
|
||||
%newmasked = and i32 %new32, %mask32
|
||||
%oldmasked = and i32 %old32, %notmask32
|
||||
%result = or i32 %newmasked, %oldmasked
|
||||
|
||||
%resultvec = bitcast i32 %result to <4 x i8>
|
||||
',`
|
||||
%m = trunc <4 x i64> %2 to <4 x i1>
|
||||
%resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
|
||||
')
|
||||
store <4 x i8> %resultvec, <4 x i8> * %0, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
|
||||
<4 x i64>) nounwind alwaysinline {
|
||||
%old = load <4 x i16> * %0, align 2
|
||||
ifelse(LLVM_VERSION,LLVM_3_0,`
|
||||
%old64 = bitcast <4 x i16> %old to i64
|
||||
%new64 = bitcast <4 x i16> %1 to i64
|
||||
|
||||
%mask16 = trunc <4 x i64> %2 to <4 x i16>
|
||||
%mask64 = bitcast <4 x i16> %mask16 to i64
|
||||
%notmask64 = xor i64 %mask64, -1
|
||||
|
||||
%newmasked = and i64 %new64, %mask64
|
||||
%oldmasked = and i64 %old64, %notmask64
|
||||
%result = or i64 %newmasked, %oldmasked
|
||||
|
||||
%resultvec = bitcast i64 %result to <4 x i16>
|
||||
',`
|
||||
%m = trunc <4 x i64> %2 to <4 x i1>
|
||||
%resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
|
||||
')
|
||||
store <4 x i16> %resultvec, <4 x i16> * %0, align 2
|
||||
ret void
|
||||
}
|
||||
')
|
||||
|
||||
define(`masked_store_blend_8_16_by_8', `
|
||||
define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
@@ -3378,10 +3696,10 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
|
||||
define(`packed_load_and_store', `
|
||||
|
||||
define i32 @__packed_load_active(i32 * %startptr, <WIDTH x i32> * %val_ptr,
|
||||
<WIDTH x i32> %full_mask) nounwind alwaysinline {
|
||||
<WIDTH x MASK> %full_mask) nounwind alwaysinline {
|
||||
entry:
|
||||
%mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
|
||||
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
|
||||
%mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
|
||||
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
|
||||
br i1 %mask_known, label %known_mask, label %unknown_mask
|
||||
|
||||
known_mask:
|
||||
@@ -3432,10 +3750,10 @@ done:
|
||||
}
|
||||
|
||||
define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
|
||||
<WIDTH x i32> %full_mask) nounwind alwaysinline {
|
||||
<WIDTH x MASK> %full_mask) nounwind alwaysinline {
|
||||
entry:
|
||||
%mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
|
||||
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
|
||||
%mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
|
||||
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
|
||||
br i1 %mask_known, label %known_mask, label %unknown_mask
|
||||
|
||||
known_mask:
|
||||
@@ -3544,10 +3862,10 @@ check_neighbors:
|
||||
%castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1)
|
||||
%vr = bitcast <$1 x $4> %castvr to <$1 x $2>
|
||||
%eq = $5 $7 <$1 x $2> %vec, %vr
|
||||
ifelse(MASK,i32, `
|
||||
%eq32 = sext <$1 x i1> %eq to <$1 x i32>
|
||||
%eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', `
|
||||
%eqmm = call i64 @__movmsk(<$1 x MASK> %eq)')
|
||||
ifelse(MASK,i1, `
|
||||
%eqmm = call i64 @__movmsk(<$1 x MASK> %eq)',
|
||||
`%eqm = sext <$1 x i1> %eq to <$1 x MASK>
|
||||
%eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)')
|
||||
%alleq = icmp eq i64 %eqmm, ALL_ON_MASK
|
||||
br i1 %alleq, label %all_equal, label %not_all_equal
|
||||
', `
|
||||
@@ -3722,9 +4040,9 @@ pl_done:
|
||||
define(`gen_gather_general', `
|
||||
; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
|
||||
define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs,
|
||||
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
|
||||
%ret_ptr = alloca <WIDTH x $1>
|
||||
per_lane(WIDTH, <WIDTH x i32> %vecmask, `
|
||||
per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
|
||||
%iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
|
||||
%val_LANE_ID = load $1 * %ptr_LANE_ID
|
||||
@@ -3738,9 +4056,9 @@ define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs,
|
||||
|
||||
; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
|
||||
define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs,
|
||||
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
|
||||
%ret_ptr = alloca <WIDTH x $1>
|
||||
per_lane(WIDTH, <WIDTH x i32> %vecmask, `
|
||||
per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
|
||||
%iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
|
||||
%val_LANE_ID = load $1 * %ptr_LANE_ID
|
||||
@@ -3804,7 +4122,7 @@ define <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %o
|
||||
|
||||
define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
|
||||
<WIDTH x i32> %offset_delta,
|
||||
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
|
||||
; We can be clever and avoid the per-lane stuff for gathers if we are willing
|
||||
; to require that the 0th element of the array being gathered from is always
|
||||
; legal to read from (and we do indeed require that, given the benefits!)
|
||||
@@ -3813,13 +4131,13 @@ define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32
|
||||
%offsetsPtr = alloca <WIDTH x i32>
|
||||
store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %offsetsPtr
|
||||
call void @__masked_store_blend_i32(<WIDTH x i32> * %offsetsPtr, <WIDTH x i32> %offsets,
|
||||
<WIDTH x i32> %vecmask)
|
||||
<WIDTH x MASK> %vecmask)
|
||||
%newOffsets = load <WIDTH x i32> * %offsetsPtr
|
||||
|
||||
%deltaPtr = alloca <WIDTH x i32>
|
||||
store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %deltaPtr
|
||||
call void @__masked_store_blend_i32(<WIDTH x i32> * %deltaPtr, <WIDTH x i32> %offset_delta,
|
||||
<WIDTH x i32> %vecmask)
|
||||
<WIDTH x MASK> %vecmask)
|
||||
%newDelta = load <WIDTH x i32> * %deltaPtr
|
||||
|
||||
%ret0 = call <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, <WIDTH x i32> %newOffsets,
|
||||
@@ -3835,7 +4153,7 @@ define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32
|
||||
|
||||
define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
|
||||
<WIDTH x i64> %offset_delta,
|
||||
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
|
||||
; We can be clever and avoid the per-lane stuff for gathers if we are willing
|
||||
; to require that the 0th element of the array being gathered from is always
|
||||
; legal to read from (and we do indeed require that, given the benefits!)
|
||||
@@ -3844,13 +4162,13 @@ define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64
|
||||
%offsetsPtr = alloca <WIDTH x i64>
|
||||
store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %offsetsPtr
|
||||
call void @__masked_store_blend_i64(<WIDTH x i64> * %offsetsPtr, <WIDTH x i64> %offsets,
|
||||
<WIDTH x i32> %vecmask)
|
||||
<WIDTH x MASK> %vecmask)
|
||||
%newOffsets = load <WIDTH x i64> * %offsetsPtr
|
||||
|
||||
%deltaPtr = alloca <WIDTH x i64>
|
||||
store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %deltaPtr
|
||||
call void @__masked_store_blend_i64(<WIDTH x i64> * %deltaPtr, <WIDTH x i64> %offset_delta,
|
||||
<WIDTH x i32> %vecmask)
|
||||
<WIDTH x MASK> %vecmask)
|
||||
%newDelta = load <WIDTH x i64> * %deltaPtr
|
||||
|
||||
%ret0 = call <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %newOffsets,
|
||||
@@ -3876,27 +4194,27 @@ gen_gather_factored($1)
|
||||
define <WIDTH x $1>
|
||||
@__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale,
|
||||
<WIDTH x i32> %offsets,
|
||||
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale_vec = bitcast i32 %offset_scale to <1 x i32>
|
||||
%smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef,
|
||||
<WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
|
||||
%scaled_offsets = mul <WIDTH x i32> %smear_scale, %offsets
|
||||
%v = call <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %scaled_offsets, i32 1,
|
||||
<WIDTH x i32> zeroinitializer, <WIDTH x i32> %vecmask)
|
||||
<WIDTH x i32> zeroinitializer, <WIDTH x MASK> %vecmask)
|
||||
ret <WIDTH x $1> %v
|
||||
}
|
||||
|
||||
define <WIDTH x $1>
|
||||
@__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale,
|
||||
<WIDTH x i64> %offsets,
|
||||
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale64 = zext i32 %offset_scale to i64
|
||||
%scale_vec = bitcast i64 %scale64 to <1 x i64>
|
||||
%smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef,
|
||||
<WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
|
||||
%scaled_offsets = mul <WIDTH x i64> %smear_scale, %offsets
|
||||
%v = call <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %scaled_offsets,
|
||||
i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x i32> %vecmask)
|
||||
i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x MASK> %vecmask)
|
||||
ret <WIDTH x $1> %v
|
||||
}
|
||||
|
||||
@@ -3955,9 +4273,9 @@ define void @__scatter_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_s
|
||||
|
||||
define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
|
||||
<WIDTH x i32> %offset_delta, <WIDTH x $1> %values,
|
||||
<WIDTH x i32> %mask) nounwind alwaysinline {
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
|
||||
per_lane(WIDTH, <WIDTH x i32> %mask, `
|
||||
per_lane(WIDTH, <WIDTH x MASK> %mask, `
|
||||
call void @__scatter_elt32_$1(i8 * %base, <WIDTH x i32> %offsets, i32 %offset_scale,
|
||||
<WIDTH x i32> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
|
||||
ret void
|
||||
@@ -3965,9 +4283,9 @@ define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offs
|
||||
|
||||
define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
|
||||
<WIDTH x i64> %offset_delta, <WIDTH x $1> %values,
|
||||
<WIDTH x i32> %mask) nounwind alwaysinline {
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
|
||||
per_lane(WIDTH, <WIDTH x i32> %mask, `
|
||||
per_lane(WIDTH, <WIDTH x MASK> %mask, `
|
||||
call void @__scatter_elt64_$1(i8 * %base, <WIDTH x i64> %offsets, i32 %offset_scale,
|
||||
<WIDTH x i64> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
|
||||
ret void
|
||||
@@ -3975,8 +4293,8 @@ define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offs
|
||||
|
||||
; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s
|
||||
define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
|
||||
<WIDTH x i32> %mask) nounwind alwaysinline {
|
||||
per_lane(WIDTH, <WIDTH x i32> %mask, `
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
per_lane(WIDTH, <WIDTH x MASK> %mask, `
|
||||
%iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
|
||||
%val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
|
||||
@@ -3987,8 +4305,8 @@ define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
|
||||
|
||||
; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s
|
||||
define void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,
|
||||
<WIDTH x i32> %mask) nounwind alwaysinline {
|
||||
per_lane(WIDTH, <WIDTH x i32> %mask, `
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
per_lane(WIDTH, <WIDTH x MASK> %mask, `
|
||||
%iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
|
||||
%val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
|
||||
@@ -4044,3 +4362,109 @@ define i1 @__rdrand_i64(i64 * %ptr) {
|
||||
ret i1 %good
|
||||
}
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
define(`define_avg_up_uint8', `
|
||||
define <WIDTH x i8> @__avg_up_uint8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
|
||||
%b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
|
||||
%sum1 = add <WIDTH x i16> %a16, %b16
|
||||
%sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
|
||||
%avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
|
||||
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
|
||||
ret <WIDTH x i8> %r
|
||||
}')
|
||||
|
||||
define(`define_avg_up_int8', `
|
||||
define <WIDTH x i8> @__avg_up_int8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
|
||||
%b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
|
||||
%sum1 = add <WIDTH x i16> %a16, %b16
|
||||
%sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
|
||||
%avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
|
||||
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
|
||||
ret <WIDTH x i8> %r
|
||||
}')
|
||||
|
||||
define(`define_avg_up_uint16', `
|
||||
define <WIDTH x i16> @__avg_up_uint16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
|
||||
%b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
|
||||
%sum1 = add <WIDTH x i32> %a32, %b32
|
||||
%sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
|
||||
%avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
|
||||
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
|
||||
ret <WIDTH x i16> %r
|
||||
}')
|
||||
|
||||
define(`define_avg_up_int16', `
|
||||
define <WIDTH x i16> @__avg_up_int16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
|
||||
%b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
|
||||
%sum1 = add <WIDTH x i32> %a32, %b32
|
||||
%sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
|
||||
%avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
|
||||
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
|
||||
ret <WIDTH x i16> %r
|
||||
}')
|
||||
|
||||
define(`define_avg_down_uint8', `
|
||||
define <WIDTH x i8> @__avg_down_uint8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
|
||||
%b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
|
||||
%sum = add <WIDTH x i16> %a16, %b16
|
||||
%avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
|
||||
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
|
||||
ret <WIDTH x i8> %r
|
||||
}')
|
||||
|
||||
define(`define_avg_down_int8', `
|
||||
define <WIDTH x i8> @__avg_down_int8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
|
||||
%b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
|
||||
%sum = add <WIDTH x i16> %a16, %b16
|
||||
%avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
|
||||
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
|
||||
ret <WIDTH x i8> %r
|
||||
}')
|
||||
|
||||
define(`define_avg_down_uint16', `
|
||||
define <WIDTH x i16> @__avg_down_uint16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
|
||||
%b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
|
||||
%sum = add <WIDTH x i32> %a32, %b32
|
||||
%avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
|
||||
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
|
||||
ret <WIDTH x i16> %r
|
||||
}')
|
||||
|
||||
define(`define_avg_down_int16', `
|
||||
define <WIDTH x i16> @__avg_down_int16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
|
||||
%b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
|
||||
%sum = add <WIDTH x i32> %a32, %b32
|
||||
%avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
|
||||
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
|
||||
ret <WIDTH x i16> %r
|
||||
}')
|
||||
|
||||
define(`define_up_avgs', `
|
||||
define_avg_up_uint8()
|
||||
define_avg_up_int8()
|
||||
define_avg_up_uint16()
|
||||
define_avg_up_int16()
|
||||
')
|
||||
|
||||
define(`define_down_avgs', `
|
||||
define_avg_down_uint8()
|
||||
define_avg_down_int8()
|
||||
define_avg_down_uint16()
|
||||
define_avg_down_int16()
|
||||
')
|
||||
|
||||
define(`define_avgs', `
|
||||
define_up_avgs()
|
||||
define_down_avgs()
|
||||
')
|
||||
|
||||
Reference in New Issue
Block a user