Merge pull request #578 from egaburov/master

added --target=avx-i64x4 & svml support for all sse/avx modes
This commit is contained in:
Dmitry Babokin
2013-09-13 09:40:24 -07:00
22 changed files with 1148 additions and 387 deletions

1
.gitignore vendored
View File

@@ -11,5 +11,6 @@ tests*/*run
examples/*/*.png
examples/*/*.ppm
examples/*/objs/*
*.swp

View File

@@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
type.cpp util.cpp
HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
opt.h stmt.h sym.h type.h util.h
TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
TARGETS=avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
ifneq ($(ARM_ENABLED), 0)
@@ -160,7 +160,7 @@ BISON_SRC=parse.yy
FLEX_SRC=lex.ll
OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o stdlib_mask64_ispc.o \
$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
default: ispc
@@ -246,15 +246,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
@echo Compiling $<
@$(CXX) $(CXXFLAGS) -o $@ -c $<
objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 $(wildcard builtins/*common.ll)
objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
@echo Creating C++ source from builtins definition file $<
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@
objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
@echo Creating C++ source from builtins definition file $< \(32 bit version\)
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@
objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
@echo Creating C++ source from builtins definition file $< \(64 bit version\)
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@
@@ -268,20 +268,25 @@ objs/builtins-c-64.cpp: builtins/builtins.c
objs/stdlib_mask1_ispc.cpp: stdlib.ispc
@echo Creating C++ source from $< for mask1
@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
python stdlib2cpp.py mask1 > $@
objs/stdlib_mask8_ispc.cpp: stdlib.ispc
@echo Creating C++ source from $< for mask8
@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \
@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
python stdlib2cpp.py mask8 > $@
objs/stdlib_mask16_ispc.cpp: stdlib.ispc
@echo Creating C++ source from $< for mask16
@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \
@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
python stdlib2cpp.py mask16 > $@
objs/stdlib_mask32_ispc.cpp: stdlib.ispc
@echo Creating C++ source from $< for mask32
@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \
@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
python stdlib2cpp.py mask32 > $@
objs/stdlib_mask64_ispc.cpp: stdlib.ispc
@echo Creating C++ source from $< for mask64
@$(CLANG) -E -x c -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
python stdlib2cpp.py mask64 > $@

View File

@@ -302,6 +302,7 @@ lCheckModuleIntrinsics(llvm::Module *module) {
// check the llvm.x86.* intrinsics for now...
if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
if (id == 0) fprintf(stderr, "FATAL: intrinsic is not found: %s \n", funcName.c_str());
Assert(id != 0);
llvm::Type *intrinsicType =
llvm::Intrinsic::getType(*g->ctx, id);
@@ -576,20 +577,34 @@ lSetInternalFunctions(llvm::Module *module) {
"__stdlib_pow",
"__stdlib_powf",
"__stdlib_sin",
"__stdlib_asin",
"__stdlib_sincos",
"__stdlib_sincosf",
"__stdlib_sinf",
"__stdlib_tan",
"__stdlib_tanf",
"__svml_sin",
"__svml_cos",
"__svml_sincos",
"__svml_tan",
"__svml_atan",
"__svml_atan2",
"__svml_exp",
"__svml_log",
"__svml_pow",
"__svml_sind",
"__svml_asind",
"__svml_cosd",
"__svml_acosd",
"__svml_sincosd",
"__svml_tand",
"__svml_atand",
"__svml_atan2d",
"__svml_expd",
"__svml_logd",
"__svml_powd",
"__svml_sinf",
"__svml_asinf",
"__svml_cosf",
"__svml_acosf",
"__svml_sincosf",
"__svml_tanf",
"__svml_atanf",
"__svml_atan2f",
"__svml_expf",
"__svml_logf",
"__svml_powf",
"__undef_uniform",
"__undef_varying",
"__vec4_add_float",
@@ -920,6 +935,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
}
case Target::AVX: {
switch (g->target->getVectorWidth()) {
case 4:
if (runtime32) {
EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit);
}
else {
EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit);
}
break;
case 8:
if (runtime32) {
EXPORT_MODULE(builtins_bitcode_avx1_32bit);
@@ -1083,7 +1106,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
// serialized version of the stdlib.ispc file to get its
// definitions added.
extern char stdlib_mask1_code[], stdlib_mask8_code[];
extern char stdlib_mask16_code[], stdlib_mask32_code[];
extern char stdlib_mask16_code[], stdlib_mask32_code[], stdlib_mask64_code[];
if (g->target->getISA() == Target::GENERIC &&
g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib
yy_scan_string(stdlib_mask32_code);
@@ -1102,6 +1125,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
case 32:
yy_scan_string(stdlib_mask32_code);
break;
case 64:
yy_scan_string(stdlib_mask64_code);
break;
default:
FATAL("Unhandled mask bit size for stdlib.ispc");
}

217
builtins/svml.m4 Normal file
View File

@@ -0,0 +1,217 @@
;; copyright stub :)
;; Copyright (c) 2013, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;; svml macro
;; svml_stubs : stubs for svml calls
;; $1 - type ("float" or "double")
;; $2 - svml internal function suffix ("f" for float, "d" for double)
;; $3 - vector width
define(`svml_stubs',`
declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone alwaysinline
declare void @__svml_sincos$2(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone alwaysinline
declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline
')
;; svml_declare : declaration of __svml_* intrinsics
;; $1 - type ("float" or "double")
;; $2 - __svml_* intrinsic function suffix
;; float: "f4"(sse) "f8"(avx) "f16"(avx512)
;; double: "2"(sse) "4"(avx) "8"(avx512)
;; $3 - vector width
define(`svml_declare',`
declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_sincos$2(<$3 x $1> *, <$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone
declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone
');
;; defintition of __svml_* internal functions
;; $1 - type ("float" or "double")
;; $2 - __svml_* intrinsic function suffix
;; float: "f4"(sse) "f8"(avx) "f16"(avx512)
;; double: "2"(sse) "4"(avx) "8"(avx512)
;; $3 - vector width
;; $4 - svml internal function suffix ("f" for float, "d" for double)
define(`svml_define',`
define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0)
ret <$3 x $1> %ret
}
define <$3 x $1> @__svml_asin$4(<$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_asin$2(<$3 x $1> %0)
ret <$3 x $1> %ret
}
define <$3 x $1> @__svml_cos$4(<$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_cos$2(<$3 x $1> %0)
ret <$3 x $1> %ret
}
define void @__svml_sincos$4(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline {
%s = call <$3 x $1> @__svml_sincos$2(<$3 x $1> * %2, <$3 x $1> %0)
store <$3 x $1> %s, <$3 x $1> * %1
ret void
}
define <$3 x $1> @__svml_tan$4(<$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_tan$2(<$3 x $1> %0)
ret <$3 x $1> %ret
}
define <$3 x $1> @__svml_atan$4(<$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_atan$2(<$3 x $1> %0)
ret <$3 x $1> %ret
}
define <$3 x $1> @__svml_atan2$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_atan2$2(<$3 x $1> %0, <$3 x $1> %1)
ret <$3 x $1> %ret
}
define <$3 x $1> @__svml_exp$4(<$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_exp$2(<$3 x $1> %0)
ret <$3 x $1> %ret
}
define <$3 x $1> @__svml_log$4(<$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_log$2(<$3 x $1> %0)
ret <$3 x $1> %ret
}
define <$3 x $1> @__svml_pow$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
%ret = call <$3 x $1> @__svml_pow$2(<$3 x $1> %0, <$3 x $1> %1)
ret <$3 x $1> %ret
}
')
;; svml_define_x : defintition of __svml_* internal functions operation on extended width
;; $1 - type ("float" or "double")
;; $2 - __svml_* intrinsic function suffix
;; float: "f4"(sse) "f8"(avx) "f16"(avx512)
;; double: "2"(sse) "4"(avx) "8"(avx512)
;; $3 - vector width
;; $4 - svml internal function suffix ("f" for float, "d" for double)
;; $5 - extended width, must be at least twice the native vector width
;; contigent on existing of unary$3to$5 and binary$3to$5 macros
;; *todo*: in sincos call use __svml_sincos[f][2,4,8,16] call, e.g.
;;define void @__svml_sincosf(<8 x float>, <8 x float> *,
;; <8 x float> *) nounwind readnone alwaysinline {
;; ; call svml_sincosf4 two times with the two 4-wide sub-vectors
;; %a = shufflevector <8 x float> %0, <8 x float> undef,
;; <4 x i32> <i32 0, i32 1, i32 2, i32 3>
;; %b = shufflevector <8 x float> %0, <8 x float> undef,
;; <4 x i32> <i32 4, i32 5, i32 6, i32 7>
;;
;; %cospa = alloca <4 x float>
;; %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
;;
;; %cospb = alloca <4 x float>
;; %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
;;
;; %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
;; <8 x i32> <i32 0, i32 1, i32 2, i32 3,
;; i32 4, i32 5, i32 6, i32 7>
;; store <8 x float> %sin, <8 x float> * %1
;;
;; %cosa = load <4 x float> * %cospa
;; %cosb = load <4 x float> * %cospb
;; %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
;; <8 x i32> <i32 0, i32 1, i32 2, i32 3,
;; i32 4, i32 5, i32 6, i32 7>
;; store <8 x float> %cos, <8 x float> * %2
;;
;; ret void
;;}
define(`svml_define_x',`
define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline {
unary$3to$5(ret, $1, @__svml_sin$2, %0)
ret <$5 x $1> %ret
}
define <$5 x $1> @__svml_asin$4(<$5 x $1>) nounwind readnone alwaysinline {
unary$3to$5(ret, $1, @__svml_asin$2, %0)
ret <$5 x $1> %ret
}
define <$5 x $1> @__svml_cos$4(<$5 x $1>) nounwind readnone alwaysinline {
unary$3to$5(ret, $1, @__svml_cos$2, %0)
ret <$5 x $1> %ret
}
define void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline
{
%s = call <$5 x $1> @__svml_sin$4(<$5 x $1> %0)
%c = call <$5 x $1> @__svml_cos$4(<$5 x $1> %0)
store <$5 x $1> %s, <$5 x $1> * %1
store <$5 x $1> %c, <$5 x $1> * %2
ret void
}
define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline {
unary$3to$5(ret, $1, @__svml_tan$2, %0)
ret <$5 x $1> %ret
}
define <$5 x $1> @__svml_atan$4(<$5 x $1>) nounwind readnone alwaysinline {
unary$3to$5(ret, $1, @__svml_atan$2, %0)
ret <$5 x $1> %ret
}
define <$5 x $1> @__svml_atan2$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
binary$3to$5(ret, $1, @__svml_atan2$2, %0, %1)
ret <$5 x $1> %ret
}
define <$5 x $1> @__svml_exp$4(<$5 x $1>) nounwind readnone alwaysinline {
unary$3to$5(ret, $1, @__svml_exp$2, %0)
ret <$5 x $1> %ret
}
define <$5 x $1> @__svml_log$4(<$5 x $1>) nounwind readnone alwaysinline {
unary$3to$5(ret, $1, @__svml_log$2, %0)
ret <$5 x $1> %ret
}
define <$5 x $1> @__svml_pow$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
binary$3to$5(ret, $1, @__svml_pow$2, %0, %1)
ret <$5 x $1> %ret
}
')

View File

@@ -137,19 +137,14 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; svml
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
; or, use the macro to call the 4-wide ones 4x with our 16-wide
; vectors...
include(`svml.m4')
;; single precision
svml_declare(float,f8,8)
svml_define_x(float,f8,8,f,16)
declare <16 x float> @__svml_sin(<16 x float>)
declare <16 x float> @__svml_cos(<16 x float>)
declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
declare <16 x float> @__svml_tan(<16 x float>)
declare <16 x float> @__svml_atan(<16 x float>)
declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
declare <16 x float> @__svml_exp(<16 x float>)
declare <16 x float> @__svml_log(<16 x float>)
declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
;; double precision
svml_declare(double,4,4)
svml_define_x(double,4,4,d,16)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max

View File

@@ -137,19 +137,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; svml
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
; or, use the macro to call the 4-wide ones twice with our 8-wide
; vectors...
include(`svml.m4')
;; single precision
svml_declare(float,f8,8)
svml_define(float,f8,8,f)
declare <8 x float> @__svml_sin(<8 x float>)
declare <8 x float> @__svml_cos(<8 x float>)
declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
declare <8 x float> @__svml_tan(<8 x float>)
declare <8 x float> @__svml_atan(<8 x float>)
declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
declare <8 x float> @__svml_exp(<8 x float>)
declare <8 x float> @__svml_log(<8 x float>)
declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
;; double precision
svml_declare(double,4,4)
svml_define_x(double,4,4,d,8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max

View File

@@ -0,0 +1,81 @@
;; Copyright (c) 2013, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(`target-avx1-i64x4base.ll')
rdrand_decls()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
%call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
ret <4 x i32> %call
}
define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
%call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
ret <4 x i32> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unsigned int min/max
define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
%call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
ret <4 x i32> %call
}
define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
%call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
ret <4 x i32> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
ifelse(NO_HALF_DECLARES, `1', `', `
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)

View File

@@ -0,0 +1,513 @@
;; Copyright (c) 2013, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Basic 4-wide definitions
define(`WIDTH',`4')
define(`MASK',`i64')
include(`util.m4')
stdlib_core()
packed_load_and_store()
scans()
int64minmax()
include(`target-avx-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
;; sse intrinsic
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
; float iv = __rcp_v(v);
; return iv * (2. - v * iv);
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
; do one N-R iteration
%v_iv = fmul <4 x float> %0, %call
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
%iv_mul = fmul <4 x float> %call, %two_minus
ret <4 x float> %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats
;; sse intrinsic
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
ret <4 x float> %call
}
define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
ret <4 x float> %call
}
define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
ret <4 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding doubles
;; avx intrinsic
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
%call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 8)
ret <4 x double> %call
}
define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
%call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 9)
ret <4 x double> %call
}
define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
%call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 10)
ret <4 x double> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt
;; sse intrinsic
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
; float is = __rsqrt_v(v);
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <4 x float> %v, %is
%v_is_is = fmul <4 x float> %v_is, %is
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <4 x float> %is, %three_sub
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
ret <4 x float> %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; sqrt
;; sse intrinsic
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
ret <4 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
;; avx<76> intrinsic
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
%call = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %0)
ret <4 x double> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; svml
include(`svml.m4')
;; single precision
svml_declare(float,f4,4)
svml_define(float,f4,4,f)
;; double precision
svml_declare(double,4,4)
svml_define(double,4,4,d)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max
;; sse intrinsics
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
ret <4 x float> %call
}
define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
ret <4 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops
;; sse intrinsic
declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
define i64 @__movmsk(<4 x i64>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i64> %0 to <4 x double>
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
%v64 = zext i32 %v to i64
ret i64 %v64
}
define i1 @__any(<4 x i64>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i64> %0 to <4 x double>
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<4 x i64>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i64> %0 to <4 x double>
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 15
ret i1 %cmp
}
define i1 @__none(<4 x i64>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i64> %0 to <4 x double>
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal float ops
;; sse intrinsic
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
%v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
%v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
%scalar = extractelement <4 x float> %v2, i32 0
ret float %scalar
}
define float @__reduce_min_float(<4 x float>) nounwind readnone {
reduce4(float, @__min_varying_float, @__min_uniform_float)
}
define float @__reduce_max_float(<4 x float>) nounwind readnone {
reduce4(float, @__max_varying_float, @__max_uniform_float)
}
reduce_equal(4)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int8 ops
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline
{
%wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int16 ops
define internal <4 x i16> @__add_varying_i16(<4 x i16>,
<4 x i16>) nounwind readnone alwaysinline {
%r = add <4 x i16> %0, %1
ret <4 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int32 ops
define <4 x i32> @__add_varying_int32(<4 x i32>,
<4 x i32>) nounwind readnone alwaysinline {
%s = add <4 x i32> %0, %1
ret <4 x i32> %s
}
define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
%s = add i32 %0, %1
ret i32 %s
}
define i32 @__reduce_add_int32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__add_varying_int32, @__add_uniform_int32)
}
define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
}
define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
}
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal double ops
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
define double @__reduce_add_double(<4 x double>) nounwind readonly alwaysinline {
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1 = shufflevector <4 x double> <double 0.,double 0.,double 0.,double 0.>, <4 x double> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
;; %v1 = <4 x double> <double 0., double 0., double 0., double 0.>
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
%final0 = extractelement <4 x double> %sum1, i32 0
%final1 = extractelement <4 x double> %sum1, i32 2
%sum = fadd double %final0, %final1
ret double %sum
}
define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
reduce4(double, @__min_varying_double, @__min_uniform_double)
}
define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
reduce4(double, @__max_varying_double, @__max_uniform_double)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int64 ops
define <4 x i64> @__add_varying_int64(<4 x i64>,
<4 x i64>) nounwind readnone alwaysinline {
%s = add <4 x i64> %0, %1
ret <4 x i64> %s
}
define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
%s = add i64 %0, %1
ret i64 %s
}
define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__add_varying_int64, @__add_uniform_int64)
}
define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
}
define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
}
define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
}
define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
; no masked load instruction for i8 and i16 types??
masked_load(i8, 1)
masked_load(i16, 2)
;; avx intrinsics
declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask)
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline {
%mask = trunc <4 x i64> %mask64 to <4 x i32>
%floatmask = bitcast <4 x i32> %mask to <4 x float>
%floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask)
%retval = bitcast <4 x float> %floatval to <4 x i32>
ret <4 x i32> %retval
}
define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline {
%doublemask = bitcast <4 x i64> %mask to <4 x double>
%doubleval = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %doublemask)
%retval = bitcast <4 x double> %doubleval to <4 x i64>
ret <4 x i64> %retval
}
masked_load_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
gen_masked_store(i8)
gen_masked_store(i16)
; note that mask is the 2nd parameter, not the 3rd one!!
;; avx intrinsics
declare void @llvm.x86.avx.maskstore.ps (i8 *, <4 x float>, <4 x float>)
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>,
<4 x i64>) nounwind alwaysinline {
%mask32 = trunc <4 x i64> %2 to <4 x i32>
%ptr = bitcast <4 x i32> * %0 to i8 *
%val = bitcast <4 x i32> %1 to <4 x float>
%mask = bitcast <4 x i32> %mask32 to <4 x float>
call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val)
ret void
}
define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
<4 x i64>) nounwind alwaysinline {
%ptr = bitcast <4 x i64> * %0 to i8 *
%val = bitcast <4 x i64> %1 to <4 x double>
%mask = bitcast <4 x i64> %2 to <4 x double>
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask, <4 x double> %val)
ret void
}
masked_store_blend_8_16_by_4_mask64()
;; sse intrinsic
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
<4 x float>) nounwind readnone
define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>,
<4 x i64>) nounwind alwaysinline {
%mask = trunc <4 x i64> %2 to <4 x i32>
%mask_as_float = bitcast <4 x i32> %mask to <4 x float>
%oldValue = load <4 x i32>* %0, align 4
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
%newAsFloat = bitcast <4 x i32> %1 to <4 x float>
%blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
<4 x float> %newAsFloat,
<4 x float> %mask_as_float)
%blendAsInt = bitcast <4 x float> %blend to <4 x i32>
store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
ret void
}
;; avx intrinsic
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
<4 x double>) nounwind readnone
define void @__masked_store_blend_i64(<4 x i64>* nocapture , <4 x i64>,
<4 x i64>) nounwind alwaysinline {
%mask_as_double = bitcast <4 x i64> %2 to <4 x double>
%oldValue = load <4 x i64>* %0, align 4
%oldAsDouble = bitcast <4 x i64> %oldValue to <4 x double>
%newAsDouble = bitcast <4 x i64> %1 to <4 x double>
%blend = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %oldAsDouble,
<4 x double> %newAsDouble,
<4 x double> %mask_as_double)
%blendAsInt = bitcast <4 x double> %blend to <4 x i64>
store <4 x i64> %blendAsInt, <4 x i64>* %0, align 4
ret void
}
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; scatter
gen_scatter(i8)
gen_scatter(i16)
gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
%call = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %0, <4 x double> %1)
ret <4 x double> %call
}
define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
%call = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %0, <4 x double> %1)
ret <4 x double> %call
}

View File

@@ -310,6 +310,7 @@ declare double @round (double) nounwind readnone
;declare float @llvm.sqrt.f32(float %Val)
declare double @llvm.sqrt.f64(double %Val)
declare float @llvm.sin.f32(float %Val)
declare float @llvm.asin.f32(float %Val)
declare float @llvm.cos.f32(float %Val)
declare float @llvm.sqrt.f32(float %Val)
declare float @llvm.exp.f32(float %Val)
@@ -651,7 +652,18 @@ define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
declare <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline
declare <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline
declare <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline
declare void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline
declare <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline
declare <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline
declare <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline
declare <1 x float> @__svml_expd(<1 x float>) nounwind readnone alwaysinline
declare <1 x float> @__svml_logd(<1 x float>) nounwind readnone alwaysinline
declare <1 x float> @__svml_powd(<1 x float>, <1 x float>) nounwind readnone alwaysinline
define <1 x float> @__svml_sinf(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
;ret <1 x float> %ret
;%r = extractelement <1 x float> %0, i32 0
@@ -662,7 +674,18 @@ define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
}
define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
define <1 x float> @__svml_asinf(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_asinf4(<1 x float> %0)
;ret <1 x float> %ret
;%r = extractelement <1 x float> %0, i32 0
;%s = call float @llvm.asin.f32(float %r)
;%rv = insertelement <1 x float> undef, float %r, i32 0
;ret <1 x float> %rv
unary1to1(float,@llvm.asin.f32)
}
define <1 x float> @__svml_cosf(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
;ret <1 x float> %ret
;%r = extractelement <1 x float> %0, i32 0
@@ -673,18 +696,18 @@ define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
}
define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
define void @__svml_sincosf(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
; store <1 x float> %s, <1 x float> * %1
; ret void
%sin = call <1 x float> @__svml_sin (<1 x float> %0)
%cos = call <1 x float> @__svml_cos (<1 x float> %0)
%sin = call <1 x float> @__svml_sinf(<1 x float> %0)
%cos = call <1 x float> @__svml_cosf(<1 x float> %0)
store <1 x float> %sin, <1 x float> * %1
store <1 x float> %cos, <1 x float> * %2
ret void
}
define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
define <1 x float> @__svml_tanf(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
;ret <1 x float> %ret
;%r = extractelement <1 x float> %0, i32 0
@@ -696,7 +719,7 @@ define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
ret <1 x float > %0
}
define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
define <1 x float> @__svml_atanf(<1 x float>) nounwind readnone alwaysinline {
; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
; ret <1 x float> %ret
;%r = extractelement <1 x float> %0, i32 0
@@ -709,7 +732,7 @@ define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
}
define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
define <1 x float> @__svml_atan2f(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
;ret <1 x float> %ret
;%y = extractelement <1 x float> %0, i32 0
@@ -722,19 +745,19 @@ define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone al
ret <1 x float > %0
}
define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
define <1 x float> @__svml_expf(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
;ret <1 x float> %ret
unary1to1(float, @llvm.exp.f32)
}
define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
define <1 x float> @__svml_logf(<1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
;ret <1 x float> %ret
unary1to1(float, @llvm.log.f32)
}
define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
define <1 x float> @__svml_powf(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
;ret <1 x float> %ret
%r = extractelement <1 x float> %0, i32 0

View File

@@ -202,21 +202,15 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
;; svml
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
; or, use the macro to call the 4-wide ones twice with our 8-wide
; vectors...
declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
declare <WIDTH x float> @__svml_log(<WIDTH x float>)
declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
;; svml
include(`svml.m4')
svml_stubs(float,f,WIDTH)
svml_stubs(double,d,WIDTH)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reductions

View File

@@ -316,15 +316,10 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
;; yuck. We need declarations of these, even though we shouldnt ever
;; actually generate calls to them for the NEON target...
declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
declare <WIDTH x float> @__svml_log(<WIDTH x float>)
declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
include(`svml.m4')
svml_stubs(float,f,WIDTH)
svml_stubs(double,d,WIDTH)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather

View File

@@ -105,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
include(`svml.m4')
;; single precision
svml_declare(float,f4,4)
svml_define_x(float,f4,4,f,8)
define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_sinf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_cosf4, %0)
ret <8 x float> %ret
}
define void @__svml_sincos(<8 x float>, <8 x float> *,
<8 x float> *) nounwind readnone alwaysinline {
; call svml_sincosf4 two times with the two 4-wide sub-vectors
%a = shufflevector <8 x float> %0, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%b = shufflevector <8 x float> %0, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%cospa = alloca <4 x float>
%sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
%cospb = alloca <4 x float>
%sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
%sin = shufflevector <4 x float> %sa, <4 x float> %sb,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7>
store <8 x float> %sin, <8 x float> * %1
%cosa = load <4 x float> * %cospa
%cosb = load <4 x float> * %cospb
%cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7>
store <8 x float> %cos, <8 x float> * %2
ret void
}
define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_tanf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_atanf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_atan2(<8 x float>,
<8 x float>) nounwind readnone alwaysinline {
binary4to8(ret, float, @__svml_atan2f4, %0, %1)
ret <8 x float> %ret
}
define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_expf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_logf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_pow(<8 x float>,
<8 x float>) nounwind readnone alwaysinline {
binary4to8(ret, float, @__svml_powf4, %0, %1)
ret <8 x float> %ret
}
;; double precision
svml_declare(double,2,2)
svml_define_x(double,2,2,d,8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

View File

@@ -496,62 +496,15 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
include(`svml.m4')
;; single precision
svml_declare(float,f4,4)
svml_define(float,f4,4,f)
;; double precision
svml_declare(double,2,2)
svml_define_x(double,2,2,d,4)
define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
ret <4 x float> %ret
}
define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
store <4 x float> %s, <4 x float> * %1
ret void
}
define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
ret <4 x float> %ret
}
define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
ret <4 x float> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max

View File

@@ -209,16 +209,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
;; svml
; FIXME
declare <8 x float> @__svml_sin(<8 x float>)
declare <8 x float> @__svml_cos(<8 x float>)
declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
declare <8 x float> @__svml_tan(<8 x float>)
declare <8 x float> @__svml_atan(<8 x float>)
declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
declare <8 x float> @__svml_exp(<8 x float>)
declare <8 x float> @__svml_log(<8 x float>)
declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
include(`svml.m4')
svml_stubs(float,f,WIDTH)
svml_stubs(double,d,WIDTH)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions

View File

@@ -222,15 +222,9 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
; FIXME
declare <16 x float> @__svml_sin(<16 x float>)
declare <16 x float> @__svml_cos(<16 x float>)
declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
declare <16 x float> @__svml_tan(<16 x float>)
declare <16 x float> @__svml_atan(<16 x float>)
declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
declare <16 x float> @__svml_exp(<16 x float>)
declare <16 x float> @__svml_log(<16 x float>)
declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
include(`svml.m4')
svml_stubs(float,f,WIDTH)
svml_stubs(double,d,WIDTH)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions

View File

@@ -105,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
include(`svml.m4')
;; single precision
svml_declare(float,f4,4)
svml_define_x(float,f4,4,f,8)
define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_sinf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_cosf4, %0)
ret <8 x float> %ret
}
define void @__svml_sincos(<8 x float>, <8 x float> *,
<8 x float> *) nounwind readnone alwaysinline {
; call svml_sincosf4 two times with the two 4-wide sub-vectors
%a = shufflevector <8 x float> %0, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%b = shufflevector <8 x float> %0, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%cospa = alloca <4 x float>
%sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
%cospb = alloca <4 x float>
%sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
%sin = shufflevector <4 x float> %sa, <4 x float> %sb,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7>
store <8 x float> %sin, <8 x float> * %1
%cosa = load <4 x float> * %cospa
%cosb = load <4 x float> * %cospb
%cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7>
store <8 x float> %cos, <8 x float> * %2
ret void
}
define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_tanf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_atanf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_atan2(<8 x float>,
<8 x float>) nounwind readnone alwaysinline {
binary4to8(ret, float, @__svml_atan2f4, %0, %1)
ret <8 x float> %ret
}
define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_expf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
unary4to8(ret, float, @__svml_logf4, %0)
ret <8 x float> %ret
}
define <8 x float> @__svml_pow(<8 x float>,
<8 x float>) nounwind readnone alwaysinline {
binary4to8(ret, float, @__svml_powf4, %0, %1)
ret <8 x float> %ret
}
;; double precision
svml_declare(double,2,2)
svml_define_x(double,2,2,d,8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

View File

@@ -209,62 +209,14 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
include(`svml.m4')
;; single precision
svml_declare(float,f4,4)
svml_define(float,f4,4,f)
define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
ret <4 x float> %ret
}
define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
store <4 x float> %s, <4 x float> * %1
ret void
}
define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
ret <4 x float> %ret
}
define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
ret <4 x float> %ret
}
define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
ret <4 x float> %ret
}
;; double precision
svml_declare(double,2,2)
svml_define_x(double,2,2,d,4)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions

View File

@@ -947,6 +947,22 @@ define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
%r = sext <$1 x i32> %0 to <$1 x i64>
ret <$1 x i64> %r
}
define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) {
%r = trunc <$1 x i64> %0 to <$1 x i8>
ret <$1 x i8> %r
}
define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) {
%r = trunc <$1 x i64> %0 to <$1 x i16>
ret <$1 x i16> %r
}
define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) {
%r = trunc <$1 x i64> %0 to <$1 x i32>
ret <$1 x i32> %r
}
define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
ret <$1 x i64> %0
}
')
mask_converts(WIDTH)
@@ -2689,9 +2705,13 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
}
define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
ifelse(MASK,i32, `ret <WIDTH x i32> %0',
`%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
ret <WIDTH x i32> %se')
;; ifelse(MASK,i32, `ret <WIDTH x i32> %0',
;; `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
;; ret <WIDTH x i32> %se')
ifelse(MASK,i32, `%se = bitcast <WIDTH x i32> %0 to <WIDTH x i32>',
MASK,i64, `%se = trunc <WIDTH x MASK> %0 to <WIDTH x i32>',
`%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>')
ret <WIDTH x i32> %se
}
@@ -3160,6 +3180,7 @@ define float @__stdlib_powf(float, float) nounwind readnone alwaysinline {
}
declare double @sin(double) nounwind readnone
declare double @asin(double) nounwind readnone
declare double @cos(double) nounwind readnone
declare void @sincos(double, double *, double *) nounwind readnone
declare double @tan(double) nounwind readnone
@@ -3174,6 +3195,11 @@ define double @__stdlib_sin(double) nounwind readnone alwaysinline {
ret double %r
}
define double @__stdlib_asin(double) nounwind readnone alwaysinline {
%r = call double @asin(double %0)
ret double %r
}
define double @__stdlib_cos(double) nounwind readnone alwaysinline {
%r = call double @cos(double %0)
ret double %r
@@ -3502,6 +3528,56 @@ define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
}
')
define(`masked_store_blend_8_16_by_4_mask64', `
define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
<4 x i64>) nounwind alwaysinline {
%old = load <4 x i8> * %0, align 1
ifelse(LLVM_VERSION,LLVM_3_0,`
%old32 = bitcast <4 x i8> %old to i32
%new32 = bitcast <4 x i8> %1 to i32
%mask8 = trunc <4 x i64> %2 to <4 x i8>
%mask32 = bitcast <4 x i8> %mask8 to i32
%notmask32 = xor i32 %mask32, -1
%newmasked = and i32 %new32, %mask32
%oldmasked = and i32 %old32, %notmask32
%result = or i32 %newmasked, %oldmasked
%resultvec = bitcast i32 %result to <4 x i8>
',`
%m = trunc <4 x i64> %2 to <4 x i1>
%resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
')
store <4 x i8> %resultvec, <4 x i8> * %0, align 1
ret void
}
define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
<4 x i64>) nounwind alwaysinline {
%old = load <4 x i16> * %0, align 2
ifelse(LLVM_VERSION,LLVM_3_0,`
%old64 = bitcast <4 x i16> %old to i64
%new64 = bitcast <4 x i16> %1 to i64
%mask16 = trunc <4 x i64> %2 to <4 x i16>
%mask64 = bitcast <4 x i16> %mask16 to i64
%notmask64 = xor i64 %mask64, -1
%newmasked = and i64 %new64, %mask64
%oldmasked = and i64 %old64, %notmask64
%result = or i64 %newmasked, %oldmasked
%resultvec = bitcast i64 %result to <4 x i16>
',`
%m = trunc <4 x i64> %2 to <4 x i1>
%resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
')
store <4 x i16> %resultvec, <4 x i16> * %0, align 2
ret void
}
')
define(`masked_store_blend_8_16_by_8', `
define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
<8 x i32>) nounwind alwaysinline {

View File

@@ -446,6 +446,14 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_maskingIsFree = false;
this->m_maskBitCount = 32;
}
else if (!strcasecmp(isa, "avx1-i64x4") ) {
this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov";
this->m_maskingIsFree = false;
this->m_maskBitCount = 64;
}
else if (!strcasecmp(isa, "avx-x2") ||
!strcasecmp(isa, "avx1-x2") ||
!strcasecmp(isa, "avx1-i32x16")) {

View File

@@ -132,6 +132,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth());
break;
case 64:
LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
llvm::VectorType::get(llvm::Type::getInt64Ty(*ctx), target.getVectorWidth());
break;
default:
FATAL("Unhandled mask width for initializing MaskType");
}
@@ -183,6 +187,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
true /*signed*/); // 0xffffffff
break;
case 64:
onMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), -1,
true /*signed*/); // 0xffffffffffffffffull
break;
default:
FATAL("Unhandled mask width for onMask");
}
@@ -210,6 +218,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
true /*signed*/);
break;
case 64:
offMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), 0,
true /*signed*/);
break;
default:
FATAL("Unhandled mask width for offMask");
}
@@ -480,7 +492,10 @@ LLVMUInt64Vector(const uint64_t *ivec) {
llvm::Constant *
LLVMBoolVector(bool b) {
llvm::Constant *v;
if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
v = llvm::ConstantInt::get(LLVMTypes::Int64Type, b ? 0xffffffffffffffffull : 0,
false /*unsigned*/);
else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0,
false /*unsigned*/);
else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
@@ -506,7 +521,10 @@ LLVMBoolVector(const bool *bvec) {
std::vector<llvm::Constant *> vals;
for (int i = 0; i < g->target->getVectorWidth(); ++i) {
llvm::Constant *v;
if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
v = llvm::ConstantInt::get(LLVMTypes::Int64Type, bvec[i] ? 0xffffffffffffffffull : 0,
false /*unsigned*/);
else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0,
false /*unsigned*/);
else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)

View File

@@ -2183,6 +2183,9 @@ static void lAddMaskToSymbolTable(SourcePos pos) {
case 32:
t = AtomicType::VaryingUInt32;
break;
case 64:
t = AtomicType::VaryingUInt64;
break;
default:
FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable");
}

View File

@@ -50,6 +50,9 @@
#elif (ISPC_MASK_BITS == 32)
#define IntMaskType int32
#define UIntMaskType unsigned int32
#elif (ISPC_MASK_BITS == 64)
#define IntMaskType int64
#define UIntMaskType unsigned int64
#else
#error Unknown value of ISPC_MASK_BITS
#endif
@@ -2180,7 +2183,7 @@ static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
__declspec(safe)
static inline float sin(float x_full) {
if (__math_lib == __math_lib_svml) {
return __svml_sin(x_full);
return __svml_sinf(x_full);
}
else if (__math_lib == __math_lib_system) {
float ret;
@@ -2313,8 +2316,10 @@ static inline float asin(float x) {
bool isnan = (x > 1);
float v;
if (__math_lib == __math_lib_svml ||
__math_lib == __math_lib_system) {
if (__math_lib == __math_lib_svml) {
return __svml_asinf(x);
}
else if (__math_lib == __math_lib_system) {
float ret;
foreach_active (i) {
uniform float r = __stdlib_asinf(extract(x, i));
@@ -2417,7 +2422,7 @@ static inline uniform float asin(uniform float x) {
__declspec(safe)
static inline float cos(float x_full) {
if (__math_lib == __math_lib_svml) {
return __svml_cos(x_full);
return __svml_cosf(x_full);
}
else if (__math_lib == __math_lib_system) {
float ret;
@@ -2545,18 +2550,28 @@ static inline float acos(float v) {
return 1.57079637050628662109375 - asin(v);
}
__declspec(safe)
static inline double acos(const double v) {
return 1.57079637050628662109375 - asin(v);
}
__declspec(safe)
static inline uniform float acos(uniform float v) {
return 1.57079637050628662109375 - asin(v);
}
__declspec(safe)
static inline uniform double acos(const uniform double v) {
return 1.57079637050628662109375 - asin(v);
}
__declspec(safe)
static inline void sincos(float x_full, varying float * uniform sin_result,
varying float * uniform cos_result) {
if (__math_lib == __math_lib_svml) {
__svml_sincos(x_full, sin_result, cos_result);
__svml_sincosf(x_full, sin_result, cos_result);
}
else if (__math_lib == __math_lib_system) {
foreach_active (i) {
@@ -2688,7 +2703,7 @@ static inline void sincos(uniform float x_full, uniform float * uniform sin_resu
__declspec(safe)
static inline float tan(float x_full) {
if (__math_lib == __math_lib_svml) {
return __svml_tan(x_full);
return __svml_tanf(x_full);
}
else if (__math_lib == __math_lib_system) {
float ret;
@@ -2839,7 +2854,7 @@ static inline uniform float tan(uniform float x_full) {
__declspec(safe)
static inline float atan(float x_full) {
if (__math_lib == __math_lib_svml) {
return __svml_atan(x_full);
return __svml_atanf(x_full);
}
else if (__math_lib == __math_lib_system) {
float ret;
@@ -2934,7 +2949,7 @@ static inline uniform float atan(uniform float x_full) {
__declspec(safe)
static inline float atan2(float y, float x) {
if (__math_lib == __math_lib_svml) {
return __svml_atan2(y, x);
return __svml_atan2f(y, x);
}
else if (__math_lib == __math_lib_system) {
float ret;
@@ -2997,7 +3012,7 @@ static inline float exp(float x_full) {
return __exp_varying_float(x_full);
}
else if (__math_lib == __math_lib_svml) {
return __svml_exp(x_full);
return __svml_expf(x_full);
}
else if (__math_lib == __math_lib_system) {
float ret;
@@ -3204,7 +3219,7 @@ static inline float log(float x_full) {
return __log_varying_float(x_full);
}
else if (__math_lib == __math_lib_svml) {
return __svml_log(x_full);
return __svml_logf(x_full);
}
else if (__math_lib == __math_lib_system) {
float ret;
@@ -3379,7 +3394,7 @@ static inline float pow(float a, float b) {
return __pow_varying_float(a, b);
}
else if (__math_lib == __math_lib_svml) {
return __svml_pow(a, b);
return __svml_powf(a, b);
}
else if (__math_lib == __math_lib_system) {
float ret;
@@ -3469,7 +3484,11 @@ static inline uniform double frexp(uniform double x, uniform int * uniform pw2)
__declspec(safe)
static inline double sin(double x) {
if (__math_lib == __math_lib_ispc_fast)
if (__math_lib == __math_lib_svml)
{
return __svml_sind(x);
}
else if (__math_lib == __math_lib_ispc_fast)
return sin((float)x);
else {
double ret;
@@ -3490,8 +3509,30 @@ static inline uniform double sin(uniform double x) {
}
__declspec(safe)
static inline double cos(double x) {
if (__math_lib == __math_lib_ispc_fast)
static inline double asin(const double x) {
if (__math_lib == __math_lib_svml)
{
return __svml_asind(x);
}
else if (__math_lib == __math_lib_ispc_fast)
return asin((float)x);
else {
double ret;
foreach_active (i) {
uniform double r = __stdlib_asin(extract(x, i));
ret = insert(ret, i, r);
}
return ret;
}
}
__declspec(safe)
static inline double cos(const double x) {
if (__math_lib == __math_lib_svml)
{
return __svml_cosd(x);
}
else if (__math_lib == __math_lib_ispc_fast)
return cos((float)x);
else {
double ret;
@@ -3514,7 +3555,11 @@ static inline uniform double cos(uniform double x) {
__declspec(safe)
static inline void sincos(double x, varying double * uniform sin_result,
varying double * uniform cos_result) {
if (__math_lib == __math_lib_ispc_fast) {
if (__math_lib == __math_lib_svml)
{
__svml_sincosd(x, sin_result, cos_result);
}
else if (__math_lib == __math_lib_ispc_fast) {
float sr, cr;
sincos((float)x, &sr, &cr);
*sin_result = sr;
@@ -3545,7 +3590,11 @@ static inline void sincos(uniform double x, uniform double * uniform sin_result,
__declspec(safe)
static inline double tan(double x) {
if (__math_lib == __math_lib_ispc_fast)
if (__math_lib == __math_lib_svml)
{
return __svml_tand(x);
}
else if (__math_lib == __math_lib_ispc_fast)
return tan((float)x);
else {
double ret;
@@ -3589,7 +3638,11 @@ static inline uniform double atan(uniform double x) {
__declspec(safe)
static inline double atan2(double y, double x) {
if (__math_lib == __math_lib_ispc_fast)
if (__math_lib == __math_lib_svml)
{
return __svml_atan2d(y,x);
}
else if (__math_lib == __math_lib_ispc_fast)
return atan2((float)y, (float)x);
else {
double ret;
@@ -3611,7 +3664,11 @@ static inline uniform double atan2(uniform double y, uniform double x) {
__declspec(safe)
static inline double exp(double x) {
if (__math_lib == __math_lib_ispc_fast)
if (__math_lib == __math_lib_svml)
{
return __svml_expd(x);
}
else if (__math_lib == __math_lib_ispc_fast)
return exp((float)x);
else {
double ret;
@@ -3633,7 +3690,11 @@ static inline uniform double exp(uniform double x) {
__declspec(safe)
static inline double log(double x) {
if (__math_lib == __math_lib_ispc_fast)
if (__math_lib == __math_lib_svml)
{
return __svml_logd(x);
}
else if (__math_lib == __math_lib_ispc_fast)
return log((float)x);
else {
double ret;
@@ -3655,7 +3716,11 @@ static inline uniform double log(uniform double x) {
__declspec(safe)
static inline double pow(double a, double b) {
if (__math_lib == __math_lib_ispc_fast)
if (__math_lib == __math_lib_svml)
{
return __svml_powd(a,b);
}
else if (__math_lib == __math_lib_ispc_fast)
return pow((float)a, (float)b);
else {
double ret;