Merge pull request #657 from dbabokin/avx-i32x4

avx1-i32x4 target
This commit is contained in:
jbrodman
2013-11-15 16:00:57 -08:00
23 changed files with 479 additions and 310 deletions

View File

@@ -214,7 +214,7 @@ def check_targets():
try_do_LLVM("build check_ISA", "cl check_isa.cpp", True) try_do_LLVM("build check_ISA", "cl check_isa.cpp", True)
SSE2 = ["sse2-i32x4", "sse2-i32x8"] SSE2 = ["sse2-i32x4", "sse2-i32x8"]
SSE4 = ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"] SSE4 = ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
AVX = ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"] AVX = ["avx1-i32x4", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4"]
AVX11 = ["avx1.1-i32x8","avx1.1-i32x16","avx1.1-i64x4"] AVX11 = ["avx1.1-i32x8","avx1.1-i32x16","avx1.1-i64x4"]
AVX2 = ["avx2-i32x8", "avx2-i32x16", "avx2-i64x4"] AVX2 = ["avx2-i32x8", "avx2-i32x16", "avx2-i64x4"]
targets = [["AVX2", AVX2, False], ["AVX1.1", AVX11, False], ["AVX", AVX, False], ["SSE4", SSE4, False], ["SSE2", SSE2, False]] targets = [["AVX2", AVX2, False], ["AVX1.1", AVX11, False], ["AVX", AVX, False], ["SSE4", SSE4, False], ["SSE2", SSE2, False]]
@@ -251,7 +251,7 @@ def check_targets():
if targets[3][2] == False and "wsm" in f_lines[i]: if targets[3][2] == False and "wsm" in f_lines[i]:
answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]] answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]]
if targets[2][2] == False and "snb" in f_lines[i]: if targets[2][2] == False and "snb" in f_lines[i]:
answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]] answer_sde = answer_sde + [["-snb", "avx1-i32x4"], ["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]]
if targets[1][2] == False and "ivb" in f_lines[i]: if targets[1][2] == False and "ivb" in f_lines[i]:
answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"], ["-ivb", "avx1.1-i64x4"]] answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"], ["-ivb", "avx1.1-i64x4"]]
if targets[0][2] == False and "hsw" in f_lines[i]: if targets[0][2] == False and "hsw" in f_lines[i]:
@@ -495,6 +495,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
performance.ref = "ispc_ref" performance.ref = "ispc_ref"
if current_OS == "Windows": if current_OS == "Windows":
performance.ref = "ispc_ref.exe" performance.ref = "ispc_ref.exe"
performance.perf_target = ""
performance.in_file = "." + os.sep + f_date + os.sep + "performance.log" performance.in_file = "." + os.sep + f_date + os.sep + "performance.log"
# prepare LLVM 3.3 as newest LLVM # prepare LLVM 3.3 as newest LLVM
need_LLVM = check_LLVM(["3.3"]) need_LLVM = check_LLVM(["3.3"])

View File

@@ -942,12 +942,32 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
case Target::AVX: { case Target::AVX: {
switch (g->target->getVectorWidth()) { switch (g->target->getVectorWidth()) {
case 4: case 4:
if (g->target->getDataTypeWidth() == 32) {
// Note here that for avx1-i32x4 we are using bitcode file for
// sse4-i32x4. This is intentional and good enough.
// AVX target implies appropriate target-feature attrbute,
// which forces LLVM to generate AVX code, even for SSE4
// intrinsics. Except that the only "missing" feature in sse4
// target is implemenation of __masked_[store|load]_[i32|i64]
// using maskmov instruction. But it's not very popular
// intrinsics, so we assume the implementation to be good
// enough at the moment.
if (runtime32) {
EXPORT_MODULE(builtins_bitcode_sse4_32bit);
}
else {
EXPORT_MODULE(builtins_bitcode_sse4_64bit);
}
} else if (g->target->getDataTypeWidth() == 64) {
if (runtime32) { if (runtime32) {
EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit); EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit);
} }
else { else {
EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit); EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit);
} }
} else {
FATAL("logic error in DefineStdlib");
}
break; break;
case 8: case 8:
if (runtime32) { if (runtime32) {

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation ;; Copyright (c) 2010-2013, Intel Corporation
;; All rights reserved. ;; All rights reserved.
;; ;;
;; Redistribution and use in source and binary forms, with or without ;; Redistribution and use in source and binary forms, with or without
@@ -31,30 +31,16 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; AVX target implementation. ;; AVX target implementation.
;;
;; Please note that this file uses SSE intrinsics, but LLVM generates AVX
;; instructions, so it doesn't makes sense to change this implemenation.
ctlztz() ctlztz()
define_prefetches() define_prefetches()
define_shuffles() define_shuffles()
aossoa() aossoa()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
; uniform float iv = extract(__rcp_u(v), 0);
; return iv * (2. - v * iv);
%vecval = insertelement <4 x float> undef, float %0, i32 0
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
%scall = extractelement <4 x float> %call, i32 0
; do one N-R iteration
%v_iv = fmul float %0, %scall
%two_minus = fsub float 2., %v_iv
%iv_mul = fmul float %scall, %two_minus
ret float %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats ;; rounding floats
@@ -77,7 +63,8 @@ define float @__round_uniform_float(float) nounwind readonly alwaysinline {
; r3 = a3 ; r3 = a3
; ;
; It doesn't matter what we pass as a, since we only need the r0 value ; It doesn't matter what we pass as a, since we only need the r0 value
; here. So we pass the same register for both. ; here. So we pass the same register for both. Further, only the 0th
; element of the b parameter matters
%xi = insertelement <4 x float> undef, float %0, i32 0 %xi = insertelement <4 x float> undef, float %0, i32 0
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8) %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
%rs = extractelement <4 x float> %xr, i32 0 %rs = extractelement <4 x float> %xr, i32 0
@@ -117,7 +104,7 @@ define double @__round_uniform_double(double) nounwind readonly alwaysinline {
define double @__floor_uniform_double(double) nounwind readonly alwaysinline { define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0 %xi = insertelement <2 x double> undef, double %0, i32 0
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 ; roundsd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9) %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
%rs = extractelement <2 x double> %xr, i32 0 %rs = extractelement <2 x double> %xr, i32 0
ret double %rs ret double %rs
@@ -126,12 +113,31 @@ define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline { define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0 %xi = insertelement <2 x double> undef, double %0, i32 0
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10 ; roundsd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10) %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
%rs = extractelement <2 x double> %xr, i32 0 %rs = extractelement <2 x double> %xr, i32 0
ret double %rs ret double %rs
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
; do the rcpss call
; uniform float iv = extract(__rcp_u(v), 0);
; return iv * (2. - v * iv);
%vecval = insertelement <4 x float> undef, float %0, i32 0
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
%scall = extractelement <4 x float> %call, i32 0
; do one N-R iteration to improve precision, as above
%v_iv = fmul float %0, %scall
%two_minus = fsub float 2., %v_iv
%iv_mul = fmul float %scall, %two_minus
ret float %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt ;; rsqrt
@@ -144,6 +150,7 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v) %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
%is = extractelement <4 x float> %vis, i32 0 %is = extractelement <4 x float> %vis, i32 0
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is); ; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul float %0, %is %v_is = fmul float %0, %is
%v_is_is = fmul float %v_is, %is %v_is_is = fmul float %v_is, %is
@@ -164,9 +171,18 @@ define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
ret float %ret ret float %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; fastmath ;; fast math mode
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
@@ -200,6 +216,22 @@ define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
ret float %ret ret float %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
ret double %ret
}
define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max ;; int min/max
@@ -235,7 +267,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops ;; horizontal ops / reductions
declare i32 @llvm.ctpop.i32(i32) nounwind readnone declare i32 @llvm.ctpop.i32(i32) nounwind readnone
@@ -251,32 +283,6 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
ret i64 %call ret i64 %call
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
ret double %ret
}
define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins ;; int8/int16 builtins

View File

@@ -49,11 +49,10 @@ include(`target-avx-common.ll')
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline { define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
; do one N-R iteration to improve precision
; float iv = __rcp_v(v); ; float iv = __rcp_v(v);
; return iv * (2. - v * iv); ; return iv * (2. - v * iv);
%call = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %0) %call = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %0)
; do one N-R iteration
%v_iv = fmul <8 x float> %0, %call %v_iv = fmul <8 x float> %0, %call
%two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2., %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.>, %v_iv float 2., float 2., float 2., float 2.>, %v_iv
@@ -61,6 +60,46 @@ define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinl
ret <8 x float> %iv_mul ret <8 x float> %iv_mul
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
; float is = __rsqrt_v(v);
%is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <8 x float> %v, %is
%v_is_is = fmul <8 x float> %v_is, %is
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <8 x float> %is, %three_sub
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
ret <8 x float> %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; sqrt
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
%call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
ret <8 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
ret <8 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats ;; rounding floats
@@ -94,58 +133,15 @@ define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwa
} }
define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline { define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9 ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
round4to8double(%0, 9) round4to8double(%0, 9)
} }
define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline { define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10 ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
round4to8double(%0, 10) round4to8double(%0, 10)
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
; float is = __rsqrt_v(v);
%is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <8 x float> %v, %is
%v_is_is = fmul <8 x float> %v_is, %is
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <8 x float> %is, %three_sub
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
ret <8 x float> %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; sqrt
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
%call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
ret <8 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; svml
include(`svml.m4')
;; single precision
svml_declare(float,f8,8)
svml_define(float,f8,8,f)
;; double precision
svml_declare(double,4,4)
svml_define_x(double,4,4,d,8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max ;; float min/max
@@ -166,7 +162,37 @@ define <8 x float> @__min_varying_float(<8 x float>,
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops ;; double precision min/max
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
ret <8 x double> %ret
}
define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
ret <8 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; svml
include(`svml.m4')
;; single precision
svml_declare(float,f8,8)
svml_define(float,f8,8,f)
;; double precision
svml_declare(double,4,4)
svml_define_x(double,4,4,d,8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; mask handling
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
@@ -198,6 +224,9 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp ret i1 %cmp
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal ops / reductions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal float ops ;; horizontal float ops
@@ -216,12 +245,36 @@ define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
reduce8(float, @__min_varying_float, @__min_uniform_float) reduce8(float, @__min_varying_float, @__min_uniform_float)
} }
define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline { define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
reduce8(float, @__max_varying_float, @__max_uniform_float) reduce8(float, @__max_varying_float, @__max_uniform_float)
} }
reduce_equal(8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal double ops
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
%v0 = shufflevector <8 x double> %0, <8 x double> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1 = shufflevector <8 x double> %0, <8 x double> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
%final0 = extractelement <4 x double> %sum1, i32 0
%final1 = extractelement <4 x double> %sum1, i32 2
%sum = fadd double %final0, %final1
ret double %sum
}
define double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
reduce8(double, @__min_varying_double, @__min_uniform_double)
}
define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
reduce8(double, @__max_varying_double, @__max_uniform_double)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int8 ops ;; horizontal int8 ops
@@ -262,6 +315,7 @@ define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int32 ops ;; horizontal int32 ops
;; helper functions
define <8 x i32> @__add_varying_int32(<8 x i32>, define <8 x i32> @__add_varying_int32(<8 x i32>,
<8 x i32>) nounwind readnone alwaysinline { <8 x i32>) nounwind readnone alwaysinline {
%s = add <8 x i32> %0, %1 %s = add <8 x i32> %0, %1
@@ -273,16 +327,15 @@ define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
ret i32 %s ret i32 %s
} }
;; reduction functions
define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline { define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__add_varying_int32, @__add_uniform_int32) reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
} }
define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline { define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__min_varying_int32, @__min_uniform_int32) reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
} }
define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline { define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__max_varying_int32, @__max_uniform_int32) reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
} }
@@ -295,38 +348,11 @@ define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32) reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal double ops
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
%v0 = shufflevector <8 x double> %0, <8 x double> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1 = shufflevector <8 x double> %0, <8 x double> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
%final0 = extractelement <4 x double> %sum1, i32 0
%final1 = extractelement <4 x double> %sum1, i32 2
%sum = fadd double %final0, %final1
ret double %sum
}
define double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
reduce8(double, @__min_varying_double, @__min_uniform_double)
}
define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
reduce8(double, @__max_varying_double, @__max_uniform_double)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int64 ops ;; horizontal int64 ops
;; helper functions
define <8 x i64> @__add_varying_int64(<8 x i64>, define <8 x i64> @__add_varying_int64(<8 x i64>,
<8 x i64>) nounwind readnone alwaysinline { <8 x i64>) nounwind readnone alwaysinline {
%s = add <8 x i64> %0, %1 %s = add <8 x i64> %0, %1
@@ -338,6 +364,7 @@ define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
ret i64 %s ret i64 %s
} }
;; reduction functions
define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline { define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
reduce8(i64, @__add_varying_int64, @__add_uniform_int64) reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
} }
@@ -362,6 +389,7 @@ define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
} }
reduce_equal(8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts ;; unaligned loads/loads+broadcasts
@@ -446,6 +474,10 @@ define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>,
ret void ret void
} }
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store blend
masked_store_blend_8_16_by_8() masked_store_blend_8_16_by_8()
@@ -517,8 +549,6 @@ define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
ret void ret void
} }
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; scatter ;; scatter
@@ -529,30 +559,3 @@ gen_scatter(float)
gen_scatter(i64) gen_scatter(i64)
gen_scatter(double) gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
ret <8 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
ret <8 x double> %ret
}
define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
ret <8 x double> %ret
}

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation ;; Copyright (c) 2010-2013, Intel Corporation
;; All rights reserved. ;; All rights reserved.
;; ;;
;; Redistribution and use in source and binary forms, with or without ;; Redistribution and use in source and binary forms, with or without
@@ -29,6 +29,9 @@
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; SSE4 target implementation.
ctlztz() ctlztz()
define_prefetches() define_prefetches()
define_shuffles() define_shuffles()
@@ -67,7 +70,7 @@ define float @__round_uniform_float(float) nounwind readonly alwaysinline {
define float @__floor_uniform_float(float) nounwind readonly alwaysinline { define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <4 x float> undef, float %0, i32 0 %xi = insertelement <4 x float> undef, float %0, i32 0
; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9 ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9) %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
%rs = extractelement <4 x float> %xr, i32 0 %rs = extractelement <4 x float> %xr, i32 0
ret float %rs ret float %rs
@@ -97,7 +100,7 @@ define double @__round_uniform_double(double) nounwind readonly alwaysinline {
define double @__floor_uniform_double(double) nounwind readonly alwaysinline { define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0 %xi = insertelement <2 x double> undef, double %0, i32 0
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 ; roundsd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9) %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
%rs = extractelement <2 x double> %xr, i32 0 %rs = extractelement <2 x double> %xr, i32 0
ret double %rs ret double %rs
@@ -106,7 +109,7 @@ define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline { define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0 %xi = insertelement <2 x double> undef, double %0, i32 0
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 ; roundsd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10) %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
%rs = extractelement <2 x double> %xr, i32 0 %rs = extractelement <2 x double> %xr, i32 0
ret double %rs ret double %rs
@@ -119,6 +122,8 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline { define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
; do the rcpss call ; do the rcpss call
; uniform float iv = extract(__rcp_u(v), 0);
; return iv * (2. - v * iv);
%vecval = insertelement <4 x float> undef, float %0, i32 0 %vecval = insertelement <4 x float> undef, float %0, i32 0
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval) %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
%scall = extractelement <4 x float> %call, i32 0 %scall = extractelement <4 x float> %call, i32 0
@@ -130,9 +135,8 @@ define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
ret float %iv_mul ret float %iv_mul
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; rsqrt ;; rsqrt
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
@@ -154,7 +158,7 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; sqrt ;; sqrt
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
@@ -163,6 +167,16 @@ define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
ret float %ret ret float %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; fast math mode ;; fast math mode
@@ -198,36 +212,25 @@ define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
ret float %ret ret float %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max ;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
define double @__min_uniform_double(double, double) nounwind readnone { define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1) sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
ret double %ret ret double %ret
} }
define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
define double @__max_uniform_double(double, double) nounwind readnone {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1) sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
ret double %ret ret double %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int32 min/max ;; int min/max
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
@@ -242,8 +245,9 @@ define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
ret i32 %ret ret i32 %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unsigned int min/max ;; unsigned int min/max
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
@@ -258,9 +262,8 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
ret i32 %ret ret i32 %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions ;; horizontal ops / reductions
declare i32 @llvm.ctpop.i32(i32) nounwind readnone declare i32 @llvm.ctpop.i32(i32) nounwind readnone

View File

@@ -58,10 +58,10 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline { define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
; do one N-R iteration to improve precision ; do one N-R iteration to improve precision
; float iv = __rcp_v(v); ; float iv = __rcp_v(v);
; return iv * (2. - v * iv); ; return iv * (2. - v * iv);
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
%v_iv = fmul <4 x float> %0, %call %v_iv = fmul <4 x float> %0, %call
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
%iv_mul = fmul <4 x float> %call, %two_minus %iv_mul = fmul <4 x float> %call, %two_minus
@@ -87,7 +87,7 @@ define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwa
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; sqrt ;; sqrt
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
@@ -154,16 +154,34 @@ define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alway
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { define <4 x float> @__max_varying_float(<4 x float>,
<4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1) %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
ret <4 x float> %call ret <4 x float> %call
} }
define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { define <4 x float> @__min_varying_float(<4 x float>,
<4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1) %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
ret <4 x float> %call ret <4 x float> %call
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
ret <4 x double> %ret
}
define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
ret <4 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int32 min/max ;; int32 min/max
@@ -191,23 +209,7 @@ define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly a
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max ;; svml stuff
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
ret <4 x double> %ret
}
define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
ret <4 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
include(`svml.m4') include(`svml.m4')
;; single precision ;; single precision
@@ -219,7 +221,7 @@ svml_declare(double,2,2)
svml_define_x(double,2,2,d,4) svml_define_x(double,2,2,d,4)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions ;; mask handling
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
@@ -251,6 +253,55 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp ret i1 %cmp
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal ops / reductions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal float ops
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
%v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
%v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
%scalar = extractelement <4 x float> %v2, i32 0
ret float %scalar
}
define float @__reduce_min_float(<4 x float>) nounwind readnone alwaysinline {
reduce4(float, @__min_varying_float, @__min_uniform_float)
}
define float @__reduce_max_float(<4 x float>) nounwind readnone alwaysinline {
reduce4(float, @__max_varying_float, @__max_uniform_float)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal double ops
define double @__reduce_add_double(<4 x double>) nounwind readnone alwaysinline {
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
<2 x i32> <i32 0, i32 1>
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
<2 x i32> <i32 2, i32 3>
%sum = fadd <2 x double> %v0, %v1
%e0 = extractelement <2 x double> %sum, i32 0
%e1 = extractelement <2 x double> %sum, i32 1
%m = fadd double %e0, %e1
ret double %m
}
define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
reduce4(double, @__min_varying_double, @__min_uniform_double)
}
define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
reduce4(double, @__max_varying_double, @__max_uniform_double)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int8 ops
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline { define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
@@ -266,6 +317,9 @@ define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
ret i16 %r16 ret i16 %r16
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int16 ops
define internal <4 x i16> @__add_varying_i16(<4 x i16>, define internal <4 x i16> @__add_varying_i16(<4 x i16>,
<4 x i16>) nounwind readnone alwaysinline { <4 x i16>) nounwind readnone alwaysinline {
%r = add <4 x i16> %0, %1 %r = add <4 x i16> %0, %1
@@ -281,24 +335,11 @@ define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
reduce4(i16, @__add_varying_i16, @__add_uniform_i16) reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
} }
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int32 ops
define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline { ;; reduction functions
%v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0) define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone alwaysinline {
%v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
%scalar = extractelement <4 x float> %v2, i32 0
ret float %scalar
}
define float @__reduce_min_float(<4 x float>) nounwind readnone {
reduce4(float, @__min_varying_float, @__min_uniform_float)
}
define float @__reduce_max_float(<4 x float>) nounwind readnone {
reduce4(float, @__max_varying_float, @__max_uniform_float)
}
define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
%v1 = shufflevector <4 x i32> %v, <4 x i32> undef, %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef> <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%m1 = add <4 x i32> %v1, %v %m1 = add <4 x i32> %v1, %v
@@ -308,44 +349,27 @@ define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
ret i32 %sum ret i32 %sum
} }
define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone { define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__min_varying_int32, @__min_uniform_int32) reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
} }
define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone { define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__max_varying_int32, @__max_uniform_int32) reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
} }
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone { define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32) reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
} }
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone { define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32) reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int64 ops
define double @__reduce_add_double(<4 x double>) nounwind readnone { ;; reduction functions
%v0 = shufflevector <4 x double> %0, <4 x double> undef, define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
<2 x i32> <i32 0, i32 1>
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
<2 x i32> <i32 2, i32 3>
%sum = fadd <2 x double> %v0, %v1
%e0 = extractelement <2 x double> %sum, i32 0
%e1 = extractelement <2 x double> %sum, i32 1
%m = fadd double %e0, %e1
ret double %m
}
define double @__reduce_min_double(<4 x double>) nounwind readnone {
reduce4(double, @__min_varying_double, @__min_uniform_double)
}
define double @__reduce_max_double(<4 x double>) nounwind readnone {
reduce4(double, @__max_varying_double, @__max_uniform_double)
}
define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
%v0 = shufflevector <4 x i64> %0, <4 x i64> undef, %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
<2 x i32> <i32 0, i32 1> <2 x i32> <i32 0, i32 1>
%v1 = shufflevector <4 x i64> %0, <4 x i64> undef, %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
@@ -357,27 +381,50 @@ define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
ret i64 %m ret i64 %m
} }
define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone { define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__min_varying_int64, @__min_uniform_int64) reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
} }
define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone { define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__max_varying_int64, @__max_uniform_int64) reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
} }
define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone { define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64) reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
} }
define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone { define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64) reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
} }
reduce_equal(4) reduce_equal(4)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
masked_load(i8, 1)
masked_load(i16, 2)
masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store ;; masked store
gen_masked_store(i8)
gen_masked_store(i16)
gen_masked_store(i32)
gen_masked_store(i64)
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store blend
masked_store_blend_8_16_by_4()
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
<4 x float>) nounwind readnone <4 x float>) nounwind readnone
@@ -444,29 +491,6 @@ define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
ret void ret void
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
masked_store_blend_8_16_by_4()
gen_masked_store(i8)
gen_masked_store(i16)
gen_masked_store(i32)
gen_masked_store(i64)
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
masked_load(i8, 1)
masked_load(i16, 2)
masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter ;; gather/scatter

View File

@@ -2,7 +2,7 @@
EXAMPLE=ao EXAMPLE=ao
CPP_SRC=ao.cpp ao_serial.cpp CPP_SRC=ao.cpp ao_serial.cpp
ISPC_SRC=ao.ispc ISPC_SRC=ao.ispc
ISPC_IA_TARGETS=sse2,sse4,avx ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -16,8 +16,26 @@ ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/) ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
ifeq ($(ARCH),x86) ifeq ($(ARCH),x86)
ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \ ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o)
$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o) COMMA=,
ifneq (,$(findstring $(COMMA),$(ISPC_IA_TARGETS)))
#$(info multi-target detected: $(ISPC_IA_TARGETS))
ifneq (,$(findstring sse2,$(ISPC_IA_TARGETS)))
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse2.o)
endif
ifneq (,$(findstring sse4,$(ISPC_IA_TARGETS)))
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse4.o)
endif
ifneq (,$(findstring avx1-,$(ISPC_IA_TARGETS)))
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx.o)
endif
ifneq (,$(findstring avx1.1,$(ISPC_IA_TARGETS)))
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx11.o)
endif
ifneq (,$(findstring avx2,$(ISPC_IA_TARGETS)))
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx2.o)
endif
endif
ISPC_TARGETS=$(ISPC_IA_TARGETS) ISPC_TARGETS=$(ISPC_IA_TARGETS)
ARCH_BIT:=$(shell getconf LONG_BIT) ARCH_BIT:=$(shell getconf LONG_BIT)
ifeq ($(ARCH_BIT),32) ifeq ($(ARCH_BIT),32)
@@ -66,9 +84,9 @@ objs/%.o: %.c dirs $(ISPC_HEADER)
objs/%.o: ../%.cpp dirs objs/%.o: ../%.cpp dirs
$(CXX) $< $(CXXFLAGS) -c -o $@ $(CXX) $< $(CXXFLAGS) -c -o $@
objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h dirs
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o objs/%_ispc_avx11.o objs/%_ispc_avx2.o: %.ispc dirs
$(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h $(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC) objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)

View File

@@ -2,7 +2,7 @@
EXAMPLE=deferred_shading EXAMPLE=deferred_shading
CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp
ISPC_SRC=kernels.ispc ISPC_SRC=kernels.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
ISPC_FLAGS=--opt=fast-math ISPC_FLAGS=--opt=fast-math

View File

@@ -3,7 +3,7 @@ EXAMPLE=gmres
CPP_SRC=algorithm.cpp main.cpp matrix.cpp CPP_SRC=algorithm.cpp main.cpp matrix.cpp
CC_SRC=mmio.c CC_SRC=mmio.c
ISPC_SRC=matrix.ispc ISPC_SRC=matrix.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -2,7 +2,7 @@
EXAMPLE=mandelbrot EXAMPLE=mandelbrot
CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
ISPC_SRC=mandelbrot.ispc ISPC_SRC=mandelbrot.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -2,7 +2,7 @@
EXAMPLE=mandelbrot_tasks EXAMPLE=mandelbrot_tasks
CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
ISPC_SRC=mandelbrot_tasks.ispc ISPC_SRC=mandelbrot_tasks.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -2,7 +2,7 @@
EXAMPLE=noise EXAMPLE=noise
CPP_SRC=noise.cpp noise_serial.cpp CPP_SRC=noise.cpp noise_serial.cpp
ISPC_SRC=noise.ispc ISPC_SRC=noise.ispc
ISPC_IA_TARGETS=sse2,sse4,avx-x2 ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x16,avx2-i32x16
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -2,7 +2,7 @@
EXAMPLE=options EXAMPLE=options
CPP_SRC=options.cpp options_serial.cpp CPP_SRC=options.cpp options_serial.cpp
ISPC_SRC=options.ispc ISPC_SRC=options.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -2,7 +2,7 @@
EXAMPLE=perbench EXAMPLE=perbench
CPP_SRC=perfbench.cpp perfbench_serial.cpp CPP_SRC=perfbench.cpp perfbench_serial.cpp
ISPC_SRC=perfbench.ispc ISPC_SRC=perfbench.ispc
ISPC_IA_TARGETS=sse2,sse4,avx ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -2,7 +2,7 @@
EXAMPLE=rt EXAMPLE=rt
CPP_SRC=rt.cpp rt_serial.cpp CPP_SRC=rt.cpp rt_serial.cpp
ISPC_SRC=rt.ispc ISPC_SRC=rt.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -2,7 +2,7 @@
EXAMPLE=sort EXAMPLE=sort
CPP_SRC=sort.cpp sort_serial.cpp CPP_SRC=sort.cpp sort_serial.cpp
ISPC_SRC=sort.ispc ISPC_SRC=sort.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
#ISPC_FLAGS=-DDEBUG #ISPC_FLAGS=-DDEBUG

View File

@@ -2,7 +2,7 @@
EXAMPLE=stencil EXAMPLE=stencil
CPP_SRC=stencil.cpp stencil_serial.cpp CPP_SRC=stencil.cpp stencil_serial.cpp
ISPC_SRC=stencil.ispc ISPC_SRC=stencil.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -2,7 +2,7 @@
EXAMPLE=volume EXAMPLE=volume
CPP_SRC=volume.cpp volume_serial.cpp CPP_SRC=volume.cpp volume_serial.cpp
ISPC_SRC=volume.ispc ISPC_SRC=volume.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -191,6 +191,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
m_tf_attributes(NULL), m_tf_attributes(NULL),
#endif #endif
m_nativeVectorWidth(-1), m_nativeVectorWidth(-1),
m_dataTypeWidth(-1),
m_vectorWidth(-1), m_vectorWidth(-1),
m_generatePIC(pic), m_generatePIC(pic),
m_maskingIsFree(false), m_maskingIsFree(false),
@@ -308,6 +309,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse2-i32x4")) { !strcasecmp(isa, "sse2-i32x4")) {
this->m_isa = Target::SSE2; this->m_isa = Target::SSE2;
this->m_nativeVectorWidth = 4; this->m_nativeVectorWidth = 4;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -323,6 +325,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse2-i32x8")) { !strcasecmp(isa, "sse2-i32x8")) {
this->m_isa = Target::SSE2; this->m_isa = Target::SSE2;
this->m_nativeVectorWidth = 4; this->m_nativeVectorWidth = 4;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -338,6 +341,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse4-i32x4")) { !strcasecmp(isa, "sse4-i32x4")) {
this->m_isa = Target::SSE4; this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 4; this->m_nativeVectorWidth = 4;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
// TODO: why not sse42 and popcnt? // TODO: why not sse42 and popcnt?
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -355,6 +359,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse4-i32x8")) { !strcasecmp(isa, "sse4-i32x8")) {
this->m_isa = Target::SSE4; this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 4; this->m_nativeVectorWidth = 4;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -369,6 +374,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "sse4-i8x16")) { else if (!strcasecmp(isa, "sse4-i8x16")) {
this->m_isa = Target::SSE4; this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 16; this->m_nativeVectorWidth = 16;
this->m_dataTypeWidth = 8;
this->m_vectorWidth = 16; this->m_vectorWidth = 16;
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -383,6 +389,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "sse4-i16x8")) { else if (!strcasecmp(isa, "sse4-i16x8")) {
this->m_isa = Target::SSE4; this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 16;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -457,11 +464,21 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_maskingIsFree = false; this->m_maskingIsFree = false;
this->m_maskBitCount = 32; this->m_maskBitCount = 32;
} }
else if (!strcasecmp(isa, "avx1-i32x4")) {
this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov";
this->m_maskingIsFree = false;
this->m_maskBitCount = 32;
}
else if (!strcasecmp(isa, "avx") || else if (!strcasecmp(isa, "avx") ||
!strcasecmp(isa, "avx1") || !strcasecmp(isa, "avx1") ||
!strcasecmp(isa, "avx1-i32x8")) { !strcasecmp(isa, "avx1-i32x8")) {
this->m_isa = Target::AVX; this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+avx,+popcnt,+cmov"; this->m_attributes = "+avx,+popcnt,+cmov";
this->m_maskingIsFree = false; this->m_maskingIsFree = false;
@@ -471,6 +488,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1-i64x4")) { !strcasecmp(isa, "avx1-i64x4")) {
this->m_isa = Target::AVX; this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_dataTypeWidth = 64;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov"; this->m_attributes = "+avx,+popcnt,+cmov";
this->m_maskingIsFree = false; this->m_maskingIsFree = false;
@@ -481,6 +499,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1-i32x16")) { !strcasecmp(isa, "avx1-i32x16")) {
this->m_isa = Target::AVX; this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 16; this->m_vectorWidth = 16;
this->m_attributes = "+avx,+popcnt,+cmov"; this->m_attributes = "+avx,+popcnt,+cmov";
this->m_maskingIsFree = false; this->m_maskingIsFree = false;
@@ -490,6 +509,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1.1-i32x8")) { !strcasecmp(isa, "avx1.1-i32x8")) {
this->m_isa = Target::AVX11; this->m_isa = Target::AVX11;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+avx,+popcnt,+cmov,+f16c" this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -510,6 +530,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1.1-i32x16")) { !strcasecmp(isa, "avx1.1-i32x16")) {
this->m_isa = Target::AVX11; this->m_isa = Target::AVX11;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 16; this->m_vectorWidth = 16;
this->m_attributes = "+avx,+popcnt,+cmov,+f16c" this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -529,6 +550,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "avx1.1-i64x4")) { else if (!strcasecmp(isa, "avx1.1-i64x4")) {
this->m_isa = Target::AVX11; this->m_isa = Target::AVX11;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_dataTypeWidth = 64;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov,+f16c" this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -549,6 +571,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx2-i32x8")) { !strcasecmp(isa, "avx2-i32x8")) {
this->m_isa = Target::AVX2; this->m_isa = Target::AVX2;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -573,6 +596,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx2-i32x16")) { !strcasecmp(isa, "avx2-i32x16")) {
this->m_isa = Target::AVX2; this->m_isa = Target::AVX2;
this->m_nativeVectorWidth = 16; this->m_nativeVectorWidth = 16;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 16; this->m_vectorWidth = 16;
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -596,6 +620,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "avx2-i64x4")) { else if (!strcasecmp(isa, "avx2-i64x4")) {
this->m_isa = Target::AVX2; this->m_isa = Target::AVX2;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_dataTypeWidth = 64;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -620,6 +645,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "neon-i8x16")) { else if (!strcasecmp(isa, "neon-i8x16")) {
this->m_isa = Target::NEON8; this->m_isa = Target::NEON8;
this->m_nativeVectorWidth = 16; this->m_nativeVectorWidth = 16;
this->m_dataTypeWidth = 8;
this->m_vectorWidth = 16; this->m_vectorWidth = 16;
this->m_attributes = "+neon,+fp16"; this->m_attributes = "+neon,+fp16";
this->m_hasHalf = true; // ?? this->m_hasHalf = true; // ??
@@ -629,6 +655,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "neon-i16x8")) { else if (!strcasecmp(isa, "neon-i16x8")) {
this->m_isa = Target::NEON16; this->m_isa = Target::NEON16;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 16;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+neon,+fp16"; this->m_attributes = "+neon,+fp16";
this->m_hasHalf = true; // ?? this->m_hasHalf = true; // ??
@@ -639,6 +666,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "neon-i32x4")) { !strcasecmp(isa, "neon-i32x4")) {
this->m_isa = Target::NEON32; this->m_isa = Target::NEON32;
this->m_nativeVectorWidth = 4; this->m_nativeVectorWidth = 4;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
this->m_attributes = "+neon,+fp16"; this->m_attributes = "+neon,+fp16";
this->m_hasHalf = true; // ?? this->m_hasHalf = true; // ??
@@ -773,6 +801,7 @@ Target::SupportedTargets() {
#endif #endif
"sse2-i32x4, sse2-i32x8, " "sse2-i32x4, sse2-i32x8, "
"sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, " "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
"avx1-i32x4, "
"avx1-i32x8, avx1-i32x16, avx1-i64x4, " "avx1-i32x8, avx1-i32x16, avx1-i64x4, "
"avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 " "avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 "
"avx2-i32x8, avx2-i32x16, avx2-i64x4, " "avx2-i32x8, avx2-i32x16, avx2-i64x4, "
@@ -810,6 +839,9 @@ Target::GetTripleString() const {
return triple.str(); return triple.str();
} }
// This function returns string representation of ISA for the purpose of
// mangling. And may return any unique string, preferably short, like
// sse4, avx and etc.
const char * const char *
Target::ISAToString(ISA isa) { Target::ISAToString(ISA isa) {
switch (isa) { switch (isa) {
@@ -845,6 +877,45 @@ Target::GetISAString() const {
} }
// This function returns string representation of default target corresponding
// to ISA. I.e. for SSE4 it's sse4-i32x4, for AVX11 it's avx1.1-i32x8. This
// string may be used to initialize Target.
const char *
Target::ISAToTargetString(ISA isa) {
switch (isa) {
#ifdef ISPC_ARM_ENABLED
case Target::NEON8:
return "neon-8";
case Target::NEON16:
return "neon-16";
case Target::NEON32:
return "neon-32";
#endif
case Target::SSE2:
return "sse2-i32x4";
case Target::SSE4:
return "sse4-i32x4";
case Target::AVX:
return "avx1-i32x8";
case Target::AVX11:
return "avx1.1-i32x8";
case Target::AVX2:
return "avx2-i32x8";
case Target::GENERIC:
return "generic-4";
default:
FATAL("Unhandled target in ISAToTargetString()");
}
return "";
}
const char *
Target::GetISATargetString() const {
return ISAToString(m_isa);
}
static bool static bool
lGenericTypeLayoutIndeterminate(llvm::Type *type) { lGenericTypeLayoutIndeterminate(llvm::Type *type) {
if (type->isPrimitiveType() || type->isIntegerTy()) if (type->isPrimitiveType() || type->isIntegerTy())

17
ispc.h
View File

@@ -214,9 +214,16 @@ public:
/** Convert ISA enum to string */ /** Convert ISA enum to string */
static const char *ISAToString(Target::ISA isa); static const char *ISAToString(Target::ISA isa);
/** Returns a string like "avx" encoding the target. */ /** Returns a string like "avx" encoding the target. Good for mangling. */
const char *GetISAString() const; const char *GetISAString() const;
/** Convert ISA enum to string */
static const char *ISAToTargetString(Target::ISA isa);
/** Returns a string like "avx1.1-i32x8" encoding the target.
This may be used for Target initialization. */
const char *GetISATargetString() const;
/** Returns the size of the given type */ /** Returns the size of the given type */
llvm::Value *SizeOf(llvm::Type *type, llvm::Value *SizeOf(llvm::Type *type,
llvm::BasicBlock *insertAtEnd); llvm::BasicBlock *insertAtEnd);
@@ -253,6 +260,8 @@ public:
int getNativeVectorWidth() const {return m_nativeVectorWidth;} int getNativeVectorWidth() const {return m_nativeVectorWidth;}
int getDataTypeWidth() const {return m_dataTypeWidth;}
int getVectorWidth() const {return m_vectorWidth;} int getVectorWidth() const {return m_vectorWidth;}
bool getGeneratePIC() const {return m_generatePIC;} bool getGeneratePIC() const {return m_generatePIC;}
@@ -319,10 +328,14 @@ private:
#endif #endif
/** Native vector width of the vector instruction set. Note that this /** Native vector width of the vector instruction set. Note that this
value is directly derived from the ISA Being used (e.g. it's 4 for value is directly derived from the ISA being used (e.g. it's 4 for
SSE, 8 for AVX, etc.) */ SSE, 8 for AVX, etc.) */
int m_nativeVectorWidth; int m_nativeVectorWidth;
/** Data type with in bits. Typically it's 32, but could be 8, 16 or 64.
For generic it's -1, which means undefined. */
int m_dataTypeWidth;
/** Actual vector width currently being compiled to. This may be an /** Actual vector width currently being compiled to. This may be an
integer multiple of the native vector width, for example if we're integer multiple of the native vector width, for example if we're
"doubling up" and compiling 8-wide on a 4-wide SSE system. */ "doubling up" and compiling 8-wide on a 4-wide SSE system. */

View File

@@ -2443,7 +2443,7 @@ Module::CompileAndOutput(const char *srcFile,
int i = 0; int i = 0;
const char *firstISA; const char *firstISA;
while (i < Target::NUM_ISAS && firstTargetMachine == NULL) { while (i < Target::NUM_ISAS && firstTargetMachine == NULL) {
firstISA = Target::ISAToString((Target::ISA) i); firstISA = Target::ISAToTargetString((Target::ISA) i);
firstTargetMachine = targetMachines[i++]; firstTargetMachine = targetMachines[i++];
} }
Assert(firstTargetMachine != NULL); Assert(firstTargetMachine != NULL);

14
perf.py
View File

@@ -391,6 +391,10 @@ def perf(options1, args):
# end of preparations # end of preparations
print_debug("Okey go go go!\n\n", s, perf_log) print_debug("Okey go go go!\n\n", s, perf_log)
# report command line
if __name__ == "__main__":
print_debug("Command line: %s\n" % " ".join(map(str, sys.argv)), s, perf_log)
# report used ispc
print_debug("Testing ispc: " + ispc_test + "\n", s, perf_log) print_debug("Testing ispc: " + ispc_test + "\n", s, perf_log)
#print compilers versions #print compilers versions
@@ -419,11 +423,15 @@ def perf(options1, args):
# read parameters of test # read parameters of test
command = lines[i+2] command = lines[i+2]
command = command[:-1] command = command[:-1]
# handle conditional target argument
target_str = ""
if options.perf_target != "":
target_str = " ISPC_IA_TARGETS="+options.perf_target
if is_windows == False: if is_windows == False:
ex_command_ref = "./ref " + command + " >> " + perf_temp + "_ref" ex_command_ref = "./ref " + command + " >> " + perf_temp + "_ref"
ex_command = "./test " + command + " >> " + perf_temp + "_test" ex_command = "./test " + command + " >> " + perf_temp + "_test"
bu_command_ref = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=ref ISPC="+ispc_ref+" >> "+build_log+" 2>> "+build_log bu_command_ref = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=ref ISPC="+ispc_ref+target_str+" >> "+build_log+" 2>> "+build_log
bu_command = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=test ISPC="+ispc_test+" >> "+build_log+" 2>> "+build_log bu_command = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=test ISPC="+ispc_test+target_str+" >> "+build_log+" 2>> "+build_log
re_command = "make clean >> "+build_log re_command = "make clean >> "+build_log
else: else:
ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref" ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref"
@@ -503,5 +511,7 @@ if __name__ == "__main__":
help='set reference compiler for compare', default="") help='set reference compiler for compare', default="")
parser.add_option('-f', '--file', dest='in_file', parser.add_option('-f', '--file', dest='in_file',
help='file to save perf output', default="") help='file to save perf output', default="")
parser.add_option('-t', '--target', dest='perf_target',
help='set ispc target for building benchmarks (both test and ref)', default="")
(options, args) = parser.parse_args() (options, args) = parser.parse_args()
perf(options, args) perf(options, args)