Merged with upstream/master

This commit is contained in:
evghenii
2013-11-22 08:13:16 +01:00
33 changed files with 795 additions and 394 deletions

View File

@@ -83,6 +83,10 @@ ifeq ($(LLVM_VERSION),LLVM_3_4)
ISPC_LIBS += -lcurses ISPC_LIBS += -lcurses
endif endif
ifeq ($(LLVM_VERSION),LLVM_3_5)
ISPC_LIBS += -lcurses
endif
ifeq ($(ARCH_OS),Linux) ifeq ($(ARCH_OS),Linux)
ISPC_LIBS += -ldl ISPC_LIBS += -ldl
endif endif

View File

@@ -214,7 +214,7 @@ def check_targets():
try_do_LLVM("build check_ISA", "cl check_isa.cpp", True) try_do_LLVM("build check_ISA", "cl check_isa.cpp", True)
SSE2 = ["sse2-i32x4", "sse2-i32x8"] SSE2 = ["sse2-i32x4", "sse2-i32x8"]
SSE4 = ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"] SSE4 = ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
AVX = ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"] AVX = ["avx1-i32x4", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4"]
AVX11 = ["avx1.1-i32x8","avx1.1-i32x16","avx1.1-i64x4"] AVX11 = ["avx1.1-i32x8","avx1.1-i32x16","avx1.1-i64x4"]
AVX2 = ["avx2-i32x8", "avx2-i32x16", "avx2-i64x4"] AVX2 = ["avx2-i32x8", "avx2-i32x16", "avx2-i64x4"]
targets = [["AVX2", AVX2, False], ["AVX1.1", AVX11, False], ["AVX", AVX, False], ["SSE4", SSE4, False], ["SSE2", SSE2, False]] targets = [["AVX2", AVX2, False], ["AVX1.1", AVX11, False], ["AVX", AVX, False], ["SSE4", SSE4, False], ["SSE2", SSE2, False]]
@@ -251,7 +251,7 @@ def check_targets():
if targets[3][2] == False and "wsm" in f_lines[i]: if targets[3][2] == False and "wsm" in f_lines[i]:
answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]] answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]]
if targets[2][2] == False and "snb" in f_lines[i]: if targets[2][2] == False and "snb" in f_lines[i]:
answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]] answer_sde = answer_sde + [["-snb", "avx1-i32x4"], ["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]]
if targets[1][2] == False and "ivb" in f_lines[i]: if targets[1][2] == False and "ivb" in f_lines[i]:
answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"], ["-ivb", "avx1.1-i64x4"]] answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"], ["-ivb", "avx1.1-i64x4"]]
if targets[0][2] == False and "hsw" in f_lines[i]: if targets[0][2] == False and "hsw" in f_lines[i]:
@@ -495,6 +495,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
performance.ref = "ispc_ref" performance.ref = "ispc_ref"
if current_OS == "Windows": if current_OS == "Windows":
performance.ref = "ispc_ref.exe" performance.ref = "ispc_ref.exe"
performance.perf_target = ""
performance.in_file = "." + os.sep + f_date + os.sep + "performance.log" performance.in_file = "." + os.sep + f_date + os.sep + "performance.log"
# prepare LLVM 3.3 as newest LLVM # prepare LLVM 3.3 as newest LLVM
need_LLVM = check_LLVM(["3.3"]) need_LLVM = check_LLVM(["3.3"])

View File

@@ -967,11 +967,31 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
case Target::AVX: { case Target::AVX: {
switch (g->target->getVectorWidth()) { switch (g->target->getVectorWidth()) {
case 4: case 4:
if (runtime32) { if (g->target->getDataTypeWidth() == 32) {
EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit); // Note here that for avx1-i32x4 we are using bitcode file for
} // sse4-i32x4. This is intentional and good enough.
else { // AVX target implies appropriate target-feature attrbute,
EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit); // which forces LLVM to generate AVX code, even for SSE4
// intrinsics. Except that the only "missing" feature in sse4
// target is implemenation of __masked_[store|load]_[i32|i64]
// using maskmov instruction. But it's not very popular
// intrinsics, so we assume the implementation to be good
// enough at the moment.
if (runtime32) {
EXPORT_MODULE(builtins_bitcode_sse4_32bit);
}
else {
EXPORT_MODULE(builtins_bitcode_sse4_64bit);
}
} else if (g->target->getDataTypeWidth() == 64) {
if (runtime32) {
EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit);
}
else {
EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit);
}
} else {
FATAL("logic error in DefineStdlib");
} }
break; break;
case 8: case 8:

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation ;; Copyright (c) 2010-2013, Intel Corporation
;; All rights reserved. ;; All rights reserved.
;; ;;
;; Redistribution and use in source and binary forms, with or without ;; Redistribution and use in source and binary forms, with or without
@@ -31,30 +31,16 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; AVX target implementation. ;; AVX target implementation.
;;
;; Please note that this file uses SSE intrinsics, but LLVM generates AVX
;; instructions, so it doesn't makes sense to change this implemenation.
ctlztz() ctlztz()
define_prefetches() define_prefetches()
define_shuffles() define_shuffles()
aossoa() aossoa()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
; uniform float iv = extract(__rcp_u(v), 0);
; return iv * (2. - v * iv);
%vecval = insertelement <4 x float> undef, float %0, i32 0
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
%scall = extractelement <4 x float> %call, i32 0
; do one N-R iteration
%v_iv = fmul float %0, %scall
%two_minus = fsub float 2., %v_iv
%iv_mul = fmul float %scall, %two_minus
ret float %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats ;; rounding floats
@@ -77,7 +63,8 @@ define float @__round_uniform_float(float) nounwind readonly alwaysinline {
; r3 = a3 ; r3 = a3
; ;
; It doesn't matter what we pass as a, since we only need the r0 value ; It doesn't matter what we pass as a, since we only need the r0 value
; here. So we pass the same register for both. ; here. So we pass the same register for both. Further, only the 0th
; element of the b parameter matters
%xi = insertelement <4 x float> undef, float %0, i32 0 %xi = insertelement <4 x float> undef, float %0, i32 0
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8) %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
%rs = extractelement <4 x float> %xr, i32 0 %rs = extractelement <4 x float> %xr, i32 0
@@ -117,7 +104,7 @@ define double @__round_uniform_double(double) nounwind readonly alwaysinline {
define double @__floor_uniform_double(double) nounwind readonly alwaysinline { define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0 %xi = insertelement <2 x double> undef, double %0, i32 0
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 ; roundsd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9) %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
%rs = extractelement <2 x double> %xr, i32 0 %rs = extractelement <2 x double> %xr, i32 0
ret double %rs ret double %rs
@@ -126,12 +113,31 @@ define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline { define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0 %xi = insertelement <2 x double> undef, double %0, i32 0
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10 ; roundsd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10) %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
%rs = extractelement <2 x double> %xr, i32 0 %rs = extractelement <2 x double> %xr, i32 0
ret double %rs ret double %rs
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
; do the rcpss call
; uniform float iv = extract(__rcp_u(v), 0);
; return iv * (2. - v * iv);
%vecval = insertelement <4 x float> undef, float %0, i32 0
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
%scall = extractelement <4 x float> %call, i32 0
; do one N-R iteration to improve precision, as above
%v_iv = fmul float %0, %scall
%two_minus = fsub float 2., %v_iv
%iv_mul = fmul float %scall, %two_minus
ret float %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt ;; rsqrt
@@ -144,6 +150,7 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v) %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
%is = extractelement <4 x float> %vis, i32 0 %is = extractelement <4 x float> %vis, i32 0
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is); ; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul float %0, %is %v_is = fmul float %0, %is
%v_is_is = fmul float %v_is, %is %v_is_is = fmul float %v_is, %is
@@ -164,9 +171,18 @@ define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
ret float %ret ret float %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; fastmath ;; fast math mode
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
@@ -200,6 +216,22 @@ define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
ret float %ret ret float %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
ret double %ret
}
define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max ;; int min/max
@@ -235,7 +267,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops ;; horizontal ops / reductions
declare i32 @llvm.ctpop.i32(i32) nounwind readnone declare i32 @llvm.ctpop.i32(i32) nounwind readnone
@@ -251,32 +283,6 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
ret i64 %call ret i64 %call
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
ret double %ret
}
define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins ;; int8/int16 builtins

View File

@@ -49,11 +49,10 @@ include(`target-avx-common.ll')
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline { define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
; do one N-R iteration to improve precision
; float iv = __rcp_v(v); ; float iv = __rcp_v(v);
; return iv * (2. - v * iv); ; return iv * (2. - v * iv);
%call = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %0) %call = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %0)
; do one N-R iteration
%v_iv = fmul <8 x float> %0, %call %v_iv = fmul <8 x float> %0, %call
%two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2., %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.>, %v_iv float 2., float 2., float 2., float 2.>, %v_iv
@@ -61,6 +60,46 @@ define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinl
ret <8 x float> %iv_mul ret <8 x float> %iv_mul
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
; float is = __rsqrt_v(v);
%is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <8 x float> %v, %is
%v_is_is = fmul <8 x float> %v_is, %is
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <8 x float> %is, %three_sub
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
ret <8 x float> %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; sqrt
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
%call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
ret <8 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
ret <8 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats ;; rounding floats
@@ -94,58 +133,15 @@ define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwa
} }
define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline { define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9 ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
round4to8double(%0, 9) round4to8double(%0, 9)
} }
define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline { define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10 ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
round4to8double(%0, 10) round4to8double(%0, 10)
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
; float is = __rsqrt_v(v);
%is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <8 x float> %v, %is
%v_is_is = fmul <8 x float> %v_is, %is
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <8 x float> %is, %three_sub
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
ret <8 x float> %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; sqrt
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
%call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
ret <8 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; svml
include(`svml.m4')
;; single precision
svml_declare(float,f8,8)
svml_define(float,f8,8,f)
;; double precision
svml_declare(double,4,4)
svml_define_x(double,4,4,d,8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max ;; float min/max
@@ -166,7 +162,37 @@ define <8 x float> @__min_varying_float(<8 x float>,
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops ;; double precision min/max
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
ret <8 x double> %ret
}
define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
ret <8 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; svml
include(`svml.m4')
;; single precision
svml_declare(float,f8,8)
svml_define(float,f8,8,f)
;; double precision
svml_declare(double,4,4)
svml_define_x(double,4,4,d,8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; mask handling
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
@@ -198,6 +224,9 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp ret i1 %cmp
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal ops / reductions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal float ops ;; horizontal float ops
@@ -216,12 +245,36 @@ define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
reduce8(float, @__min_varying_float, @__min_uniform_float) reduce8(float, @__min_varying_float, @__min_uniform_float)
} }
define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline { define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
reduce8(float, @__max_varying_float, @__max_uniform_float) reduce8(float, @__max_varying_float, @__max_uniform_float)
} }
reduce_equal(8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal double ops
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
%v0 = shufflevector <8 x double> %0, <8 x double> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1 = shufflevector <8 x double> %0, <8 x double> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
%final0 = extractelement <4 x double> %sum1, i32 0
%final1 = extractelement <4 x double> %sum1, i32 2
%sum = fadd double %final0, %final1
ret double %sum
}
define double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
reduce8(double, @__min_varying_double, @__min_uniform_double)
}
define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
reduce8(double, @__max_varying_double, @__max_uniform_double)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int8 ops ;; horizontal int8 ops
@@ -262,6 +315,7 @@ define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int32 ops ;; horizontal int32 ops
;; helper functions
define <8 x i32> @__add_varying_int32(<8 x i32>, define <8 x i32> @__add_varying_int32(<8 x i32>,
<8 x i32>) nounwind readnone alwaysinline { <8 x i32>) nounwind readnone alwaysinline {
%s = add <8 x i32> %0, %1 %s = add <8 x i32> %0, %1
@@ -273,16 +327,15 @@ define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
ret i32 %s ret i32 %s
} }
;; reduction functions
define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline { define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__add_varying_int32, @__add_uniform_int32) reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
} }
define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline { define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__min_varying_int32, @__min_uniform_int32) reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
} }
define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline { define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__max_varying_int32, @__max_uniform_int32) reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
} }
@@ -295,38 +348,11 @@ define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32) reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal double ops
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
%v0 = shufflevector <8 x double> %0, <8 x double> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1 = shufflevector <8 x double> %0, <8 x double> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
%final0 = extractelement <4 x double> %sum1, i32 0
%final1 = extractelement <4 x double> %sum1, i32 2
%sum = fadd double %final0, %final1
ret double %sum
}
define double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
reduce8(double, @__min_varying_double, @__min_uniform_double)
}
define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
reduce8(double, @__max_varying_double, @__max_uniform_double)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int64 ops ;; horizontal int64 ops
;; helper functions
define <8 x i64> @__add_varying_int64(<8 x i64>, define <8 x i64> @__add_varying_int64(<8 x i64>,
<8 x i64>) nounwind readnone alwaysinline { <8 x i64>) nounwind readnone alwaysinline {
%s = add <8 x i64> %0, %1 %s = add <8 x i64> %0, %1
@@ -338,6 +364,7 @@ define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
ret i64 %s ret i64 %s
} }
;; reduction functions
define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline { define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
reduce8(i64, @__add_varying_int64, @__add_uniform_int64) reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
} }
@@ -362,6 +389,7 @@ define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
} }
reduce_equal(8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts ;; unaligned loads/loads+broadcasts
@@ -446,6 +474,10 @@ define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>,
ret void ret void
} }
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store blend
masked_store_blend_8_16_by_8() masked_store_blend_8_16_by_8()
@@ -517,8 +549,6 @@ define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
ret void ret void
} }
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; scatter ;; scatter
@@ -529,30 +559,3 @@ gen_scatter(float)
gen_scatter(i64) gen_scatter(i64)
gen_scatter(double) gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
ret <8 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
ret <8 x double> %ret
}
define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
ret <8 x double> %ret
}

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation ;; Copyright (c) 2010-2013, Intel Corporation
;; All rights reserved. ;; All rights reserved.
;; ;;
;; Redistribution and use in source and binary forms, with or without ;; Redistribution and use in source and binary forms, with or without
@@ -29,6 +29,9 @@
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; SSE4 target implementation.
ctlztz() ctlztz()
define_prefetches() define_prefetches()
define_shuffles() define_shuffles()
@@ -67,7 +70,7 @@ define float @__round_uniform_float(float) nounwind readonly alwaysinline {
define float @__floor_uniform_float(float) nounwind readonly alwaysinline { define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <4 x float> undef, float %0, i32 0 %xi = insertelement <4 x float> undef, float %0, i32 0
; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9 ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9) %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
%rs = extractelement <4 x float> %xr, i32 0 %rs = extractelement <4 x float> %xr, i32 0
ret float %rs ret float %rs
@@ -97,7 +100,7 @@ define double @__round_uniform_double(double) nounwind readonly alwaysinline {
define double @__floor_uniform_double(double) nounwind readonly alwaysinline { define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0 %xi = insertelement <2 x double> undef, double %0, i32 0
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 ; roundsd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9) %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
%rs = extractelement <2 x double> %xr, i32 0 %rs = extractelement <2 x double> %xr, i32 0
ret double %rs ret double %rs
@@ -106,7 +109,7 @@ define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline { define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0 %xi = insertelement <2 x double> undef, double %0, i32 0
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 ; roundsd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10) %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
%rs = extractelement <2 x double> %xr, i32 0 %rs = extractelement <2 x double> %xr, i32 0
ret double %rs ret double %rs
@@ -119,6 +122,8 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline { define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
; do the rcpss call ; do the rcpss call
; uniform float iv = extract(__rcp_u(v), 0);
; return iv * (2. - v * iv);
%vecval = insertelement <4 x float> undef, float %0, i32 0 %vecval = insertelement <4 x float> undef, float %0, i32 0
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval) %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
%scall = extractelement <4 x float> %call, i32 0 %scall = extractelement <4 x float> %call, i32 0
@@ -130,9 +135,8 @@ define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
ret float %iv_mul ret float %iv_mul
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; rsqrt ;; rsqrt
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
@@ -154,7 +158,7 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; sqrt ;; sqrt
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
@@ -163,6 +167,16 @@ define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
ret float %ret ret float %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; fast math mode ;; fast math mode
@@ -198,36 +212,25 @@ define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
ret float %ret ret float %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max ;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
define double @__min_uniform_double(double, double) nounwind readnone { define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1) sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
ret double %ret ret double %ret
} }
define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
define double @__max_uniform_double(double, double) nounwind readnone {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1) sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
ret double %ret ret double %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int32 min/max ;; int min/max
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
@@ -242,8 +245,9 @@ define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
ret i32 %ret ret i32 %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unsigned int min/max ;; unsigned int min/max
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
@@ -258,9 +262,8 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
ret i32 %ret ret i32 %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions ;; horizontal ops / reductions
declare i32 @llvm.ctpop.i32(i32) nounwind readnone declare i32 @llvm.ctpop.i32(i32) nounwind readnone

View File

@@ -58,10 +58,10 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline { define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
; do one N-R iteration to improve precision ; do one N-R iteration to improve precision
; float iv = __rcp_v(v); ; float iv = __rcp_v(v);
; return iv * (2. - v * iv); ; return iv * (2. - v * iv);
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
%v_iv = fmul <4 x float> %0, %call %v_iv = fmul <4 x float> %0, %call
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
%iv_mul = fmul <4 x float> %call, %two_minus %iv_mul = fmul <4 x float> %call, %two_minus
@@ -87,7 +87,7 @@ define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwa
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; sqrt ;; sqrt
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
@@ -154,16 +154,34 @@ define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alway
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { define <4 x float> @__max_varying_float(<4 x float>,
<4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1) %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
ret <4 x float> %call ret <4 x float> %call
} }
define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { define <4 x float> @__min_varying_float(<4 x float>,
<4 x float>) nounwind readonly alwaysinline {
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1) %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
ret <4 x float> %call ret <4 x float> %call
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
ret <4 x double> %ret
}
define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
ret <4 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int32 min/max ;; int32 min/max
@@ -191,23 +209,7 @@ define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly a
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max ;; svml stuff
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
ret <4 x double> %ret
}
define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
ret <4 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; svml stuff
include(`svml.m4') include(`svml.m4')
;; single precision ;; single precision
@@ -219,7 +221,7 @@ svml_declare(double,2,2)
svml_define_x(double,2,2,d,4) svml_define_x(double,2,2,d,4)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions ;; mask handling
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
@@ -251,6 +253,55 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp ret i1 %cmp
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal ops / reductions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal float ops
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
%v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
%v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
%scalar = extractelement <4 x float> %v2, i32 0
ret float %scalar
}
define float @__reduce_min_float(<4 x float>) nounwind readnone alwaysinline {
reduce4(float, @__min_varying_float, @__min_uniform_float)
}
define float @__reduce_max_float(<4 x float>) nounwind readnone alwaysinline {
reduce4(float, @__max_varying_float, @__max_uniform_float)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal double ops
define double @__reduce_add_double(<4 x double>) nounwind readnone alwaysinline {
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
<2 x i32> <i32 0, i32 1>
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
<2 x i32> <i32 2, i32 3>
%sum = fadd <2 x double> %v0, %v1
%e0 = extractelement <2 x double> %sum, i32 0
%e1 = extractelement <2 x double> %sum, i32 1
%m = fadd double %e0, %e1
ret double %m
}
define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
reduce4(double, @__min_varying_double, @__min_uniform_double)
}
define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
reduce4(double, @__max_varying_double, @__max_uniform_double)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int8 ops
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline { define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
@@ -266,6 +317,9 @@ define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
ret i16 %r16 ret i16 %r16
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int16 ops
define internal <4 x i16> @__add_varying_i16(<4 x i16>, define internal <4 x i16> @__add_varying_i16(<4 x i16>,
<4 x i16>) nounwind readnone alwaysinline { <4 x i16>) nounwind readnone alwaysinline {
%r = add <4 x i16> %0, %1 %r = add <4 x i16> %0, %1
@@ -281,24 +335,11 @@ define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
reduce4(i16, @__add_varying_i16, @__add_uniform_i16) reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
} }
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int32 ops
define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline { ;; reduction functions
%v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0) define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone alwaysinline {
%v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
%scalar = extractelement <4 x float> %v2, i32 0
ret float %scalar
}
define float @__reduce_min_float(<4 x float>) nounwind readnone {
reduce4(float, @__min_varying_float, @__min_uniform_float)
}
define float @__reduce_max_float(<4 x float>) nounwind readnone {
reduce4(float, @__max_varying_float, @__max_uniform_float)
}
define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
%v1 = shufflevector <4 x i32> %v, <4 x i32> undef, %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef> <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%m1 = add <4 x i32> %v1, %v %m1 = add <4 x i32> %v1, %v
@@ -308,44 +349,27 @@ define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
ret i32 %sum ret i32 %sum
} }
define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone { define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__min_varying_int32, @__min_uniform_int32) reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
} }
define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone { define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__max_varying_int32, @__max_uniform_int32) reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
} }
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone { define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32) reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
} }
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone { define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32) reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int64 ops
define double @__reduce_add_double(<4 x double>) nounwind readnone { ;; reduction functions
%v0 = shufflevector <4 x double> %0, <4 x double> undef, define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
<2 x i32> <i32 0, i32 1>
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
<2 x i32> <i32 2, i32 3>
%sum = fadd <2 x double> %v0, %v1
%e0 = extractelement <2 x double> %sum, i32 0
%e1 = extractelement <2 x double> %sum, i32 1
%m = fadd double %e0, %e1
ret double %m
}
define double @__reduce_min_double(<4 x double>) nounwind readnone {
reduce4(double, @__min_varying_double, @__min_uniform_double)
}
define double @__reduce_max_double(<4 x double>) nounwind readnone {
reduce4(double, @__max_varying_double, @__max_uniform_double)
}
define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
%v0 = shufflevector <4 x i64> %0, <4 x i64> undef, %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
<2 x i32> <i32 0, i32 1> <2 x i32> <i32 0, i32 1>
%v1 = shufflevector <4 x i64> %0, <4 x i64> undef, %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
@@ -357,27 +381,50 @@ define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
ret i64 %m ret i64 %m
} }
define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone { define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__min_varying_int64, @__min_uniform_int64) reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
} }
define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone { define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__max_varying_int64, @__max_uniform_int64) reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
} }
define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone { define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64) reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
} }
define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone { define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64) reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
} }
reduce_equal(4) reduce_equal(4)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
masked_load(i8, 1)
masked_load(i16, 2)
masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store ;; masked store
gen_masked_store(i8)
gen_masked_store(i16)
gen_masked_store(i32)
gen_masked_store(i64)
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store blend
masked_store_blend_8_16_by_4()
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
<4 x float>) nounwind readnone <4 x float>) nounwind readnone
@@ -444,29 +491,6 @@ define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
ret void ret void
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
masked_store_blend_8_16_by_4()
gen_masked_store(i8)
gen_masked_store(i16)
gen_masked_store(i32)
gen_masked_store(i64)
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
masked_load(i8, 1)
masked_load(i16, 2)
masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter ;; gather/scatter

View File

@@ -2196,7 +2196,7 @@ bool CWriter::doInitialization(llvm::Module &M) {
#endif #endif
TAsm = new CBEMCAsmInfo(); TAsm = new CBEMCAsmInfo();
MRI = new llvm::MCRegisterInfo(); MRI = new llvm::MCRegisterInfo();
#if defined(LLVM_3_4) #if defined(LLVM_3_4) || defined(LLVM_3_5)
TCtx = new llvm::MCContext(TAsm, MRI, NULL); TCtx = new llvm::MCContext(TAsm, MRI, NULL);
#else #else
TCtx = new llvm::MCContext(*TAsm, *MRI, NULL); TCtx = new llvm::MCContext(*TAsm, *MRI, NULL);

View File

@@ -350,7 +350,7 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
AssertPos(currentPos, diSubprogramType.Verify()); AssertPos(currentPos, diSubprogramType.Verify());
} }
#if defined(LLVM_3_4) #if defined(LLVM_3_4) || defined(LLVM_3_5)
Assert(diSubprogramType.isCompositeType()); Assert(diSubprogramType.isCompositeType());
llvm::DICompositeType diSubprogramType_n = llvm::DICompositeType diSubprogramType_n =
static_cast<llvm::DICompositeType>(diSubprogramType); static_cast<llvm::DICompositeType>(diSubprogramType);

View File

@@ -2,7 +2,7 @@
EXAMPLE=ao EXAMPLE=ao
CPP_SRC=ao.cpp ao_serial.cpp CPP_SRC=ao.cpp ao_serial.cpp
ISPC_SRC=ao.ispc ISPC_SRC=ao.ispc
ISPC_IA_TARGETS=sse2,sse4,avx ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -16,8 +16,26 @@ ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/) ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
ifeq ($(ARCH),x86) ifeq ($(ARCH),x86)
ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \ ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o)
$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o) COMMA=,
ifneq (,$(findstring $(COMMA),$(ISPC_IA_TARGETS)))
#$(info multi-target detected: $(ISPC_IA_TARGETS))
ifneq (,$(findstring sse2,$(ISPC_IA_TARGETS)))
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse2.o)
endif
ifneq (,$(findstring sse4,$(ISPC_IA_TARGETS)))
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse4.o)
endif
ifneq (,$(findstring avx1-,$(ISPC_IA_TARGETS)))
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx.o)
endif
ifneq (,$(findstring avx1.1,$(ISPC_IA_TARGETS)))
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx11.o)
endif
ifneq (,$(findstring avx2,$(ISPC_IA_TARGETS)))
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx2.o)
endif
endif
ISPC_TARGETS=$(ISPC_IA_TARGETS) ISPC_TARGETS=$(ISPC_IA_TARGETS)
ARCH_BIT:=$(shell getconf LONG_BIT) ARCH_BIT:=$(shell getconf LONG_BIT)
ifeq ($(ARCH_BIT),32) ifeq ($(ARCH_BIT),32)
@@ -66,9 +84,9 @@ objs/%.o: %.c dirs $(ISPC_HEADER)
objs/%.o: ../%.cpp dirs objs/%.o: ../%.cpp dirs
$(CXX) $< $(CXXFLAGS) -c -o $@ $(CXX) $< $(CXXFLAGS) -c -o $@
objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h dirs
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o objs/%_ispc_avx11.o objs/%_ispc_avx2.o: %.ispc dirs
$(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h $(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC) objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)

View File

@@ -2,7 +2,7 @@
EXAMPLE=deferred_shading EXAMPLE=deferred_shading
CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp
ISPC_SRC=kernels.ispc ISPC_SRC=kernels.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
ISPC_FLAGS=--opt=fast-math ISPC_FLAGS=--opt=fast-math

View File

@@ -3,7 +3,7 @@ EXAMPLE=gmres
CPP_SRC=algorithm.cpp main.cpp matrix.cpp CPP_SRC=algorithm.cpp main.cpp matrix.cpp
CC_SRC=mmio.c CC_SRC=mmio.c
ISPC_SRC=matrix.ispc ISPC_SRC=matrix.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -2,7 +2,7 @@
EXAMPLE=mandelbrot EXAMPLE=mandelbrot
CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
ISPC_SRC=mandelbrot.ispc ISPC_SRC=mandelbrot.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -2,7 +2,7 @@
EXAMPLE=mandelbrot_tasks EXAMPLE=mandelbrot_tasks
CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
ISPC_SRC=mandelbrot_tasks.ispc ISPC_SRC=mandelbrot_tasks.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -2,7 +2,7 @@
EXAMPLE=noise EXAMPLE=noise
CPP_SRC=noise.cpp noise_serial.cpp CPP_SRC=noise.cpp noise_serial.cpp
ISPC_SRC=noise.ispc ISPC_SRC=noise.ispc
ISPC_IA_TARGETS=sse2,sse4,avx-x2 ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x16,avx2-i32x16
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -2,7 +2,7 @@
EXAMPLE=options EXAMPLE=options
CPP_SRC=options.cpp options_serial.cpp CPP_SRC=options.cpp options_serial.cpp
ISPC_SRC=options.ispc ISPC_SRC=options.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -2,7 +2,7 @@
EXAMPLE=perbench EXAMPLE=perbench
CPP_SRC=perfbench.cpp perfbench_serial.cpp CPP_SRC=perfbench.cpp perfbench_serial.cpp
ISPC_SRC=perfbench.ispc ISPC_SRC=perfbench.ispc
ISPC_IA_TARGETS=sse2,sse4,avx ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -2,7 +2,7 @@
EXAMPLE=rt EXAMPLE=rt
CPP_SRC=rt.cpp rt_serial.cpp CPP_SRC=rt.cpp rt_serial.cpp
ISPC_SRC=rt.ispc ISPC_SRC=rt.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -2,7 +2,7 @@
EXAMPLE=sort EXAMPLE=sort
CPP_SRC=sort.cpp sort_serial.cpp CPP_SRC=sort.cpp sort_serial.cpp
ISPC_SRC=sort.ispc ISPC_SRC=sort.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
#ISPC_FLAGS=-DDEBUG #ISPC_FLAGS=-DDEBUG

View File

@@ -2,7 +2,7 @@
EXAMPLE=stencil EXAMPLE=stencil
CPP_SRC=stencil.cpp stencil_serial.cpp CPP_SRC=stencil.cpp stencil_serial.cpp
ISPC_SRC=stencil.ispc ISPC_SRC=stencil.ispc
ISPC_IA_TARGETS=avx ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -693,10 +693,20 @@ InitTaskSystem() {
} }
char name[32]; char name[32];
sprintf(name, "ispc_task.%d", (int)getpid()); bool success = false;
workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0); srand(time(NULL));
if (!workerSemaphore) { for (int i = 0; i < 10; i++) {
fprintf(stderr, "Error creating semaphore: %s\n", strerror(err)); sprintf(name, "ispc_task.%d.%d", (int)getpid(), (int)rand());
workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
if (workerSemaphore != SEM_FAILED) {
success = true;
break;
}
fprintf(stderr, "Failed to create %s\n", name);
}
if (!success) {
fprintf(stderr, "Error creating semaphore (%s): %s\n", name, strerror(errno));
exit(1); exit(1);
} }

View File

@@ -2,7 +2,7 @@
EXAMPLE=volume EXAMPLE=volume
CPP_SRC=volume.cpp volume_serial.cpp CPP_SRC=volume.cpp volume_serial.cpp
ISPC_SRC=volume.ispc ISPC_SRC=volume.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -287,24 +287,12 @@
./tests/atomics-13.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 clang++3.3 -O2 * ./tests/atomics-13.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 clang++3.3 -O2 *
./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O2 * ./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O2 *
./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O2 * ./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O2 *
./tests/test-141.ispc runfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O2 *
./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O2 * ./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O2 *
./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O2 * ./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O2 *
./tests/test-141.ispc runfail x86 avx2-i32x16 Linux LLVM 3.3 clang++3.3 -O2 *
./tests/test-141.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.3 clang++3.3 -O2 *
./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Linux LLVM 3.4 clang++3.3 -O2 *
./tests/funcptr-null-5.ispc runfail x86 sse4-i8x16 Linux LLVM 3.4 clang++3.3 -O2 *
./tests/funcptr-null-6.ispc runfail x86 sse4-i8x16 Linux LLVM 3.4 clang++3.3 -O2 *
./tests/funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.4 clang++3.3 -O2 *
./tests/funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.4 clang++3.3 -O2 *
./tests/funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.4 clang++3.3 -O2 *
./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O2 * ./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O2 *
./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O2 * ./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O2 *
./tests/test-141.ispc runfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O2 *
./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O2 * ./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O2 *
./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O2 * ./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O2 *
./tests/test-141.ispc runfail x86 avx2-i32x16 Linux LLVM 3.4 clang++3.3 -O2 *
./tests/test-141.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.4 clang++3.3 -O2 *
./tests/atomics-13.ispc compfail x86 sse4-i16x8 Mac LLVM 3.3 clang++3.3 -O2 * ./tests/atomics-13.ispc compfail x86 sse4-i16x8 Mac LLVM 3.3 clang++3.3 -O2 *
./tests/atomics-13.ispc compfail x86-64 sse4-i16x8 Mac LLVM 3.3 clang++3.3 -O2 * ./tests/atomics-13.ispc compfail x86-64 sse4-i16x8 Mac LLVM 3.3 clang++3.3 -O2 *
./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Mac LLVM 3.3 clang++3.3 -O2 * ./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Mac LLVM 3.3 clang++3.3 -O2 *
@@ -462,3 +450,125 @@
.\tests\switch-8.ispc compfail x86-64 avx2-i32x16 Windows LLVM 3.4 cl -O2 * .\tests\switch-8.ispc compfail x86-64 avx2-i32x16 Windows LLVM 3.4 cl -O2 *
.\tests\switch-9.ispc compfail x86-64 avx2-i32x16 Windows LLVM 3.4 cl -O2 * .\tests\switch-9.ispc compfail x86-64 avx2-i32x16 Windows LLVM 3.4 cl -O2 *
.\tests\reduce-equal-10.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * .\tests\reduce-equal-10.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 *
./tests/half-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/ptr-15.ispc runfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-1.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-10.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-11.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-12.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-13.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-14.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-2.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-3.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-4.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-9.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/exclusive-scan-add-1.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/exclusive-scan-add-10.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/exclusive-scan-add-8.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/exclusive-scan-add-9.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/exclusive-scan-and-1.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/exclusive-scan-and-2.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/exclusive-scan-or-1.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-1.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-10.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-12.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-13.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-2.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-3.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-4.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-5.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-6.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-8.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/half-1.ispc runfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/ptr-15.ispc runfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/ptr-19.ispc runfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-1.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-10.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-11.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-12.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-13.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-14.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-2.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-3.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-4.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/atomics-9.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/exclusive-scan-add-1.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/exclusive-scan-add-10.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/exclusive-scan-add-8.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/exclusive-scan-add-9.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/exclusive-scan-and-1.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/exclusive-scan-and-2.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/exclusive-scan-or-1.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-1.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-10.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-12.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-13.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-2.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-3.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-4.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-5.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-6.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/reduce-equal-8.ispc compfail x86-64 generic-16 Linux LLVM 3.3 clang++3.3 -O0 *
./tests/half-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/ptr-15.ispc runfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-1.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-10.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-11.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-12.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-13.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-14.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-2.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-3.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-4.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-9.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/exclusive-scan-add-1.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/exclusive-scan-add-10.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/exclusive-scan-add-8.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/exclusive-scan-add-9.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/exclusive-scan-and-1.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/exclusive-scan-and-2.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/exclusive-scan-or-1.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-1.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-10.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-12.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-13.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-2.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-3.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-4.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-5.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-6.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-8.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/half-1.ispc runfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/ptr-15.ispc runfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/ptr-19.ispc runfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-1.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-10.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-11.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-12.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-13.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-14.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-2.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-3.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-4.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/atomics-9.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/exclusive-scan-add-1.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/exclusive-scan-add-10.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/exclusive-scan-add-8.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/exclusive-scan-add-9.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/exclusive-scan-and-1.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/exclusive-scan-and-2.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/exclusive-scan-or-1.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-1.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-10.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-12.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-13.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-2.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-3.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-4.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-5.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-6.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *
./tests/reduce-equal-8.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 *

View File

@@ -193,6 +193,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
m_tf_attributes(NULL), m_tf_attributes(NULL),
#endif #endif
m_nativeVectorWidth(-1), m_nativeVectorWidth(-1),
m_dataTypeWidth(-1),
m_vectorWidth(-1), m_vectorWidth(-1),
m_generatePIC(pic), m_generatePIC(pic),
m_maskingIsFree(false), m_maskingIsFree(false),
@@ -317,9 +318,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
!strcasecmp(isa, "sse2-i32x4")) { !strcasecmp(isa, "sse2-i32x4")) {
this->m_isa = Target::SSE2; this->m_isa = Target::SSE2;
this->m_nativeVectorWidth = 4; this->m_nativeVectorWidth = 4;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
#if defined(LLVM_3_4) #if defined(LLVM_3_4) || defined(LLVM_3_5)
",-sse4.1,-sse4.2" ",-sse4.1,-sse4.2"
#else #else
",-sse41,-sse42" ",-sse41,-sse42"
@@ -332,9 +334,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
!strcasecmp(isa, "sse2-i32x8")) { !strcasecmp(isa, "sse2-i32x8")) {
this->m_isa = Target::SSE2; this->m_isa = Target::SSE2;
this->m_nativeVectorWidth = 4; this->m_nativeVectorWidth = 4;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
#if defined(LLVM_3_4) #if defined(LLVM_3_4) || defined(LLVM_3_5)
",-sse4.1,-sse4.2" ",-sse4.1,-sse4.2"
#else #else
",-sse41,-sse42" ",-sse41,-sse42"
@@ -347,10 +350,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
!strcasecmp(isa, "sse4-i32x4")) { !strcasecmp(isa, "sse4-i32x4")) {
this->m_isa = Target::SSE4; this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 4; this->m_nativeVectorWidth = 4;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
// TODO: why not sse42 and popcnt? // TODO: why not sse42 and popcnt?
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
#if defined(LLVM_3_4) #if defined(LLVM_3_4) || defined(LLVM_3_5)
",+sse4.1,-sse4.2" ",+sse4.1,-sse4.2"
#else #else
",+sse41,-sse42" ",+sse41,-sse42"
@@ -364,9 +368,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
!strcasecmp(isa, "sse4-i32x8")) { !strcasecmp(isa, "sse4-i32x8")) {
this->m_isa = Target::SSE4; this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 4; this->m_nativeVectorWidth = 4;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
#if defined(LLVM_3_4) #if defined(LLVM_3_4) || defined(LLVM_3_5)
",+sse4.1,-sse4.2" ",+sse4.1,-sse4.2"
#else #else
",+sse41,-sse42" ",+sse41,-sse42"
@@ -378,9 +383,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
else if (!strcasecmp(isa, "sse4-i8x16")) { else if (!strcasecmp(isa, "sse4-i8x16")) {
this->m_isa = Target::SSE4; this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 16; this->m_nativeVectorWidth = 16;
this->m_dataTypeWidth = 8;
this->m_vectorWidth = 16; this->m_vectorWidth = 16;
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
#if defined(LLVM_3_4) #if defined(LLVM_3_4) || defined(LLVM_3_5)
",+sse4.1,-sse4.2" ",+sse4.1,-sse4.2"
#else #else
",+sse41,-sse42" ",+sse41,-sse42"
@@ -392,9 +398,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
else if (!strcasecmp(isa, "sse4-i16x8")) { else if (!strcasecmp(isa, "sse4-i16x8")) {
this->m_isa = Target::SSE4; this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 16;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
#if defined(LLVM_3_4) #if defined(LLVM_3_4) || defined(LLVM_3_5)
",+sse4.1,-sse4.2" ",+sse4.1,-sse4.2"
#else #else
",+sse41,-sse42" ",+sse41,-sse42"
@@ -466,11 +473,21 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
this->m_maskingIsFree = false; this->m_maskingIsFree = false;
this->m_maskBitCount = 32; this->m_maskBitCount = 32;
} }
else if (!strcasecmp(isa, "avx1-i32x4")) {
this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov";
this->m_maskingIsFree = false;
this->m_maskBitCount = 32;
}
else if (!strcasecmp(isa, "avx") || else if (!strcasecmp(isa, "avx") ||
!strcasecmp(isa, "avx1") || !strcasecmp(isa, "avx1") ||
!strcasecmp(isa, "avx1-i32x8")) { !strcasecmp(isa, "avx1-i32x8")) {
this->m_isa = Target::AVX; this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+avx,+popcnt,+cmov"; this->m_attributes = "+avx,+popcnt,+cmov";
this->m_maskingIsFree = false; this->m_maskingIsFree = false;
@@ -480,6 +497,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
!strcasecmp(isa, "avx1-i64x4")) { !strcasecmp(isa, "avx1-i64x4")) {
this->m_isa = Target::AVX; this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_dataTypeWidth = 64;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov"; this->m_attributes = "+avx,+popcnt,+cmov";
this->m_maskingIsFree = false; this->m_maskingIsFree = false;
@@ -490,6 +508,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
!strcasecmp(isa, "avx1-i32x16")) { !strcasecmp(isa, "avx1-i32x16")) {
this->m_isa = Target::AVX; this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 16; this->m_vectorWidth = 16;
this->m_attributes = "+avx,+popcnt,+cmov"; this->m_attributes = "+avx,+popcnt,+cmov";
this->m_maskingIsFree = false; this->m_maskingIsFree = false;
@@ -499,9 +518,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
!strcasecmp(isa, "avx1.1-i32x8")) { !strcasecmp(isa, "avx1.1-i32x8")) {
this->m_isa = Target::AVX11; this->m_isa = Target::AVX11;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+avx,+popcnt,+cmov,+f16c" this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4) || defined(LLVM_3_5)
",+rdrnd" ",+rdrnd"
#else #else
",+rdrand" ",+rdrand"
@@ -519,9 +539,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
!strcasecmp(isa, "avx1.1-i32x16")) { !strcasecmp(isa, "avx1.1-i32x16")) {
this->m_isa = Target::AVX11; this->m_isa = Target::AVX11;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 16; this->m_vectorWidth = 16;
this->m_attributes = "+avx,+popcnt,+cmov,+f16c" this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4) || defined(LLVM_3_5)
",+rdrnd" ",+rdrnd"
#else #else
",+rdrand" ",+rdrand"
@@ -538,9 +559,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
else if (!strcasecmp(isa, "avx1.1-i64x4")) { else if (!strcasecmp(isa, "avx1.1-i64x4")) {
this->m_isa = Target::AVX11; this->m_isa = Target::AVX11;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_dataTypeWidth = 64;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov,+f16c" this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4) || defined(LLVM_3_5)
",+rdrnd" ",+rdrnd"
#else #else
",+rdrand" ",+rdrand"
@@ -558,9 +580,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
!strcasecmp(isa, "avx2-i32x8")) { !strcasecmp(isa, "avx2-i32x8")) {
this->m_isa = Target::AVX2; this->m_isa = Target::AVX2;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4) || defined(LLVM_3_5)
",+rdrnd" ",+rdrnd"
#else #else
",+rdrand" ",+rdrand"
@@ -582,9 +605,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
!strcasecmp(isa, "avx2-i32x16")) { !strcasecmp(isa, "avx2-i32x16")) {
this->m_isa = Target::AVX2; this->m_isa = Target::AVX2;
this->m_nativeVectorWidth = 16; this->m_nativeVectorWidth = 16;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 16; this->m_vectorWidth = 16;
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4) || defined(LLVM_3_5)
",+rdrnd" ",+rdrnd"
#else #else
",+rdrand" ",+rdrand"
@@ -605,9 +629,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
else if (!strcasecmp(isa, "avx2-i64x4")) { else if (!strcasecmp(isa, "avx2-i64x4")) {
this->m_isa = Target::AVX2; this->m_isa = Target::AVX2;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_dataTypeWidth = 64;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4) || defined(LLVM_3_5)
",+rdrnd" ",+rdrnd"
#else #else
",+rdrand" ",+rdrand"
@@ -629,6 +654,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
else if (!strcasecmp(isa, "neon-i8x16")) { else if (!strcasecmp(isa, "neon-i8x16")) {
this->m_isa = Target::NEON8; this->m_isa = Target::NEON8;
this->m_nativeVectorWidth = 16; this->m_nativeVectorWidth = 16;
this->m_dataTypeWidth = 8;
this->m_vectorWidth = 16; this->m_vectorWidth = 16;
this->m_attributes = "+neon,+fp16"; this->m_attributes = "+neon,+fp16";
this->m_hasHalf = true; // ?? this->m_hasHalf = true; // ??
@@ -638,6 +664,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
else if (!strcasecmp(isa, "neon-i16x8")) { else if (!strcasecmp(isa, "neon-i16x8")) {
this->m_isa = Target::NEON16; this->m_isa = Target::NEON16;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 16;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+neon,+fp16"; this->m_attributes = "+neon,+fp16";
this->m_hasHalf = true; // ?? this->m_hasHalf = true; // ??
@@ -648,6 +675,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
!strcasecmp(isa, "neon-i32x4")) { !strcasecmp(isa, "neon-i32x4")) {
this->m_isa = Target::NEON32; this->m_isa = Target::NEON32;
this->m_nativeVectorWidth = 4; this->m_nativeVectorWidth = 4;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
this->m_attributes = "+neon,+fp16"; this->m_attributes = "+neon,+fp16";
this->m_hasHalf = true; // ?? this->m_hasHalf = true; // ??
@@ -799,6 +827,7 @@ Target::SupportedTargets() {
#endif #endif
"sse2-i32x4, sse2-i32x8, " "sse2-i32x4, sse2-i32x8, "
"sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, " "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
"avx1-i32x4, "
"avx1-i32x8, avx1-i32x16, avx1-i64x4, " "avx1-i32x8, avx1-i32x16, avx1-i64x4, "
"avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 " "avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 "
"avx2-i32x8, avx2-i32x16, avx2-i64x4, " "avx2-i32x8, avx2-i32x16, avx2-i64x4, "
@@ -840,6 +869,9 @@ Target::GetTripleString() const {
return triple.str(); return triple.str();
} }
// This function returns string representation of ISA for the purpose of
// mangling. And may return any unique string, preferably short, like
// sse4, avx and etc.
const char * const char *
Target::ISAToString(ISA isa) { Target::ISAToString(ISA isa) {
switch (isa) { switch (isa) {
@@ -877,6 +909,45 @@ Target::GetISAString() const {
} }
// This function returns string representation of default target corresponding
// to ISA. I.e. for SSE4 it's sse4-i32x4, for AVX11 it's avx1.1-i32x8. This
// string may be used to initialize Target.
const char *
Target::ISAToTargetString(ISA isa) {
switch (isa) {
#ifdef ISPC_ARM_ENABLED
case Target::NEON8:
return "neon-8";
case Target::NEON16:
return "neon-16";
case Target::NEON32:
return "neon-32";
#endif
case Target::SSE2:
return "sse2-i32x4";
case Target::SSE4:
return "sse4-i32x4";
case Target::AVX:
return "avx1-i32x8";
case Target::AVX11:
return "avx1.1-i32x8";
case Target::AVX2:
return "avx2-i32x8";
case Target::GENERIC:
return "generic-4";
default:
FATAL("Unhandled target in ISAToTargetString()");
}
return "";
}
const char *
Target::GetISATargetString() const {
return ISAToString(m_isa);
}
static bool static bool
lGenericTypeLayoutIndeterminate(llvm::Type *type) { lGenericTypeLayoutIndeterminate(llvm::Type *type) {
if (type->isPrimitiveType() || type->isIntegerTy()) if (type->isPrimitiveType() || type->isIntegerTy())

21
ispc.h
View File

@@ -40,8 +40,8 @@
#define ISPC_VERSION "1.5.1dev" #define ISPC_VERSION "1.5.1dev"
#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)
#error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported" #error "Only LLVM 3.1, 3.2, 3.3, 3.4 and the 3.5 development branch are supported"
#endif #endif
#if defined(_WIN32) || defined(_WIN64) #if defined(_WIN32) || defined(_WIN64)
@@ -214,9 +214,16 @@ public:
/** Convert ISA enum to string */ /** Convert ISA enum to string */
static const char *ISAToString(Target::ISA isa); static const char *ISAToString(Target::ISA isa);
/** Returns a string like "avx" encoding the target. */ /** Returns a string like "avx" encoding the target. Good for mangling. */
const char *GetISAString() const; const char *GetISAString() const;
/** Convert ISA enum to string */
static const char *ISAToTargetString(Target::ISA isa);
/** Returns a string like "avx1.1-i32x8" encoding the target.
This may be used for Target initialization. */
const char *GetISATargetString() const;
/** Returns the size of the given type */ /** Returns the size of the given type */
llvm::Value *SizeOf(llvm::Type *type, llvm::Value *SizeOf(llvm::Type *type,
llvm::BasicBlock *insertAtEnd); llvm::BasicBlock *insertAtEnd);
@@ -254,6 +261,8 @@ public:
int getNativeVectorWidth() const {return m_nativeVectorWidth;} int getNativeVectorWidth() const {return m_nativeVectorWidth;}
int getDataTypeWidth() const {return m_dataTypeWidth;}
int getVectorWidth() const {return m_vectorWidth;} int getVectorWidth() const {return m_vectorWidth;}
bool getGeneratePIC() const {return m_generatePIC;} bool getGeneratePIC() const {return m_generatePIC;}
@@ -321,10 +330,14 @@ private:
#endif #endif
/** Native vector width of the vector instruction set. Note that this /** Native vector width of the vector instruction set. Note that this
value is directly derived from the ISA Being used (e.g. it's 4 for value is directly derived from the ISA being used (e.g. it's 4 for
SSE, 8 for AVX, etc.) */ SSE, 8 for AVX, etc.) */
int m_nativeVectorWidth; int m_nativeVectorWidth;
/** Data type with in bits. Typically it's 32, but could be 8, 16 or 64.
For generic it's -1, which means undefined. */
int m_dataTypeWidth;
/** Actual vector width currently being compiled to. This may be an /** Actual vector width currently being compiled to. This may be an
integer multiple of the native vector width, for example if we're integer multiple of the native vector width, for example if we're
"doubling up" and compiling 8-wide on a 4-wide SSE system. */ "doubling up" and compiling 8-wide on a 4-wide SSE system. */

View File

@@ -0,0 +1,51 @@
From 13c33dd2931ae9d9c5c9f142677f025281fbefca Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Fri, 1 Nov 2013 11:08:08 -0700
Subject: [PATCH] Fix PR17764
- %ret = select %mask, %v1, %v2 is equivalent to
%ret = %mask ? %v1 : %v2
but VPBLENDVB %mask, %v1, %v2, %ret (operands are in Intel assembly
order) is equivalent to
%ret = %mask ? %v2 : %v1
---
lib/Target/X86/X86InstrSSE.td | 2 +-
test/CodeGen/X86/pr17764.ll | 10 ++++++++++
2 files changed, 11 insertions(+), 1 deletion(-)
create mode 100644 test/CodeGen/X86/pr17764.ll
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 7cae485..bac88f9 100644
--- lib/Target/X86/X86InstrSSE.td
+++ lib/Target/X86/X86InstrSSE.td
@@ -6965,7 +6965,7 @@ let Predicates = [HasAVX] in {
let Predicates = [HasAVX2] in {
def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
(v32i8 VR256:$src2))),
- (VPBLENDVBYrr VR256:$src1, VR256:$src2, VR256:$mask)>;
+ (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2),
(imm:$mask))),
(VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>;
diff --git a/test/CodeGen/X86/pr17764.ll b/test/CodeGen/X86/pr17764.ll
new file mode 100644
index 0000000..7a3fd6d
--- /dev/null
+++ test/CodeGen/X86/pr17764.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core-avx2 | FileCheck %s
+
+define <16 x i16> @foo(<16 x i1> %mask, <16 x i16> %x, <16 x i16> %y) {
+ %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %y
+ ret <16 x i16> %ret
+}
+
+; CHECK: foo
+; CHECK: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
+; CHECK: ret
--
1.8.1.2

View File

@@ -70,6 +70,8 @@ lPrintVersion() {
"3.3" "3.3"
#elif defined(LLVM_3_4) #elif defined(LLVM_3_4)
"3.4" "3.4"
#elif defined(LLVM_3_5)
"3.5"
#else #else
#error "Unhandled LLVM version" #error "Unhandled LLVM version"
#endif #endif
@@ -164,7 +166,7 @@ devUsage(int ret) {
printf(" disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n"); printf(" disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
printf(" [--yydebug]\t\t\t\tPrint debugging information during parsing\n"); printf(" [--yydebug]\t\t\t\tPrint debugging information during parsing\n");
printf(" [--debug-phase=<value>]\t\tSet optimization phases to dump. --debug-phase=first,210:220,300,305,310:last\n"); printf(" [--debug-phase=<value>]\t\tSet optimization phases to dump. --debug-phase=first,210:220,300,305,310:last\n");
#ifdef LLVM_3_4 #if defined(LLVM_3_4) || defined(LLVM_3_5)
printf(" [--debug-ir=<value>]\t\tSet optimization phase to generate debugIR after it\n"); printf(" [--debug-ir=<value>]\t\tSet optimization phase to generate debugIR after it\n");
#endif #endif
printf(" [--off-phase=<value>]\t\tSwitch off optimization phases. --off-phase=first,210:220,300,305,310:last\n"); printf(" [--off-phase=<value>]\t\tSwitch off optimization phases. --off-phase=first,210:220,300,305,310:last\n");
@@ -556,7 +558,7 @@ int main(int Argc, char *Argv[]) {
"away or introduce the new ones.\n"); "away or introduce the new ones.\n");
g->debug_stages = ParsingPhases(argv[i] + strlen("--debug-phase=")); g->debug_stages = ParsingPhases(argv[i] + strlen("--debug-phase="));
} }
#ifdef LLVM_3_4 #if defined(LLVM_3_4) || defined(LLVM_3_5)
else if (strncmp(argv[i], "--debug-ir=", 11) == 0) { else if (strncmp(argv[i], "--debug-ir=", 11) == 0) {
g->debugIR = ParsingPhaseName(argv[i] + strlen("--debug-ir=")); g->debugIR = ParsingPhaseName(argv[i] + strlen("--debug-ir="));
} }

View File

@@ -2601,7 +2601,7 @@ Module::CompileAndOutput(const char *srcFile,
int i = 0; int i = 0;
const char *firstISA; const char *firstISA;
while (i < Target::NUM_ISAS && firstTargetMachine == NULL) { while (i < Target::NUM_ISAS && firstTargetMachine == NULL) {
firstISA = Target::ISAToString((Target::ISA) i); firstISA = Target::ISAToTargetString((Target::ISA) i);
firstTargetMachine = targetMachines[i++]; firstTargetMachine = targetMachines[i++];
} }
Assert(firstTargetMachine != NULL); Assert(firstTargetMachine != NULL);

View File

@@ -63,7 +63,7 @@
#include <llvm/IR/BasicBlock.h> #include <llvm/IR/BasicBlock.h>
#include <llvm/IR/Constants.h> #include <llvm/IR/Constants.h>
#endif #endif
#if defined (LLVM_3_4) #if defined (LLVM_3_4) || defined(LLVM_3_5)
#include <llvm/Transforms/Instrumentation.h> #include <llvm/Transforms/Instrumentation.h>
#endif #endif
#include <llvm/PassManager.h> #include <llvm/PassManager.h>
@@ -441,7 +441,7 @@ DebugPassManager::add(llvm::Pass * P, int stage = -1) {
number, P->getPassName()); number, P->getPassName());
PM.add(CreateDebugPass(buf)); PM.add(CreateDebugPass(buf));
} }
#ifdef LLVM_3_4 #if defined(LLVM_3_4) || defined(LLVM_3_5)
if (g->debugIR == number) { if (g->debugIR == number) {
// adding generating of LLVM IR debug after optimization // adding generating of LLVM IR debug after optimization
char buf[100]; char buf[100];

158
perf.py
View File

@@ -177,7 +177,10 @@ def geomean(par):
l = len(par) l = len(par)
for i in range(l): for i in range(l):
temp = temp * par[i] temp = temp * par[i]
temp = temp ** (1.0/l) if l != 0:
temp = temp ** (1.0/l)
else:
temp = 0
return round(temp, 2) return round(temp, 2)
#takes an answer struct and print it. #takes an answer struct and print it.
@@ -189,18 +192,30 @@ def geomean(par):
#test[4] - list of absolute results with tasks #test[4] - list of absolute results with tasks
#test[5] - list of absolute time without ISPC (serial) #test[5] - list of absolute time without ISPC (serial)
#test[1..4] may be empty #test[1..4] may be empty
def print_answer(answer): def print_answer(answer, target_number):
filelist = [] filelist = []
print_debug("--------------------------------------------------------------------------\n", s, perf_log) print_debug("--------------------------------------------------------------------------\n", s, perf_log)
print_debug("test name:\t ISPC speedup: ISPC + tasks speedup: | " + print_debug("test name:\t ISPC speedup: ISPC + tasks speedup: | " +
" ISPC time: ISPC + tasks time: serial:\n", s, perf_log) " ISPC time: ISPC + tasks time: serial:\n", s, perf_log)
filelist.append("test name,ISPC speedup,diff," + if target_number > 1:
"ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n") if options.output == "":
options.output = "targets.csv"
filelist.append("test name,ISPC speedup" + "," * target_number + "ISPC + tasks speedup\n")
filelist.append("," + options.perf_target + "," + options.perf_target + "\n")
else:
filelist.append("test name,ISPC speedup,diff," +
"ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n")
max_t = [0,0,0,0,0] max_t = [0,0,0,0,0]
diff_t = [0,0,0,0,0] diff_t = [0,0,0,0,0]
geomean_t = [0,0,0,0,0] geomean_t = []
list_of_max = [[],[],[],[],[]] list_of_max = []
for i1 in range(target_number):
geomean_t.append([0,0,0,0,0])
list_of_max.append([[],[],[],[],[]])
list_of_compare = [[],[],[],[],[],[]] list_of_compare = [[],[],[],[],[],[]]
target_k = 0
temp_str_1 = ""
temp_str_2 = ""
for i in range(len(answer)): for i in range(len(answer)):
list_of_compare[0].append(answer[i][0]) list_of_compare[0].append(answer[i][0])
for t in range(1,6): for t in range(1,6):
@@ -215,7 +230,7 @@ def print_answer(answer):
mm = min(answer[i][t]) mm = min(answer[i][t])
list_of_compare[t].append(mm) list_of_compare[t].append(mm)
max_t[t-1] = '%.2f' % mm max_t[t-1] = '%.2f' % mm
list_of_max[t-1].append(mm) list_of_max[i % target_number][t-1].append(mm)
diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t])) diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t]))
print_debug("%s:\n" % answer[i][0], s, perf_log) print_debug("%s:\n" % answer[i][0], s, perf_log)
print_debug("\t\tmax:\t%5s\t\t%10s\t|min:%10s\t%10s\t%10s\n" % print_debug("\t\tmax:\t%5s\t\t%10s\t|min:%10s\t%10s\t%10s\n" %
@@ -227,17 +242,37 @@ def print_answer(answer):
max_t[t] = "" max_t[t] = ""
if diff_t[t] == "n/a": if diff_t[t] == "n/a":
diff_t[t] = "" diff_t[t] = ""
filelist.append(answer[i][0] + "," + if target_number > 1:
if target_k == 0:
temp_str_1 = answer[i][0] + ","
temp_str_2 = ""
temp_str_1 += max_t[0] + ","
temp_str_2 += max_t[1] + ","
target_k = target_k + 1
if target_k == target_number:
filelist.append(temp_str_1 + temp_str_2[:-1] + "\n")
target_k = 0
else:
filelist.append(answer[i][0] + "," +
max_t[0] + "," + diff_t[0] + "," + max_t[1] + "," + diff_t[1] + "," + max_t[0] + "," + diff_t[0] + "," + max_t[1] + "," + diff_t[1] + "," +
max_t[2] + "," + diff_t[2] + "," + max_t[3] + "," + diff_t[3] + "," + max_t[2] + "," + diff_t[2] + "," + max_t[3] + "," + diff_t[3] + "," +
max_t[4] + "," + diff_t[4] + "\n") max_t[4] + "," + diff_t[4] + "\n")
for i in range(0,5): for i in range(0,5):
geomean_t[i] = geomean(list_of_max[i]) for i1 in range(target_number):
geomean_t[i1][i] = geomean(list_of_max[i1][i])
print_debug("---------------------------------------------------------------------------------\n", s, perf_log) print_debug("---------------------------------------------------------------------------------\n", s, perf_log)
print_debug("Geomean:\t\t%5s\t\t%10s\t|%14s\t%10s\t%10s\n" % print_debug("Geomean:\t\t%5s\t\t%10s\t|%14s\t%10s\t%10s\n" %
(geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4]), s, perf_log) (geomean_t[0][0], geomean_t[0][1], geomean_t[0][2], geomean_t[0][3], geomean_t[0][4]), s, perf_log)
filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1]) if target_number > 1:
+ ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n") temp_str_1 = "Geomean,"
temp_str_2 = ""
for i in range(target_number):
temp_str_1 += str(geomean_t[i][0]) + ","
temp_str_2 += str(geomean_t[i][1]) + ","
filelist.append(temp_str_1 + temp_str_2[:-1] + "\n")
else:
filelist.append("Geomean," + str(geomean_t[0][0]) + ",," + str(geomean_t[0][1])
+ ",," + str(geomean_t[0][2]) + ",," + str(geomean_t[0][3]) + ",," + str(geomean_t[0][4]) + "\n")
print_file(filelist) print_file(filelist)
return list_of_compare return list_of_compare
@@ -297,6 +332,15 @@ def perf(options1, args):
if is_windows: if is_windows:
pwd1 = "..\\..\\" pwd1 = "..\\..\\"
if options.perf_target != "":
test_only_r = " sse2-i32x4 sse2-i32x8 sse4-i32x4 sse4-i32x8 sse4-i16x8 \
sse4-i8x16 avx1-i32x8 avx1-i32x16 avx1-i64x4 avx1.1-i32x8 \
avx1.1-i32x16 avx1.1-i64x4 avx2-i32x8 avx2-i32x16 avx2-i64x4 "
test_only = options.perf_target.split(" ")
for iterator in test_only:
if not (" " + iterator + " " in test_only_r):
error("unknow option for target: " + iterator, 1)
# check if cpu usage is low now # check if cpu usage is low now
cpu_percent = cpu_check() cpu_percent = cpu_check()
if cpu_percent > 20: if cpu_percent > 20:
@@ -391,6 +435,10 @@ def perf(options1, args):
# end of preparations # end of preparations
print_debug("Okey go go go!\n\n", s, perf_log) print_debug("Okey go go go!\n\n", s, perf_log)
# report command line
if __name__ == "__main__":
print_debug("Command line: %s\n" % " ".join(map(str, sys.argv)), s, perf_log)
# report used ispc
print_debug("Testing ispc: " + ispc_test + "\n", s, perf_log) print_debug("Testing ispc: " + ispc_test + "\n", s, perf_log)
#print compilers versions #print compilers versions
@@ -405,8 +453,6 @@ def perf(options1, args):
while i < length-2: while i < length-2:
# we read name of test # we read name of test
print_debug("%s" % lines[i], s, perf_log) print_debug("%s" % lines[i], s, perf_log)
test = [lines[i][:-1],[],[],[],[],[]]
test_ref = [lines[i][:-1],[],[],[],[],[]]
# read location of test # read location of test
folder = lines[i+1] folder = lines[i+1]
folder = folder[:-1] folder = folder[:-1]
@@ -419,38 +465,52 @@ def perf(options1, args):
# read parameters of test # read parameters of test
command = lines[i+2] command = lines[i+2]
command = command[:-1] command = command[:-1]
if is_windows == False: # handle conditional target argument
ex_command_ref = "./ref " + command + " >> " + perf_temp + "_ref" target_str_temp = ""
ex_command = "./test " + command + " >> " + perf_temp + "_test" perf_targets = [""]
bu_command_ref = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=ref ISPC="+ispc_ref+" >> "+build_log+" 2>> "+build_log target_number = 1
bu_command = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=test ISPC="+ispc_test+" >> "+build_log+" 2>> "+build_log if options.perf_target != "":
re_command = "make clean >> "+build_log perf_targets = options.perf_target.split(',')
else: target_str_temp = " ISPC_IA_TARGETS="
ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref" target_number = len(perf_targets)
ex_command = "x64\\Release1\\test.exe " + command + " >> " + perf_temp + "_test" temp = 0
bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /p:ISPC_compiler=ispc_ref /t:rebuild >> " + build_log for target_i in range(target_number):
bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /p:ISPC_compiler=ispc /t:rebuild >> " + build_log test = [lines[i][:-1],[],[],[],[],[]]
re_command = "msbuild /t:clean >> " + build_log test_ref = [lines[i][:-1],[],[],[],[],[]]
commands = [ex_command, bu_command, ex_command_ref, bu_command_ref, re_command] target_str = target_str_temp + perf_targets[target_i]
# parsing config parameters if is_windows == False:
next_line = lines[i+3] ex_command_ref = "./ref " + command + " >> " + perf_temp + "_ref"
if next_line[0] == "!": # we should take only one part of test output ex_command = "./test " + command + " >> " + perf_temp + "_test"
R = next_line.split(' ') bu_command_ref = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=ref ISPC="+ispc_ref+target_str+" >> "+build_log+" 2>> "+build_log
c1 = int(R[1]) #c1 is a number of string which we want to use in test output bu_command = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=test ISPC="+ispc_test+target_str+" >> "+build_log+" 2>> "+build_log
c2 = int(R[2]) #c2 is total number of strings in test output re_command = "make clean >> "+build_log
i = i+1 else:
else: ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref"
c1 = 1 ex_command = "x64\\Release1\\test.exe " + command + " >> " + perf_temp + "_test"
c2 = 1 bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /p:ISPC_compiler=ispc_ref /t:rebuild >> " + build_log
next_line = lines[i+3] bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /p:ISPC_compiler=ispc /t:rebuild >> " + build_log
if next_line[0] == "^": #we should concatenate result of this test with previous one re_command = "msbuild /t:clean >> " + build_log
run_test(commands, c1, c2, answer[len(answer)-1], answer_ref[len(answer)-1], False) commands = [ex_command, bu_command, ex_command_ref, bu_command_ref, re_command]
i = i+1 # parsing config parameters
else: #we run this test and append it's result to answer structure next_line = lines[i+3]
run_test(commands, c1, c2, test, test_ref, True) if next_line[0] == "!": # we should take only one part of test output
answer.append(test) R = next_line.split(' ')
answer_ref.append(test_ref) c1 = int(R[1]) #c1 is a number of string which we want to use in test output
c2 = int(R[2]) #c2 is total number of strings in test output
temp = 1
else:
c1 = 1
c2 = 1
next_line = lines[i+3]
if next_line[0] == "^":
temp = 1
if next_line[0] == "^" and target_number == 1: #we should concatenate result of this test with previous one
run_test(commands, c1, c2, answer[len(answer)-1], answer_ref[len(answer)-1], False)
else: #we run this test and append it's result to answer structure
run_test(commands, c1, c2, test, test_ref, True)
answer.append(test)
answer_ref.append(test_ref)
i = i + temp
# preparing next loop iteration # preparing next loop iteration
os.chdir(pwd1) os.chdir(pwd1)
i+=4 i+=4
@@ -460,8 +520,10 @@ def perf(options1, args):
common.remove_if_exists(perf_temp+"_ref") common.remove_if_exists(perf_temp+"_ref")
#print collected answer #print collected answer
if target_number > 1:
s = True
print_debug("\n\nTEST COMPILER:\n", s, perf_log) print_debug("\n\nTEST COMPILER:\n", s, perf_log)
A = print_answer(answer) A = print_answer(answer, target_number)
if options.ref != "": if options.ref != "":
print_debug("\n\nREFERENCE COMPILER:\n", s, perf_log) print_debug("\n\nREFERENCE COMPILER:\n", s, perf_log)
B = print_answer(answer_ref) B = print_answer(answer_ref)
@@ -503,5 +565,7 @@ if __name__ == "__main__":
help='set reference compiler for compare', default="") help='set reference compiler for compare', default="")
parser.add_option('-f', '--file', dest='in_file', parser.add_option('-f', '--file', dest='in_file',
help='file to save perf output', default="") help='file to save perf output', default="")
parser.add_option('-t', '--target', dest='perf_target',
help='set ispc target for building benchmarks (both test and ref)', default="")
(options, args) = parser.parse_args() (options, args) = parser.parse_args()
perf(options, args) perf(options, args)

View File

@@ -4,7 +4,8 @@ export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) { export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex]; float a = aFOO[programIndex];
RET[programIndex] = (exp(-log(1/a)) - a) < 1e-7 ? 1 : 0; // calculation error 1e-6 is the same as in icc
RET[programIndex] = (exp(-log(1/a)) - a) < 1e-6 ? 1 : 0;
} }
export void result(uniform float RET[4]) { export void result(uniform float RET[4]) {

View File

@@ -2879,7 +2879,7 @@ FunctionType::GetDIType(llvm::DIDescriptor scope) const {
for (int i = 0; i < GetNumParameters(); ++i) { for (int i = 0; i < GetNumParameters(); ++i) {
const Type *t = GetParameterType(i); const Type *t = GetParameterType(i);
if (t == NULL) if (t == NULL)
#if defined(LLVM_3_4) #if defined(LLVM_3_4) || defined(LLVM_3_5)
return llvm::DICompositeType(); return llvm::DICompositeType();
#else #else
return llvm::DIType(); return llvm::DIType();