[AVX512]: try gemeric-16 like builtins

This commit is contained in:
Anton Mitrokhin
2015-05-06 14:59:02 +03:00
parent d01718aa91
commit 7628f2a6c9
3 changed files with 320 additions and 717 deletions

View File

@@ -29,265 +29,366 @@
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32-v4:128:128";
;; AVX target implementation.
;;
;; Please note that this file uses SSE intrinsics, but LLVM generates AVX
;; instructions, so it doesn't makes sense to change this implemenation.
define(`MASK',`i1')
define(`HAVE_GATHER',`1')
define(`HAVE_SCATTER',`1')
ctlztz() include(`util.m4')
define_prefetches()
define_shuffles()
aossoa()
stdlib_core()
scans()
reduce_equal(WIDTH)
rdrand_decls()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats ;; broadcast/rotate/shuffle
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone declare <WIDTH x float> @__smear_float(float) nounwind readnone
declare <WIDTH x double> @__smear_double(double) nounwind readnone
declare <WIDTH x i8> @__smear_i8(i8) nounwind readnone
declare <WIDTH x i16> @__smear_i16(i16) nounwind readnone
declare <WIDTH x i32> @__smear_i32(i32) nounwind readnone
declare <WIDTH x i64> @__smear_i64(i64) nounwind readnone
define float @__round_uniform_float(float) nounwind readonly alwaysinline { declare <WIDTH x float> @__setzero_float() nounwind readnone
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 declare <WIDTH x double> @__setzero_double() nounwind readnone
; the roundss intrinsic is a total mess--docs say: declare <WIDTH x i8> @__setzero_i8() nounwind readnone
; declare <WIDTH x i16> @__setzero_i16() nounwind readnone
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c) declare <WIDTH x i32> @__setzero_i32() nounwind readnone
; declare <WIDTH x i64> @__setzero_i64() nounwind readnone
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
; on b0. The higher order 96 bits are copied directly from input parameter a. The
; return value is described by the following equations:
;
; r0 = RND(b0)
; r1 = a1
; r2 = a2
; r3 = a3
;
; It doesn't matter what we pass as a, since we only need the r0 value
; here. So we pass the same register for both. Further, only the 0th
; element of the b parameter matters
%xi = insertelement <4 x float> undef, float %0, i32 0
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
%rs = extractelement <4 x float> %xr, i32 0
ret float %rs
}
define float @__floor_uniform_float(float) nounwind readonly alwaysinline { declare <WIDTH x float> @__undef_float() nounwind readnone
; see above for round_ss instrinsic discussion... declare <WIDTH x double> @__undef_double() nounwind readnone
%xi = insertelement <4 x float> undef, float %0, i32 0 declare <WIDTH x i8> @__undef_i8() nounwind readnone
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 declare <WIDTH x i16> @__undef_i16() nounwind readnone
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9) declare <WIDTH x i32> @__undef_i32() nounwind readnone
%rs = extractelement <4 x float> %xr, i32 0 declare <WIDTH x i64> @__undef_i64() nounwind readnone
ret float %rs
}
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline { declare <WIDTH x float> @__broadcast_float(<WIDTH x float>, i32) nounwind readnone
; see above for round_ss instrinsic discussion... declare <WIDTH x double> @__broadcast_double(<WIDTH x double>, i32) nounwind readnone
%xi = insertelement <4 x float> undef, float %0, i32 0 declare <WIDTH x i8> @__broadcast_i8(<WIDTH x i8>, i32) nounwind readnone
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 declare <WIDTH x i16> @__broadcast_i16(<WIDTH x i16>, i32) nounwind readnone
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10) declare <WIDTH x i32> @__broadcast_i32(<WIDTH x i32>, i32) nounwind readnone
%rs = extractelement <4 x float> %xr, i32 0 declare <WIDTH x i64> @__broadcast_i64(<WIDTH x i64>, i32) nounwind readnone
ret float %rs
} declare <WIDTH x i8> @__rotate_i8(<WIDTH x i8>, i32) nounwind readnone
declare <WIDTH x i16> @__rotate_i16(<WIDTH x i16>, i32) nounwind readnone
declare <WIDTH x float> @__rotate_float(<WIDTH x float>, i32) nounwind readnone
declare <WIDTH x i32> @__rotate_i32(<WIDTH x i32>, i32) nounwind readnone
declare <WIDTH x double> @__rotate_double(<WIDTH x double>, i32) nounwind readnone
declare <WIDTH x i64> @__rotate_i64(<WIDTH x i64>, i32) nounwind readnone
declare <WIDTH x i8> @__shift_i8(<WIDTH x i8>, i32) nounwind readnone
declare <WIDTH x i16> @__shift_i16(<WIDTH x i16>, i32) nounwind readnone
declare <WIDTH x float> @__shift_float(<WIDTH x float>, i32) nounwind readnone
declare <WIDTH x i32> @__shift_i32(<WIDTH x i32>, i32) nounwind readnone
declare <WIDTH x double> @__shift_double(<WIDTH x double>, i32) nounwind readnone
declare <WIDTH x i64> @__shift_i64(<WIDTH x i64>, i32) nounwind readnone
declare <WIDTH x i8> @__shuffle_i8(<WIDTH x i8>, <WIDTH x i32>) nounwind readnone
declare <WIDTH x i8> @__shuffle2_i8(<WIDTH x i8>, <WIDTH x i8>,
<WIDTH x i32>) nounwind readnone
declare <WIDTH x i16> @__shuffle_i16(<WIDTH x i16>, <WIDTH x i32>) nounwind readnone
declare <WIDTH x i16> @__shuffle2_i16(<WIDTH x i16>, <WIDTH x i16>,
<WIDTH x i32>) nounwind readnone
declare <WIDTH x float> @__shuffle_float(<WIDTH x float>,
<WIDTH x i32>) nounwind readnone
declare <WIDTH x float> @__shuffle2_float(<WIDTH x float>, <WIDTH x float>,
<WIDTH x i32>) nounwind readnone
declare <WIDTH x i32> @__shuffle_i32(<WIDTH x i32>,
<WIDTH x i32>) nounwind readnone
declare <WIDTH x i32> @__shuffle2_i32(<WIDTH x i32>, <WIDTH x i32>,
<WIDTH x i32>) nounwind readnone
declare <WIDTH x double> @__shuffle_double(<WIDTH x double>,
<WIDTH x i32>) nounwind readnone
declare <WIDTH x double> @__shuffle2_double(<WIDTH x double>,
<WIDTH x double>, <WIDTH x i32>) nounwind readnone
declare <WIDTH x i64> @__shuffle_i64(<WIDTH x i64>,
<WIDTH x i32>) nounwind readnone
declare <WIDTH x i64> @__shuffle2_i64(<WIDTH x i64>, <WIDTH x i64>,
<WIDTH x i32>) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding doubles ;; aos/soa
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone declare void @__soa_to_aos3_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
<WIDTH x float> %v2, float * noalias %p) nounwind
define double @__round_uniform_double(double) nounwind readonly alwaysinline { declare void @__aos_to_soa3_float(float * noalias %p, <WIDTH x float> * %out0,
%xi = insertelement <2 x double> undef, double %0, i32 0 <WIDTH x float> * %out1, <WIDTH x float> * %out2) nounwind
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8) declare void @__soa_to_aos4_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
%rs = extractelement <2 x double> %xr, i32 0 <WIDTH x float> %v2, <WIDTH x float> %v3,
ret double %rs float * noalias %p) nounwind
} declare void @__aos_to_soa4_float(float * noalias %p, <WIDTH x float> * noalias %out0,
<WIDTH x float> * noalias %out1,
define double @__floor_uniform_double(double) nounwind readonly alwaysinline { <WIDTH x float> * noalias %out2,
; see above for round_ss instrinsic discussion... <WIDTH x float> * noalias %out3) nounwind
%xi = insertelement <2 x double> undef, double %0, i32 0
; roundsd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
%rs = extractelement <2 x double> %xr, i32 0
ret double %rs
}
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0
; roundsd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
%rs = extractelement <2 x double> %xr, i32 0
ret double %rs
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp ;; half conversion routines
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline { declare i16 @__float_to_half_uniform(float %v) nounwind readnone
; do the rcpss call declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
; uniform float iv = extract(__rcp_u(v), 0);
; return iv * (2. - v * iv);
%vecval = insertelement <4 x float> undef, float %0, i32 0
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
%scall = extractelement <4 x float> %call, i32 0
; do one N-R iteration to improve precision, as above
%v_iv = fmul float %0, %scall
%two_minus = fsub float 2., %v_iv
%iv_mul = fmul float %scall, %two_minus
ret float %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt ;; math
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone declare void @__fastmath() nounwind
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline { ;; round/floor/ceil
; uniform float is = extract(__rsqrt_u(v), 0);
%v = insertelement <4 x float> undef, float %0, i32 0
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
%is = extractelement <4 x float> %vis, i32 0
; Newton-Raphson iteration to improve precision declare float @__round_uniform_float(float) nounwind readnone
; return 0.5 * is * (3. - (v * is) * is); declare float @__floor_uniform_float(float) nounwind readnone
%v_is = fmul float %0, %is declare float @__ceil_uniform_float(float) nounwind readnone
%v_is_is = fmul float %v_is, %is
%three_sub = fsub float 3., %v_is_is
%is_mul = fmul float %is, %three_sub
%half_scale = fmul float 0.5, %is_mul
ret float %half_scale
}
declare double @__round_uniform_double(double) nounwind readnone
declare double @__floor_uniform_double(double) nounwind readnone
declare double @__ceil_uniform_double(double) nounwind readnone
declare <WIDTH x float> @__round_varying_float(<WIDTH x float>) nounwind readnone
declare <WIDTH x float> @__floor_varying_float(<WIDTH x float>) nounwind readnone
declare <WIDTH x float> @__ceil_varying_float(<WIDTH x float>) nounwind readnone
declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone
declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone
declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone
;; min/max
declare float @__max_uniform_float(float, float) nounwind readnone
declare float @__min_uniform_float(float, float) nounwind readnone
declare i32 @__min_uniform_int32(i32, i32) nounwind readnone
declare i32 @__max_uniform_int32(i32, i32) nounwind readnone
declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone
declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone
declare i64 @__min_uniform_int64(i64, i64) nounwind readnone
declare i64 @__max_uniform_int64(i64, i64) nounwind readnone
declare i64 @__min_uniform_uint64(i64, i64) nounwind readnone
declare i64 @__max_uniform_uint64(i64, i64) nounwind readnone
declare double @__min_uniform_double(double, double) nounwind readnone
declare double @__max_uniform_double(double, double) nounwind readnone
declare <WIDTH x float> @__max_varying_float(<WIDTH x float>,
<WIDTH x float>) nounwind readnone
declare <WIDTH x float> @__min_varying_float(<WIDTH x float>,
<WIDTH x float>) nounwind readnone
declare <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
declare <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
declare <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
declare <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
declare <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone
declare <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone
declare <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone
declare <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone
declare <WIDTH x double> @__min_varying_double(<WIDTH x double>,
<WIDTH x double>) nounwind readnone
declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,
<WIDTH x double>) nounwind readnone
;; sqrt/rsqrt/rcp
declare float @__rsqrt_uniform_float(float) nounwind readnone
declare float @__rcp_uniform_float(float) nounwind readnone
declare float @__sqrt_uniform_float(float) nounwind readnone
declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone
declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone
declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
declare double @__sqrt_uniform_double(double) nounwind readnone
declare <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone
;; bit ops
declare i32 @__popcnt_int32(i32) nounwind readnone
declare i64 @__popcnt_int64(i64) nounwind readnone
declare i32 @__count_trailing_zeros_i32(i32) nounwind readnone
declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
; or, use the macro to call the 4-wide ones twice with our 8-wide
; vectors...
;; svml
include(`svml.m4')
svml_stubs(float,f,WIDTH)
svml_stubs(double,d,WIDTH)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; sqrt ;; reductions
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone
declare i1 @__any(<WIDTH x i1>) nounwind readnone
declare i1 @__all(<WIDTH x i1>) nounwind readnone
declare i1 @__none(<WIDTH x i1>) nounwind readnone
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline { declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0) declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
ret float %ret
} declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone
declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone
declare i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone
declare double @__reduce_add_double(<WIDTH x double>) nounwind readnone
declare double @__reduce_min_double(<WIDTH x double>) nounwind readnone
declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone
declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt ;; unaligned loads/loads+broadcasts
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind alwaysinline { declare <WIDTH x i8> @__masked_load_i8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0) declare <WIDTH x i16> @__masked_load_i16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
ret double %ret declare <WIDTH x i32> @__masked_load_i32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x float> @__masked_load_float(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x i64> @__masked_load_i64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x double> @__masked_load_double(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare void @__masked_store_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
<WIDTH x i1>) nounwind
declare void @__masked_store_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
<WIDTH x i1>) nounwind
declare void @__masked_store_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
<WIDTH x i1>) nounwind
declare void @__masked_store_float(<WIDTH x float>* nocapture, <WIDTH x float>,
<WIDTH x i1>) nounwind
declare void @__masked_store_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
<WIDTH x i1> %mask) nounwind
declare void @__masked_store_double(<WIDTH x double>* nocapture, <WIDTH x double>,
<WIDTH x i1> %mask) nounwind
define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
<WIDTH x i1>) nounwind alwaysinline {
%v = load PTR_OP_ARGS(`<WIDTH x i8> ') %0
%v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
store <WIDTH x i8> %v1, <WIDTH x i8> * %0
ret void
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
;; fast math mode <WIDTH x i1>) nounwind alwaysinline {
%v = load PTR_OP_ARGS(`<WIDTH x i16> ') %0
%v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
store <WIDTH x i16> %v1, <WIDTH x i16> * %0
ret void
}
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind <WIDTH x i1>) nounwind alwaysinline {
%v = load PTR_OP_ARGS(`<WIDTH x i32> ') %0
%v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
store <WIDTH x i32> %v1, <WIDTH x i32> * %0
ret void
}
define void @__fastmath() nounwind alwaysinline { define void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>,
%ptr = alloca i32 <WIDTH x i1>) nounwind alwaysinline {
%ptr8 = bitcast i32 * %ptr to i8 * %v = load PTR_OP_ARGS(`<WIDTH x float> ') %0
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8) %v1 = select <WIDTH x i1> %2, <WIDTH x float> %1, <WIDTH x float> %v
%oldval = load PTR_OP_ARGS(`i32 ') %ptr store <WIDTH x float> %v1, <WIDTH x float> * %0
ret void
}
; turn on DAZ (64)/FTZ (32768) -> 32832 define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
%update = or i32 %oldval, 32832 <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
store i32 %update, i32 *%ptr %v = load PTR_OP_ARGS(`<WIDTH x i64> ') %0
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8) %v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
store <WIDTH x i64> %v1, <WIDTH x i64> * %0
ret void
}
define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
<WIDTH x double>, <WIDTH x i1>) nounwind alwaysinline {
%v = load PTR_OP_ARGS(`<WIDTH x double> ') %0
%v1 = select <WIDTH x i1> %2, <WIDTH x double> %1, <WIDTH x double> %v
store <WIDTH x double> %v1, <WIDTH x double> * %0
ret void ret void
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max ;; gather/scatter
define float @__max_uniform_float(float, float) nounwind readonly alwaysinline { define(`gather_scatter', `
%cmp = fcmp ogt float %1, %0 declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, i32, <WIDTH x i32>,
%ret = select i1 %cmp, float %1, float %0 <WIDTH x i1>) nounwind readonly
ret float %ret declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, i32, <WIDTH x i64>,
} <WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>,
<WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>,
<WIDTH x i1>) nounwind readonly
define float @__min_uniform_float(float, float) nounwind readonly alwaysinline { declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, <WIDTH x i32>,
%cmp = fcmp ogt float %1, %0 <WIDTH x $1>, <WIDTH x i1>) nounwind
%ret = select i1 %cmp, float %0, float %1 declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, <WIDTH x i64>,
ret float %ret <WIDTH x $1>, <WIDTH x i1>) nounwind
} declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
<WIDTH x i1>) nounwind
declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
<WIDTH x i1>) nounwind
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; gather_scatter(i8)
;; double precision min/max gather_scatter(i16)
gather_scatter(i32)
gather_scatter(float)
gather_scatter(i64)
gather_scatter(double)
define double @__min_uniform_double(double, double) nounwind readnone alwaysinline { declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
%cmp = fcmp ogt double %1, %0 <WIDTH x i1>) nounwind
%ret = select i1 %cmp, double %0, double %1 declare i32 @__packed_store_active(i32 * nocapture, <WIDTH x i32> %vals,
ret double %ret <WIDTH x i1>) nounwind
} declare i32 @__packed_store_active2(i32 * nocapture, <WIDTH x i32> %vals,
<WIDTH x i1>) nounwind
define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
%cmp = fcmp ogt double %1, %0
%ret = select i1 %cmp, double %1, double %0
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
%cmp = icmp sgt i32 %1, %0
%ret = select i1 %cmp, i32 %0, i32 %1
ret i32 %ret
}
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
%cmp = icmp sgt i32 %1, %0
%ret = select i1 %cmp, i32 %1, i32 %0
ret i32 %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unsigned int min/max ;; prefetch
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
%cmp = icmp ugt i32 %1, %0
%ret = select i1 %cmp, i32 %0, i32 %1
ret i32 %ret
}
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
%cmp = icmp ugt i32 %1, %0
%ret = select i1 %cmp, i32 %1, i32 %0
ret i32 %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal ops / reductions
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
%call = call i32 @llvm.ctpop.i32(i32 %0)
ret i32 %call
}
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
%call = call i64 @llvm.ctpop.i64(i64 %0)
ret i64 %call
}
declare void @__prefetch_read_uniform_1(i8 * nocapture) nounwind
declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind
declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
declare void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins ;; int8/int16 builtins
define_avgs() define_avgs()
declare_nvptx() declare_nvptx()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reciprocals in double precision, if supported
rsqrtd_decl()
rcpd_decl()
transcendetals_decl()
trigonometry_decl()

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2013, Intel Corporation ;; Copyright (c) 2010-2014, Intel Corporation
;; All rights reserved. ;; All rights reserved.
;; ;;
;; Redistribution and use in source and binary forms, with or without ;; Redistribution and use in source and binary forms, with or without
@@ -29,510 +29,6 @@
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
define(`HAVE_GATHER', `1') define(`WIDTH',`16')
include(`target-avx512-common.ll')
include(`target-avx-x2.ll') saturation_arithmetic_novec()
rdrand_definition()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1)
ret <16 x i32> %m
}
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1)
ret <16 x i32> %m
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unsigned int min/max
declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1)
ret <16 x i32> %m
}
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1)
ret <16 x i32> %m
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float/half conversions
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
; 0 is round nearest even
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
%r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
%r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
%r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x float> %r
}
define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
%r_0 = shufflevector <16 x float> %v, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
%r_1 = shufflevector <16 x float> %v, <16 x float> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
%r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %r
}
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
%v1 = bitcast i16 %v to <1 x i16>
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
%r = extractelement <8 x float> %rv, i32 0
ret float %r
}
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
%v1 = bitcast float %v to <1 x float>
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
; round to nearest even
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
%r = extractelement <8 x i16> %rv, i32 0
ret i16 %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
declare void @llvm.trap() noreturn nounwind
; $1: type
; $2: var base name
define(`extract_4s', `
%$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%$2_3 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
%$2_4 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
')
; $1: type
; $2: var base name
define(`extract_8s', `
%$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
')
; $1: element type
; $2: ret name
; $3: v1
; $4: v2
define(`assemble_8s', `
%$2 = shufflevector <8 x $1> %$3, <8 x $1> %$4,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
')
; $1: element type
; $2: ret name
; $3: v1
; $4: v2
; $5: v3
; $6: v4
define(`assemble_4s', `
%$2_1 = shufflevector <4 x $1> %$3, <4 x $1> %$4,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$2_2 = shufflevector <4 x $1> %$5, <4 x $1> %$6,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
assemble_8s($1, $2, $2_1, $2_2)
')
gen_gather(i8)
gen_gather(i16)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int32 gathers
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
<8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
<4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %scale, <16 x i32> %offsets,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
extract_8s(i32, offsets)
extract_8s(i32, vecmask)
%v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
<8 x i32> %offsets_1, <8 x i32> %vecmask_1, i8 %scale8)
%v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
<8 x i32> %offsets_2, <8 x i32> %vecmask_2, i8 %scale8)
assemble_8s(i32, v, v1, v2)
ret <16 x i32> %v
}
define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
i32 %scale, <16 x i64> %offsets,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
extract_4s(i32, vecmask)
extract_4s(i64, offsets)
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
%v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_3, <4 x i32> %vecmask_3, i8 %scale8)
%v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_4, <4 x i32> %vecmask_4, i8 %scale8)
assemble_4s(i32, v, v1, v2, v3, v4)
ret <16 x i32> %v
}
define <16 x i32> @__gather32_i32(<16 x i32> %ptrs,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
extract_8s(i32, ptrs)
extract_8s(i32, vecmask)
%v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
<8 x i32> %ptrs_1, <8 x i32> %vecmask_1, i8 1)
%v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
<8 x i32> %ptrs_2, <8 x i32> %vecmask_2, i8 1)
assemble_8s(i32, v, v1, v2)
ret <16 x i32> %v
}
define <16 x i32> @__gather64_i32(<16 x i64> %ptrs,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
extract_4s(i64, ptrs)
extract_4s(i32, vecmask)
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
%v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_3, <4 x i32> %vecmask_3, i8 1)
%v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_4, <4 x i32> %vecmask_4, i8 1)
assemble_4s(i32, v, v1, v2, v3, v4)
ret <16 x i32> %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float gathers
declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
<8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
<4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
define <16 x float> @__gather_base_offsets32_float(i8 * %ptr,
i32 %scale, <16 x i32> %offsets,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%mask = bitcast <16 x i32> %vecmask to <16 x float>
extract_8s(i32, offsets)
extract_8s(float, mask)
%v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
<8 x i32> %offsets_1, <8 x float> %mask_1, i8 %scale8)
%v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
<8 x i32> %offsets_2, <8 x float> %mask_2, i8 %scale8)
assemble_8s(float, v, v1, v2)
ret <16 x float> %v
}
define <16 x float> @__gather_base_offsets64_float(i8 * %ptr,
i32 %scale, <16 x i64> %offsets,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%mask = bitcast <16 x i32> %vecmask to <16 x float>
extract_4s(i64, offsets)
extract_4s(float, mask)
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
%v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_3, <4 x float> %mask_3, i8 %scale8)
%v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_4, <4 x float> %mask_4, i8 %scale8)
assemble_4s(float, v, v1, v2, v3, v4)
ret <16 x float> %v
}
define <16 x float> @__gather32_float(<16 x i32> %ptrs,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%mask = bitcast <16 x i32> %vecmask to <16 x float>
extract_8s(float, mask)
extract_8s(i32, ptrs)
%v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
<8 x i32> %ptrs_1, <8 x float> %mask_1, i8 1)
%v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
<8 x i32> %ptrs_2, <8 x float> %mask_2, i8 1)
assemble_8s(float, v, v1, v2)
ret <16 x float> %v
}
define <16 x float> @__gather64_float(<16 x i64> %ptrs,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%mask = bitcast <16 x i32> %vecmask to <16 x float>
extract_4s(i64, ptrs)
extract_4s(float, mask)
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
%v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_3, <4 x float> %mask_3, i8 1)
%v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_4, <4 x float> %mask_4, i8 1)
assemble_4s(float, v, v1, v2, v3, v4)
ret <16 x float> %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int64 gathers
declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
<4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
<4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
define <16 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
i32 %scale, <16 x i32> %offsets,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
extract_4s(i32, offsets)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
%v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
%v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
assemble_4s(i64, v, v1, v2, v3, v4)
ret <16 x i64> %v
}
define <16 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
i32 %scale, <16 x i64> %offsets,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
extract_4s(i64, offsets)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
%v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
%v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
assemble_4s(i64, v, v1, v2, v3, v4)
ret <16 x i64> %v
}
define <16 x i64> @__gather32_i64(<16 x i32> %ptrs,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
extract_4s(i32, ptrs)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
%v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
%v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
assemble_4s(i64, v, v1, v2, v3, v4)
ret <16 x i64> %v
}
define <16 x i64> @__gather64_i64(<16 x i64> %ptrs,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
extract_4s(i64, ptrs)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
%v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
%v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
assemble_4s(i64, v, v1, v2, v3, v4)
ret <16 x i64> %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double gathers
declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
<4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
<4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
define <16 x double> @__gather_base_offsets32_double(i8 * %ptr,
i32 %scale, <16 x i32> %offsets,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
extract_4s(i32, offsets)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
%v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
%v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
assemble_4s(double, v, v1, v2, v3, v4)
ret <16 x double> %v
}
define <16 x double> @__gather_base_offsets64_double(i8 * %ptr,
i32 %scale, <16 x i64> %offsets,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
extract_4s(i64, offsets)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
%v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
%v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
assemble_4s(double, v, v1, v2, v3, v4)
ret <16 x double> %v
}
define <16 x double> @__gather32_double(<16 x i32> %ptrs,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
extract_4s(i32, ptrs)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
%v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_3, <4 x double> %vecmask_3, i8 1)
%v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_4, <4 x double> %vecmask_4, i8 1)
assemble_4s(double, v, v1, v2, v3, v4)
ret <16 x double> %v
}
define <16 x double> @__gather64_double(<16 x i64> %ptrs,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
extract_4s(i64, ptrs)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
%v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_3, <4 x double> %vecmask_3, i8 1)
%v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_4, <4 x double> %vecmask_4, i8 1)
assemble_4s(double, v, v1, v2, v3, v4)
ret <16 x double> %v
}

View File

@@ -1126,7 +1126,9 @@ Target::SupportedTargets() {
"avx2-i32x8, avx2-i32x16, avx2-i64x4, " "avx2-i32x8, avx2-i32x16, avx2-i64x4, "
"generic-x1, generic-x4, generic-x8, generic-x16, " "generic-x1, generic-x4, generic-x8, generic-x16, "
"generic-x32, generic-x64, *-generic-x16, " "generic-x32, generic-x64, *-generic-x16, "
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5) && !defined(LLVM_3_6)// LLVM 3.7+
"knl-avx512" "knl-avx512"
#endif
#ifdef ISPC_ARM_ENABLED #ifdef ISPC_ARM_ENABLED
", neon-i8x16, neon-i16x8, neon-i32x4" ", neon-i8x16, neon-i16x8, neon-i32x4"
#endif #endif
@@ -1195,8 +1197,10 @@ Target::ISAToString(ISA isa) {
return "avx11"; return "avx11";
case Target::AVX2: case Target::AVX2:
return "avx2"; return "avx2";
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5) && !defined(LLVM_3_6)// LLVM 3.7+
case Target::KNL_AVX512: case Target::KNL_AVX512:
return "knl-avx512"; return "knl-avx512";
#endif
case Target::SKX: case Target::SKX:
return "skx"; return "skx";
case Target::GENERIC: case Target::GENERIC:
@@ -1241,8 +1245,10 @@ Target::ISAToTargetString(ISA isa) {
return "avx1.1-i32x8"; return "avx1.1-i32x8";
case Target::AVX2: case Target::AVX2:
return "avx2-i32x8"; return "avx2-i32x8";
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5) && !defined(LLVM_3_6)// LLVM 3.7+
case Target::KNL_AVX512: case Target::KNL_AVX512:
return "knl-avx512"; return "knl-avx512";
#endif
case Target::SKX: case Target::SKX:
return "avx2"; return "avx2";
case Target::GENERIC: case Target::GENERIC: