;; Copyright (c) 2010-2015, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are ;; met: ;; ;; * Redistributions of source code must retain the above copyright ;; notice, this list of conditions and the following disclaimer. ;; ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; ;; * Neither the name of Intel Corporation nor the names of its ;; contributors may be used to endorse or promote products derived from ;; this software without specific prior written permission. ;; ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS ;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED ;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A ;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER ;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. define(`MASK',`i1') define(`HAVE_GATHER',`1') define(`HAVE_SCATTER',`1') include(`util.m4') stdlib_core() scans() reduce_equal(WIDTH) rdrand_definition() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; broadcast/rotate/shuffle define_shuffles() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; aos/soa aossoa() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone ; 0 is round nearest even declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone { %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef, <8 x i32> %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0) %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef, <8 x i32> %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1) %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, <16 x i32> ret <16 x float> %r } define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone { %r_0 = shufflevector <16 x float> %v, <16 x float> undef, <8 x i32> %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0) %r_1 = shufflevector <16 x float> %v, <16 x float> undef, <8 x i32> %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0) %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, <16 x i32> ret <16 x i16> %r } define float @__half_to_float_uniform(i16 %v) nounwind readnone { %v1 = bitcast i16 %v to <1 x i16> %vv = shufflevector <1 x i16> %v1, <1 x i16> undef, <8 x i32> %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv) %r = extractelement <8 x float> %rv, i32 0 ret float %r } define i16 @__float_to_half_uniform(float %v) nounwind readnone { %v1 = bitcast float %v to <1 x float> %vv = shufflevector <1 x float> %v1, <1 x float> undef, <8 x i32> ; round to nearest even %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0) %r = extractelement <8 x i16> %rv, i32 0 ret i16 %r } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; math declare void @__fastmath() nounwind ;; round/floor/ceil declare float @__round_uniform_float(float) nounwind readnone declare float @__floor_uniform_float(float) nounwind readnone declare float @__ceil_uniform_float(float) nounwind readnone declare double @__round_uniform_double(double) nounwind readnone declare double @__floor_uniform_double(double) nounwind readnone declare double @__ceil_uniform_double(double) nounwind readnone declare @__round_varying_float() nounwind readnone declare @__floor_varying_float() nounwind readnone declare @__ceil_varying_float() nounwind readnone declare @__round_varying_double() nounwind readnone declare @__floor_varying_double() nounwind readnone declare @__ceil_varying_double() nounwind readnone ;; min/max int64minmax() declare float @__max_uniform_float(float, float) nounwind readnone declare float @__min_uniform_float(float, float) nounwind readnone declare i32 @__min_uniform_int32(i32, i32) nounwind readnone declare i32 @__max_uniform_int32(i32, i32) nounwind readnone declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone declare double @__min_uniform_double(double, double) nounwind readnone declare double @__max_uniform_double(double, double) nounwind readnone declare @__max_varying_float(, ) nounwind readnone declare @__min_varying_float(, ) nounwind readnone declare @__min_varying_int32(, ) nounwind readnone declare @__max_varying_int32(, ) nounwind readnone declare @__min_varying_uint32(, ) nounwind readnone declare @__max_varying_uint32(, ) nounwind readnone declare @__min_varying_double(, ) nounwind readnone declare @__max_varying_double(, ) nounwind readnone ;; sqrt/rsqrt/rcp declare float @__rsqrt_uniform_float(float) nounwind readnone declare float @__rcp_uniform_float(float) nounwind readnone declare float @__sqrt_uniform_float(float) nounwind readnone declare @__rcp_varying_float() nounwind readnone declare @__rsqrt_varying_float() nounwind readnone declare @__sqrt_varying_float() nounwind readnone declare double @__sqrt_uniform_double(double) nounwind readnone declare @__sqrt_varying_double() nounwind readnone ;; bit ops declare i32 @__popcnt_int32(i32) nounwind readnone declare i64 @__popcnt_int64(i64) nounwind readnone ctlztz() ; FIXME: need either to wire these up to the 8-wide SVML entrypoints, ; or, use the macro to call the 4-wide ones twice with our 8-wide ; vectors... ;; svml include(`svml.m4') svml_stubs(float,f,WIDTH) svml_stubs(double,d,WIDTH) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions define i64 @__movmsk() nounwind readnone alwaysinline { %intmask = bitcast %0 to i16 %res = zext i16 %intmask to i64 ret i64 %res } define i1 @__any() nounwind readnone alwaysinline { %intmask = bitcast %0 to i16 %res = icmp ne i16 %intmask, 0 ret i1 %res } define i1 @__all() nounwind readnone alwaysinline { %intmask = bitcast %0 to i16 %res = icmp eq i16 %intmask, 65535 ret i1 %res } define i1 @__none() nounwind readnone alwaysinline { %intmask = bitcast %0 to i16 %res = icmp eq i16 %intmask, 0 ret i1 %res } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal int8/16 ops declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline { %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0, <16 x i8> zeroinitializer) %r0 = extractelement <2 x i64> %rv, i32 0 %r1 = extractelement <2 x i64> %rv, i32 1 %r = add i64 %r0, %r1 %r16 = trunc i64 %r to i16 ret i16 %r16 } define internal <16 x i16> @__add_varying_i16(<16 x i16>, <16 x i16>) nounwind readnone alwaysinline { %r = add <16 x i16> %0, %1 ret <16 x i16> %r } define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { %r = add i16 %0, %1 ret i16 %r } define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline { reduce16(i16, @__add_varying_i16, @__add_uniform_i16) } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal float ops declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline { %va = shufflevector <16 x float> %0, <16 x float> undef, <8 x i32> %vb = shufflevector <16 x float> %0, <16 x float> undef, <8 x i32> %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %va, <8 x float> %vb) %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1) %v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2) %scalar1 = extractelement <8 x float> %v3, i32 0 %scalar2 = extractelement <8 x float> %v3, i32 4 %sum = fadd float %scalar1, %scalar2 ret float %sum } define float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline { reduce16(float, @__min_varying_float, @__min_uniform_float) } define float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline { reduce16(float, @__max_varying_float, @__max_uniform_float) } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal int32 ops define internal <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) nounwind readnone alwaysinline { %s = add <16 x i32> %0, %1 ret <16 x i32> %s } define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline { %s = add i32 %0, %1 ret i32 %s } define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline { reduce16(i32, @__add_varying_int32, @__add_uniform_int32) } define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline { reduce16(i32, @__min_varying_int32, @__min_uniform_int32) } define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline { reduce16(i32, @__max_varying_int32, @__max_uniform_int32) } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; horizontal uint32 ops define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline { reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32) } define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline { reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32) } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal double ops declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone define double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline { %va = shufflevector <16 x double> %0, <16 x double> undef, <4 x i32> %vb = shufflevector <16 x double> %0, <16 x double> undef, <4 x i32> %vc = shufflevector <16 x double> %0, <16 x double> undef, <4 x i32> %vd = shufflevector <16 x double> %0, <16 x double> undef, <4 x i32> %vab = fadd <4 x double> %va, %vb %vcd = fadd <4 x double> %vc, %vd %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd) %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0) %final0 = extractelement <4 x double> %sum1, i32 0 %final1 = extractelement <4 x double> %sum1, i32 2 %sum = fadd double %final0, %final1 ret double %sum } define double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline { reduce16(double, @__min_varying_double, @__min_uniform_double) } define double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline { reduce16(double, @__max_varying_double, @__max_uniform_double) } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal int64 ops define internal <16 x i64> @__add_varying_int64(<16 x i64>, <16 x i64>) nounwind readnone alwaysinline { %s = add <16 x i64> %0, %1 ret <16 x i64> %s } define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline { %s = add i64 %0, %1 ret i64 %s } define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline { reduce16(i64, @__add_varying_int64, @__add_uniform_int64) } define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline { reduce16(i64, @__min_varying_int64, @__min_uniform_int64) } define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline { reduce16(i64, @__max_varying_int64, @__max_uniform_int64) } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; horizontal uint64 ops define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline { reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64) } define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline { reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64) } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts masked_load(i8, 1) masked_load(i16, 2) masked_load(i32, 4) masked_load(i64, 8) masked_load_float_double() gen_masked_store(i8) gen_masked_store(i16) gen_masked_store(i32) gen_masked_store(i64) define void @__masked_store_float( * nocapture, , ) nounwind alwaysinline { %ptr = bitcast * %0 to * %val = bitcast %1 to call void @__masked_store_i32( * %ptr, %val, %2) ret void } define void @__masked_store_double( * nocapture, , ) nounwind alwaysinline { %ptr = bitcast * %0 to * %val = bitcast %1 to call void @__masked_store_i64( * %ptr, %val, %2) ret void } define void @__masked_store_blend_i8(* nocapture, , ) nounwind alwaysinline { %v = load PTR_OP_ARGS(` ') %0 %v1 = select %2, %1, %v store %v1, * %0 ret void } define void @__masked_store_blend_i16(* nocapture, , ) nounwind alwaysinline { %v = load PTR_OP_ARGS(` ') %0 %v1 = select %2, %1, %v store %v1, * %0 ret void } define void @__masked_store_blend_i32(* nocapture, , ) nounwind alwaysinline { %v = load PTR_OP_ARGS(` ') %0 %v1 = select %2, %1, %v store %v1, * %0 ret void } define void @__masked_store_blend_float(* nocapture, , ) nounwind alwaysinline { %v = load PTR_OP_ARGS(` ') %0 %v1 = select %2, %1, %v store %v1, * %0 ret void } define void @__masked_store_blend_i64(* nocapture, , ) nounwind alwaysinline { %v = load PTR_OP_ARGS(` ') %0 %v1 = select %2, %1, %v store %v1, * %0 ret void } define void @__masked_store_blend_double(* nocapture, , ) nounwind alwaysinline { %v = load PTR_OP_ARGS(` ') %0 %v1 = select %2, %1, %v store %v1, * %0 ret void } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter define(`gather_scatter', ` declare @__gather_base_offsets32_$1(i8 * nocapture, i32, , ) nounwind readonly declare @__gather_base_offsets64_$1(i8 * nocapture, i32, , ) nounwind readonly declare @__gather32_$1(, ) nounwind readonly declare @__gather64_$1(, ) nounwind readonly declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, , , ) nounwind declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, , , ) nounwind declare void @__scatter32_$1(, , ) nounwind declare void @__scatter64_$1(, , ) nounwind ') gather_scatter(i8) gather_scatter(i16) gather_scatter(i32) gather_scatter(float) gather_scatter(i64) gather_scatter(double) packed_load_and_store() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; prefetch define_prefetches() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int8/int16 builtins define_avgs() declare_nvptx() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reciprocals in double precision, if supported rsqrtd_decl() rcpd_decl() transcendetals_decl() trigonometry_decl()