diff --git a/Makefile b/Makefile index 09ec302d..b5bb3472 100644 --- a/Makefile +++ b/Makefile @@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ +TARGETS=avxh avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \ generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 ifneq ($(ARM_ENABLED), 0) diff --git a/builtins.cpp b/builtins.cpp index 886eec15..63c90337 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -920,6 +920,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } case Target::AVX: { switch (g->target->getVectorWidth()) { + case 4: + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_avxh_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_avxh_64bit); + } + break; case 8: if (runtime32) { EXPORT_MODULE(builtins_bitcode_avx1_32bit); diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-h.ll new file mode 100644 index 00000000..d56a63b9 --- /dev/null +++ b/builtins/target-avx-h.ll @@ -0,0 +1,554 @@ +;; Copyright (c) 2010-2012, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Basic 4-wide definitions + +define(`WIDTH',`4') +define(`MASK',`i32') +include(`util.m4') + +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-avx-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline { + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + + %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0) + ; do one N-R iteration + %v_iv = fmul <4 x float> %0, %call + %two_minus = fsub <4 x float> , %v_iv + %iv_mul = fmul <4 x float> %call, %two_minus + ret <4 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline { + ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8) + ret <4 x float> %call +} + +define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline { + ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 + %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9) + ret <4 x float> %call +} + +define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline { + ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 + %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10) + ret <4 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +;; avx intrinsic +declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone + +define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline { + %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 8) + ret <4 x double> %call +} + +define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline { + ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9 + %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 9) + ret <4 x double> %call +} + + +define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline { + ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10 + %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 10) + ret <4 x double> %call +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rsqrt + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <4 x float> %v, %is + %v_is_is = fmul <4 x float> %v_is, %is + %three_sub = fsub <4 x float> , %v_is_is + %is_mul = fmul <4 x float> %is, %three_sub + %half_scale = fmul <4 x float> , %is_mul + ret <4 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; sqrt + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0) + ret <4 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +;; avx§ intrinsic +declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone + +define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline { + %call = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %0) + ret <4 x double> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME: need either to wire these up to the 8-wide SVML entrypoints, +; or, use the macro to call the 4-wide ones twice with our 8-wide +; vectors... + +;;declare <4 x double> @__svml_sin4(<4 x double>) +;;declare <4 x double> @__svml_cos4(<4 x double>) +;;declare void @__svml_sincos4(<4 x double>, <4 x double> *, <4 x double> *) +;;declare <4 x double> @__svml_tan4(<4 x double>) +;;declare <4 x double> @__svml_atan4(<4 x double>) +;;declare <4 x double> @__svml_atan24(<4 x double>, <4 x double>) +;;declare <4 x double> @__svml_exp4(<4 x double>) +;;declare <4 x double> @__svml_log4(<4 x double>) +;;declare <4 x double> @__svml_pow4(<4 x double>, <4 x double>) +declare <4 x float> @__svml_sin(<4 x float>) +declare <4 x float> @__svml_cos(<4 x float>) +declare void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) +declare <4 x float> @__svml_tan(<4 x float>) +declare <4 x float> @__svml_atan(<4 x float>) +declare <4 x float> @__svml_atan2(<4 x float>, <4 x float>) +declare <4 x float> @__svml_exp(<4 x float>) +declare <4 x float> @__svml_log(<4 x float>) +declare <4 x float> @__svml_pow(<4 x float>, <4 x float>) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +;; sse intrinsics +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1) + ret <4 x float> %call +} + +define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1) + ret <4 x float> %call +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops + +;; sse intrinsic +declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone + +define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + %v64 = zext i32 %v to i64 + ret i64 %v64 +} + +define i1 @__any(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + %cmp = icmp eq i32 %v, 15 + ret i1 %cmp +} + +define i1 @__none(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + %cmp = icmp eq i32 %v, 0 + ret i1 %cmp +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal float ops + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone + +define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline { + %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0) + %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1) + %scalar = extractelement <4 x float> %v2, i32 0 + ret float %scalar +} + +define float @__reduce_min_float(<4 x float>) nounwind readnone { + reduce4(float, @__min_varying_float, @__min_uniform_float) +} + +define float @__reduce_max_float(<4 x float>) nounwind readnone { + reduce4(float, @__max_varying_float, @__max_uniform_float) +} + +reduce_equal(4) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int8 ops + +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline +{ + %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int16 ops + +define internal <4 x i16> @__add_varying_i16(<4 x i16>, + <4 x i16>) nounwind readnone alwaysinline { + %r = add <4 x i16> %0, %1 + ret <4 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline { + reduce4(i16, @__add_varying_i16, @__add_uniform_i16) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int32 ops + +define <4 x i32> @__add_varying_int32(<4 x i32>, + <4 x i32>) nounwind readnone alwaysinline { + %s = add <4 x i32> %0, %1 + ret <4 x i32> %s +} + +define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline { + %s = add i32 %0, %1 + ret i32 %s +} + +define i32 @__reduce_add_int32(<4 x i32>) nounwind readnone alwaysinline { + reduce4(i32, @__add_varying_int32, @__add_uniform_int32) +} + + +define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline { + reduce4(i32, @__min_varying_int32, @__min_uniform_int32) +} + + +define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline { + reduce4(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline { + reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline { + reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal double ops + +declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define double @__reduce_add_double(<4 x double>) nounwind readonly alwaysinline { + %v0 = shufflevector <4 x double> %0, <4 x double> undef, + <4 x i32> + %v1 = shufflevector <4 x double> , <4 x double> undef, + <4 x i32> +;; %v1 = <4 x double> + %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1) + %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0) + %final0 = extractelement <4 x double> %sum1, i32 0 + %final1 = extractelement <4 x double> %sum1, i32 2 + %sum = fadd double %final0, %final1 + + ret double %sum +} + +define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline { + reduce4(double, @__min_varying_double, @__min_uniform_double) +} + + +define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline { + reduce4(double, @__max_varying_double, @__max_uniform_double) +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int64 ops + +define <4 x i64> @__add_varying_int64(<4 x i64>, + <4 x i64>) nounwind readnone alwaysinline { + %s = add <4 x i64> %0, %1 + ret <4 x i64> %s +} + +define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline { + %s = add i64 %0, %1 + ret i64 %s +} + +define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline { + reduce4(i64, @__add_varying_int64, @__add_uniform_int64) +} + + +define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline { + reduce4(i64, @__min_varying_int64, @__min_uniform_int64) +} + + +define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline { + reduce4(i64, @__max_varying_int64, @__max_uniform_int64) +} + + +define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline { + reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + + +define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline { + reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + + +; no masked load instruction for i8 and i16 types?? +masked_load(i8, 1) +masked_load(i16, 2) + +;; avx intrinsics +declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask) +declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask) + +define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline { + %floatmask = bitcast <4 x i32> %mask to <4 x float> + %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask) + %retval = bitcast <4 x float> %floatval to <4 x i32> + ret <4 x i32> %retval +} + + +define <4 x i64> @__masked_load_i64(i8 *, <4 x i32> %mask) nounwind alwaysinline { + ; double up masks, bitcast to doubles + %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef, + <8 x i32> + %mask0d = bitcast <8 x i32> %mask0 to <4 x double> + + %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d) + + %vald = shufflevector <4 x double> %val0d, <4 x double> undef, + <4 x i32> + %val = bitcast <4 x double> %vald to <4 x i64> + ret <4 x i64> %val +} + +masked_load_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +gen_masked_store(i8) +gen_masked_store(i16) + +; note that mask is the 2nd parameter, not the 3rd one!! +;; avx intrinsics +declare void @llvm.x86.avx.maskstore.ps (i8 *, <4 x float>, <4 x float>) +declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>) + +define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, + <4 x i32>) nounwind alwaysinline { + %ptr = bitcast <4 x i32> * %0 to i8 * + %val = bitcast <4 x i32> %1 to <4 x float> + %mask = bitcast <4 x i32> %2 to <4 x float> + call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val) + ret void +} + +define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>, + <4 x i32> %mask) nounwind alwaysinline { + %ptr = bitcast <4 x i64> * %0 to i8 * + %val = bitcast <4 x i64> %1 to <4 x double> + + %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef, + <8 x i32> + + %mask0d = bitcast <8 x i32> %mask0 to <4 x double> + + %val0 = shufflevector <4 x double> %val, <4 x double> undef, + <4 x i32> + + call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0) + ret void +} + + +masked_store_blend_8_16_by_4() + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, + <4 x float>) nounwind readnone + + +define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, + <4 x i32> %mask) nounwind alwaysinline { + %mask_as_float = bitcast <4 x i32> %mask to <4 x float> + %oldValue = load <4 x i32>* %0, align 4 + %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float> + %newAsFloat = bitcast <4 x i32> %1 to <4 x float> + %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat, + <4 x float> %newAsFloat, + <4 x float> %mask_as_float) + %blendAsInt = bitcast <4 x float> %blend to <4 x i32> + store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4 + ret void +} + +;; avx intrinsic +declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, + <8 x float>) nounwind readnone + +define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new, + <4 x i32> %i32mask) nounwind alwaysinline { + %oldValue = load <4 x i64>* %ptr, align 8 + %mask = bitcast <4 x i32> %i32mask to <4 x float> + + ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values + ; are actually bitcast <4 x i64> values + ; + ; set up the first four 64-bit values + %old01 = bitcast <4 x i64> %oldValue to <4 x i64> + %old01f = bitcast <4 x i64> %old01 to <8 x float> + %new01 = bitcast <4 x i64> %new to <4 x i64> + %new01f = bitcast <4 x i64> %new01 to <8 x float> + ; compute mask--note that the indices are all doubled-up + %mask01 = shufflevector <4 x float> %mask, <4 x float> undef, + <8 x i32> + ; and blend them + %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f, + <8 x float> %new01f, + <8 x float> %mask01) + %result01 = bitcast <8 x float> %result01f to <4 x i64> + + + %final = bitcast <4 x i64> %result01 to <4 x i64> + store <4 x i64> %final, <4 x i64> * %ptr, align 8 + ret void +} + +masked_store_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; scatter + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone +declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline { + %call = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %0, <4 x double> %1) + ret <4 x double> %call +} + +define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline { + %call = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %0, <4 x double> %1) + ret <4 x double> %call +} + diff --git a/builtins/target-avxh.ll b/builtins/target-avxh.ll new file mode 100644 index 00000000..98c9111d --- /dev/null +++ b/builtins/target-avxh.ll @@ -0,0 +1,81 @@ +;; Copyright (c) 2010-2011, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include(`target-avx-h.ll') + +rdrand_decls() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int min/max + +define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %call +} +define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1) + + ret <4 x i32> %call +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unsigned int min/max + +define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %call +} + +define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +ifelse(NO_HALF_DECLARES, `1', `', ` +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) diff --git a/ispc.cpp b/ispc.cpp index 6d4b063d..02c23568 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -446,6 +446,15 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } + else if (!strcasecmp(isa, "avxh") ) { + fprintf(stderr, " ISA is avxh \n"); + this->m_isa = Target::AVX; + this->m_nativeVectorWidth = 4; + this->m_vectorWidth = 4; + this->m_attributes = "+avx,+popcnt,+cmov"; + this->m_maskingIsFree = false; + this->m_maskBitCount = 32; + } else if (!strcasecmp(isa, "avx-x2") || !strcasecmp(isa, "avx1-x2") || !strcasecmp(isa, "avx1-i32x16")) {