From 9c79d4d182ca14072583128e6b59d48a80b93102 Mon Sep 17 00:00:00 2001 From: egaburov Date: Wed, 11 Sep 2013 12:58:02 +0200 Subject: [PATCH 01/14] addded avxh with vectorWidth=4 support, use --target=avxh to enable it --- Makefile | 2 +- builtins.cpp | 8 + builtins/target-avx-h.ll | 554 +++++++++++++++++++++++++++++++++++++++ builtins/target-avxh.ll | 81 ++++++ ispc.cpp | 9 + 5 files changed, 653 insertions(+), 1 deletion(-) create mode 100644 builtins/target-avx-h.ll create mode 100644 builtins/target-avxh.ll diff --git a/Makefile b/Makefile index 09ec302d..b5bb3472 100644 --- a/Makefile +++ b/Makefile @@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ +TARGETS=avxh avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \ generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 ifneq ($(ARM_ENABLED), 0) diff --git a/builtins.cpp b/builtins.cpp index 886eec15..63c90337 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -920,6 +920,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } case Target::AVX: { switch (g->target->getVectorWidth()) { + case 4: + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_avxh_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_avxh_64bit); + } + break; case 8: if (runtime32) { EXPORT_MODULE(builtins_bitcode_avx1_32bit); diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-h.ll new file mode 100644 index 00000000..d56a63b9 --- /dev/null +++ b/builtins/target-avx-h.ll @@ -0,0 +1,554 @@ +;; Copyright (c) 2010-2012, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Basic 4-wide definitions + +define(`WIDTH',`4') +define(`MASK',`i32') +include(`util.m4') + +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-avx-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline { + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + + %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0) + ; do one N-R iteration + %v_iv = fmul <4 x float> %0, %call + %two_minus = fsub <4 x float> , %v_iv + %iv_mul = fmul <4 x float> %call, %two_minus + ret <4 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline { + ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8) + ret <4 x float> %call +} + +define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline { + ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 + %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9) + ret <4 x float> %call +} + +define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline { + ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 + %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10) + ret <4 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +;; avx intrinsic +declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone + +define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline { + %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 8) + ret <4 x double> %call +} + +define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline { + ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9 + %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 9) + ret <4 x double> %call +} + + +define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline { + ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10 + %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 10) + ret <4 x double> %call +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rsqrt + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <4 x float> %v, %is + %v_is_is = fmul <4 x float> %v_is, %is + %three_sub = fsub <4 x float> , %v_is_is + %is_mul = fmul <4 x float> %is, %three_sub + %half_scale = fmul <4 x float> , %is_mul + ret <4 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; sqrt + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0) + ret <4 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +;; avx§ intrinsic +declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone + +define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline { + %call = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %0) + ret <4 x double> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME: need either to wire these up to the 8-wide SVML entrypoints, +; or, use the macro to call the 4-wide ones twice with our 8-wide +; vectors... + +;;declare <4 x double> @__svml_sin4(<4 x double>) +;;declare <4 x double> @__svml_cos4(<4 x double>) +;;declare void @__svml_sincos4(<4 x double>, <4 x double> *, <4 x double> *) +;;declare <4 x double> @__svml_tan4(<4 x double>) +;;declare <4 x double> @__svml_atan4(<4 x double>) +;;declare <4 x double> @__svml_atan24(<4 x double>, <4 x double>) +;;declare <4 x double> @__svml_exp4(<4 x double>) +;;declare <4 x double> @__svml_log4(<4 x double>) +;;declare <4 x double> @__svml_pow4(<4 x double>, <4 x double>) +declare <4 x float> @__svml_sin(<4 x float>) +declare <4 x float> @__svml_cos(<4 x float>) +declare void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) +declare <4 x float> @__svml_tan(<4 x float>) +declare <4 x float> @__svml_atan(<4 x float>) +declare <4 x float> @__svml_atan2(<4 x float>, <4 x float>) +declare <4 x float> @__svml_exp(<4 x float>) +declare <4 x float> @__svml_log(<4 x float>) +declare <4 x float> @__svml_pow(<4 x float>, <4 x float>) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +;; sse intrinsics +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1) + ret <4 x float> %call +} + +define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1) + ret <4 x float> %call +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops + +;; sse intrinsic +declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone + +define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + %v64 = zext i32 %v to i64 + ret i64 %v64 +} + +define i1 @__any(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + %cmp = icmp eq i32 %v, 15 + ret i1 %cmp +} + +define i1 @__none(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + %cmp = icmp eq i32 %v, 0 + ret i1 %cmp +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal float ops + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone + +define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline { + %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0) + %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1) + %scalar = extractelement <4 x float> %v2, i32 0 + ret float %scalar +} + +define float @__reduce_min_float(<4 x float>) nounwind readnone { + reduce4(float, @__min_varying_float, @__min_uniform_float) +} + +define float @__reduce_max_float(<4 x float>) nounwind readnone { + reduce4(float, @__max_varying_float, @__max_uniform_float) +} + +reduce_equal(4) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int8 ops + +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline +{ + %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int16 ops + +define internal <4 x i16> @__add_varying_i16(<4 x i16>, + <4 x i16>) nounwind readnone alwaysinline { + %r = add <4 x i16> %0, %1 + ret <4 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline { + reduce4(i16, @__add_varying_i16, @__add_uniform_i16) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int32 ops + +define <4 x i32> @__add_varying_int32(<4 x i32>, + <4 x i32>) nounwind readnone alwaysinline { + %s = add <4 x i32> %0, %1 + ret <4 x i32> %s +} + +define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline { + %s = add i32 %0, %1 + ret i32 %s +} + +define i32 @__reduce_add_int32(<4 x i32>) nounwind readnone alwaysinline { + reduce4(i32, @__add_varying_int32, @__add_uniform_int32) +} + + +define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline { + reduce4(i32, @__min_varying_int32, @__min_uniform_int32) +} + + +define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline { + reduce4(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline { + reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline { + reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal double ops + +declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define double @__reduce_add_double(<4 x double>) nounwind readonly alwaysinline { + %v0 = shufflevector <4 x double> %0, <4 x double> undef, + <4 x i32> + %v1 = shufflevector <4 x double> , <4 x double> undef, + <4 x i32> +;; %v1 = <4 x double> + %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1) + %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0) + %final0 = extractelement <4 x double> %sum1, i32 0 + %final1 = extractelement <4 x double> %sum1, i32 2 + %sum = fadd double %final0, %final1 + + ret double %sum +} + +define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline { + reduce4(double, @__min_varying_double, @__min_uniform_double) +} + + +define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline { + reduce4(double, @__max_varying_double, @__max_uniform_double) +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int64 ops + +define <4 x i64> @__add_varying_int64(<4 x i64>, + <4 x i64>) nounwind readnone alwaysinline { + %s = add <4 x i64> %0, %1 + ret <4 x i64> %s +} + +define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline { + %s = add i64 %0, %1 + ret i64 %s +} + +define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline { + reduce4(i64, @__add_varying_int64, @__add_uniform_int64) +} + + +define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline { + reduce4(i64, @__min_varying_int64, @__min_uniform_int64) +} + + +define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline { + reduce4(i64, @__max_varying_int64, @__max_uniform_int64) +} + + +define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline { + reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + + +define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline { + reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + + +; no masked load instruction for i8 and i16 types?? +masked_load(i8, 1) +masked_load(i16, 2) + +;; avx intrinsics +declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask) +declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask) + +define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline { + %floatmask = bitcast <4 x i32> %mask to <4 x float> + %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask) + %retval = bitcast <4 x float> %floatval to <4 x i32> + ret <4 x i32> %retval +} + + +define <4 x i64> @__masked_load_i64(i8 *, <4 x i32> %mask) nounwind alwaysinline { + ; double up masks, bitcast to doubles + %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef, + <8 x i32> + %mask0d = bitcast <8 x i32> %mask0 to <4 x double> + + %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d) + + %vald = shufflevector <4 x double> %val0d, <4 x double> undef, + <4 x i32> + %val = bitcast <4 x double> %vald to <4 x i64> + ret <4 x i64> %val +} + +masked_load_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +gen_masked_store(i8) +gen_masked_store(i16) + +; note that mask is the 2nd parameter, not the 3rd one!! +;; avx intrinsics +declare void @llvm.x86.avx.maskstore.ps (i8 *, <4 x float>, <4 x float>) +declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>) + +define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, + <4 x i32>) nounwind alwaysinline { + %ptr = bitcast <4 x i32> * %0 to i8 * + %val = bitcast <4 x i32> %1 to <4 x float> + %mask = bitcast <4 x i32> %2 to <4 x float> + call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val) + ret void +} + +define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>, + <4 x i32> %mask) nounwind alwaysinline { + %ptr = bitcast <4 x i64> * %0 to i8 * + %val = bitcast <4 x i64> %1 to <4 x double> + + %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef, + <8 x i32> + + %mask0d = bitcast <8 x i32> %mask0 to <4 x double> + + %val0 = shufflevector <4 x double> %val, <4 x double> undef, + <4 x i32> + + call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0) + ret void +} + + +masked_store_blend_8_16_by_4() + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, + <4 x float>) nounwind readnone + + +define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, + <4 x i32> %mask) nounwind alwaysinline { + %mask_as_float = bitcast <4 x i32> %mask to <4 x float> + %oldValue = load <4 x i32>* %0, align 4 + %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float> + %newAsFloat = bitcast <4 x i32> %1 to <4 x float> + %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat, + <4 x float> %newAsFloat, + <4 x float> %mask_as_float) + %blendAsInt = bitcast <4 x float> %blend to <4 x i32> + store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4 + ret void +} + +;; avx intrinsic +declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, + <8 x float>) nounwind readnone + +define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new, + <4 x i32> %i32mask) nounwind alwaysinline { + %oldValue = load <4 x i64>* %ptr, align 8 + %mask = bitcast <4 x i32> %i32mask to <4 x float> + + ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values + ; are actually bitcast <4 x i64> values + ; + ; set up the first four 64-bit values + %old01 = bitcast <4 x i64> %oldValue to <4 x i64> + %old01f = bitcast <4 x i64> %old01 to <8 x float> + %new01 = bitcast <4 x i64> %new to <4 x i64> + %new01f = bitcast <4 x i64> %new01 to <8 x float> + ; compute mask--note that the indices are all doubled-up + %mask01 = shufflevector <4 x float> %mask, <4 x float> undef, + <8 x i32> + ; and blend them + %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f, + <8 x float> %new01f, + <8 x float> %mask01) + %result01 = bitcast <8 x float> %result01f to <4 x i64> + + + %final = bitcast <4 x i64> %result01 to <4 x i64> + store <4 x i64> %final, <4 x i64> * %ptr, align 8 + ret void +} + +masked_store_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; scatter + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone +declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline { + %call = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %0, <4 x double> %1) + ret <4 x double> %call +} + +define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline { + %call = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %0, <4 x double> %1) + ret <4 x double> %call +} + diff --git a/builtins/target-avxh.ll b/builtins/target-avxh.ll new file mode 100644 index 00000000..98c9111d --- /dev/null +++ b/builtins/target-avxh.ll @@ -0,0 +1,81 @@ +;; Copyright (c) 2010-2011, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include(`target-avx-h.ll') + +rdrand_decls() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int min/max + +define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %call +} +define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1) + + ret <4 x i32> %call +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unsigned int min/max + +define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %call +} + +define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +ifelse(NO_HALF_DECLARES, `1', `', ` +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) diff --git a/ispc.cpp b/ispc.cpp index 6d4b063d..02c23568 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -446,6 +446,15 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } + else if (!strcasecmp(isa, "avxh") ) { + fprintf(stderr, " ISA is avxh \n"); + this->m_isa = Target::AVX; + this->m_nativeVectorWidth = 4; + this->m_vectorWidth = 4; + this->m_attributes = "+avx,+popcnt,+cmov"; + this->m_maskingIsFree = false; + this->m_maskBitCount = 32; + } else if (!strcasecmp(isa, "avx-x2") || !strcasecmp(isa, "avx1-x2") || !strcasecmp(isa, "avx1-i32x16")) { From 320c41ffcf223f4793c39c2f445ed0aed19d6270 Mon Sep 17 00:00:00 2001 From: egaburov Date: Wed, 11 Sep 2013 15:16:50 +0200 Subject: [PATCH 02/14] added svml support. experimental. for some reason all sybmols are visible.. --- .gitignore | 4 ++ Makefile | 6 +- builtins.cpp | 13 ++++ builtins/target-avx-h.ll | 27 ++------ builtins/target-avx-x2.ll | 16 +---- builtins/target-avx.ll | 18 ++---- builtins/target-generic-1.ll | 45 +++++++++---- builtins/target-generic-common.ll | 16 ++--- builtins/target-neon-common.ll | 13 ++-- builtins/target-sse2-x2.ll | 36 +++++------ builtins/target-sse2.ll | 61 ++---------------- builtins/target-sse4-16.ll | 13 +--- builtins/target-sse4-8.ll | 12 +--- builtins/target-sse4-x2.ll | 36 +++++------ builtins/target-sse4.ll | 61 ++---------------- builtins/util.m4 | 6 ++ stdlib.ispc | 102 ++++++++++++++++++++++++------ 17 files changed, 216 insertions(+), 269 deletions(-) diff --git a/.gitignore b/.gitignore index 0469cf7d..3bec2ace 100644 --- a/.gitignore +++ b/.gitignore @@ -11,5 +11,9 @@ tests*/*run examples/*/*.png examples/*/*.ppm examples/*/objs/* +*.swp +.* +!.gitignore + diff --git a/Makefile b/Makefile index b5bb3472..43f41e09 100644 --- a/Makefile +++ b/Makefile @@ -246,15 +246,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc @echo Compiling $< @$(CXX) $(CXXFLAGS) -o $@ -c $< -objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 $(wildcard builtins/*common.ll) +objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll) @echo Creating C++ source from builtins definition file $< @m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@ -objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll) +objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll) @echo Creating C++ source from builtins definition file $< \(32 bit version\) @m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@ -objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll) +objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll) @echo Creating C++ source from builtins definition file $< \(64 bit version\) @m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@ diff --git a/builtins.cpp b/builtins.cpp index 63c90337..139b8f04 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -582,7 +582,9 @@ lSetInternalFunctions(llvm::Module *module) { "__stdlib_tan", "__stdlib_tanf", "__svml_sin", + "__svml_asin", "__svml_cos", + "__svml_acos", "__svml_sincos", "__svml_tan", "__svml_atan", @@ -590,6 +592,17 @@ lSetInternalFunctions(llvm::Module *module) { "__svml_exp", "__svml_log", "__svml_pow", + "__svml_sinf", + "__svml_asinf", + "__svml_cosf", + "__svml_acosf", + "__svml_sincosf", + "__svml_tanf", + "__svml_atanf", + "__svml_atan2f", + "__svml_expf", + "__svml_logf", + "__svml_powf", "__undef_uniform", "__undef_varying", "__vec4_add_float", diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-h.ll index d56a63b9..a06e5ab3 100644 --- a/builtins/target-avx-h.ll +++ b/builtins/target-avx-h.ll @@ -154,28 +154,11 @@ define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; svml -; FIXME: need either to wire these up to the 8-wide SVML entrypoints, -; or, use the macro to call the 4-wide ones twice with our 8-wide -; vectors... - -;;declare <4 x double> @__svml_sin4(<4 x double>) -;;declare <4 x double> @__svml_cos4(<4 x double>) -;;declare void @__svml_sincos4(<4 x double>, <4 x double> *, <4 x double> *) -;;declare <4 x double> @__svml_tan4(<4 x double>) -;;declare <4 x double> @__svml_atan4(<4 x double>) -;;declare <4 x double> @__svml_atan24(<4 x double>, <4 x double>) -;;declare <4 x double> @__svml_exp4(<4 x double>) -;;declare <4 x double> @__svml_log4(<4 x double>) -;;declare <4 x double> @__svml_pow4(<4 x double>, <4 x double>) -declare <4 x float> @__svml_sin(<4 x float>) -declare <4 x float> @__svml_cos(<4 x float>) -declare void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) -declare <4 x float> @__svml_tan(<4 x float>) -declare <4 x float> @__svml_atan(<4 x float>) -declare <4 x float> @__svml_atan2(<4 x float>, <4 x float>) -declare <4 x float> @__svml_exp(<4 x float>) -declare <4 x float> @__svml_log(<4 x float>) -declare <4 x float> @__svml_pow(<4 x float>, <4 x float>) +include(`svml.m4') +svmlf_declare(4) +svmlf_define(4) +svmld_declare(4) +svmld_define(4) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index d9e0322b..d646720e 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -137,19 +137,9 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; svml -; FIXME: need either to wire these up to the 8-wide SVML entrypoints, -; or, use the macro to call the 4-wide ones 4x with our 16-wide -; vectors... - -declare <16 x float> @__svml_sin(<16 x float>) -declare <16 x float> @__svml_cos(<16 x float>) -declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *) -declare <16 x float> @__svml_tan(<16 x float>) -declare <16 x float> @__svml_atan(<16 x float>) -declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>) -declare <16 x float> @__svml_exp(<16 x float>) -declare <16 x float> @__svml_log(<16 x float>) -declare <16 x float> @__svml_pow(<16 x float>, <16 x float>) +include(`svml.m4') +svmlf_stubs(16) +svmld_stubs(16) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index 90e2f3ac..1d33e3f9 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -137,19 +137,11 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; svml -; FIXME: need either to wire these up to the 8-wide SVML entrypoints, -; or, use the macro to call the 4-wide ones twice with our 8-wide -; vectors... - -declare <8 x float> @__svml_sin(<8 x float>) -declare <8 x float> @__svml_cos(<8 x float>) -declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *) -declare <8 x float> @__svml_tan(<8 x float>) -declare <8 x float> @__svml_atan(<8 x float>) -declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>) -declare <8 x float> @__svml_exp(<8 x float>) -declare <8 x float> @__svml_log(<8 x float>) -declare <8 x float> @__svml_pow(<8 x float>, <8 x float>) +include(`svml.m4') +svmlf_declare(8) +svmlf_define(8) +svmld_declare(4) +svmld_stubs(8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 31ebcdd5..910565dd 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -310,6 +310,7 @@ declare double @round (double) nounwind readnone ;declare float @llvm.sqrt.f32(float %Val) declare double @llvm.sqrt.f64(double %Val) declare float @llvm.sin.f32(float %Val) +declare float @llvm.asin.f32(float %Val) declare float @llvm.cos.f32(float %Val) declare float @llvm.sqrt.f32(float %Val) declare float @llvm.exp.f32(float %Val) @@ -651,7 +652,18 @@ define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; svml stuff -define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline { +declare <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline +declare void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline +declare <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_expd(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_logd(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_powd(<1 x float>, <1 x float>) nounwind readnone alwaysinline + +define <1 x float> @__svml_sinf(<1 x float>) nounwind readnone alwaysinline { ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0) ;ret <1 x float> %ret ;%r = extractelement <1 x float> %0, i32 0 @@ -662,7 +674,18 @@ define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline { } -define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline { +define <1 x float> @__svml_asinf(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_asinf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm.asin.f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + unary1to1(float,@llvm.asin.f32) + +} + +define <1 x float> @__svml_cosf(<1 x float>) nounwind readnone alwaysinline { ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0) ;ret <1 x float> %ret ;%r = extractelement <1 x float> %0, i32 0 @@ -673,18 +696,18 @@ define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline { } -define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline { +define void @__svml_sincosf(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline { ; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0) ; store <1 x float> %s, <1 x float> * %1 ; ret void - %sin = call <1 x float> @__svml_sin (<1 x float> %0) - %cos = call <1 x float> @__svml_cos (<1 x float> %0) + %sin = call <1 x float> @__svml_sinf(<1 x float> %0) + %cos = call <1 x float> @__svml_cosf(<1 x float> %0) store <1 x float> %sin, <1 x float> * %1 store <1 x float> %cos, <1 x float> * %2 ret void } -define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline { +define <1 x float> @__svml_tanf(<1 x float>) nounwind readnone alwaysinline { ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0) ;ret <1 x float> %ret ;%r = extractelement <1 x float> %0, i32 0 @@ -696,7 +719,7 @@ define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline { ret <1 x float > %0 } -define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline { +define <1 x float> @__svml_atanf(<1 x float>) nounwind readnone alwaysinline { ; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0) ; ret <1 x float> %ret ;%r = extractelement <1 x float> %0, i32 0 @@ -709,7 +732,7 @@ define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline { } -define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline { +define <1 x float> @__svml_atan2f(<1 x float>, <1 x float>) nounwind readnone alwaysinline { ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1) ;ret <1 x float> %ret ;%y = extractelement <1 x float> %0, i32 0 @@ -722,19 +745,19 @@ define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone al ret <1 x float > %0 } -define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline { +define <1 x float> @__svml_expf(<1 x float>) nounwind readnone alwaysinline { ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0) ;ret <1 x float> %ret unary1to1(float, @llvm.exp.f32) } -define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline { +define <1 x float> @__svml_logf(<1 x float>) nounwind readnone alwaysinline { ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0) ;ret <1 x float> %ret unary1to1(float, @llvm.log.f32) } -define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline { +define <1 x float> @__svml_powf(<1 x float>, <1 x float>) nounwind readnone alwaysinline { ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1) ;ret <1 x float> %ret %r = extractelement <1 x float> %0, i32 0 diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 2896c6b1..bc7db9ec 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -202,21 +202,15 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone declare i32 @__count_leading_zeros_i32(i32) nounwind readnone declare i64 @__count_leading_zeros_i64(i64) nounwind readnone -;; svml - ; FIXME: need either to wire these up to the 8-wide SVML entrypoints, ; or, use the macro to call the 4-wide ones twice with our 8-wide ; vectors... -declare @__svml_sin() -declare @__svml_cos() -declare void @__svml_sincos(, *, *) -declare @__svml_tan() -declare @__svml_atan() -declare @__svml_atan2(, ) -declare @__svml_exp() -declare @__svml_log() -declare @__svml_pow(, ) +;; svml + +include(`svml.m4') +svmlf_stubs(WIDTH) +svmld_stubs(WIDTH) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll index 696b0748..92fc5ce3 100644 --- a/builtins/target-neon-common.ll +++ b/builtins/target-neon-common.ll @@ -316,15 +316,10 @@ define void @__masked_store_blend_i64(* nocapture %ptr, ;; yuck. We need declarations of these, even though we shouldnt ever ;; actually generate calls to them for the NEON target... -declare @__svml_sin() -declare @__svml_cos() -declare void @__svml_sincos(, *, *) -declare @__svml_tan() -declare @__svml_atan() -declare @__svml_atan2(, ) -declare @__svml_exp() -declare @__svml_log() -declare @__svml_pow(, ) + +include(`svml.m4') +svmlf_stubs(WIDTH) +svmld_stubs(WIDTH) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index da22a66c..5688ebba 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -105,28 +105,28 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; svml stuff -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone +include(`svml.m4') +svmlf_declare(4) +svmld_declare(2) +svmld_stubs(8) -define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline { +define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline { unary4to8(ret, float, @__svml_sinf4, %0) ret <8 x float> %ret } -define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline { +define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_asinf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline { unary4to8(ret, float, @__svml_cosf4, %0) ret <8 x float> %ret } -define void @__svml_sincos(<8 x float>, <8 x float> *, +define void @__svml_sincosf(<8 x float>, <8 x float> *, <8 x float> *) nounwind readnone alwaysinline { ; call svml_sincosf4 two times with the two 4-wide sub-vectors %a = shufflevector <8 x float> %0, <8 x float> undef, @@ -155,33 +155,33 @@ define void @__svml_sincos(<8 x float>, <8 x float> *, ret void } -define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline { +define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline { unary4to8(ret, float, @__svml_tanf4, %0) ret <8 x float> %ret } -define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline { +define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline { unary4to8(ret, float, @__svml_atanf4, %0) ret <8 x float> %ret } -define <8 x float> @__svml_atan2(<8 x float>, +define <8 x float> @__svml_atan2f(<8 x float>, <8 x float>) nounwind readnone alwaysinline { binary4to8(ret, float, @__svml_atan2f4, %0, %1) ret <8 x float> %ret } -define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline { +define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline { unary4to8(ret, float, @__svml_expf4, %0) ret <8 x float> %ret } -define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline { +define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline { unary4to8(ret, float, @__svml_logf4, %0) ret <8 x float> %ret } -define <8 x float> @__svml_pow(<8 x float>, +define <8 x float> @__svml_powf(<8 x float>, <8 x float>) nounwind readnone alwaysinline { binary4to8(ret, float, @__svml_powf4, %0, %1) ret <8 x float> %ret diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index a6b206b6..236cda33 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -496,62 +496,11 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; svml stuff -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone - - -define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) - ret <4 x float> %ret -} - -define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { - %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) - store <4 x float> %s, <4 x float> * %1 - ret void -} - -define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - -define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_expf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_logf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} +include(`svml.m4') +svmlf_declare(4) +svmld_declare(2) +svmlf_define(4) +svmld_stubs(4) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index d7f3833d..3fbbe534 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -209,16 +209,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r ;; svml ; FIXME - -declare <8 x float> @__svml_sin(<8 x float>) -declare <8 x float> @__svml_cos(<8 x float>) -declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *) -declare <8 x float> @__svml_tan(<8 x float>) -declare <8 x float> @__svml_atan(<8 x float>) -declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>) -declare <8 x float> @__svml_exp(<8 x float>) -declare <8 x float> @__svml_log(<8 x float>) -declare <8 x float> @__svml_pow(<8 x float>, <8 x float>) +include(`svml.m4') +svmlf_stubs(8) +svmld_stubs(8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index fd4b74d7..e65077b7 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -222,15 +222,9 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin ; FIXME -declare <16 x float> @__svml_sin(<16 x float>) -declare <16 x float> @__svml_cos(<16 x float>) -declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *) -declare <16 x float> @__svml_tan(<16 x float>) -declare <16 x float> @__svml_atan(<16 x float>) -declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>) -declare <16 x float> @__svml_exp(<16 x float>) -declare <16 x float> @__svml_log(<16 x float>) -declare <16 x float> @__svml_pow(<16 x float>, <16 x float>) +include(`svml.m4') +svmlf_stubs(16) +svmld_stubs(16) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index a7faddb3..2a69b60a 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -105,28 +105,28 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; svml stuff -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone +include(`svml.m4') +svmlf_declare(4) +svmld_declare(2) +svmld_stubs(8) -define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline { +define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline { unary4to8(ret, float, @__svml_sinf4, %0) ret <8 x float> %ret } -define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline { +define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_asinf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline { unary4to8(ret, float, @__svml_cosf4, %0) ret <8 x float> %ret } -define void @__svml_sincos(<8 x float>, <8 x float> *, +define void @__svml_sincosf(<8 x float>, <8 x float> *, <8 x float> *) nounwind readnone alwaysinline { ; call svml_sincosf4 two times with the two 4-wide sub-vectors %a = shufflevector <8 x float> %0, <8 x float> undef, @@ -155,33 +155,33 @@ define void @__svml_sincos(<8 x float>, <8 x float> *, ret void } -define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline { +define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline { unary4to8(ret, float, @__svml_tanf4, %0) ret <8 x float> %ret } -define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline { +define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline { unary4to8(ret, float, @__svml_atanf4, %0) ret <8 x float> %ret } -define <8 x float> @__svml_atan2(<8 x float>, +define <8 x float> @__svml_atan2f(<8 x float>, <8 x float>) nounwind readnone alwaysinline { binary4to8(ret, float, @__svml_atan2f4, %0, %1) ret <8 x float> %ret } -define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline { +define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline { unary4to8(ret, float, @__svml_expf4, %0) ret <8 x float> %ret } -define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline { +define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline { unary4to8(ret, float, @__svml_logf4, %0) ret <8 x float> %ret } -define <8 x float> @__svml_pow(<8 x float>, +define <8 x float> @__svml_powf(<8 x float>, <8 x float>) nounwind readnone alwaysinline { binary4to8(ret, float, @__svml_powf4, %0, %1) ret <8 x float> %ret diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index e05b865f..686b4f84 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -209,62 +209,11 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; svml stuff -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone - - -define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) - ret <4 x float> %ret -} - -define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { - %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) - store <4 x float> %s, <4 x float> * %1 - ret void -} - -define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - -define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_expf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_logf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} +include(`svml.m4') +svmlf_declare(4) +svmlf_define(4) +svmld_declare(2) +svmld_stubs(8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/builtins/util.m4 b/builtins/util.m4 index 95e3844d..6c90c821 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -3160,6 +3160,7 @@ define float @__stdlib_powf(float, float) nounwind readnone alwaysinline { } declare double @sin(double) nounwind readnone +declare double @asin(double) nounwind readnone declare double @cos(double) nounwind readnone declare void @sincos(double, double *, double *) nounwind readnone declare double @tan(double) nounwind readnone @@ -3174,6 +3175,11 @@ define double @__stdlib_sin(double) nounwind readnone alwaysinline { ret double %r } +define double @__stdlib_asin(double) nounwind readnone alwaysinline { + %r = call double @asin(double %0) + ret double %r +} + define double @__stdlib_cos(double) nounwind readnone alwaysinline { %r = call double @cos(double %0) ret double %r diff --git a/stdlib.ispc b/stdlib.ispc index e4f8844f..db9d7f36 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -2180,7 +2180,7 @@ static inline uniform float frexp(uniform float x, uniform int * uniform pw2) { __declspec(safe) static inline float sin(float x_full) { if (__math_lib == __math_lib_svml) { - return __svml_sin(x_full); + return __svml_sinf(x_full); } else if (__math_lib == __math_lib_system) { float ret; @@ -2313,8 +2313,10 @@ static inline float asin(float x) { bool isnan = (x > 1); float v; - if (__math_lib == __math_lib_svml || - __math_lib == __math_lib_system) { + if (__math_lib == __math_lib_svml) { + return __svml_asinf(x); + } + else if (__math_lib == __math_lib_system) { float ret; foreach_active (i) { uniform float r = __stdlib_asinf(extract(x, i)); @@ -2417,7 +2419,7 @@ static inline uniform float asin(uniform float x) { __declspec(safe) static inline float cos(float x_full) { if (__math_lib == __math_lib_svml) { - return __svml_cos(x_full); + return __svml_cosf(x_full); } else if (__math_lib == __math_lib_system) { float ret; @@ -2545,18 +2547,28 @@ static inline float acos(float v) { return 1.57079637050628662109375 - asin(v); } +__declspec(safe) +static inline double acos(const double v) { + return 1.57079637050628662109375 - asin(v); +} + __declspec(safe) static inline uniform float acos(uniform float v) { return 1.57079637050628662109375 - asin(v); } +__declspec(safe) +static inline uniform double acos(const uniform double v) { + return 1.57079637050628662109375 - asin(v); +} + __declspec(safe) static inline void sincos(float x_full, varying float * uniform sin_result, varying float * uniform cos_result) { if (__math_lib == __math_lib_svml) { - __svml_sincos(x_full, sin_result, cos_result); + __svml_sincosf(x_full, sin_result, cos_result); } else if (__math_lib == __math_lib_system) { foreach_active (i) { @@ -2688,7 +2700,7 @@ static inline void sincos(uniform float x_full, uniform float * uniform sin_resu __declspec(safe) static inline float tan(float x_full) { if (__math_lib == __math_lib_svml) { - return __svml_tan(x_full); + return __svml_tanf(x_full); } else if (__math_lib == __math_lib_system) { float ret; @@ -2839,7 +2851,7 @@ static inline uniform float tan(uniform float x_full) { __declspec(safe) static inline float atan(float x_full) { if (__math_lib == __math_lib_svml) { - return __svml_atan(x_full); + return __svml_atanf(x_full); } else if (__math_lib == __math_lib_system) { float ret; @@ -2934,7 +2946,7 @@ static inline uniform float atan(uniform float x_full) { __declspec(safe) static inline float atan2(float y, float x) { if (__math_lib == __math_lib_svml) { - return __svml_atan2(y, x); + return __svml_atan2f(y, x); } else if (__math_lib == __math_lib_system) { float ret; @@ -2997,7 +3009,7 @@ static inline float exp(float x_full) { return __exp_varying_float(x_full); } else if (__math_lib == __math_lib_svml) { - return __svml_exp(x_full); + return __svml_expf(x_full); } else if (__math_lib == __math_lib_system) { float ret; @@ -3204,7 +3216,7 @@ static inline float log(float x_full) { return __log_varying_float(x_full); } else if (__math_lib == __math_lib_svml) { - return __svml_log(x_full); + return __svml_logf(x_full); } else if (__math_lib == __math_lib_system) { float ret; @@ -3379,7 +3391,7 @@ static inline float pow(float a, float b) { return __pow_varying_float(a, b); } else if (__math_lib == __math_lib_svml) { - return __svml_pow(a, b); + return __svml_powf(a, b); } else if (__math_lib == __math_lib_system) { float ret; @@ -3469,7 +3481,11 @@ static inline uniform double frexp(uniform double x, uniform int * uniform pw2) __declspec(safe) static inline double sin(double x) { - if (__math_lib == __math_lib_ispc_fast) + if (__math_lib == __math_lib_svml) + { + return __svml_sind(x); + } + else if (__math_lib == __math_lib_ispc_fast) return sin((float)x); else { double ret; @@ -3490,8 +3506,30 @@ static inline uniform double sin(uniform double x) { } __declspec(safe) -static inline double cos(double x) { - if (__math_lib == __math_lib_ispc_fast) +static inline double asin(const double x) { + if (__math_lib == __math_lib_svml) + { + return __svml_asind(x); + } + else if (__math_lib == __math_lib_ispc_fast) + return asin((float)x); + else { + double ret; + foreach_active (i) { + uniform double r = __stdlib_asin(extract(x, i)); + ret = insert(ret, i, r); + } + return ret; + } +} + +__declspec(safe) +static inline double cos(const double x) { + if (__math_lib == __math_lib_svml) + { + return __svml_cosd(x); + } + else if (__math_lib == __math_lib_ispc_fast) return cos((float)x); else { double ret; @@ -3514,7 +3552,11 @@ static inline uniform double cos(uniform double x) { __declspec(safe) static inline void sincos(double x, varying double * uniform sin_result, varying double * uniform cos_result) { - if (__math_lib == __math_lib_ispc_fast) { + if (__math_lib == __math_lib_svml) + { + __svml_sincosd(x, sin_result, cos_result); + } + else if (__math_lib == __math_lib_ispc_fast) { float sr, cr; sincos((float)x, &sr, &cr); *sin_result = sr; @@ -3545,7 +3587,11 @@ static inline void sincos(uniform double x, uniform double * uniform sin_result, __declspec(safe) static inline double tan(double x) { - if (__math_lib == __math_lib_ispc_fast) + if (__math_lib == __math_lib_svml) + { + return __svml_tand(x); + } + else if (__math_lib == __math_lib_ispc_fast) return tan((float)x); else { double ret; @@ -3589,7 +3635,11 @@ static inline uniform double atan(uniform double x) { __declspec(safe) static inline double atan2(double y, double x) { - if (__math_lib == __math_lib_ispc_fast) + if (__math_lib == __math_lib_svml) + { + return __svml_atan2d(y,x); + } + else if (__math_lib == __math_lib_ispc_fast) return atan2((float)y, (float)x); else { double ret; @@ -3611,7 +3661,11 @@ static inline uniform double atan2(uniform double y, uniform double x) { __declspec(safe) static inline double exp(double x) { - if (__math_lib == __math_lib_ispc_fast) + if (__math_lib == __math_lib_svml) + { + return __svml_expd(x); + } + else if (__math_lib == __math_lib_ispc_fast) return exp((float)x); else { double ret; @@ -3633,7 +3687,11 @@ static inline uniform double exp(uniform double x) { __declspec(safe) static inline double log(double x) { - if (__math_lib == __math_lib_ispc_fast) + if (__math_lib == __math_lib_svml) + { + return __svml_logd(x); + } + else if (__math_lib == __math_lib_ispc_fast) return log((float)x); else { double ret; @@ -3655,7 +3713,11 @@ static inline uniform double log(uniform double x) { __declspec(safe) static inline double pow(double a, double b) { - if (__math_lib == __math_lib_ispc_fast) + if (__math_lib == __math_lib_svml) + { + return __svml_powd(a,b); + } + else if (__math_lib == __math_lib_ispc_fast) return pow((float)a, (float)b); else { double ret; From 7a326995735293a25fb44d5f7243521a57df719a Mon Sep 17 00:00:00 2001 From: egaburov Date: Wed, 11 Sep 2013 15:18:03 +0200 Subject: [PATCH 03/14] added svml.m4 --- builtins/svml.m4 | 176 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 builtins/svml.m4 diff --git a/builtins/svml.m4 b/builtins/svml.m4 new file mode 100644 index 00000000..cc3cd979 --- /dev/null +++ b/builtins/svml.m4 @@ -0,0 +1,176 @@ +;; svml + +;; stub +define(`svmlf_stubs',` + declare <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline + declare <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline + declare <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline + declare void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline + declare <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline + declare <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline + declare <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline + declare <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline + declare <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline + declare <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline +') + +define(`svmld_stubs',` + declare <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline + declare <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline + declare <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline + declare void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline + declare <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline + declare <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline + declare <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline + declare <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline + declare <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline + declare <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline +') + +;; single precision +define(`svmlf_declare',` + declare <$1 x float> @__svml_sinf$1(<$1 x float>) nounwind readnone + declare <$1 x float> @__svml_asinf$1(<$1 x float>) nounwind readnone + declare <$1 x float> @__svml_cosf$1(<$1 x float>) nounwind readnone + declare <$1 x float> @__svml_sincosf$1(<$1 x float> *, <$1 x float>) nounwind readnone + declare <$1 x float> @__svml_tanf$1(<$1 x float>) nounwind readnone + declare <$1 x float> @__svml_atanf$1(<$1 x float>) nounwind readnone + declare <$1 x float> @__svml_atan2f$1(<$1 x float>, <$1 x float>) nounwind readnone + declare <$1 x float> @__svml_expf$1(<$1 x float>) nounwind readnone + declare <$1 x float> @__svml_logf$1(<$1 x float>) nounwind readnone + declare <$1 x float> @__svml_powf$1(<$1 x float>, <$1 x float>) nounwind readnone +'); + + + +define(`svmlf_define',` + define <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline { + %ret = call <$1 x float> @__svml_sinf$1(<$1 x float> %0) + ret <$1 x float> %ret + } + define <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline { + %ret = call <$1 x float> @__svml_asinf$1(<$1 x float> %0) + ret <$1 x float> %ret + } + + define <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline { + %ret = call <$1 x float> @__svml_cosf$1(<$1 x float> %0) + ret <$1 x float> %ret + } + + define void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline { + %s = call <$1 x float> @__svml_sincosf$1(<$1 x float> * %2, <$1 x float> %0) + store <$1 x float> %s, <$1 x float> * %1 + ret void + } + + define <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline { + %ret = call <$1 x float> @__svml_tanf$1(<$1 x float> %0) + ret <$1 x float> %ret + } + + define <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline { + %ret = call <$1 x float> @__svml_atanf$1(<$1 x float> %0) + ret <$1 x float> %ret + } + + define <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline { + %ret = call <$1 x float> @__svml_atan2f$1(<$1 x float> %0, <$1 x float> %1) + ret <$1 x float> %ret + } + + define <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline { + %ret = call <$1 x float> @__svml_expf$1(<$1 x float> %0) + ret <$1 x float> %ret + } + + define <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline { + %ret = call <$1 x float> @__svml_logf$1(<$1 x float> %0) + ret <$1 x float> %ret + } + + define <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline { + %ret = call <$1 x float> @__svml_powf$1(<$1 x float> %0, <$1 x float> %1) + ret <$1 x float> %ret + } +') + +;; double precision +define(`svmld_declare',` + declare <$1 x double> @__svml_sin$1(<$1 x double>) nounwind readnone + declare <$1 x double> @__svml_asin$1(<$1 x double>) nounwind readnone + declare <$1 x double> @__svml_cos$1(<$1 x double>) nounwind readnone + declare <$1 x double> @__svml_sincos$1(<$1 x double> *, <$1 x double>) nounwind readnone + declare <$1 x double> @__svml_tan$1(<$1 x double>) nounwind readnone + declare <$1 x double> @__svml_atan$1(<$1 x double>) nounwind readnone + declare <$1 x double> @__svml_atan2$1(<$1 x double>, <$1 x double>) nounwind readnone + declare <$1 x double> @__svml_exp$1(<$1 x double>) nounwind readnone + declare <$1 x double> @__svml_log$1(<$1 x double>) nounwind readnone + declare <$1 x double> @__svml_pow$1(<$1 x double>, <$1 x double>) nounwind readnone +') + +define(`svmld_define',` + define <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline { + %ret = call <$1 x double> @__svml_sin$1(<$1 x double> %0) + ret <$1 x double> %ret + } + define <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline { + %ret = call <$1 x double> @__svml_asin$1(<$1 x double> %0) + ret <$1 x double> %ret + } + + + define <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline { + %ret = call <$1 x double> @__svml_cos$1(<$1 x double> %0) + ret <$1 x double> %ret + } + + define void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline { + %s = call <$1 x double> @__svml_sincos$1(<$1 x double> * %2, <$1 x double> %0) + store <$1 x double> %s, <$1 x double> * %1 + ret void + } + + define <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline { + %ret = call <$1 x double> @__svml_tan$1(<$1 x double> %0) + ret <$1 x double> %ret + } + + define <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline { + %ret = call <$1 x double> @__svml_atan$1(<$1 x double> %0) + ret <$1 x double> %ret + } + + define <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline { + %ret = call <$1 x double> @__svml_atan2$1(<$1 x double> %0, <$1 x double> %1) + ret <$1 x double> %ret + } + + define <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline { + %ret = call <$1 x double> @__svml_exp$1(<$1 x double> %0) + ret <$1 x double> %ret + } + + define <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline { + %ret = call <$1 x double> @__svml_log$1(<$1 x double> %0) + ret <$1 x double> %ret + } + + define <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline { + %ret = call <$1 x double> @__svml_pow$1(<$1 x double> %0, <$1 x double> %1) + ret <$1 x double> %ret + } +') + +;; need to implement smvld for 2xvectorWidth ...:w + +define(`svmld2_define',` + define <$1 x double> @__svml_sinxx(<$1 x double>) nounwind readnone alwaysinline { + %v0 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> + %v1 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> + %ret0 = call <$2 x double> @__svml_sin$2(<$2 x double> %v0) + %ret1 = call <$2 x double> @__svml_sin$2(<$2 x double> %v1) + %ret = shufflevector <$2 x double> %ret0, <$2 x double> %ret1, <$1 x i32> + ret <$1 x double> %ret + } +') From 9cf8e8cbf3945df122bf0652326be1404634c0cb Mon Sep 17 00:00:00 2001 From: egaburov Date: Wed, 11 Sep 2013 15:23:45 +0200 Subject: [PATCH 04/14] builtins fix for double precision svml and __stdlib_asin --- builtins.cpp | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index 139b8f04..816d4d78 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -576,22 +576,23 @@ lSetInternalFunctions(llvm::Module *module) { "__stdlib_pow", "__stdlib_powf", "__stdlib_sin", + "__stdlib_asin", "__stdlib_sincos", "__stdlib_sincosf", "__stdlib_sinf", "__stdlib_tan", "__stdlib_tanf", - "__svml_sin", - "__svml_asin", - "__svml_cos", - "__svml_acos", - "__svml_sincos", - "__svml_tan", - "__svml_atan", - "__svml_atan2", - "__svml_exp", - "__svml_log", - "__svml_pow", + "__svml_sind", + "__svml_asind", + "__svml_cosd", + "__svml_acosd", + "__svml_sincosd", + "__svml_tand", + "__svml_atand", + "__svml_atan2d", + "__svml_expd", + "__svml_logd", + "__svml_powd", "__svml_sinf", "__svml_asinf", "__svml_cosf", From 19379db3b60a60f2f1862a54709115bcf11c7545 Mon Sep 17 00:00:00 2001 From: egaburov Date: Wed, 11 Sep 2013 16:48:56 +0200 Subject: [PATCH 05/14] svml cleanup --- builtins/svml.m4 | 209 +++++++++--------------------- builtins/target-avx-h.ll | 11 +- builtins/target-avx-x2.ll | 9 +- builtins/target-avx.ll | 11 +- builtins/target-generic-common.ll | 4 +- builtins/target-sse2-x2.ll | 8 +- builtins/target-sse2.ll | 12 +- builtins/target-sse4-16.ll | 4 +- builtins/target-sse4-8.ll | 4 +- builtins/target-sse4-x2.ll | 9 +- builtins/target-sse4.ll | 11 +- 11 files changed, 116 insertions(+), 176 deletions(-) diff --git a/builtins/svml.m4 b/builtins/svml.m4 index cc3cd979..9608dea6 100644 --- a/builtins/svml.m4 +++ b/builtins/svml.m4 @@ -1,176 +1,93 @@ ;; svml -;; stub -define(`svmlf_stubs',` - declare <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline - declare <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline - declare <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline - declare void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline - declare <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline - declare <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline - declare <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline - declare <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline - declare <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline - declare <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline +;; stubs +define(`svml_stubs',` + declare <$2 x $1> @__svml_sin$3(<$2 x $1>) nounwind readnone alwaysinline + declare <$2 x $1> @__svml_asin$3(<$2 x $1>) nounwind readnone alwaysinline + declare <$2 x $1> @__svml_cos$3(<$2 x $1>) nounwind readnone alwaysinline + declare void @__svml_sincos$3(<$2 x $1>, <$2 x $1> *, <$2 x $1> *) nounwind readnone alwaysinline + declare <$2 x $1> @__svml_tan$3(<$2 x $1>) nounwind readnone alwaysinline + declare <$2 x $1> @__svml_atan$3(<$2 x $1>) nounwind readnone alwaysinline + declare <$2 x $1> @__svml_atan2$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline + declare <$2 x $1> @__svml_exp$3(<$2 x $1>) nounwind readnone alwaysinline + declare <$2 x $1> @__svml_log$3(<$2 x $1>) nounwind readnone alwaysinline + declare <$2 x $1> @__svml_pow$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline ') -define(`svmld_stubs',` - declare <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline - declare <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline - declare <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline - declare void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline - declare <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline - declare <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline - declare <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline - declare <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline - declare <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline - declare <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline -') - -;; single precision -define(`svmlf_declare',` - declare <$1 x float> @__svml_sinf$1(<$1 x float>) nounwind readnone - declare <$1 x float> @__svml_asinf$1(<$1 x float>) nounwind readnone - declare <$1 x float> @__svml_cosf$1(<$1 x float>) nounwind readnone - declare <$1 x float> @__svml_sincosf$1(<$1 x float> *, <$1 x float>) nounwind readnone - declare <$1 x float> @__svml_tanf$1(<$1 x float>) nounwind readnone - declare <$1 x float> @__svml_atanf$1(<$1 x float>) nounwind readnone - declare <$1 x float> @__svml_atan2f$1(<$1 x float>, <$1 x float>) nounwind readnone - declare <$1 x float> @__svml_expf$1(<$1 x float>) nounwind readnone - declare <$1 x float> @__svml_logf$1(<$1 x float>) nounwind readnone - declare <$1 x float> @__svml_powf$1(<$1 x float>, <$1 x float>) nounwind readnone +;; decalre __svml calls +define(`svml_declare',` + declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_sincos$2(<$3 x $1> *, <$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone '); - - -define(`svmlf_define',` - define <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline { - %ret = call <$1 x float> @__svml_sinf$1(<$1 x float> %0) - ret <$1 x float> %ret +;; define native __svml calls +define(`svml_define',` + define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0) + ret <$3 x $1> %ret } - define <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline { - %ret = call <$1 x float> @__svml_asinf$1(<$1 x float> %0) - ret <$1 x float> %ret + define <$3 x $1> @__svml_asin$4(<$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_asin$2(<$3 x $1> %0) + ret <$3 x $1> %ret } - define <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline { - %ret = call <$1 x float> @__svml_cosf$1(<$1 x float> %0) - ret <$1 x float> %ret + define <$3 x $1> @__svml_cos$4(<$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_cos$2(<$3 x $1> %0) + ret <$3 x $1> %ret } - define void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline { - %s = call <$1 x float> @__svml_sincosf$1(<$1 x float> * %2, <$1 x float> %0) - store <$1 x float> %s, <$1 x float> * %1 + define void @__svml_sincos$4(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline { + %s = call <$3 x $1> @__svml_sincos$2(<$3 x $1> * %2, <$3 x $1> %0) + store <$3 x $1> %s, <$3 x $1> * %1 ret void } - define <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline { - %ret = call <$1 x float> @__svml_tanf$1(<$1 x float> %0) - ret <$1 x float> %ret + define <$3 x $1> @__svml_tan$4(<$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_tan$2(<$3 x $1> %0) + ret <$3 x $1> %ret } - define <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline { - %ret = call <$1 x float> @__svml_atanf$1(<$1 x float> %0) - ret <$1 x float> %ret + define <$3 x $1> @__svml_atan$4(<$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_atan$2(<$3 x $1> %0) + ret <$3 x $1> %ret } - define <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline { - %ret = call <$1 x float> @__svml_atan2f$1(<$1 x float> %0, <$1 x float> %1) - ret <$1 x float> %ret + define <$3 x $1> @__svml_atan2$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_atan2$2(<$3 x $1> %0, <$3 x $1> %1) + ret <$3 x $1> %ret } - define <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline { - %ret = call <$1 x float> @__svml_expf$1(<$1 x float> %0) - ret <$1 x float> %ret + define <$3 x $1> @__svml_exp$4(<$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_exp$2(<$3 x $1> %0) + ret <$3 x $1> %ret } - define <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline { - %ret = call <$1 x float> @__svml_logf$1(<$1 x float> %0) - ret <$1 x float> %ret + define <$3 x $1> @__svml_log$4(<$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_log$2(<$3 x $1> %0) + ret <$3 x $1> %ret } - define <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline { - %ret = call <$1 x float> @__svml_powf$1(<$1 x float> %0, <$1 x float> %1) - ret <$1 x float> %ret + define <$3 x $1> @__svml_pow$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_pow$2(<$3 x $1> %0, <$3 x $1> %1) + ret <$3 x $1> %ret } ') -;; double precision -define(`svmld_declare',` - declare <$1 x double> @__svml_sin$1(<$1 x double>) nounwind readnone - declare <$1 x double> @__svml_asin$1(<$1 x double>) nounwind readnone - declare <$1 x double> @__svml_cos$1(<$1 x double>) nounwind readnone - declare <$1 x double> @__svml_sincos$1(<$1 x double> *, <$1 x double>) nounwind readnone - declare <$1 x double> @__svml_tan$1(<$1 x double>) nounwind readnone - declare <$1 x double> @__svml_atan$1(<$1 x double>) nounwind readnone - declare <$1 x double> @__svml_atan2$1(<$1 x double>, <$1 x double>) nounwind readnone - declare <$1 x double> @__svml_exp$1(<$1 x double>) nounwind readnone - declare <$1 x double> @__svml_log$1(<$1 x double>) nounwind readnone - declare <$1 x double> @__svml_pow$1(<$1 x double>, <$1 x double>) nounwind readnone + +;; define x2 __svml calls +define(`svml_define_x2',` + svml_stubs($1,$3,$4) ') -define(`svmld_define',` - define <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline { - %ret = call <$1 x double> @__svml_sin$1(<$1 x double> %0) - ret <$1 x double> %ret - } - define <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline { - %ret = call <$1 x double> @__svml_asin$1(<$1 x double> %0) - ret <$1 x double> %ret - } - - - define <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline { - %ret = call <$1 x double> @__svml_cos$1(<$1 x double> %0) - ret <$1 x double> %ret - } - - define void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline { - %s = call <$1 x double> @__svml_sincos$1(<$1 x double> * %2, <$1 x double> %0) - store <$1 x double> %s, <$1 x double> * %1 - ret void - } - - define <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline { - %ret = call <$1 x double> @__svml_tan$1(<$1 x double> %0) - ret <$1 x double> %ret - } - - define <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline { - %ret = call <$1 x double> @__svml_atan$1(<$1 x double> %0) - ret <$1 x double> %ret - } - - define <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline { - %ret = call <$1 x double> @__svml_atan2$1(<$1 x double> %0, <$1 x double> %1) - ret <$1 x double> %ret - } - - define <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline { - %ret = call <$1 x double> @__svml_exp$1(<$1 x double> %0) - ret <$1 x double> %ret - } - - define <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline { - %ret = call <$1 x double> @__svml_log$1(<$1 x double> %0) - ret <$1 x double> %ret - } - - define <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline { - %ret = call <$1 x double> @__svml_pow$1(<$1 x double> %0, <$1 x double> %1) - ret <$1 x double> %ret - } -') - -;; need to implement smvld for 2xvectorWidth ...:w - -define(`svmld2_define',` - define <$1 x double> @__svml_sinxx(<$1 x double>) nounwind readnone alwaysinline { - %v0 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> - %v1 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> - %ret0 = call <$2 x double> @__svml_sin$2(<$2 x double> %v0) - %ret1 = call <$2 x double> @__svml_sin$2(<$2 x double> %v1) - %ret = shufflevector <$2 x double> %ret0, <$2 x double> %ret1, <$1 x i32> - ret <$1 x double> %ret - } +;; define x4 __svml calls +define(`svml_define_x4',` + svml_stubs($1,$3,$4) ') diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-h.ll index a06e5ab3..283eaddd 100644 --- a/builtins/target-avx-h.ll +++ b/builtins/target-avx-h.ll @@ -155,10 +155,13 @@ define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline { ;; svml include(`svml.m4') -svmlf_declare(4) -svmlf_define(4) -svmld_declare(4) -svmld_define(4) +;; single precision +svml_declare(float,f4,4) +svml_define(float,f4,4,f) + +;; double precision +svml_declare(double,4,4) +svml_define(double,4,4,d) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index d646720e..f3f1590a 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -138,8 +138,13 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always ;; svml include(`svml.m4') -svmlf_stubs(16) -svmld_stubs(16) +;; single precision +svml_declare(float,f8,8) +svml_define_x2(float,f8,8,f,16) + +;; double precision +svml_declare(double,4,4) +svml_define_x2(double,4,4,d,16) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index 1d33e3f9..7e7ab330 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -138,10 +138,13 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ;; svml include(`svml.m4') -svmlf_declare(8) -svmlf_define(8) -svmld_declare(4) -svmld_stubs(8) +;; single precision +svml_declare(float,f8,8) +svml_define(float,f8,8,f) + +;; double precision +svml_declare(double,4,4) +svml_define_x2(double,4,4,d,8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index bc7db9ec..30a8b030 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -209,8 +209,8 @@ declare i64 @__count_leading_zeros_i64(i64) nounwind readnone ;; svml include(`svml.m4') -svmlf_stubs(WIDTH) -svmld_stubs(WIDTH) +svml_stubs(float, WIDTH, f) +svml_stubs(double, WIDTH, d) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 5688ebba..9fa607a4 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -106,10 +106,12 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ; svml stuff include(`svml.m4') -svmlf_declare(4) -svmld_declare(2) -svmld_stubs(8) +;; single precision +svml_declare(float,f4,4) +;; double precision +svml_declare(double,2,2) +svml_define_x4(double,2,2,d,8) define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline { unary4to8(ret, float, @__svml_sinf4, %0) diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 236cda33..c858ccb6 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -497,10 +497,14 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin ; svml stuff include(`svml.m4') -svmlf_declare(4) -svmld_declare(2) -svmlf_define(4) -svmld_stubs(4) +;; single precision +svml_declare(float,f4,4) +svml_define(float,f4,4,f) + +;; double precision +svml_declare(double,2,2) +svml_define_x2(double,2,2,d,4) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 3fbbe534..3f8cd339 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -210,8 +210,8 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r ; FIXME include(`svml.m4') -svmlf_stubs(8) -svmld_stubs(8) +svml_stubs(float,8,f) +svml_stubs(double,8,d) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index e65077b7..f43cd940 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -223,8 +223,8 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin ; FIXME include(`svml.m4') -svmlf_stubs(16) -svmld_stubs(16) +svml_stubs(float,16,f) +svml_stubs(double,16,d) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 2a69b60a..c45966e3 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -106,9 +106,12 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ; svml stuff include(`svml.m4') -svmlf_declare(4) -svmld_declare(2) -svmld_stubs(8) +;; single precision +svml_declare(float,f4,4) + +;; double precision +svml_declare(double,2,2) +svml_define_x4(double,2,2,d,8) define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline { diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 686b4f84..eb82ab9a 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -210,10 +210,13 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r ; svml stuff include(`svml.m4') -svmlf_declare(4) -svmlf_define(4) -svmld_declare(2) -svmld_stubs(8) +;; single precision +svml_declare(float,f4,4) +svml_define(float,f4,4,f) + +;; double precision +svml_declare(double,2,2) +svml_define_x2(double,2,2,d,4) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions From efc20c211061585150abb02b4720316f0e45dad5 Mon Sep 17 00:00:00 2001 From: egaburov Date: Wed, 11 Sep 2013 17:07:54 +0200 Subject: [PATCH 06/14] added svml support to all sse/avx modes --- builtins/svml.m4 | 44 ++++++++++++++++++--- builtins/target-avx-x2.ll | 4 +- builtins/target-avx.ll | 2 +- builtins/target-sse2-x2.ll | 79 +------------------------------------ builtins/target-sse2.ll | 2 +- builtins/target-sse4-x2.ll | 80 +------------------------------------- builtins/target-sse4.ll | 2 +- 7 files changed, 47 insertions(+), 166 deletions(-) diff --git a/builtins/svml.m4 b/builtins/svml.m4 index 9608dea6..71a6a709 100644 --- a/builtins/svml.m4 +++ b/builtins/svml.m4 @@ -83,11 +83,43 @@ define(`svml_define',` ;; define x2 __svml calls -define(`svml_define_x2',` - svml_stubs($1,$3,$4) +define(`svml_define_x',` + define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline { + unary$3to$5(ret, $1, @__svml_sin$2, %0) + ret <$5 x $1> %ret + } + define <$5 x $1> @__svml_asin$4(<$5 x $1>) nounwind readnone alwaysinline { + unary$3to$5(ret, $1, @__svml_asin$2, %0) + ret <$5 x $1> %ret + } + define <$5 x $1> @__svml_cos$4(<$5 x $1>) nounwind readnone alwaysinline { + unary$3to$5(ret, $1, @__svml_cos$2, %0) + ret <$5 x $1> %ret + } + declare void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline + define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline { + unary$3to$5(ret, $1, @__svml_tan$2, %0) + ret <$5 x $1> %ret + } + define <$5 x $1> @__svml_atan$4(<$5 x $1>) nounwind readnone alwaysinline { + unary$3to$5(ret, $1, @__svml_atan$2, %0) + ret <$5 x $1> %ret + } + define <$5 x $1> @__svml_atan2$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline { + binary$3to$5(ret, $1, @__svml_atan2$2, %0, %1) + ret <$5 x $1> %ret + } + define <$5 x $1> @__svml_exp$4(<$5 x $1>) nounwind readnone alwaysinline { + unary$3to$5(ret, $1, @__svml_exp$2, %0) + ret <$5 x $1> %ret + } + define <$5 x $1> @__svml_log$4(<$5 x $1>) nounwind readnone alwaysinline { + unary$3to$5(ret, $1, @__svml_log$2, %0) + ret <$5 x $1> %ret + } + define <$5 x $1> @__svml_pow$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline { + binary$3to$5(ret, $1, @__svml_pow$2, %0, %1) + ret <$5 x $1> %ret + } ') -;; define x4 __svml calls -define(`svml_define_x4',` - svml_stubs($1,$3,$4) -') diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index f3f1590a..f8fd5cd5 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -140,11 +140,11 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always include(`svml.m4') ;; single precision svml_declare(float,f8,8) -svml_define_x2(float,f8,8,f,16) +svml_define_x(float,f8,8,f,16) ;; double precision svml_declare(double,4,4) -svml_define_x2(double,4,4,d,16) +svml_define_x(double,4,4,d,16) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index 7e7ab330..196e5ea4 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -144,7 +144,7 @@ svml_define(float,f8,8,f) ;; double precision svml_declare(double,4,4) -svml_define_x2(double,4,4,d,8) +svml_define_x(double,4,4,d,8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 9fa607a4..77bf1a9d 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -108,86 +108,11 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin include(`svml.m4') ;; single precision svml_declare(float,f4,4) +svml_define_x(float,f4,4,f,8) ;; double precision svml_declare(double,2,2) -svml_define_x4(double,2,2,d,8) - -define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_sinf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_asinf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_cosf4, %0) - ret <8 x float> %ret -} - -define void @__svml_sincosf(<8 x float>, <8 x float> *, - <8 x float> *) nounwind readnone alwaysinline { - ; call svml_sincosf4 two times with the two 4-wide sub-vectors - %a = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - %b = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - - %cospa = alloca <4 x float> - %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) - - %cospb = alloca <4 x float> - %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) - - %sin = shufflevector <4 x float> %sa, <4 x float> %sb, - <8 x i32> - store <8 x float> %sin, <8 x float> * %1 - - %cosa = load <4 x float> * %cospa - %cosb = load <4 x float> * %cospb - %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, - <8 x i32> - store <8 x float> %cos, <8 x float> * %2 - - ret void -} - -define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_tanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_atanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan2f(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_atan2f4, %0, %1) - ret <8 x float> %ret -} - -define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_expf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_logf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_powf(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_powf4, %0, %1) - ret <8 x float> %ret -} +svml_define_x(double,2,2,d,8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index c858ccb6..e42d4990 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -503,7 +503,7 @@ svml_define(float,f4,4,f) ;; double precision svml_declare(double,2,2) -svml_define_x2(double,2,2,d,4) +svml_define_x(double,2,2,d,4) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index c45966e3..842db53f 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -108,87 +108,11 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin include(`svml.m4') ;; single precision svml_declare(float,f4,4) +svml_define_x(float,f4,4,f,8) ;; double precision svml_declare(double,2,2) -svml_define_x4(double,2,2,d,8) - - -define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_sinf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_asinf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_cosf4, %0) - ret <8 x float> %ret -} - -define void @__svml_sincosf(<8 x float>, <8 x float> *, - <8 x float> *) nounwind readnone alwaysinline { - ; call svml_sincosf4 two times with the two 4-wide sub-vectors - %a = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - %b = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - - %cospa = alloca <4 x float> - %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) - - %cospb = alloca <4 x float> - %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) - - %sin = shufflevector <4 x float> %sa, <4 x float> %sb, - <8 x i32> - store <8 x float> %sin, <8 x float> * %1 - - %cosa = load <4 x float> * %cospa - %cosb = load <4 x float> * %cospb - %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, - <8 x i32> - store <8 x float> %cos, <8 x float> * %2 - - ret void -} - -define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_tanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_atanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan2f(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_atan2f4, %0, %1) - ret <8 x float> %ret -} - -define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_expf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_logf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_powf(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_powf4, %0, %1) - ret <8 x float> %ret -} +svml_define_x(double,2,2,d,8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index eb82ab9a..88be6c59 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -216,7 +216,7 @@ svml_define(float,f4,4,f) ;; double precision svml_declare(double,2,2) -svml_define_x2(double,2,2,d,4) +svml_define_x(double,2,2,d,4) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions From 7364e06387e7cc02f1a144097754e03181602208 Mon Sep 17 00:00:00 2001 From: egaburov Date: Thu, 12 Sep 2013 12:02:42 +0200 Subject: [PATCH 07/14] added mask64 --- Makefile | 17 ++- builtins.cpp | 10 +- .../{target-avxh.ll => target-avx-i64x4.ll} | 2 +- ...arget-avx-h.ll => target-avx-i64x4base.ll} | 137 +++++++----------- builtins/util.m4 | 76 +++++++++- ispc.cpp | 5 +- llvmutil.cpp | 22 ++- parse.yy | 3 + stdlib.ispc | 3 + 9 files changed, 175 insertions(+), 100 deletions(-) rename builtins/{target-avxh.ll => target-avx-i64x4.ll} (98%) rename builtins/{target-avx-h.ll => target-avx-i64x4base.ll} (78%) diff --git a/Makefile b/Makefile index 43f41e09..92debe4f 100644 --- a/Makefile +++ b/Makefile @@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=avxh avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ +TARGETS=avx-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \ generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 ifneq ($(ARM_ENABLED), 0) @@ -160,7 +160,7 @@ BISON_SRC=parse.yy FLEX_SRC=lex.ll OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \ - stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \ + stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o stdlib_mask64_ispc.o \ $(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o)) default: ispc @@ -268,20 +268,25 @@ objs/builtins-c-64.cpp: builtins/builtins.c objs/stdlib_mask1_ispc.cpp: stdlib.ispc @echo Creating C++ source from $< for mask1 - @$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + @$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 $< -o - | \ python stdlib2cpp.py mask1 > $@ objs/stdlib_mask8_ispc.cpp: stdlib.ispc @echo Creating C++ source from $< for mask8 - @$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + @$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 $< -o - | \ python stdlib2cpp.py mask8 > $@ objs/stdlib_mask16_ispc.cpp: stdlib.ispc @echo Creating C++ source from $< for mask16 - @$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + @$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 $< -o - | \ python stdlib2cpp.py mask16 > $@ objs/stdlib_mask32_ispc.cpp: stdlib.ispc @echo Creating C++ source from $< for mask32 - @$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + @$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 $< -o - | \ python stdlib2cpp.py mask32 > $@ + +objs/stdlib_mask64_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask64 + @$(CLANG) -E -x c -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 $< -o - | \ + python stdlib2cpp.py mask64 > $@ diff --git a/builtins.cpp b/builtins.cpp index 816d4d78..f8d4136e 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -302,6 +302,7 @@ lCheckModuleIntrinsics(llvm::Module *module) { // check the llvm.x86.* intrinsics for now... if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) { llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID(); + if (id == 0) fprintf(stderr, "FATAL: intrinsic is not found: %s \n", funcName.c_str()); Assert(id != 0); llvm::Type *intrinsicType = llvm::Intrinsic::getType(*g->ctx, id); @@ -936,10 +937,10 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod switch (g->target->getVectorWidth()) { case 4: if (runtime32) { - EXPORT_MODULE(builtins_bitcode_avxh_32bit); + EXPORT_MODULE(builtins_bitcode_avx_i64x4_32bit); } else { - EXPORT_MODULE(builtins_bitcode_avxh_64bit); + EXPORT_MODULE(builtins_bitcode_avx_i64x4_64bit); } break; case 8: @@ -1105,7 +1106,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // serialized version of the stdlib.ispc file to get its // definitions added. extern char stdlib_mask1_code[], stdlib_mask8_code[]; - extern char stdlib_mask16_code[], stdlib_mask32_code[]; + extern char stdlib_mask16_code[], stdlib_mask32_code[], stdlib_mask64_code[]; if (g->target->getISA() == Target::GENERIC && g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib yy_scan_string(stdlib_mask32_code); @@ -1124,6 +1125,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod case 32: yy_scan_string(stdlib_mask32_code); break; + case 64: + yy_scan_string(stdlib_mask64_code); + break; default: FATAL("Unhandled mask bit size for stdlib.ispc"); } diff --git a/builtins/target-avxh.ll b/builtins/target-avx-i64x4.ll similarity index 98% rename from builtins/target-avxh.ll rename to builtins/target-avx-i64x4.ll index 98c9111d..d7dbb6bd 100644 --- a/builtins/target-avxh.ll +++ b/builtins/target-avx-i64x4.ll @@ -29,7 +29,7 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -include(`target-avx-h.ll') +include(`target-avx-i64x4base.ll') rdrand_decls() diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-i64x4base.ll similarity index 78% rename from builtins/target-avx-h.ll rename to builtins/target-avx-i64x4base.ll index 283eaddd..05bf178d 100644 --- a/builtins/target-avx-h.ll +++ b/builtins/target-avx-i64x4base.ll @@ -33,7 +33,7 @@ ;; Basic 4-wide definitions define(`WIDTH',`4') -define(`MASK',`i32') +define(`MASK',`i64') include(`util.m4') stdlib_core() @@ -185,32 +185,32 @@ define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind reado ; horizontal ops ;; sse intrinsic -declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone +declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone -define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline { - %floatmask = bitcast <4 x i32> %0 to <4 x float> - %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone +define i64 @__movmsk(<4 x i64>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i64> %0 to <4 x double> + %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone %v64 = zext i32 %v to i64 ret i64 %v64 } -define i1 @__any(<4 x i32>) nounwind readnone alwaysinline { - %floatmask = bitcast <4 x i32> %0 to <4 x float> - %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone +define i1 @__any(<4 x i64>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i64> %0 to <4 x double> + %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone %cmp = icmp ne i32 %v, 0 ret i1 %cmp } -define i1 @__all(<4 x i32>) nounwind readnone alwaysinline { - %floatmask = bitcast <4 x i32> %0 to <4 x float> - %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone +define i1 @__all(<4 x i64>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i64> %0 to <4 x double> + %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone %cmp = icmp eq i32 %v, 15 ret i1 %cmp } -define i1 @__none(<4 x i32>) nounwind readnone alwaysinline { - %floatmask = bitcast <4 x i32> %0 to <4 x float> - %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone +define i1 @__none(<4 x i64>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i64> %0 to <4 x double> + %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone %cmp = icmp eq i32 %v, 0 ret i1 %cmp } @@ -392,7 +392,8 @@ masked_load(i16, 2) declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask) declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask) -define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline { +define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline { + %mask = trunc <4 x i64> %mask64 to <4 x i32> %floatmask = bitcast <4 x i32> %mask to <4 x float> %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask) %retval = bitcast <4 x float> %floatval to <4 x i32> @@ -400,18 +401,11 @@ define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline } -define <4 x i64> @__masked_load_i64(i8 *, <4 x i32> %mask) nounwind alwaysinline { - ; double up masks, bitcast to doubles - %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef, - <8 x i32> - %mask0d = bitcast <8 x i32> %mask0 to <4 x double> - - %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d) - - %vald = shufflevector <4 x double> %val0d, <4 x double> undef, - <4 x i32> - %val = bitcast <4 x double> %vald to <4 x i64> - ret <4 x i64> %val +define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline { + %doublemask = bitcast <4 x i64> %mask to <4 x double> + %doubleval = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %doublemask) + %retval = bitcast <4 x double> %doubleval to <4 x i64> + ret <4 x i64> %retval } masked_load_float_double() @@ -428,83 +422,62 @@ declare void @llvm.x86.avx.maskstore.ps (i8 *, <4 x float>, <4 x float>) declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>) define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, - <4 x i32>) nounwind alwaysinline { - %ptr = bitcast <4 x i32> * %0 to i8 * - %val = bitcast <4 x i32> %1 to <4 x float> - %mask = bitcast <4 x i32> %2 to <4 x float> + <4 x i64>) nounwind alwaysinline { + %mask32 = trunc <4 x i64> %2 to <4 x i32> + + %ptr = bitcast <4 x i32> * %0 to i8 * + %val = bitcast <4 x i32> %1 to <4 x float> + %mask = bitcast <4 x i32> %mask32 to <4 x float> call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val) ret void } define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>, - <4 x i32> %mask) nounwind alwaysinline { - %ptr = bitcast <4 x i64> * %0 to i8 * - %val = bitcast <4 x i64> %1 to <4 x double> - - %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef, - <8 x i32> - - %mask0d = bitcast <8 x i32> %mask0 to <4 x double> - - %val0 = shufflevector <4 x double> %val, <4 x double> undef, - <4 x i32> - - call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0) + <4 x i64>) nounwind alwaysinline { + %ptr = bitcast <4 x i64> * %0 to i8 * + %val = bitcast <4 x i64> %1 to <4 x double> + %mask = bitcast <4 x i64> %2 to <4 x double> + call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask, <4 x double> %val) ret void } -masked_store_blend_8_16_by_4() +masked_store_blend_8_16_by_4_mask64() ;; sse intrinsic -declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, +declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone - define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, - <4 x i32> %mask) nounwind alwaysinline { + <4 x i64>) nounwind alwaysinline { + %mask = trunc <4 x i64> %2 to <4 x i32> %mask_as_float = bitcast <4 x i32> %mask to <4 x float> - %oldValue = load <4 x i32>* %0, align 4 - %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float> - %newAsFloat = bitcast <4 x i32> %1 to <4 x float> - %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat, - <4 x float> %newAsFloat, - <4 x float> %mask_as_float) + %oldValue = load <4 x i32>* %0, align 4 + %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float> + %newAsFloat = bitcast <4 x i32> %1 to <4 x float> + %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat, + <4 x float> %newAsFloat, + <4 x float> %mask_as_float) %blendAsInt = bitcast <4 x float> %blend to <4 x i32> store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4 ret void } ;; avx intrinsic -declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, - <8 x float>) nounwind readnone +declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, + <4 x double>) nounwind readnone -define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new, - <4 x i32> %i32mask) nounwind alwaysinline { - %oldValue = load <4 x i64>* %ptr, align 8 - %mask = bitcast <4 x i32> %i32mask to <4 x float> - - ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values - ; are actually bitcast <4 x i64> values - ; - ; set up the first four 64-bit values - %old01 = bitcast <4 x i64> %oldValue to <4 x i64> - %old01f = bitcast <4 x i64> %old01 to <8 x float> - %new01 = bitcast <4 x i64> %new to <4 x i64> - %new01f = bitcast <4 x i64> %new01 to <8 x float> - ; compute mask--note that the indices are all doubled-up - %mask01 = shufflevector <4 x float> %mask, <4 x float> undef, - <8 x i32> - ; and blend them - %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f, - <8 x float> %new01f, - <8 x float> %mask01) - %result01 = bitcast <8 x float> %result01f to <4 x i64> - - - %final = bitcast <4 x i64> %result01 to <4 x i64> - store <4 x i64> %final, <4 x i64> * %ptr, align 8 +define void @__masked_store_blend_i64(<4 x i64>* nocapture , <4 x i64>, + <4 x i64>) nounwind alwaysinline { + %mask_as_double = bitcast <4 x i64> %2 to <4 x double> + %oldValue = load <4 x i64>* %0, align 4 + %oldAsDouble = bitcast <4 x i64> %oldValue to <4 x double> + %newAsDouble = bitcast <4 x i64> %1 to <4 x double> + %blend = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %oldAsDouble, + <4 x double> %newAsDouble, + <4 x double> %mask_as_double) + %blendAsInt = bitcast <4 x double> %blend to <4 x i64> + store <4 x i64> %blendAsInt, <4 x i64>* %0, align 4 ret void } diff --git a/builtins/util.m4 b/builtins/util.m4 index 6c90c821..68fa818b 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -947,6 +947,22 @@ define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) { %r = sext <$1 x i32> %0 to <$1 x i64> ret <$1 x i64> %r } + +define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) { + %r = trunc <$1 x i64> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) { + %r = trunc <$1 x i64> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) { + %r = trunc <$1 x i64> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) { + ret <$1 x i64> %0 +} ') mask_converts(WIDTH) @@ -2689,9 +2705,13 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline { } define @__sext_varying_bool() nounwind readnone alwaysinline { - ifelse(MASK,i32, `ret %0', - `%se = sext %0 to - ret %se') +;; ifelse(MASK,i32, `ret %0', +;; `%se = sext %0 to +;; ret %se') + ifelse(MASK,i32, `%se = bitcast %0 to ', + MASK,i64, `%se = trunc %0 to ', + `%se = sext %0 to ') + ret %se } @@ -3508,6 +3528,56 @@ define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>, } ') +define(`masked_store_blend_8_16_by_4_mask64', ` +define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>, + <4 x i64>) nounwind alwaysinline { + %old = load <4 x i8> * %0, align 1 + ifelse(LLVM_VERSION,LLVM_3_0,` + %old32 = bitcast <4 x i8> %old to i32 + %new32 = bitcast <4 x i8> %1 to i32 + + %mask8 = trunc <4 x i64> %2 to <4 x i8> + %mask32 = bitcast <4 x i8> %mask8 to i32 + %notmask32 = xor i32 %mask32, -1 + + %newmasked = and i32 %new32, %mask32 + %oldmasked = and i32 %old32, %notmask32 + %result = or i32 %newmasked, %oldmasked + + %resultvec = bitcast i32 %result to <4 x i8> + ',` + %m = trunc <4 x i64> %2 to <4 x i1> + %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old + ') + store <4 x i8> %resultvec, <4 x i8> * %0, align 1 + ret void +} + +define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>, + <4 x i64>) nounwind alwaysinline { + %old = load <4 x i16> * %0, align 2 + ifelse(LLVM_VERSION,LLVM_3_0,` + %old64 = bitcast <4 x i16> %old to i64 + %new64 = bitcast <4 x i16> %1 to i64 + + %mask16 = trunc <4 x i64> %2 to <4 x i16> + %mask64 = bitcast <4 x i16> %mask16 to i64 + %notmask64 = xor i64 %mask64, -1 + + %newmasked = and i64 %new64, %mask64 + %oldmasked = and i64 %old64, %notmask64 + %result = or i64 %newmasked, %oldmasked + + %resultvec = bitcast i64 %result to <4 x i16> + ',` + %m = trunc <4 x i64> %2 to <4 x i1> + %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old + ') + store <4 x i16> %resultvec, <4 x i16> * %0, align 2 + ret void +} +') + define(`masked_store_blend_8_16_by_8', ` define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>, <8 x i32>) nounwind alwaysinline { diff --git a/ispc.cpp b/ispc.cpp index 02c23568..046c64c4 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -446,14 +446,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "avxh") ) { - fprintf(stderr, " ISA is avxh \n"); + else if (!strcasecmp(isa, "avx-i64x4") ) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; this->m_attributes = "+avx,+popcnt,+cmov"; this->m_maskingIsFree = false; - this->m_maskBitCount = 32; + this->m_maskBitCount = 64; } else if (!strcasecmp(isa, "avx-x2") || !strcasecmp(isa, "avx1-x2") || diff --git a/llvmutil.cpp b/llvmutil.cpp index 180c8676..64691498 100644 --- a/llvmutil.cpp +++ b/llvmutil.cpp @@ -132,6 +132,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { LLVMTypes::MaskType = LLVMTypes::BoolVectorType = llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth()); break; + case 64: + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt64Ty(*ctx), target.getVectorWidth()); + break; default: FATAL("Unhandled mask width for initializing MaskType"); } @@ -183,6 +187,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1, true /*signed*/); // 0xffffffff break; + case 64: + onMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), -1, + true /*signed*/); // 0xffffffff + break; default: FATAL("Unhandled mask width for onMask"); } @@ -210,6 +218,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0, true /*signed*/); break; + case 64: + offMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), 0, + true /*signed*/); + break; default: FATAL("Unhandled mask width for offMask"); } @@ -480,7 +492,10 @@ LLVMUInt64Vector(const uint64_t *ivec) { llvm::Constant * LLVMBoolVector(bool b) { llvm::Constant *v; - if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int64Type, b ? 0xffffffffffffffffull : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0, false /*unsigned*/); else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType) @@ -506,7 +521,10 @@ LLVMBoolVector(const bool *bvec) { std::vector vals; for (int i = 0; i < g->target->getVectorWidth(); ++i) { llvm::Constant *v; - if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int64Type, bvec[i] ? 0xffffffffffffffffull : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0, false /*unsigned*/); else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType) diff --git a/parse.yy b/parse.yy index 5fc01cb0..9a2b4fc3 100644 --- a/parse.yy +++ b/parse.yy @@ -2183,6 +2183,9 @@ static void lAddMaskToSymbolTable(SourcePos pos) { case 32: t = AtomicType::VaryingUInt32; break; + case 64: + t = AtomicType::VaryingUInt64; + break; default: FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable"); } diff --git a/stdlib.ispc b/stdlib.ispc index db9d7f36..6d7ee051 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -50,6 +50,9 @@ #elif (ISPC_MASK_BITS == 32) #define IntMaskType int32 #define UIntMaskType unsigned int32 +#elif (ISPC_MASK_BITS == 64) + #define IntMaskType int64 + #define UIntMaskType unsigned int64 #else #error Unknown value of ISPC_MASK_BITS #endif From 059d80cc11d0cf50d337fceb1ae04d0c3c365152 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Thu, 12 Sep 2013 17:18:12 +0200 Subject: [PATCH 08/14] included suggested changes, ./tests/launch-*.ispc still fails. something is mask64 related, not sure what. help... --- .gitignore | 3 - builtins/svml.m4 | 124 ++++++++++++++++++++++++++---- builtins/target-avx-i64x4.ll | 2 +- builtins/target-avx-i64x4base.ll | 2 +- builtins/target-generic-common.ll | 4 +- builtins/target-neon-common.ll | 4 +- builtins/target-sse4-16.ll | 4 +- builtins/target-sse4-8.ll | 4 +- llvmutil.cpp | 2 +- run_tests.py | 2 +- 10 files changed, 120 insertions(+), 31 deletions(-) diff --git a/.gitignore b/.gitignore index 3bec2ace..88fb0197 100644 --- a/.gitignore +++ b/.gitignore @@ -12,8 +12,5 @@ examples/*/*.png examples/*/*.ppm examples/*/objs/* *.swp -.* -!.gitignore - diff --git a/builtins/svml.m4 b/builtins/svml.m4 index 71a6a709..0a587577 100644 --- a/builtins/svml.m4 +++ b/builtins/svml.m4 @@ -1,20 +1,61 @@ -;; svml +;; copyright stub :) +;; Copyright (c) 2013, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;; stubs + +;; svml macro + +;; svml_stubs : stubs for svml calls +;; $1 - type ("float" or "double") +;; $2 - svml internal function suffix ("f" for float, "d" for double) +;; $3 - vector width define(`svml_stubs',` - declare <$2 x $1> @__svml_sin$3(<$2 x $1>) nounwind readnone alwaysinline - declare <$2 x $1> @__svml_asin$3(<$2 x $1>) nounwind readnone alwaysinline - declare <$2 x $1> @__svml_cos$3(<$2 x $1>) nounwind readnone alwaysinline - declare void @__svml_sincos$3(<$2 x $1>, <$2 x $1> *, <$2 x $1> *) nounwind readnone alwaysinline - declare <$2 x $1> @__svml_tan$3(<$2 x $1>) nounwind readnone alwaysinline - declare <$2 x $1> @__svml_atan$3(<$2 x $1>) nounwind readnone alwaysinline - declare <$2 x $1> @__svml_atan2$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline - declare <$2 x $1> @__svml_exp$3(<$2 x $1>) nounwind readnone alwaysinline - declare <$2 x $1> @__svml_log$3(<$2 x $1>) nounwind readnone alwaysinline - declare <$2 x $1> @__svml_pow$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone alwaysinline + declare void @__svml_sincos$2(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline ') -;; decalre __svml calls +;; svml_declare : declaration of __svml_* intrinsics +;; $1 - type ("float" or "double") +;; $2 - __svml_* intrinsic function suffix +;; float: "f4"(sse) "f8"(avx) "f16"(avx512) +;; double: "2"(sse) "4"(avx) "8"(avx512) +;; $3 - vector width define(`svml_declare',` declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone @@ -28,7 +69,13 @@ define(`svml_declare',` declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone '); -;; define native __svml calls +;; defintition of __svml_* internal functions +;; $1 - type ("float" or "double") +;; $2 - __svml_* intrinsic function suffix +;; float: "f4"(sse) "f8"(avx) "f16"(avx512) +;; double: "2"(sse) "4"(avx) "8"(avx512) +;; $3 - vector width +;; $4 - svml internal function suffix ("f" for float, "d" for double) define(`svml_define',` define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline { %ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0) @@ -82,7 +129,45 @@ define(`svml_define',` ') -;; define x2 __svml calls +;; svml_define_x : defintition of __svml_* internal functions operation on extended width +;; $1 - type ("float" or "double") +;; $2 - __svml_* intrinsic function suffix +;; float: "f4"(sse) "f8"(avx) "f16"(avx512) +;; double: "2"(sse) "4"(avx) "8"(avx512) +;; $3 - vector width +;; $4 - svml internal function suffix ("f" for float, "d" for double) +;; $5 - extended width, must be at least twice the native vector width +;; contigent on existing of unary$3to$5 and binary$3to$5 macros + +;; *todo*: in sincos call use __svml_sincos[f][2,4,8,16] call, e.g. +;;define void @__svml_sincosf(<8 x float>, <8 x float> *, +;; <8 x float> *) nounwind readnone alwaysinline { +;; ; call svml_sincosf4 two times with the two 4-wide sub-vectors +;; %a = shufflevector <8 x float> %0, <8 x float> undef, +;; <4 x i32> +;; %b = shufflevector <8 x float> %0, <8 x float> undef, +;; <4 x i32> +;; +;; %cospa = alloca <4 x float> +;; %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) +;; +;; %cospb = alloca <4 x float> +;; %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) +;; +;; %sin = shufflevector <4 x float> %sa, <4 x float> %sb, +;; <8 x i32> +;; store <8 x float> %sin, <8 x float> * %1 +;; +;; %cosa = load <4 x float> * %cospa +;; %cosb = load <4 x float> * %cospb +;; %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, +;; <8 x i32> +;; store <8 x float> %cos, <8 x float> * %2 +;; +;; ret void +;;} define(`svml_define_x',` define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline { unary$3to$5(ret, $1, @__svml_sin$2, %0) @@ -96,7 +181,14 @@ define(`svml_define_x',` unary$3to$5(ret, $1, @__svml_cos$2, %0) ret <$5 x $1> %ret } - declare void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline + define void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline + { + %s = call <$5 x $1> @__svml_sin$4(<$5 x $1> %0) + %c = call <$5 x $1> @__svml_cos$4(<$5 x $1> %0) + store <$5 x $1> %s, <$5 x $1> * %1 + store <$5 x $1> %c, <$5 x $1> * %2 + ret void + } define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline { unary$3to$5(ret, $1, @__svml_tan$2, %0) ret <$5 x $1> %ret diff --git a/builtins/target-avx-i64x4.ll b/builtins/target-avx-i64x4.ll index d7dbb6bd..65490ea5 100644 --- a/builtins/target-avx-i64x4.ll +++ b/builtins/target-avx-i64x4.ll @@ -1,4 +1,4 @@ -;; Copyright (c) 2010-2011, Intel Corporation +;; Copyright (c) 2013, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without diff --git a/builtins/target-avx-i64x4base.ll b/builtins/target-avx-i64x4base.ll index 05bf178d..e1832030 100644 --- a/builtins/target-avx-i64x4base.ll +++ b/builtins/target-avx-i64x4base.ll @@ -1,4 +1,4 @@ -;; Copyright (c) 2010-2012, Intel Corporation +;; Copyright (c) 2013, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 30a8b030..2a5d1b32 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -209,8 +209,8 @@ declare i64 @__count_leading_zeros_i64(i64) nounwind readnone ;; svml include(`svml.m4') -svml_stubs(float, WIDTH, f) -svml_stubs(double, WIDTH, d) +svml_stubs(float,f,WIDTH) +svml_stubs(double,d,WIDTH) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll index 92fc5ce3..1c0b421f 100644 --- a/builtins/target-neon-common.ll +++ b/builtins/target-neon-common.ll @@ -318,8 +318,8 @@ define void @__masked_store_blend_i64(* nocapture %ptr, include(`svml.m4') -svmlf_stubs(WIDTH) -svmld_stubs(WIDTH) +svml_stubs(float,f,WIDTH) +svml_stubs(double,d,WIDTH) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 3f8cd339..72b81ff0 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -210,8 +210,8 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r ; FIXME include(`svml.m4') -svml_stubs(float,8,f) -svml_stubs(double,8,d) +svml_stubs(float,f,WIDTH) +svml_stubs(double,d,WIDTH) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index f43cd940..69b355e3 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -223,8 +223,8 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin ; FIXME include(`svml.m4') -svml_stubs(float,16,f) -svml_stubs(double,16,d) +svml_stubs(float,f,WIDTH) +svml_stubs(double,d,WIDTH) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/llvmutil.cpp b/llvmutil.cpp index 64691498..275cf794 100644 --- a/llvmutil.cpp +++ b/llvmutil.cpp @@ -189,7 +189,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { break; case 64: onMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), -1, - true /*signed*/); // 0xffffffff + true /*signed*/); // 0xffffffffffffffffull break; default: FATAL("Unhandled mask width for onMask"); diff --git a/run_tests.py b/run_tests.py index 9729930f..180205a0 100755 --- a/run_tests.py +++ b/run_tests.py @@ -75,7 +75,7 @@ if not os.path.exists(ispc_exe): sys.stderr.write("Fatal error: missing ispc compiler: %s\n" % ispc_exe) sys.exit() -ispc_exe += " " + options.ispc_flags +ispc_exe += " -g " + options.ispc_flags if __name__ == '__main__': sys.stdout.write("ispc compiler: %s\n" % ispc_exe) From 40af8d6ed564cc5970786459587ecdc487a1fc44 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Thu, 12 Sep 2013 20:25:44 +0200 Subject: [PATCH 09/14] fixed segfault in tests/launch-*.ispc. nativeVectoWidth in avx-i64x4 was set to 4. Fixed --- ispc.cpp | 2 +- run_tests.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ispc.cpp b/ispc.cpp index 046c64c4..1a99154b 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -448,7 +448,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } else if (!strcasecmp(isa, "avx-i64x4") ) { this->m_isa = Target::AVX; - this->m_nativeVectorWidth = 4; + this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ this->m_vectorWidth = 4; this->m_attributes = "+avx,+popcnt,+cmov"; this->m_maskingIsFree = false; diff --git a/run_tests.py b/run_tests.py index 180205a0..9729930f 100755 --- a/run_tests.py +++ b/run_tests.py @@ -75,7 +75,7 @@ if not os.path.exists(ispc_exe): sys.stderr.write("Fatal error: missing ispc compiler: %s\n" % ispc_exe) sys.exit() -ispc_exe += " -g " + options.ispc_flags +ispc_exe += " " + options.ispc_flags if __name__ == '__main__': sys.stdout.write("ispc compiler: %s\n" % ispc_exe) From 715b82826634644eec8f95f40e53d16b8a587ca3 Mon Sep 17 00:00:00 2001 From: egaburov Date: Fri, 13 Sep 2013 09:25:52 +0200 Subject: [PATCH 10/14] fixed float constants to be read as doubles --- lex.ll | 4 ++-- parse.yy | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lex.ll b/lex.ll index 8baa627a..129f0cd5 100644 --- a/lex.ll +++ b/lex.ll @@ -440,13 +440,13 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA {FLOAT_NUMBER} { RT; - yylval.floatVal = (float)atof(yytext); + yylval.floatVal = atof(yytext); return TOKEN_FLOAT_CONSTANT; } {HEX_FLOAT_NUMBER} { RT; - yylval.floatVal = (float)lParseHexFloat(yytext); + yylval.floatVal = lParseHexFloat(yytext); return TOKEN_FLOAT_CONSTANT; } diff --git a/parse.yy b/parse.yy index 9a2b4fc3..b55d49e0 100644 --- a/parse.yy +++ b/parse.yy @@ -149,7 +149,7 @@ struct ForeachDimension { %union { uint64_t intVal; - float floatVal; + double floatVal; std::string *stringVal; const char *constCharPtr; @@ -326,8 +326,8 @@ primary_expression (uint64_t)yylval.intVal, @1); } | TOKEN_FLOAT_CONSTANT { - $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(), - (float)yylval.floatVal, @1); + $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(), + yylval.floatVal, @1); } | TOKEN_TRUE { $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1); From a97eb7b7cb217fb8f583314612527171488b0f79 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Fri, 13 Sep 2013 09:32:59 +0200 Subject: [PATCH 11/14] added clamp in double precision --- stdlib.ispc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/stdlib.ispc b/stdlib.ispc index 6d7ee051..0d5c4efd 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1559,6 +1559,18 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl return min(max(v, low), high); } +// double + +__declspec(safe,cost2) +static inline double clamp(double v, double low, double high) { + return min(max(v, low), high); +} + +__declspec(safe,cost2) +static inline uniform double clamp(uniform double v, uniform double low, uniform double high) { + return min(max(v, low), high); +} + // int8 __declspec(safe,cost2) From a9913c83377614dde2ac782e298f437e45dcbd84 Mon Sep 17 00:00:00 2001 From: egaburov Date: Fri, 13 Sep 2013 10:26:15 +0200 Subject: [PATCH 12/14] changed lexer/parser to be able to read float constants, if they have "f"-suffix --- lex.ll | 23 ++++++++++++++++++++--- parse.yy | 11 ++++++++--- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/lex.ll b/lex.ll index 129f0cd5..7a3db71a 100644 --- a/lex.ll +++ b/lex.ll @@ -76,7 +76,7 @@ static int allTokens[] = { TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED, TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE, TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT, - TOKEN_FLOAT_CONSTANT, + TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT, TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT, TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT, TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT, @@ -152,6 +152,7 @@ void ParserInit() { tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\""; tokenToName[TOKEN_DOTDOTDOT] = "..."; tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT"; + tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT"; tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT"; tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT"; tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT"; @@ -266,6 +267,7 @@ void ParserInit() { tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\""; tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'"; tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant"; + tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant"; tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant"; tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant"; tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant"; @@ -341,6 +343,8 @@ inline int ispcRand() { WHITESPACE [ \t\r]+ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]* INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\. +DOUBLE_NUMBER (([0-9]+|(([0-9]+\.[0-9]*?)|(\.[0-9]+)))([eE][-+]?[0-9]+)??) +HEX_DOUBLE_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+?) FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?) HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?) @@ -438,15 +442,28 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA } + +{DOUBLE_NUMBER} { + RT; + yylval.doubleVal = atof(yytext); + return TOKEN_DOUBLE_CONSTANT; +} + +{HEX_DOUBLE_NUMBER} { + RT; + yylval.doubleVal = lParseHexFloat(yytext); + return TOKEN_DOUBLE_CONSTANT; +} + {FLOAT_NUMBER} { RT; - yylval.floatVal = atof(yytext); + yylval.floatVal = (float)atof(yytext); return TOKEN_FLOAT_CONSTANT; } {HEX_FLOAT_NUMBER} { RT; - yylval.floatVal = lParseHexFloat(yytext); + yylval.floatVal = (float)lParseHexFloat(yytext); return TOKEN_FLOAT_CONSTANT; } diff --git a/parse.yy b/parse.yy index b55d49e0..933a3455 100644 --- a/parse.yy +++ b/parse.yy @@ -149,7 +149,8 @@ struct ForeachDimension { %union { uint64_t intVal; - double floatVal; + float floatVal; + double doubleVal; std::string *stringVal; const char *constCharPtr; @@ -185,7 +186,7 @@ struct ForeachDimension { %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT %token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT -%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL +%token TOKEN_FLOAT_CONSTANT TOKEN_DOUBLE_CONSTANT TOKEN_STRING_C_LITERAL %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP @@ -326,9 +327,13 @@ primary_expression (uint64_t)yylval.intVal, @1); } | TOKEN_FLOAT_CONSTANT { - $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(), + $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(), yylval.floatVal, @1); } + | TOKEN_DOUBLE_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(), + yylval.doubleVal, @1); + } | TOKEN_TRUE { $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1); } From 9861375f0c1235ea25f68211f3a82f6dcd91874c Mon Sep 17 00:00:00 2001 From: Evghenii Date: Fri, 13 Sep 2013 15:07:14 +0200 Subject: [PATCH 13/14] renamed avx-i64x4 -> avx1-i64x4 --- Makefile | 2 +- builtins.cpp | 4 ++-- builtins/{target-avx-i64x4.ll => target-avx1-i64x4.ll} | 2 +- .../{target-avx-i64x4base.ll => target-avx1-i64x4base.ll} | 0 ispc.cpp | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) rename builtins/{target-avx-i64x4.ll => target-avx1-i64x4.ll} (98%) rename builtins/{target-avx-i64x4base.ll => target-avx1-i64x4base.ll} (100%) diff --git a/Makefile b/Makefile index 92debe4f..097da238 100644 --- a/Makefile +++ b/Makefile @@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=avx-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ +TARGETS=avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \ generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 ifneq ($(ARM_ENABLED), 0) diff --git a/builtins.cpp b/builtins.cpp index f8d4136e..43f68833 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -937,10 +937,10 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod switch (g->target->getVectorWidth()) { case 4: if (runtime32) { - EXPORT_MODULE(builtins_bitcode_avx_i64x4_32bit); + EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit); } else { - EXPORT_MODULE(builtins_bitcode_avx_i64x4_64bit); + EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit); } break; case 8: diff --git a/builtins/target-avx-i64x4.ll b/builtins/target-avx1-i64x4.ll similarity index 98% rename from builtins/target-avx-i64x4.ll rename to builtins/target-avx1-i64x4.ll index 65490ea5..d183f1ce 100644 --- a/builtins/target-avx-i64x4.ll +++ b/builtins/target-avx1-i64x4.ll @@ -29,7 +29,7 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -include(`target-avx-i64x4base.ll') +include(`target-avx1-i64x4base.ll') rdrand_decls() diff --git a/builtins/target-avx-i64x4base.ll b/builtins/target-avx1-i64x4base.ll similarity index 100% rename from builtins/target-avx-i64x4base.ll rename to builtins/target-avx1-i64x4base.ll diff --git a/ispc.cpp b/ispc.cpp index 1a99154b..26ca0b39 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -446,7 +446,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "avx-i64x4") ) { + else if (!strcasecmp(isa, "avx1-i64x4") ) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ this->m_vectorWidth = 4; From 36886971e337c555b1b339b862653111f9cf9506 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Fri, 13 Sep 2013 16:02:53 +0200 Subject: [PATCH 14/14] revert lex.ll parse.yy stdlib.ispc to state when all constants are floats --- lex.ll | 19 +------------------ parse.yy | 11 +++-------- stdlib.ispc | 12 ------------ 3 files changed, 4 insertions(+), 38 deletions(-) diff --git a/lex.ll b/lex.ll index 7a3db71a..8baa627a 100644 --- a/lex.ll +++ b/lex.ll @@ -76,7 +76,7 @@ static int allTokens[] = { TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED, TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE, TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT, - TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT, + TOKEN_FLOAT_CONSTANT, TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT, TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT, TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT, @@ -152,7 +152,6 @@ void ParserInit() { tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\""; tokenToName[TOKEN_DOTDOTDOT] = "..."; tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT"; - tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT"; tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT"; tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT"; tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT"; @@ -267,7 +266,6 @@ void ParserInit() { tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\""; tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'"; tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant"; - tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant"; tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant"; tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant"; tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant"; @@ -343,8 +341,6 @@ inline int ispcRand() { WHITESPACE [ \t\r]+ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]* INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\. -DOUBLE_NUMBER (([0-9]+|(([0-9]+\.[0-9]*?)|(\.[0-9]+)))([eE][-+]?[0-9]+)??) -HEX_DOUBLE_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+?) FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?) HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?) @@ -442,19 +438,6 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA } - -{DOUBLE_NUMBER} { - RT; - yylval.doubleVal = atof(yytext); - return TOKEN_DOUBLE_CONSTANT; -} - -{HEX_DOUBLE_NUMBER} { - RT; - yylval.doubleVal = lParseHexFloat(yytext); - return TOKEN_DOUBLE_CONSTANT; -} - {FLOAT_NUMBER} { RT; yylval.floatVal = (float)atof(yytext); diff --git a/parse.yy b/parse.yy index 933a3455..9a2b4fc3 100644 --- a/parse.yy +++ b/parse.yy @@ -149,8 +149,7 @@ struct ForeachDimension { %union { uint64_t intVal; - float floatVal; - double doubleVal; + float floatVal; std::string *stringVal; const char *constCharPtr; @@ -186,7 +185,7 @@ struct ForeachDimension { %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT %token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT -%token TOKEN_FLOAT_CONSTANT TOKEN_DOUBLE_CONSTANT TOKEN_STRING_C_LITERAL +%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP @@ -328,11 +327,7 @@ primary_expression } | TOKEN_FLOAT_CONSTANT { $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(), - yylval.floatVal, @1); - } - | TOKEN_DOUBLE_CONSTANT { - $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(), - yylval.doubleVal, @1); + (float)yylval.floatVal, @1); } | TOKEN_TRUE { $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1); diff --git a/stdlib.ispc b/stdlib.ispc index 0d5c4efd..6d7ee051 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1559,18 +1559,6 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl return min(max(v, low), high); } -// double - -__declspec(safe,cost2) -static inline double clamp(double v, double low, double high) { - return min(max(v, low), high); -} - -__declspec(safe,cost2) -static inline uniform double clamp(uniform double v, uniform double low, uniform double high) { - return min(max(v, low), high); -} - // int8 __declspec(safe,cost2)