From 7e9b4c0924a884d05182d6d6416dd24e697d9a96 Mon Sep 17 00:00:00 2001 From: egaburov Date: Tue, 15 Oct 2013 10:02:10 +0200 Subject: [PATCH] added avx2-i64x4 and avx1.1-i64x4 targets --- Makefile | 2 +- builtins.cpp | 16 ++ builtins/target-avx11-i64x4.ll | 126 +++++++++++ builtins/target-avx2-i64x4.ll | 369 +++++++++++++++++++++++++++++++++ ispc.cpp | 46 +++- 5 files changed, 556 insertions(+), 3 deletions(-) create mode 100644 builtins/target-avx11-i64x4.ll create mode 100644 builtins/target-avx2-i64x4.ll diff --git a/Makefile b/Makefile index 10d51bd5..9d39baa4 100644 --- a/Makefile +++ b/Makefile @@ -140,7 +140,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ +TARGETS=avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \ generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 ifneq ($(ARM_ENABLED), 0) diff --git a/builtins.cpp b/builtins.cpp index 43f68833..af9649b7 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -966,6 +966,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } case Target::AVX11: { switch (g->target->getVectorWidth()) { + case 4: + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_avx11_i64x4_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_avx11_i64x4_64bit); + } + break; case 8: if (runtime32) { EXPORT_MODULE(builtins_bitcode_avx11_32bit); @@ -989,6 +997,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } case Target::AVX2: { switch (g->target->getVectorWidth()) { + case 4: + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_avx2_i64x4_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_avx2_i64x4_64bit); + } + break; case 8: if (runtime32) { EXPORT_MODULE(builtins_bitcode_avx2_32bit); diff --git a/builtins/target-avx11-i64x4.ll b/builtins/target-avx11-i64x4.ll new file mode 100644 index 00000000..aae612bb --- /dev/null +++ b/builtins/target-avx11-i64x4.ll @@ -0,0 +1,126 @@ +;; Copyright (c) 2012, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include(`target-avx1-i64x4base.ll') + +ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', + LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int min/max + +define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + +define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unsigned int min/max + +define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + +define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +gen_gather(i8) +gen_gather(i16) +gen_gather(i32) +gen_gather(float) +gen_gather(i64) +gen_gather(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float/half conversions + +ifelse(LLVM_VERSION, `LLVM_3_0', ` +;; nothing to define... +', ` + +define(`expand_4to8', ` + %$3 = shufflevector <4 x $1> %$2, <4 x $1> undef, <8 x i32> +') +define(`extract_4from8', ` + %$3 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> +') + +declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone +; 0 is round nearest even +declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone + +define <4 x float> @__half_to_float_varying(<4 x i16> %v4) nounwind readnone { + expand_4to8(i16, v4, v) + %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v) + extract_4from8(float, r, ret) + ret <4 x float> %ret +} + +define <4 x i16> @__float_to_half_varying(<4 x float> %v4) nounwind readnone { + expand_4to8(float, v4, v) + %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0) + extract_4from8(i16, r, ret) + ret <4 x i16> %ret +} + +define float @__half_to_float_uniform(i16 %v) nounwind readnone { + %v1 = bitcast i16 %v to <1 x i16> + %vv = shufflevector <1 x i16> %v1, <1 x i16> undef, + <8 x i32> + %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv) + %r = extractelement <8 x float> %rv, i32 0 + ret float %r +} + +define i16 @__float_to_half_uniform(float %v) nounwind readnone { + %v1 = bitcast float %v to <1 x float> + %vv = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + ; round to nearest even + %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0) + %r = extractelement <8 x i16> %rv, i32 0 + ret i16 %r +} +') diff --git a/builtins/target-avx2-i64x4.ll b/builtins/target-avx2-i64x4.ll new file mode 100644 index 00000000..cdd10386 --- /dev/null +++ b/builtins/target-avx2-i64x4.ll @@ -0,0 +1,369 @@ +;; Copyright (c) 2010-2012, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +ifelse(LLVM_VERSION, `LLVM_3_0', `', + LLVM_VERSION, `LLVM_3_1', `', + `define(`HAVE_GATHER', `1')') + +include(`target-avx1-i64x4base.ll') + +ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', + LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int min/max + +;; declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone +;; declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readonly + +define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + +define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unsigned int min/max + +;; declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readonly +;; declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readonly + +define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + +define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float/half conversions + + + +ifelse(LLVM_VERSION, `LLVM_3_0', ` +;; nothing to define... +', ` + +define(`expand_4to8', ` + %$3 = shufflevector <4 x $1> %$2, <4 x $1> undef, <8 x i32> +') +define(`extract_4from8', ` + %$3 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> +') + +declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone +; 0 is round nearest even +declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone + +define <4 x float> @__half_to_float_varying(<4 x i16> %v4) nounwind readnone { + expand_4to8(i16, v4, v) + %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v) + extract_4from8(float, r, ret) + ret <4 x float> %ret +} + +define <4 x i16> @__float_to_half_varying(<4 x float> %v4) nounwind readnone { + expand_4to8(float, v4, v) + %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0) + extract_4from8(i16, r, ret) + ret <4 x i16> %ret +} + +define float @__half_to_float_uniform(i16 %v) nounwind readnone { + %v1 = bitcast i16 %v to <1 x i16> + %vv = shufflevector <1 x i16> %v1, <1 x i16> undef, + <8 x i32> + %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv) + %r = extractelement <8 x float> %rv, i32 0 + ret float %r +} + +define i16 @__float_to_half_uniform(float %v) nounwind readnone { + %v1 = bitcast float %v to <1 x float> + %vv = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + ; round to nearest even + %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0) + %r = extractelement <8 x i16> %rv, i32 0 + ret i16 %r +} +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +declare void @llvm.trap() noreturn nounwind + + +ifelse(LLVM_VERSION, `LLVM_3_0', ` +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double)', +LLVM_VERSION, `LLVM_3_1', ` +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double)', ` + +gen_gather(i8) +gen_gather(i16) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 gathers + +declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %target, i8 * %ptr, + <4 x i32> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind +declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr, + <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind + +define <4 x i32> @__gather_base_offsets32_i32(i8 * %ptr, + i32 %scale, <4 x i32> %offsets, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + + %v = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8 * %ptr, + <4 x i32> %offsets, <4 x i32> %vecmask, i8 %scale8) + ret <4 x i32> %v +} + + +define <4 x i32> @__gather_base_offsets64_i32(i8 * %ptr, + i32 %scale, <4 x i64> %offsets, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + + %v = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, + <4 x i64> %offsets, <4 x i32> %vecmask, i8 %scale8) + + ret <4 x i32> %v +} + + +define <4 x i32> @__gather32_i32(<4 x i32> %ptrs, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + + %v = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8 * null, + <4 x i32> %ptrs, <4 x i32> %vecmask, i8 1) + + ret <4 x i32> %v +} + + +define <4 x i32> @__gather64_i32(<4 x i64> %ptrs, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + + %v = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null, + <4 x i64> %ptrs, <4 x i32> %vecmask, i8 1) + + ret <4 x i32> %v +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float gathers + +declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %target, i8 * %ptr, + <4 x i32> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind +declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr, + <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind + +define <4 x float> @__gather_base_offsets32_float(i8 * %ptr, + i32 %scale, <4 x i32> %offsets, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + %mask = bitcast <4 x i32> %vecmask to <4 x float> + + %v = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8 * %ptr, + <4 x i32> %offsets, <4 x float> %mask, i8 %scale8) + + ret <4 x float> %v +} + + +define <4 x float> @__gather_base_offsets64_float(i8 * %ptr, + i32 %scale, <4 x i64> %offsets, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + %mask = bitcast <4 x i32> %vecmask to <4 x float> + + %v = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, + <4 x i64> %offsets, <4 x float> %mask, i8 %scale8) + + ret <4 x float> %v +} + + +define <4 x float> @__gather32_float(<4 x i32> %ptrs, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + %mask = bitcast <4 x i32> %vecmask to <4 x float> + + %v = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8 * null, + <4 x i32> %ptrs, <4 x float> %mask, i8 1) + + ret <4 x float> %v +} + + +define <4 x float> @__gather64_float(<4 x i64> %ptrs, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + %mask = bitcast <4 x i32> %vecmask to <4 x float> + + %v = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null, + <4 x i64> %ptrs, <4 x float> %mask, i8 1) + + ret <4 x float> %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int64 gathers + +declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr, + <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind +declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr, + <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind + +define <4 x i64> @__gather_base_offsets32_i64(i8 * %ptr, + i32 %scale, <4 x i32> %offsets, + <4 x i64> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + + %v = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i32> %offsets, <4 x i64> %vecmask, i8 %scale8) + + ret <4 x i64> %v +} + + +define <4 x i64> @__gather_base_offsets64_i64(i8 * %ptr, + i32 %scale, <4 x i64> %offsets, + <4 x i64> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + + %v = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i64> %offsets, <4 x i64> %vecmask, i8 %scale8) + + ret <4 x i64> %v +} + + +define <4 x i64> @__gather32_i64(<4 x i32> %ptrs, + <4 x i64> %vecmask) nounwind readonly alwaysinline { + + %v = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null, + <4 x i32> %ptrs, <4 x i64> %vecmask, i8 1) + ret <4 x i64> %v +} + + +define <4 x i64> @__gather64_i64(<4 x i64> %ptrs, + <4 x i64> %vecmask) nounwind readonly alwaysinline { + %v = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null, + <4 x i64> %ptrs, <4 x i64> %vecmask, i8 1) + ret <4 x i64> %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double gathers + +declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr, + <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind +declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr, + <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind + +define <4 x double> @__gather_base_offsets32_double(i8 * %ptr, + i32 %scale, <4 x i32> %offsets, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double> + + %v = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i32> %offsets, <4 x double> %vecmask, i8 %scale8) + ret <4 x double> %v +} + +define <4 x double> @__gather_base_offsets64_double(i8 * %ptr, + i32 %scale, <4 x i64> %offsets, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double> + + %v = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i64> %offsets, <4 x double> %vecmask, i8 %scale8) + + ret <4 x double> %v +} + +define <4 x double> @__gather32_double(<4 x i32> %ptrs, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double> + + %v = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null, + <4 x i32> %ptrs, <4 x double> %vecmask, i8 1) + + ret <4 x double> %v +} + +define <4 x double> @__gather64_double(<4 x i64> %ptrs, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double> + + %v = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null, + <4 x i64> %ptrs, <4 x double> %vecmask, i8 1) + + ret <4 x double> %v +} + +') diff --git a/ispc.cpp b/ispc.cpp index 41adffe4..db4c161a 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -507,6 +507,25 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : #if !defined(LLVM_3_1) // LLVM 3.2+ only this->m_hasRand = true; +#endif + } + else if (!strcasecmp(isa, "avx1.1-i64x4")) { + this->m_isa = Target::AVX11; + this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_vectorWidth = 4; + this->m_attributes = "+avx,+popcnt,+cmov,+f16c" +#if defined(LLVM_3_4) + ",+rdrnd" +#else + ",+rdrand" +#endif + ; + this->m_maskingIsFree = false; + this->m_maskBitCount = 64; + this->m_hasHalf = true; +#if !defined(LLVM_3_1) + // LLVM 3.2+ only + this->m_hasRand = true; #endif } else if (!strcasecmp(isa, "avx2") || @@ -555,6 +574,29 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // LLVM 3.2+ only this->m_hasRand = true; this->m_hasGather = true; +#endif + } + else if (!strcasecmp(isa, "avx2-i64x4")) { + this->m_isa = Target::AVX2; + this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_vectorWidth = 4; + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" +#if defined(LLVM_3_4) + ",+rdrnd" +#else + ",+rdrand" +#endif +#ifndef LLVM_3_1 + ",+fma" +#endif // !LLVM_3_1 + ; + this->m_maskingIsFree = false; + this->m_maskBitCount = 64; + this->m_hasHalf = true; +#if !defined(LLVM_3_1) + // LLVM 3.2+ only + this->m_hasRand = true; + this->m_hasGather = true; #endif } #ifdef ISPC_ARM_ENABLED @@ -715,8 +757,8 @@ Target::SupportedTargets() { "sse2-i32x4, sse2-i32x8, " "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, " "avx1-i32x8, avx1-i32x16, avx1-i64x4, " - "avx1.1-i32x8, avx1.1-i32x16, " - "avx2-i32x8, avx2-i32x16, " + "avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 " + "avx2-i32x8, avx2-i32x16, avx2-i64x4, " "generic-x1, generic-x4, generic-x8, generic-x16, " "generic-x32, generic-x64"; }