From e11022c95a67f8158c332f62cf9660429b54a9e4 Mon Sep 17 00:00:00 2001 From: Andrey Shishpanov Date: Mon, 27 Jul 2015 17:18:25 +0300 Subject: [PATCH] replaced gathers for i32 --- builtins/target-avx512-common.ll | 157 ++++++++++++++++++++++++++++++- 1 file changed, 156 insertions(+), 1 deletion(-) diff --git a/builtins/target-avx512-common.ll b/builtins/target-avx512-common.ll index 28cd0287..f6e00c8e 100644 --- a/builtins/target-avx512-common.ll +++ b/builtins/target-avx512-common.ll @@ -995,7 +995,162 @@ define void @__scatter_base_offsets64_$1(i8* %ptr, i32 %scale, %of gen_gather(i8) gen_gather(i16) -gen_gather(i32) + +;; Define the utility function to do the gather operation for a single element +;; of the type +define @__gather_elt32_i32(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, %ret, + i32 %lane) nounwind readonly alwaysinline { + ; compute address for this one from the base + %offset32 = extractelement %offsets, i32 %lane + ; the order and details of the next 4 lines are important--they match LLVMs + ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations + %offset64 = sext i32 %offset32 to i64 + %scale64 = sext i32 %offset_scale to i64 + %offset = mul i64 %offset64, %scale64 + %ptroffset = getelementptr PTR_OP_ARGS(`i8') %ptr, i64 %offset + + %delta = extractelement %offset_delta, i32 %lane + %delta64 = sext i32 %delta to i64 + %finalptr = getelementptr PTR_OP_ARGS(`i8') %ptroffset, i64 %delta64 + + ; load value and insert into returned value + %ptrcast = bitcast i8 * %finalptr to i32 * + %val = load PTR_OP_ARGS(`i32 ') %ptrcast + %updatedret = insertelement %ret, i32 %val, i32 %lane + ret %updatedret +} + +define @__gather_elt64_i32(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, %ret, + i32 %lane) nounwind readonly alwaysinline { + ; compute address for this one from the base + %offset64 = extractelement %offsets, i32 %lane + ; the order and details of the next 4 lines are important--they match LLVMs + ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations + %offset_scale64 = sext i32 %offset_scale to i64 + %offset = mul i64 %offset64, %offset_scale64 + %ptroffset = getelementptr PTR_OP_ARGS(`i8') %ptr, i64 %offset + + %delta64 = extractelement %offset_delta, i32 %lane + %finalptr = getelementptr PTR_OP_ARGS(`i8') %ptroffset, i64 %delta64 + + ; load value and insert into returned value + %ptrcast = bitcast i8 * %finalptr to i32 * + %val = load PTR_OP_ARGS(`i32 ') %ptrcast + %updatedret = insertelement %ret, i32 %val, i32 %lane + ret %updatedret +} + +declare <16 x i32> @llvm.x86.avx512.gather.dpi.512(<16 x i32>, i8*, <16 x i32>, i16, i32) +define @__gather_factored_base_offsets32_i32(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, + %vecmask) nounwind readonly alwaysinline { + %mask = bitcast <16 x i1> %vecmask to i16 + %scaleVecPtr = alloca <16 x i32> + store <16 x i32> zeroinitializer , <16 x i32> * %scaleVecPtr + %scaleVecZero = load PTR_OP_ARGS(`<16 x i32>') %scaleVecPtr + %scaleVec1 = insertelement <16 x i32> %scaleVecZero, i32 %offset_scale, i32 0 + %scaleVec = shufflevector <16 x i32> %scaleVec1, <16 x i32> undef, <16 x i32> + %offsetsScaled = mul <16 x i32> %offsets, %scaleVec + %offsetFinal = add <16 x i32> %offsetsScaled, %offset_delta + %res = call <16 x i32> @llvm.x86.avx512.gather.dpi.512(<16 x i32> zeroinitializer, i8* %ptr, <16 x i32>%offsetFinal, i16 %mask, i32 1) + ret <16 x i32> %res +} +declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, i8*, <8 x i64>, i8, i32) +define @__gather_factored_base_offsets64_i32(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, + %vecmask) nounwind readonly alwaysinline { + %extVecMask = zext <16 x i1> %vecmask to <16 x i32> + %maskVec1 = shufflevector <16 x i32> %extVecMask, <16 x i32> undef, <8 x i32> + %maskVec2 = shufflevector <16 x i32> %extVecMask, <16 x i32> undef, <8 x i32> + %maskVec1Small = trunc <8 x i32> %maskVec1 to <8 x i1> + %maskVec2Small = trunc <8 x i32> %maskVec2 to <8 x i1> + %mask1 = bitcast <8 x i1> %maskVec1Small to i8 + %mask2 = bitcast <8 x i1> %maskVec2Small to i8 + %scaleVecPtr = alloca <16 x i32> + store <16 x i32> zeroinitializer , <16 x i32> * %scaleVecPtr + %scaleVecZero = load PTR_OP_ARGS(`<16 x i32>') %scaleVecPtr + %scaleVec0 = insertelement <16 x i32> %scaleVecZero, i32 %offset_scale, i32 0 + %scaleVec = shufflevector <16 x i32> %scaleVec0, <16 x i32> undef, <16 x i32> + %scaleVec_64 = zext <16 x i32> %scaleVec to <16 x i64> + %offsetsScaled = mul <16 x i64> %offsets, %scaleVec_64 + %offsetsFinal = add <16 x i64> %offsetsScaled, %offset_delta + %ext = bitcast <16 x i64> %offsetsFinal to <32 x i32> + %reduced1 = shufflevector <32 x i32> %ext, <32 x i32> undef, <16 x i32> + %offsetsFinal1 = bitcast <16 x i32> %reduced1 to <8 x i64> + %reduced2 = shufflevector <32 x i32> %ext, <32 x i32> undef, <16 x i32> + %offsetsFinal2 = bitcast <16 x i32> %reduced2 to <8 x i64> + %res1 = call <8 x i32> @llvm.x86.avx512.gather.qpi.512(<8 x i32> zeroinitializer, i8* %ptr, <8 x i64> %offsetsFinal1, i8 %mask1, i32 1) + %res2 = call <8 x i32> @llvm.x86.avx512.gather.qpi.512(<8 x i32> zeroinitializer, i8* %ptr, <8 x i64> %offsetsFinal2, i8 %mask2, i32 1) + %res = shufflevector <8 x i32> %res1, <8 x i32> %res2, <16 x i32> + ret <16 x i32> %res +} +; fully general 32-bit gather, takes array of pointers encoded as vector of i32s +define @__gather32_i32( %ptrs, + %vecmask) nounwind readonly alwaysinline { + %mask = bitcast <16 x i1> %vecmask to i16 + %ret = call <16 x i32> @llvm.x86.avx512.gather.dpi.512(<16 x i32> zeroinitializer, i8* zeroinitializer, <16 x i32>%ptrs, i16 %mask, i32 1) + ret %ret +} + +; fully general 64-bit gather, takes array of pointers encoded as vector of i32s +define @__gather64_i32( %ptrs, + %vecmask) nounwind readonly alwaysinline { + %extVecMask = zext <16 x i1> %vecmask to <16 x i32> + %maskVec1 = shufflevector <16 x i32> %extVecMask, <16 x i32> undef, <8 x i32> + %maskVec2 = shufflevector <16 x i32> %extVecMask, <16 x i32> undef, <8 x i32> + %maskVec1Small = trunc <8 x i32> %maskVec1 to <8 x i1> + %maskVec2Small = trunc <8 x i32> %maskVec2 to <8 x i1> + %mask1 = bitcast <8 x i1> %maskVec1Small to i8 + %mask2 = bitcast <8 x i1> %maskVec2Small to i8 + %ext = bitcast <16 x i64> %ptrs to <32 x i32> + %reduced1 = shufflevector <32 x i32> %ext, <32 x i32> undef, <16 x i32> + %offsetsFinal1 = bitcast <16 x i32> %reduced1 to <8 x i64> + %reduced2 = shufflevector <32 x i32> %ext, <32 x i32> undef, <16 x i32> + %offsetsFinal2 = bitcast <16 x i32> %reduced2 to <8 x i64> + %res1 = call <8 x i32> @llvm.x86.avx512.gather.qpi.512(<8 x i32> zeroinitializer, i8* zeroinitializer, <8 x i64> %offsetsFinal1, i8 %mask1, i32 1) + %res2 = call <8 x i32> @llvm.x86.avx512.gather.qpi.512(<8 x i32> zeroinitializer, i8* zeroinitializer, <8 x i64> %offsetsFinal2, i8 %mask2, i32 1) + %res = shufflevector <8 x i32> %res1, <8 x i32> %res2, <16 x i32> + ret <16 x i32> %res +} + + + +define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %offset_scale, %offsets, %vecmask) nounwind readonly alwaysinline { + %src = alloca <16 x i32> + store <16 x i32> zeroinitializer, <16 x i32> * %src + %vecSrc = load <16 x i32>, <16 x i32> * %src + %scalarMask = bitcast <16 x i1> %vecmask to i16 + %res = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %vecSrc, i8* %ptr, <16 x i32> %offsets, i16 + %scalarMask, i32 %offset_scale) + ret <16 x i32> %res +} + +define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr, i32 %offset_scale, %offsets, <16 x i1> %vecmask) nounwind readonly alwaysinline { + %src1 = alloca <8 x i32> + %src2 = alloca <8 x i32> + store <8 x i32> zeroinitializer, <8 x i32> * %src1 + store <8 x i32> zeroinitializer, <8 x i32> * %src2 + %vecSrc1 = load <8 x i32>, <8 x i32> * %src1 + %vecSrc2 = load <8 x i32>, <8 x i32> * %src2 + %scalarMask = bitcast <16 x i1> %vecmask to i16 + %scalarMask1 = trunc i16 %scalarMask to i8 + %scalarMask2Tmp = lshr i16 %scalarMask, 8 + %scalarMask2 = trunc i16 %scalarMask2Tmp to i8 + + %ext = bitcast <16 x i64> %offsets to <32 x i32> + %reduced1 = shufflevector <32 x i32> %ext, <32 x i32> undef, <16 x i32> + %offsets1 = bitcast <16 x i32> %reduced1 to <8 x i64> + %reduced2 = shufflevector <32 x i32> %ext, <32 x i32> undef, <16 x i32> + %offsets2 = bitcast <16 x i32> %reduced2 to <8 x i64> + %res1 = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %vecSrc1, i8* %ptr, <8 x i64> %offsets1, i8 %scalarMask1, i32 %offset_scale) + %res2 = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %vecSrc1, i8* %ptr, <8 x i64> %offsets1, i8 %scalarMask1, i32 %offset_scale) + %res = shufflevector <8 x i32> %res1, <8 x i32> %res2 , <16 x i32> + ret <16 x i32> %res +} + + gen_gather(i64) gen_gather(float) gen_gather(double)