diff --git a/builtins/target-avx512-common.ll b/builtins/target-avx512-common.ll index f6e00c8e..38256794 100644 --- a/builtins/target-avx512-common.ll +++ b/builtins/target-avx512-common.ll @@ -996,165 +996,85 @@ define void @__scatter_base_offsets64_$1(i8* %ptr, i32 %scale, %of gen_gather(i8) gen_gather(i16) -;; Define the utility function to do the gather operation for a single element -;; of the type -define @__gather_elt32_i32(i8 * %ptr, %offsets, i32 %offset_scale, - %offset_delta, %ret, - i32 %lane) nounwind readonly alwaysinline { - ; compute address for this one from the base - %offset32 = extractelement %offsets, i32 %lane - ; the order and details of the next 4 lines are important--they match LLVMs - ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations - %offset64 = sext i32 %offset32 to i64 - %scale64 = sext i32 %offset_scale to i64 - %offset = mul i64 %offset64, %scale64 - %ptroffset = getelementptr PTR_OP_ARGS(`i8') %ptr, i64 %offset - - %delta = extractelement %offset_delta, i32 %lane - %delta64 = sext i32 %delta to i64 - %finalptr = getelementptr PTR_OP_ARGS(`i8') %ptroffset, i64 %delta64 - - ; load value and insert into returned value - %ptrcast = bitcast i8 * %finalptr to i32 * - %val = load PTR_OP_ARGS(`i32 ') %ptrcast - %updatedret = insertelement %ret, i32 %val, i32 %lane - ret %updatedret -} - -define @__gather_elt64_i32(i8 * %ptr, %offsets, i32 %offset_scale, - %offset_delta, %ret, - i32 %lane) nounwind readonly alwaysinline { - ; compute address for this one from the base - %offset64 = extractelement %offsets, i32 %lane - ; the order and details of the next 4 lines are important--they match LLVMs - ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations - %offset_scale64 = sext i32 %offset_scale to i64 - %offset = mul i64 %offset64, %offset_scale64 - %ptroffset = getelementptr PTR_OP_ARGS(`i8') %ptr, i64 %offset - - %delta64 = extractelement %offset_delta, i32 %lane - %finalptr = getelementptr PTR_OP_ARGS(`i8') %ptroffset, i64 %delta64 - - ; load value and insert into returned value - %ptrcast = bitcast i8 * %finalptr to i32 * - %val = load PTR_OP_ARGS(`i32 ') %ptrcast - %updatedret = insertelement %ret, i32 %val, i32 %lane - ret %updatedret -} - +;; gather - i32 declare <16 x i32> @llvm.x86.avx512.gather.dpi.512(<16 x i32>, i8*, <16 x i32>, i16, i32) -define @__gather_factored_base_offsets32_i32(i8 * %ptr, %offsets, i32 %offset_scale, - %offset_delta, - %vecmask) nounwind readonly alwaysinline { +define <16 x i32> +@__gather_base_offsets32_i32(i8 * %ptr, i32 %offset_scale, <16 x i32> %offsets, <16 x i1> %vecmask) nounwind readonly alwaysinline { %mask = bitcast <16 x i1> %vecmask to i16 - %scaleVecPtr = alloca <16 x i32> - store <16 x i32> zeroinitializer , <16 x i32> * %scaleVecPtr - %scaleVecZero = load PTR_OP_ARGS(`<16 x i32>') %scaleVecPtr - %scaleVec1 = insertelement <16 x i32> %scaleVecZero, i32 %offset_scale, i32 0 - %scaleVec = shufflevector <16 x i32> %scaleVec1, <16 x i32> undef, <16 x i32> - %offsetsScaled = mul <16 x i32> %offsets, %scaleVec - %offsetFinal = add <16 x i32> %offsetsScaled, %offset_delta - %res = call <16 x i32> @llvm.x86.avx512.gather.dpi.512(<16 x i32> zeroinitializer, i8* %ptr, <16 x i32>%offsetFinal, i16 %mask, i32 1) + %res = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> undef, i8* %ptr, <16 x i32> %offsets, i16 %mask, i32 %offset_scale) ret <16 x i32> %res } + declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, i8*, <8 x i64>, i8, i32) -define @__gather_factored_base_offsets64_i32(i8 * %ptr, %offsets, i32 %offset_scale, - %offset_delta, - %vecmask) nounwind readonly alwaysinline { - %extVecMask = zext <16 x i1> %vecmask to <16 x i32> - %maskVec1 = shufflevector <16 x i32> %extVecMask, <16 x i32> undef, <8 x i32> - %maskVec2 = shufflevector <16 x i32> %extVecMask, <16 x i32> undef, <8 x i32> - %maskVec1Small = trunc <8 x i32> %maskVec1 to <8 x i1> - %maskVec2Small = trunc <8 x i32> %maskVec2 to <8 x i1> - %mask1 = bitcast <8 x i1> %maskVec1Small to i8 - %mask2 = bitcast <8 x i1> %maskVec2Small to i8 - %scaleVecPtr = alloca <16 x i32> - store <16 x i32> zeroinitializer , <16 x i32> * %scaleVecPtr - %scaleVecZero = load PTR_OP_ARGS(`<16 x i32>') %scaleVecPtr - %scaleVec0 = insertelement <16 x i32> %scaleVecZero, i32 %offset_scale, i32 0 - %scaleVec = shufflevector <16 x i32> %scaleVec0, <16 x i32> undef, <16 x i32> - %scaleVec_64 = zext <16 x i32> %scaleVec to <16 x i64> - %offsetsScaled = mul <16 x i64> %offsets, %scaleVec_64 - %offsetsFinal = add <16 x i64> %offsetsScaled, %offset_delta - %ext = bitcast <16 x i64> %offsetsFinal to <32 x i32> - %reduced1 = shufflevector <32 x i32> %ext, <32 x i32> undef, <16 x i32> - %offsetsFinal1 = bitcast <16 x i32> %reduced1 to <8 x i64> - %reduced2 = shufflevector <32 x i32> %ext, <32 x i32> undef, <16 x i32> - %offsetsFinal2 = bitcast <16 x i32> %reduced2 to <8 x i64> - %res1 = call <8 x i32> @llvm.x86.avx512.gather.qpi.512(<8 x i32> zeroinitializer, i8* %ptr, <8 x i64> %offsetsFinal1, i8 %mask1, i32 1) - %res2 = call <8 x i32> @llvm.x86.avx512.gather.qpi.512(<8 x i32> zeroinitializer, i8* %ptr, <8 x i64> %offsetsFinal2, i8 %mask2, i32 1) - %res = shufflevector <8 x i32> %res1, <8 x i32> %res2, <16 x i32> - ret <16 x i32> %res -} -; fully general 32-bit gather, takes array of pointers encoded as vector of i32s -define @__gather32_i32( %ptrs, - %vecmask) nounwind readonly alwaysinline { - %mask = bitcast <16 x i1> %vecmask to i16 - %ret = call <16 x i32> @llvm.x86.avx512.gather.dpi.512(<16 x i32> zeroinitializer, i8* zeroinitializer, <16 x i32>%ptrs, i16 %mask, i32 1) - ret %ret -} - -; fully general 64-bit gather, takes array of pointers encoded as vector of i32s -define @__gather64_i32( %ptrs, - %vecmask) nounwind readonly alwaysinline { - %extVecMask = zext <16 x i1> %vecmask to <16 x i32> - %maskVec1 = shufflevector <16 x i32> %extVecMask, <16 x i32> undef, <8 x i32> - %maskVec2 = shufflevector <16 x i32> %extVecMask, <16 x i32> undef, <8 x i32> - %maskVec1Small = trunc <8 x i32> %maskVec1 to <8 x i1> - %maskVec2Small = trunc <8 x i32> %maskVec2 to <8 x i1> - %mask1 = bitcast <8 x i1> %maskVec1Small to i8 - %mask2 = bitcast <8 x i1> %maskVec2Small to i8 - %ext = bitcast <16 x i64> %ptrs to <32 x i32> - %reduced1 = shufflevector <32 x i32> %ext, <32 x i32> undef, <16 x i32> - %offsetsFinal1 = bitcast <16 x i32> %reduced1 to <8 x i64> - %reduced2 = shufflevector <32 x i32> %ext, <32 x i32> undef, <16 x i32> - %offsetsFinal2 = bitcast <16 x i32> %reduced2 to <8 x i64> - %res1 = call <8 x i32> @llvm.x86.avx512.gather.qpi.512(<8 x i32> zeroinitializer, i8* zeroinitializer, <8 x i64> %offsetsFinal1, i8 %mask1, i32 1) - %res2 = call <8 x i32> @llvm.x86.avx512.gather.qpi.512(<8 x i32> zeroinitializer, i8* zeroinitializer, <8 x i64> %offsetsFinal2, i8 %mask2, i32 1) - %res = shufflevector <8 x i32> %res1, <8 x i32> %res2, <16 x i32> - ret <16 x i32> %res -} - - - -define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %offset_scale, %offsets, %vecmask) nounwind readonly alwaysinline { - %src = alloca <16 x i32> - store <16 x i32> zeroinitializer, <16 x i32> * %src - %vecSrc = load <16 x i32>, <16 x i32> * %src - %scalarMask = bitcast <16 x i1> %vecmask to i16 - %res = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %vecSrc, i8* %ptr, <16 x i32> %offsets, i16 - %scalarMask, i32 %offset_scale) - ret <16 x i32> %res -} - -define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr, i32 %offset_scale, %offsets, <16 x i1> %vecmask) nounwind readonly alwaysinline { - %src1 = alloca <8 x i32> - %src2 = alloca <8 x i32> - store <8 x i32> zeroinitializer, <8 x i32> * %src1 - store <8 x i32> zeroinitializer, <8 x i32> * %src2 - %vecSrc1 = load <8 x i32>, <8 x i32> * %src1 - %vecSrc2 = load <8 x i32>, <8 x i32> * %src2 +define <16 x i32> +@__gather_base_offsets64_i32(i8 * %ptr, i32 %offset_scale, <16 x i64> %offsets, <16 x i1> %vecmask) nounwind readonly alwaysinline { %scalarMask = bitcast <16 x i1> %vecmask to i16 %scalarMask1 = trunc i16 %scalarMask to i8 %scalarMask2Tmp = lshr i16 %scalarMask, 8 %scalarMask2 = trunc i16 %scalarMask2Tmp to i8 - - %ext = bitcast <16 x i64> %offsets to <32 x i32> - %reduced1 = shufflevector <32 x i32> %ext, <32 x i32> undef, <16 x i32> - %offsets1 = bitcast <16 x i32> %reduced1 to <8 x i64> - %reduced2 = shufflevector <32 x i32> %ext, <32 x i32> undef, <16 x i32> - %offsets2 = bitcast <16 x i32> %reduced2 to <8 x i64> - %res1 = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %vecSrc1, i8* %ptr, <8 x i64> %offsets1, i8 %scalarMask1, i32 %offset_scale) - %res2 = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %vecSrc1, i8* %ptr, <8 x i64> %offsets1, i8 %scalarMask1, i32 %offset_scale) + %offsets_lo = shufflevector <16 x i64> %offsets, <16 x i64> undef, <8 x i32> + %offsets_hi = shufflevector <16 x i64> %offsets, <16 x i64> undef, <8 x i32> + %res1 = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> undef, i8* %ptr, <8 x i64> %offsets_lo, i8 %scalarMask1, i32 %offset_scale) + %res2 = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> undef, i8* %ptr, <8 x i64> %offsets_hi, i8 %scalarMask2, i32 %offset_scale) %res = shufflevector <8 x i32> %res1, <8 x i32> %res2 , <16 x i32> ret <16 x i32> %res } +define <16 x i32> +@__gather32_i32(<16 x i32> %ptrs, <16 x i1> %vecmask) nounwind readonly alwaysinline { + %res = call <16 x i32> @__gather_base_offsets32_i32(i8 * zeroinitializer, i32 1, <16 x i32> %ptrs, <16 x i1> %vecmask) + ret <16 x i32> %res +} +define <16 x i32> +@__gather64_i32(<16x i64> %ptrs, <16 x i1> %vecmask) nounwind readonly alwaysinline { + %res = call <16 x i32> @__gather_base_offsets64_i32(i8 * zeroinitializer, i32 1, <16 x i64> %ptrs, <16 x i1> %vecmask) + ret <16 x i32> %res +} + + +;; gather - i64 gen_gather(i64) -gen_gather(float) + + +;; gather - float +declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32) +define <16 x float> +@__gather_base_offsets32_float(i8 * %ptr, i32 %offset_scale, <16 x i32> %offsets, <16 x i1> %vecmask) nounwind readonly alwaysinline { + %mask = bitcast <16 x i1> %vecmask to i16 + %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> undef, i8* %ptr, <16 x i32>%offsets, i16 %mask, i32 %offset_scale) + ret <16 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, i8*, <8 x i64>, i8, i32) +define <16 x float> +@__gather_base_offsets64_float(i8 * %ptr, i32 %offset_scale, <16 x i64> %offsets, <16 x i1> %vecmask) nounwind readonly alwaysinline { + %mask = bitcast <16 x i1> %vecmask to i16 + %mask_shifted = lshr i16 %mask, 8 + %mask_lo = trunc i16 %mask to i8 + %mask_hi = trunc i16 %mask_shifted to i8 + %offsets_lo = shufflevector <16 x i64> %offsets, <16 x i64> undef, <8 x i32> + %offsets_hi = shufflevector <16 x i64> %offsets, <16 x i64> undef, <8 x i32> + %res_lo = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> undef, i8* %ptr, <8 x i64> %offsets_lo, i8 %mask_lo, i32 %offset_scale) + %res_hi = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> undef, i8* %ptr, <8 x i64> %offsets_hi, i8 %mask_hi, i32 %offset_scale) + %res = shufflevector <8 x float> %res_lo, <8 x float> %res_hi, <16 x i32> + ret <16 x float> %res +} + +define <16 x float> @__gather32_float(<16 x i32> %ptrs, <16 x i1> %vecmask) nounwind readonly alwaysinline { + %res = call <16 x float> @__gather_base_offsets32_float(i8 * zeroinitializer, i32 1, <16 x i32> %ptrs, <16 x i1> %vecmask) + ret <16 x float> %res +} + +define <16 x float> @__gather64_float(<16 x i64> %ptrs, <16 x i1> %vecmask) nounwind readonly alwaysinline { + %res = call <16 x float> @__gather_base_offsets64_float(i8 * zeroinitializer, i32 1, <16 x i64> %ptrs, <16 x i1> %vecmask) + ret <16 x float> %res +} + +;; gather - double gen_gather(double) + scatterbo32_64(i8) scatterbo32_64(i16) scatterbo32_64(i32)