diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 533def68..6e280ba6 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -289,18 +289,18 @@ declare void @__masked_store_blend_64(* nocapture, , define(`gather_scatter', ` declare @__gather_base_offsets32_$1(i8 * nocapture, , - i32, ) nounwind readonly + i32, , ) nounwind readonly declare @__gather_base_offsets64_$1(i8 * nocapture, , - i32, ) nounwind readonly + i32, , ) nounwind readonly declare @__gather32_$1(, ) nounwind readonly declare @__gather64_$1(, ) nounwind readonly declare void @__scatter_base_offsets32_$1(i8* nocapture, , - i32, , ) nounwind + i32, , , ) nounwind declare void @__scatter_base_offsets64_$1(i8* nocapture, , - i32, , ) nounwind + i32, , , ) nounwind declare void @__scatter32_$1(, , ) nounwind declare void @__scatter64_$1(, , diff --git a/builtins/util.m4 b/builtins/util.m4 index 64e3a130..36882491 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1565,17 +1565,15 @@ declare void @__pseudo_masked_store_64( * nocapture, , ; these represent gathers from a common base pointer with offsets. The ; offset_scale factor scales the offsets before they are added to the base ; pointer--it should have the value 1, 2, 4, or 8. (It can always just be 1.) -; The 2, 4, 8 cases are used to match LLVM patterns that use the free 2/4/8 scaling -; available in x86 addressing calculations... +; Then, the offset delta_value (guaranteed to be a compile-time constant value), +; is added to the final address. The 2, 4, 8 scales are used to match LLVM patterns +; that use the free 2/4/8 scaling available in x86 addressing calculations, and +; offset_delta feeds into the free offset calculation. ; -; varying int8 __pseudo_gather_base_offsets{32,64}_8(uniform int8 *base, -; int{32,64} offsets, int32 offset_scale, mask) -; varying int16 __pseudo_gather_base_offsets{32,64}_16(uniform int16 *base, -; int{32,64} offsets, int32 offset_scale, mask) -; varying int32 __pseudo_gather_base_offsets{32,64}_32(uniform int32 *base, -; int{32,64} offsets, int32 offset_scale, mask) -; varying int64 __pseudo_gather_base_offsets{32,64}_64(uniform int64 *base, -; int{32,64} offsets, int32 offset_scale, mask) +; varying int{8,16,32,64} +; __pseudo_gather_base_offsets{32,64}_{8,16,32,64}(uniform int8 *base, +; int{32,64} offsets, uniform int32 offset_scale, +; int{32,64} offset_delta, mask) ; ; Then, the GSImprovementsPass optimizations finds these and either ; converts them to native gather functions or converts them to vector @@ -1591,22 +1589,22 @@ declare @__pseudo_gather64_16(, ) nounw declare @__pseudo_gather64_32(, ) nounwind readonly declare @__pseudo_gather64_64(, ) nounwind readonly -declare @__pseudo_gather_base_offsets32_8(i8 *, , i32, - ) nounwind readonly -declare @__pseudo_gather_base_offsets32_16(i8 *, , i32, +declare @__pseudo_gather_base_offsets32_8(i8 *, , i32, , + ) nounwind readonly +declare @__pseudo_gather_base_offsets32_16(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets32_32(i8 *, , i32, +declare @__pseudo_gather_base_offsets32_32(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets32_64(i8 *, , i32, +declare @__pseudo_gather_base_offsets32_64(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_8(i8 *, , i32, +declare @__pseudo_gather_base_offsets64_8(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_16(i8 *, , i32, +declare @__pseudo_gather_base_offsets64_16(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_32(i8 *, , i32, +declare @__pseudo_gather_base_offsets64_32(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_64(i8 *, , i32, +declare @__pseudo_gather_base_offsets64_64(i8 *, , i32, , ) nounwind readonly ; Similarly to the pseudo-gathers defined above, we also declare undefined @@ -1621,13 +1619,9 @@ declare @__pseudo_gather_base_offsets64_64(i8 *, , i3 ; transforms them to scatters like: ; ; void __pseudo_scatter_base_offsets{32,64}_8(uniform int8 *base, -; varying int32 offsets, int32 offset_scale, varying int8 values, mask) -; void __pseudo_scatter_base_offsets{32,64}_16(uniform int16 *base, -; varying int32 offsets, int32 offset_scale, varying int16 values, mask) -; void __pseudo_scatter_base_offsets{32,64}_32(uniform int32 *base, -; varying int32 offsets, int32 offset_scale, varying int32 values, mask) -; void __pseudo_scatter_base_offsets{32,64}_64(uniform int64 *base, -; varying int32 offsets, int32 offset_scale, varying int64 values, mask) +; varying int32 offsets, uniform int32 offset_scale, +; varying int{32,64} offset_delta, varying int8 values, mask) +; (and similarly for 16/32/64 bit values) ; ; And the GSImprovementsPass in turn converts these to actual native ; scatters or masked stores. @@ -1642,22 +1636,22 @@ declare void @__pseudo_scatter64_16(, , declare void @__pseudo_scatter64_32(, , ) nounwind declare void @__pseudo_scatter64_64(, , ) nounwind -declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, , i32, +declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, , i32, +declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, , i32, +declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, , i32, +declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, , i32, +declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, , i32, +declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, , i32, +declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, , i32, +declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, , i32, , , ) nounwind ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2701,7 +2695,8 @@ define(`gen_gather', ` ;; Define the utility function to do the gather operation for a single element ;; of the type define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale, - <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline { + <$1 x i32> %offset_delta, <$1 x $2> %ret, + i32 %lane) nounwind readonly alwaysinline { ; compute address for this one from the base %offset32 = extractelement <$1 x i32> %offsets, i32 %lane ; the order and details of the next 4 lines are important--they match LLVMs @@ -2711,15 +2706,20 @@ define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_ %offset = mul i64 %offset64, %scale64 %ptroffset = getelementptr i8 * %ptr, i64 %offset + %delta = extractelement <$1 x i32> %offset_delta, i32 %lane + %delta64 = sext i32 %delta to i64 + %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + ; load value and insert into returned value - %ptrcast = bitcast i8 * %ptroffset to $2 * + %ptrcast = bitcast i8 * %finalptr to $2 * %val = load $2 *%ptrcast %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane ret <$1 x $2> %updatedret } define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale, - <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline { + <$1 x i64> %offset_delta, <$1 x $2> %ret, + i32 %lane) nounwind readonly alwaysinline { ; compute address for this one from the base %offset64 = extractelement <$1 x i64> %offsets, i32 %lane ; the order and details of the next 4 lines are important--they match LLVMs @@ -2728,8 +2728,11 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_ %offset = mul i64 %offset64, %offset_scale64 %ptroffset = getelementptr i8 * %ptr, i64 %offset + %delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane + %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + ; load value and insert into returned value - %ptrcast = bitcast i8 * %ptroffset to $2 * + %ptrcast = bitcast i8 * %finalptr to $2 * %val = load $2 *%ptrcast %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane ret <$1 x $2> %updatedret @@ -2737,6 +2740,7 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale, + <$1 x i32> %offset_delta, <$1 x i32> %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always @@ -2749,16 +2753,25 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 <$1 x i32> %vecmask) %newOffsets = load <$1 x i32> * %offsetsPtr + %deltaPtr = alloca <$1 x i32> + store <$1 x i32> zeroinitializer, <$1 x i32> * %deltaPtr + call void @__masked_store_blend_32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta, + <$1 x i32> %vecmask) + %newDelta = load <$1 x i32> * %deltaPtr + %ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets, - i32 %offset_scale, <$1 x $2> undef, i32 0) + i32 %offset_scale, <$1 x i32> %offset_delta, + <$1 x $2> undef, i32 0) forloop(lane, 1, eval($1-1), `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, - <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE) + <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x i32> %offset_delta, + <$1 x $2> %retPREV, i32 LANE) ', `LANE', lane), `PREV', eval(lane-1))') ret <$1 x $2> %ret`'eval($1-1) } define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale, + <$1 x i64> %offset_delta, <$1 x i32> %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always @@ -2771,11 +2784,19 @@ define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 <$1 x i32> %vecmask) %newOffsets = load <$1 x i64> * %offsetsPtr + %deltaPtr = alloca <$1 x i64> + store <$1 x i64> zeroinitializer, <$1 x i64> * %deltaPtr + call void @__masked_store_blend_64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta, + <$1 x i32> %vecmask) + %newDelta = load <$1 x i64> * %deltaPtr + %ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets, - i32 %offset_scale, <$1 x $2> undef, i32 0) + i32 %offset_scale, <$1 x i64> %newDelta, + <$1 x $2> undef, i32 0) forloop(lane, 1, eval($1-1), `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, - <$1 x i64> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE) + <$1 x i64> %newOffsets, i32 %offset_scale, <$1 x i64> %newDelta, + <$1 x $2> %retPREV, i32 LANE) ', `LANE', lane), `PREV', eval(lane-1))') ret <$1 x $2> %ret`'eval($1-1) } @@ -2826,7 +2847,8 @@ define(`gen_scatter', ` ;; Define the function that descripes the work to do to scatter a single ;; value define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale, - <$1 x $2> %values, i32 %lane) nounwind alwaysinline { + <$1 x i32> %offset_delta, <$1 x $2> %values, + i32 %lane) nounwind alwaysinline { %offset32 = extractelement <$1 x i32> %offsets, i32 %lane ; the order and details of the next 4 lines are important--they match LLVMs ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations @@ -2835,42 +2857,52 @@ define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scal %offset = mul i64 %offset64, %scale64 %ptroffset = getelementptr i8 * %ptr, i64 %offset - %ptrcast = bitcast i8 * %ptroffset to $2 * + %delta = extractelement <$1 x i32> %offset_delta, i32 %lane + %delta64 = sext i32 %delta to i64 + %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + + %ptrcast = bitcast i8 * %finalptr to $2 * %storeval = extractelement <$1 x $2> %values, i32 %lane store $2 %storeval, $2 * %ptrcast ret void } define void @__scatter_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale, - <$1 x $2> %values, i32 %lane) nounwind alwaysinline { + <$1 x i64> %offset_delta, <$1 x $2> %values, + i32 %lane) nounwind alwaysinline { %offset64 = extractelement <$1 x i64> %offsets, i32 %lane ; the order and details of the next 4 lines are important--they match LLVMs ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations %scale64 = sext i32 %offset_scale to i64 %offset = mul i64 %offset64, %scale64 %ptroffset = getelementptr i8 * %ptr, i64 %offset - %ptrcast = bitcast i8 * %ptroffset to $2 * + %delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane + %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + + %ptrcast = bitcast i8 * %finalptr to $2 * %storeval = extractelement <$1 x $2> %values, i32 %lane store $2 %storeval, $2 * %ptrcast ret void } define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, i32 %offset_scale, - <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline { + <$1 x i32> %offset_delta, <$1 x $2> %values, + <$1 x i32> %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... per_lane($1, <$1 x i32> %mask, ` call void @__scatter_elt32_$2(i8 * %base, <$1 x i32> %offsets, i32 %offset_scale, - <$1 x $2> %values, i32 LANE)') + <$1 x i32> %offset_delta, <$1 x $2> %values, i32 LANE)') ret void } define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %offset_scale, - <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline { + <$1 x i64> %offset_delta, <$1 x $2> %values, + <$1 x i32> %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... per_lane($1, <$1 x i32> %mask, ` call void @__scatter_elt64_$2(i8 * %base, <$1 x i64> %offsets, i32 %offset_scale, - <$1 x $2> %values, i32 LANE)') + <$1 x i64> %offset_delta, <$1 x $2> %values, i32 LANE)') ret void } diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index ffeb4680..7418f5d6 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1060,13 +1060,15 @@ static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val, // offsets * offsetScale is in bytes (for all of these) #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\ - __vec16_i1 mask) { \ +static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ + uint32_t scale, OTYPE constOffset, \ + __vec16_i1 mask) { \ VTYPE ret; \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 16; ++i) \ if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]); \ + STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ + constOffset.v[i]); \ ret.v[i] = *ptr; \ } \ return ret; \ @@ -1104,13 +1106,15 @@ GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64) // scatter -#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE void FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\ +#define SCATTER_BASE_VARYINGOFFSET(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ + uint32_t scale, OTYPE constOffset, \ VTYPE val, __vec16_i1 mask) { \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 16; ++i) \ if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]); \ + STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ + constOffset.v[i]); \ *ptr = val.v[i]; \ } \ } diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 2dc48b06..7a3af6ad 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -51,8 +51,8 @@ #define FORCEINLINE __attribute__((always_inline)) inline #endif -//CO#undef FORCEINLINE -//CO#define FORCEINLINE +#undef FORCEINLINE +#define FORCEINLINE typedef float __vec1_f; typedef double __vec1_d; @@ -2612,52 +2612,54 @@ static FORCEINLINE void __masked_store_blend_64(void *p, __vec4_i64 val, template static FORCEINLINE RetVec -lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, - __vec4_i32 offsets, uint32_t scale, __vec4_i1 mask) { +lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets, + uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { RetScalar r[4]; #if 1 // "Fast gather" trick... offsets = __select(mask, offsets, __smear_i32(0)); - int offset = scale * _mm_extract_epi32(offsets.v, 0); + constOffset = __select(mask, constOffset, __smear_i32(0)); + + int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; - offset = scale * _mm_extract_epi32(offsets.v, 1); + offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1); ptr = (RetScalar *)(p + offset); r[1] = *ptr; - offset = scale * _mm_extract_epi32(offsets.v, 2); + offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2); ptr = (RetScalar *)(p + offset); r[2] = *ptr; - offset = scale * _mm_extract_epi32(offsets.v, 3); + offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3); ptr = (RetScalar *)(p + offset); r[3] = *ptr; #else uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 0); + int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; } m = _mm_extract_ps(mask.v, 1); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 1); + int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1); RetScalar *ptr = (RetScalar *)(p + offset); r[1] = *ptr; } m = _mm_extract_ps(mask.v, 2); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 2); + int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2); RetScalar *ptr = (RetScalar *)(p + offset); r[2] = *ptr; } m = _mm_extract_ps(mask.v, 3); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 3); + int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3); RetScalar *ptr = (RetScalar *)(p + offset); r[3] = *ptr; } @@ -2665,54 +2667,57 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, return RetVec(r[0], r[1], r[2], r[3]); } + template static FORCEINLINE RetVec lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i1 mask) { + uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { RetScalar r[4]; #if 1 // "Fast gather" trick... offsets = __select(mask, offsets, __smear_i64(0)); - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); + constOffset = __select(mask, constOffset, __smear_i64(0)); + + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; - offset = scale * _mm_extract_epi64(offsets.v[0], 1); + offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1); ptr = (RetScalar *)(p + offset); r[1] = *ptr; - offset = scale * _mm_extract_epi64(offsets.v[1], 0); + offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0); ptr = (RetScalar *)(p + offset); r[2] = *ptr; - offset = scale * _mm_extract_epi64(offsets.v[1], 1); + offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1); ptr = (RetScalar *)(p + offset); r[3] = *ptr; #else uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; } m = _mm_extract_ps(mask.v, 1); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1); RetScalar *ptr = (RetScalar *)(p + offset); r[1] = *ptr; } m = _mm_extract_ps(mask.v, 2); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0); RetScalar *ptr = (RetScalar *)(p + offset); r[2] = *ptr; } m = _mm_extract_ps(mask.v, 3); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1); RetScalar *ptr = (RetScalar *)(p + offset); r[3] = *ptr; } @@ -2723,80 +2728,89 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets, static FORCEINLINE __vec4_i8 __gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets, - uint32_t scale, __vec4_i1 mask) { + uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale, - mask); + constOffset, mask); } static FORCEINLINE __vec4_i8 __gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets, - uint32_t scale, __vec4_i1 mask) { + uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale, - mask); + constOffset, mask); } static FORCEINLINE __vec4_i16 __gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets, - uint32_t scale, __vec4_i1 mask) { + uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale, - mask); + constOffset, mask); } static FORCEINLINE __vec4_i16 __gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets, - uint32_t scale, __vec4_i1 mask) { + uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale, - mask); + constOffset, mask); } static FORCEINLINE __vec4_i32 -__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i1 mask) { +__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale, + __vec4_i32 constOffset, __vec4_i1 mask) { __m128i r = _mm_set_epi32(0, 0, 0, 0); #if 1 // "Fast gather"... offsets = __select(mask, offsets, __smear_i32(0)); + constOffset = __select(mask, constOffset, __smear_i32(0)); - int offset = scale * _mm_extract_epi32(offsets.v, 0); + int offset = scale * _mm_extract_epi32(offsets.v, 0) + + _mm_extract_epi32(constOffset.v, 0); uint32_t *ptr = (uint32_t *)(p + offset); r = _mm_insert_epi32(r, *ptr, 0); - offset = scale * _mm_extract_epi32(offsets.v, 1); + offset = scale * _mm_extract_epi32(offsets.v, 1) + + _mm_extract_epi32(constOffset.v, 1); ptr = (uint32_t *)(p + offset); r = _mm_insert_epi32(r, *ptr, 1); - offset = scale * _mm_extract_epi32(offsets.v, 2); + offset = scale * _mm_extract_epi32(offsets.v, 2) + + _mm_extract_epi32(constOffset.v, 2); ptr = (uint32_t *)(p + offset); r = _mm_insert_epi32(r, *ptr, 2); - offset = scale * _mm_extract_epi32(offsets.v, 3); + offset = scale * _mm_extract_epi32(offsets.v, 3) + + _mm_extract_epi32(constOffset.v, 3); ptr = (uint32_t *)(p + offset); r = _mm_insert_epi32(r, *ptr, 3); #else uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 0); + int offset = scale * _mm_extract_epi32(offsets.v, 0) + + _mm_extract_epi32(constOffset.v, 0); uint32_t *ptr = (uint32_t *)(p + offset); r = _mm_insert_epi32(r, *ptr, 0); } m = _mm_extract_ps(mask.v, 1); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 1); + int offset = scale * _mm_extract_epi32(offsets.v, 1) + + _mm_extract_epi32(constOffset.v, 1); uint32_t *ptr = (uint32_t *)(p + offset); r = _mm_insert_epi32(r, *ptr, 1); } m = _mm_extract_ps(mask.v, 2); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 2); + int offset = scale * _mm_extract_epi32(offsets.v, 2) + + _mm_extract_epi32(constOffset.v, 2); uint32_t *ptr = (uint32_t *)(p + offset); r = _mm_insert_epi32(r, *ptr, 2); } m = _mm_extract_ps(mask.v, 3); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 3); + int offset = scale * _mm_extract_epi32(offsets.v, 3) + + _mm_extract_epi32(constOffset.v, 3); uint32_t *ptr = (uint32_t *)(p + offset); r = _mm_insert_epi32(r, *ptr, 3); } @@ -2806,23 +2820,23 @@ __gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, static FORCEINLINE __vec4_i32 __gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i1 mask) { + uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale, - mask); + delta, mask); } static FORCEINLINE __vec4_i64 __gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i1 mask) { + uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale, - mask); + delta, mask); } static FORCEINLINE __vec4_i64 __gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i1 mask) { + uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale, - mask); + delta, mask); } template @@ -2969,217 +2983,108 @@ static FORCEINLINE __vec4_i64 __gather64_i64(__vec4_i64 ptrs, __vec4_i1 mask) { // scatter -static FORCEINLINE void -__scatter_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets, - uint32_t scale, __vec4_i8 val, __vec4_i1 mask) { - uint32_t m = _mm_extract_ps(mask.v, 0); - if (m != 0) { - int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 0)); - *ptr = _mm_extract_epi8(val.v, 0); - } - - m = _mm_extract_ps(mask.v, 1); - if (m != 0) { - int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 1)); - *ptr = _mm_extract_epi8(val.v, 1); - } - - m = _mm_extract_ps(mask.v, 2); - if (m != 0) { - int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 2)); - *ptr = _mm_extract_epi8(val.v, 2); - } - - m = _mm_extract_ps(mask.v, 3); - if (m != 0) { - int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 3)); - *ptr = _mm_extract_epi8(val.v, 3); - } +#define SCATTER32_64(SUFFIX, TYPE, EXTRACT) \ +static FORCEINLINE void \ +__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \ + uint32_t scale, __vec4_i32 constOffset, \ + __vec4_##SUFFIX val, __vec4_i1 mask) { \ + uint32_t m = _mm_extract_ps(mask.v, 0); \ + if (m != 0) { \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \ + _mm_extract_epi32(constOffset.v, 0)); \ + *ptr = EXTRACT(val.v, 0); \ + } \ + m = _mm_extract_ps(mask.v, 1); \ + if (m != 0) { \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \ + _mm_extract_epi32(constOffset.v, 1)); \ + *ptr = EXTRACT(val.v, 1); \ + } \ + m = _mm_extract_ps(mask.v, 2); \ + if (m != 0) { \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \ + _mm_extract_epi32(constOffset.v, 2)); \ + *ptr = EXTRACT(val.v, 2); \ + } \ + m = _mm_extract_ps(mask.v, 3); \ + if (m != 0) { \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \ + _mm_extract_epi32(constOffset.v, 3)); \ + *ptr = EXTRACT(val.v, 3); \ + } \ +} \ +static FORCEINLINE void \ +__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \ + uint32_t scale, __vec4_i64 constOffset, \ + __vec4_##SUFFIX val, __vec4_i1 mask) { \ + uint32_t m = _mm_extract_ps(mask.v, 0); \ + if (m != 0) { \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + \ + _mm_extract_epi64(constOffset.v[0], 0); \ + TYPE *ptr = (TYPE *)(p + offset); \ + *ptr = EXTRACT(val.v, 0); \ + } \ + m = _mm_extract_ps(mask.v, 1); \ + if (m != 0) { \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + \ + _mm_extract_epi64(constOffset.v[0], 1); \ + TYPE *ptr = (TYPE *)(p + offset); \ + *ptr = EXTRACT(val.v, 1); \ + } \ + m = _mm_extract_ps(mask.v, 2); \ + if (m != 0) { \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + \ + _mm_extract_epi64(constOffset.v[1], 0); \ + TYPE *ptr = (TYPE *)(p + offset); \ + *ptr = EXTRACT(val.v, 2); \ + } \ + m = _mm_extract_ps(mask.v, 3); \ + if (m != 0) { \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + \ + _mm_extract_epi64(constOffset.v[1], 1); \ + TYPE *ptr = (TYPE *)(p + offset); \ + *ptr = EXTRACT(val.v, 3); \ + } \ } -static FORCEINLINE void -__scatter_base_offsets64_i8(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i8 val, __vec4_i1 mask) { - uint32_t m = _mm_extract_ps(mask.v, 0); - if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); - uint8_t *ptr = (uint8_t *)(p + offset); - *ptr = _mm_extract_epi8(val.v, 0); - } - m = _mm_extract_ps(mask.v, 1); - if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); - uint8_t *ptr = (uint8_t *)(p + offset); - *ptr = _mm_extract_epi8(val.v, 1); - } +SCATTER32_64(i8, int8_t, _mm_extract_epi8) +SCATTER32_64(i16, int16_t, _mm_extract_epi16) +SCATTER32_64(i32, int32_t, _mm_extract_epi32) - m = _mm_extract_ps(mask.v, 2); - if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); - uint8_t *ptr = (uint8_t *)(p + offset); - *ptr = _mm_extract_epi8(val.v, 2); - } - - m = _mm_extract_ps(mask.v, 3); - if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); - uint8_t *ptr = (uint8_t *)(p + offset); - *ptr = _mm_extract_epi8(val.v, 3); - } -} - -static FORCEINLINE void -__scatter_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets, - uint32_t scale, __vec4_i16 val, __vec4_i1 mask) { - uint32_t m = _mm_extract_ps(mask.v, 0); - if (m != 0) { - int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 0)); - *ptr = _mm_extract_epi16(val.v, 0); - } - - m = _mm_extract_ps(mask.v, 1); - if (m != 0) { - int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 1)); - *ptr = _mm_extract_epi16(val.v, 1); - } - - m = _mm_extract_ps(mask.v, 2); - if (m != 0) { - int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 2)); - *ptr = _mm_extract_epi16(val.v, 2); - } - - m = _mm_extract_ps(mask.v, 3); - if (m != 0) { - int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 3)); - *ptr = _mm_extract_epi16(val.v, 3); - } -} - -static FORCEINLINE void -__scatter_base_offsets64_i16(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i16 val, __vec4_i1 mask) { - uint32_t m = _mm_extract_ps(mask.v, 0); - if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); - uint16_t *ptr = (uint16_t *)(p + offset); - *ptr = _mm_extract_epi16(val.v, 0); - } - - m = _mm_extract_ps(mask.v, 1); - if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); - uint16_t *ptr = (uint16_t *)(p + offset); - *ptr = _mm_extract_epi16(val.v, 1); - } - - m = _mm_extract_ps(mask.v, 2); - if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); - uint16_t *ptr = (uint16_t *)(p + offset); - *ptr = _mm_extract_epi16(val.v, 2); - } - - m = _mm_extract_ps(mask.v, 3); - if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); - uint16_t *ptr = (uint16_t *)(p + offset); - *ptr = _mm_extract_epi16(val.v, 3); - } -} - -static FORCEINLINE void -__scatter_base_offsets32_i32(unsigned char *b, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 val, __vec4_i1 mask) { - uint32_t m = _mm_extract_ps(mask.v, 0); - if (m != 0) { - int32_t *ptr = (int32_t *)(b + scale * - _mm_extract_epi32(offsets.v, 0)); - *ptr = _mm_extract_epi32(val.v, 0); - } - - m = _mm_extract_ps(mask.v, 1); - if (m != 0) { - int32_t *ptr = (int32_t *)(b + scale * - _mm_extract_epi32(offsets.v, 1)); - *ptr = _mm_extract_epi32(val.v, 1); - } - - m = _mm_extract_ps(mask.v, 2); - if (m != 0) { - int32_t *ptr = (int32_t *)(b + scale * - _mm_extract_epi32(offsets.v, 2)); - *ptr = _mm_extract_epi32(val.v, 2); - } - - m = _mm_extract_ps(mask.v, 3); - if (m != 0) { - int32_t *ptr = (int32_t *)(b + scale * - _mm_extract_epi32(offsets.v, 3)); - *ptr = _mm_extract_epi32(val.v, 3); - } -} - -static FORCEINLINE void -__scatter_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i32 val, __vec4_i1 mask) { - uint32_t m = _mm_extract_ps(mask.v, 0); - if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); - uint32_t *ptr = (uint32_t *)(p + offset); - *ptr = _mm_extract_epi32(val.v, 0); - } - - m = _mm_extract_ps(mask.v, 1); - if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); - uint32_t *ptr = (uint32_t *)(p + offset); - *ptr = _mm_extract_epi32(val.v, 1); - } - - m = _mm_extract_ps(mask.v, 2); - if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); - uint32_t *ptr = (uint32_t *)(p + offset); - *ptr = _mm_extract_epi32(val.v, 2); - } - - m = _mm_extract_ps(mask.v, 3); - if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); - uint32_t *ptr = (uint32_t *)(p + offset); - *ptr = _mm_extract_epi32(val.v, 3); - } -} static FORCEINLINE void __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i64 val, __vec4_i1 mask) { + uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val, + __vec4_i1 mask) { uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 0); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) + + _mm_extract_epi32(constOffset.v, 0); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[0], 0); } m = _mm_extract_ps(mask.v, 1); if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 1); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) + + _mm_extract_epi32(constOffset.v, 1); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[0], 1); } m = _mm_extract_ps(mask.v, 2); if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 2); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) + + _mm_extract_epi32(constOffset.v, 2); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[1], 0); } m = _mm_extract_ps(mask.v, 3); if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 3); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) + + _mm_extract_epi32(constOffset.v, 3); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[1], 1); } @@ -3187,31 +3092,36 @@ __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, static FORCEINLINE void __scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 val, __vec4_i1 mask) { + uint32_t scale, __vec4_i64 constOffset, + __vec4_i64 val, __vec4_i1 mask) { uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + + _mm_extract_epi64(constOffset.v[0], 0); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[0], 0); } m = _mm_extract_ps(mask.v, 1); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + + _mm_extract_epi64(constOffset.v[0], 1); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[0], 1); } m = _mm_extract_ps(mask.v, 2); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + + _mm_extract_epi64(constOffset.v[1], 0); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[1], 0); } m = _mm_extract_ps(mask.v, 3); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + + _mm_extract_epi64(constOffset.v[1], 1); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[1], 1); } diff --git a/opt.cpp b/opt.cpp index c105947b..f6eab8c6 100644 --- a/opt.cpp +++ b/opt.cpp @@ -205,6 +205,7 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, } +#if 0 static llvm::Instruction * lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2, llvm::Value *arg3, const char *name, @@ -218,7 +219,7 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, name, insertBefore); #endif } - +#endif static llvm::Instruction * lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, @@ -234,6 +235,21 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, #endif } +static llvm::Instruction * +lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, + llvm::Value *arg2, llvm::Value *arg3, llvm::Value *arg4, + llvm::Value *arg5, const char *name, + llvm::Instruction *insertBefore = NULL) { + llvm::Value *args[6] = { arg0, arg1, arg2, arg3, arg4, arg5 }; +#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn) + llvm::ArrayRef newArgArray(&args[0], &args[6]); + return llvm::CallInst::Create(func, newArgArray, name, insertBefore); +#else + return llvm::CallInst::Create(func, &newArgs[0], &newArgs[6], + name, insertBefore); +#endif +} + /////////////////////////////////////////////////////////////////////////// void @@ -302,10 +318,13 @@ Optimize(llvm::Module *module, int optLevel) { // Early optimizations to try to reduce the total amount of code to // work with if we can - optPM.add(CreateDetectGSBaseOffsetsPass()); optPM.add(llvm::createReassociatePass()); optPM.add(llvm::createConstantPropagationPass()); + optPM.add(llvm::createConstantPropagationPass()); + optPM.add(llvm::createDeadInstEliminationPass()); + optPM.add(llvm::createCFGSimplificationPass()); + optPM.add(CreateDetectGSBaseOffsetsPass()); if (!g->opt.disableMaskAllOnOptimizations) { optPM.add(CreateIntrinsicsOptPass()); optPM.add(CreateVSelMovmskOptPass()); @@ -314,11 +333,7 @@ Optimize(llvm::Module *module, int optLevel) { } optPM.add(llvm::createDeadInstEliminationPass()); - optPM.add(llvm::createConstantPropagationPass()); - optPM.add(llvm::createDeadInstEliminationPass()); - // On to more serious optimizations - optPM.add(llvm::createCFGSimplificationPass()); if (runSROA) optPM.add(llvm::createScalarReplAggregatesPass()); optPM.add(llvm::createInstructionCombiningPass()); @@ -1173,6 +1188,166 @@ lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets, } +static llvm::Value * +lGetZeroOffsetVector(llvm::Value *origVec) { + if (origVec->getType() == LLVMTypes::Int32VectorType) + return LLVMInt32Vector((int32_t)0); + else + return LLVMInt64Vector((int64_t)0); +} + + +#if 0 +static void +lPrint(llvm::Value *v, int indent = 0) { + if (llvm::isa(v)) + return; + + fprintf(stderr, "%*c", indent, ' '); + v->dump(); + + llvm::Instruction *inst = llvm::dyn_cast(v); + if (inst != NULL) { + for (int i = 0; i < (int)inst->getNumOperands(); ++i) { + llvm::Value *op = inst->getOperand(i); + if (llvm::isa(op) == false) + lPrint(op, indent+4); + } + } +} +#endif + + +/** Given a vector expression in vec, separate it into a compile-time + constant component and a variable component, returning the two parts in + *constOffset and *variableOffset. (It should be the case that the sum + of these two is exactly equal to the original vector.) + + This routine only handles some (important) patterns; in some cases it + will fail and return components that are actually compile-time + constants in *variableOffset. + + Finally, if there aren't any constant (or, respectivaly, variable) + components, the corresponding return value may be set to NULL. + */ +static void +lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset, + llvm::Value **variableOffset, + llvm::Instruction *insertBefore) { + if (llvm::isa(vec) || + llvm::isa(vec)) { + *constOffset = vec; + *variableOffset = NULL; + return; + } + + llvm::SExtInst *sext = llvm::dyn_cast(vec); + if (sext != NULL) { + // Check the sext target. + llvm::Value *co, *vo; + lExtractConstantOffset(sext->getOperand(0), &co, &vo, insertBefore); + + // make new sext instructions for the two parts + if (co == NULL) + *constOffset = NULL; + else + *constOffset = new llvm::SExtInst(co, sext->getType(), + "const_offset_sext", insertBefore); + if (vo == NULL) + *variableOffset = NULL; + else + *variableOffset = new llvm::SExtInst(vo, sext->getType(), + "variable_offset_sext", + insertBefore); + return; + } + + // FIXME? handle bitcasts / type casts here + + llvm::BinaryOperator *bop = llvm::dyn_cast(vec); + if (bop != NULL) { + llvm::Value *op0 = bop->getOperand(0); + llvm::Value *op1 = bop->getOperand(1); + llvm::Value *c0, *v0, *c1, *v1; + + if (bop->getOpcode() == llvm::Instruction::Add) { + lExtractConstantOffset(op0, &c0, &v0, insertBefore); + lExtractConstantOffset(op1, &c1, &v1, insertBefore); + + if (c0 == NULL) + *constOffset = c1; + else if (c1 == NULL) + *constOffset = c0; + else + *constOffset = + llvm::BinaryOperator::Create(llvm::Instruction::Add, c0, c1, + "const_op", insertBefore); + + if (v0 == NULL) + *variableOffset = v1; + else if (v1 == NULL) + *variableOffset = v0; + else + *variableOffset = + llvm::BinaryOperator::Create(llvm::Instruction::Add, v0, v1, + "variable_op", insertBefore); + return; + } + else if (bop->getOpcode() == llvm::Instruction::Mul) { + lExtractConstantOffset(op0, &c0, &v0, insertBefore); + lExtractConstantOffset(op1, &c1, &v1, insertBefore); + + // Given the product of constant and variable terms, we have: + // (c0 + v0) * (c1 + v1) == (c0 c1) + (v0 c1 + c0 v1 + v0 v1) + // Note that the first term is a constant and the last three are + // variable. + if (c0 != NULL && c1 != NULL) + *constOffset = + llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, c1, + "const_mul", insertBefore); + else + *constOffset = NULL; + + llvm::Value *va = NULL, *vb = NULL, *vc = NULL; + if (v0 != NULL && c1 != NULL) + va = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, c1, + "va_mul", insertBefore); + if (c0 != NULL && v1 != NULL) + vb = llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, v1, + "vb_mul", insertBefore); + if (v0 != NULL && v1 != NULL) + vc = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, v1, + "vc_mul", insertBefore); + + + llvm::Value *vab = NULL; + if (va != NULL && vb != NULL) + vab = llvm::BinaryOperator::Create(llvm::Instruction::Add, va, vb, + "vab_add", insertBefore); + else if (va != NULL) + vab = va; + else + vab = vb; + + if (vab != NULL && vc != NULL) + *variableOffset = + llvm::BinaryOperator::Create(llvm::Instruction::Add, vab, vc, + "vabc_add", insertBefore); + else if (vab != NULL) + *variableOffset = vab; + else + *variableOffset = vc; + + return; + } + } + + // Nothing matched, just return what we have as a variable component + *constOffset = NULL; + *variableOffset = vec; +} + + /* Returns true if the given value is a constant vector of integers with the value 2, 4, 8 in all of the elements. (Returns the splatted value in *splat, if so). */ @@ -1277,6 +1452,123 @@ lExtractOffsetVector248Scale(llvm::Value **vec) { return LLVMInt32(1); } +#if 0 +static llvm::Value * +lExtractUniforms(llvm::Value **vec, llvm::Instruction *insertBefore) { + fprintf(stderr, " lextract: "); + (*vec)->dump(); + fprintf(stderr, "\n"); + + if (llvm::isa(*vec) || + llvm::isa(*vec)) + return NULL; + + llvm::SExtInst *sext = llvm::dyn_cast(*vec); + if (sext != NULL) { + llvm::Value *sextOp = sext->getOperand(0); + // Check the sext target. + llvm::Value *unif = lExtractUniforms(&sextOp, insertBefore); + if (unif == NULL) + return NULL; + + // make a new sext instruction so that we end up with the right + // type + *vec = new llvm::SExtInst(sextOp, sext->getType(), "offset_sext", sext); + return unif; + } + + std::vector phis; + if (LLVMVectorValuesAllEqual(*vec, g->target.vectorWidth, phis)) { + // FIXME: we may want to redo all of the expression here, in scalar + // form (if at all possible), for code quality... + llvm::Value *unif = + llvm::ExtractElementInst::Create(*vec, LLVMInt32(0), + "first_uniform", insertBefore); + *vec = NULL; + return unif; + } + + llvm::BinaryOperator *bop = llvm::dyn_cast(*vec); + if (bop == NULL) + return NULL; + + llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1); + if (bop->getOpcode() == llvm::Instruction::Add) { + llvm::Value *s0 = lExtractUniforms(&op0, insertBefore); + llvm::Value *s1 = lExtractUniforms(&op1, insertBefore); + if (s0 == NULL && s1 == NULL) + return NULL; + + if (op0 == NULL) + *vec = op1; + else if (op1 == NULL) + *vec = op0; + else + *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add, + op0, op1, "new_add", insertBefore); + + if (s0 == NULL) + return s1; + else if (s1 == NULL) + return s0; + else + return llvm::BinaryOperator::Create(llvm::Instruction::Add, s0, s1, + "add_unif", insertBefore); + } +#if 0 + else if (bop->getOpcode() == llvm::Instruction::Mul) { + // Check each operand for being one of the scale factors we care about. + int splat; + if (lIs248Splat(op0, &splat)) { + *vec = op1; + return LLVMInt32(splat); + } + else if (lIs248Splat(op1, &splat)) { + *vec = op0; + return LLVMInt32(splat); + } + else + return LLVMInt32(1); + } +#endif + else + return NULL; +} + + +static void +lExtractUniformsFromOffset(llvm::Value **basePtr, llvm::Value **offsetVector, + llvm::Value *offsetScale, + llvm::Instruction *insertBefore) { +#if 1 + (*basePtr)->dump(); + printf("\n"); + (*offsetVector)->dump(); + printf("\n"); + offsetScale->dump(); + printf("-----\n"); +#endif + + llvm::Value *uniformDelta = lExtractUniforms(offsetVector, insertBefore); + if (uniformDelta == NULL) + return; + + llvm::Value *index[1] = { uniformDelta }; +#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn) + llvm::ArrayRef arrayRef(&index[0], &index[1]); + *basePtr = llvm::GetElementPtrInst::Create(*basePtr, arrayRef, "new_base", + insertBefore); +#else + *basePtr = llvm::GetElementPtrInst::Create(*basePtr, &index[0], + &index[1], "new_base", + insertBefore); +#endif + + // this should only happen if we have only uniforms, but that in turn + // shouldn't be a gather/scatter! + Assert(*offsetVector != NULL); +} +#endif struct GSInfo { GSInfo(const char *pgFuncName, const char *pgboFuncName, @@ -1367,7 +1659,24 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) { // to the next instruction... continue; - llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector); + // Try to decompose the offset vector into a compile time constant + // component and a varying component. The constant component is + // passed as a separate parameter to the gather/scatter functions, + // which in turn allows their implementations to end up emitting + // x86 instructions with constant offsets encoded in them. + llvm::Value *constOffset, *variableOffset; + lExtractConstantOffset(offsetVector, &constOffset, &variableOffset, + callInst); + if (constOffset == NULL) + constOffset = lGetZeroOffsetVector(offsetVector); + if (variableOffset == NULL) + variableOffset = lGetZeroOffsetVector(offsetVector); + + // See if the varying component is scaled by 2, 4, or 8. If so, + // extract that scale factor and rewrite variableOffset to remove + // it. (This also is pulled out so that we can match the scales by + // 2/4/8 offered by x86 addressing operators.) + llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset); // Cast the base pointer to a void *, since that's what the // __pseudo_*_base_offsets_* functions want. @@ -1386,11 +1695,15 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) { // walk past the sext to get the i32 offset values and then // call out to the corresponding 32-bit gather/scatter // function. - llvm::SExtInst *sext = llvm::dyn_cast(offsetVector); + llvm::SExtInst *sext = llvm::dyn_cast(variableOffset); if (sext != NULL && sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) { - offsetVector = sext->getOperand(0); + variableOffset = sext->getOperand(0); gatherScatterFunc = info->baseOffsets32Func; + if (constOffset->getType() != LLVMTypes::Int32VectorType) + constOffset = + new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType, + "trunc_const_offset", callInst); } } @@ -1403,8 +1716,8 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) { // the instruction isn't inserted into a basic block and that // way we can then call ReplaceInstWithInst(). llvm::Instruction *newCall = - lCallInst(gatherScatterFunc, basePtr, offsetVector, offsetScale, - mask, "newgather", NULL); + lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, + constOffset, mask, "newgather", NULL); lCopyMetadata(newCall, callInst); llvm::ReplaceInstWithInst(callInst, newCall); } @@ -1416,8 +1729,8 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) { // base+offsets instruction. See above for why passing NULL // for the Instruction * is intended. llvm::Instruction *newCall = - lCallInst(gatherScatterFunc, basePtr, offsetVector, offsetScale, - storeValue, mask, "", NULL); + lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, + constOffset, storeValue, mask, "", NULL); lCopyMetadata(newCall, callInst); llvm::ReplaceInstWithInst(callInst, newCall); } @@ -2016,6 +2329,26 @@ struct GatherImpInfo { }; +static llvm::Value * +lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets, + llvm::Instruction *insertBefore) { + llvm::Value *firstOffset = + llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset", + insertBefore); + + llvm::Value *offsetIndex[1] = { firstOffset }; +#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn) + llvm::ArrayRef arrayRef(&offsetIndex[0], &offsetIndex[1]); + return + llvm::GetElementPtrInst::Create(base, arrayRef, "ptr", insertBefore); +#else + return + llvm::GetElementPtrInst::Create(base, &offsetIndex[0], &offsetIndex[1], + "ptr", insertBefore); +#endif +} + + struct ScatterImpInfo { ScatterImpInfo(const char *pName, const char *msName, LLVM_TYPE_CONST llvm::Type *vpt, int a) @@ -2109,45 +2442,42 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) { Assert(ok); llvm::Value *base = callInst->getArgOperand(0); - llvm::Value *offsets = callInst->getArgOperand(1); + llvm::Value *varyingOffsets = callInst->getArgOperand(1); llvm::Value *offsetScale = callInst->getArgOperand(2); - llvm::Value *storeValue = (scatterInfo != NULL) ? callInst->getArgOperand(3) : NULL; - llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4); + llvm::Value *constOffsets = callInst->getArgOperand(3); + llvm::Value *storeValue = (scatterInfo != NULL) ? callInst->getArgOperand(4) : NULL; + llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5); + // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets llvm::ConstantInt *offsetScaleInt = llvm::dyn_cast(offsetScale); Assert(offsetScaleInt != NULL); + uint64_t scaleValue = offsetScaleInt->getZExtValue(); - if (offsets->getType() == LLVMTypes::Int64VectorType) - // offsetScale is an i32, so sext it so that if we use it in a - // multiply below, it has the same type as the i64 offset used - // as the other operand... - offsetScale = new llvm::SExtInst(offsetScale, LLVMTypes::Int64Type, - "offset_sext", callInst); + std::vector scales; + for (int i = 0; i < g->target.vectorWidth; ++i) { + if (varyingOffsets->getType() == LLVMTypes::Int64VectorType) + scales.push_back(LLVMInt64(scaleValue)); + else + scales.push_back(LLVMInt32(scaleValue)); + } + llvm::Constant *offsetScaleVec = llvm::ConstantVector::get(scales); + + llvm::Value *scaledVarying = + llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec, + varyingOffsets, "scaled_varying", callInst); + llvm::Value *fullOffsets = + llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying, + constOffsets, "varying+const_offsets", + callInst); { std::vector seenPhis; - if (LLVMVectorValuesAllEqual(offsets, g->target.vectorWidth, seenPhis)) { + if (LLVMVectorValuesAllEqual(fullOffsets, g->target.vectorWidth, seenPhis)) { // If all the offsets are equal, then compute the single // pointer they all represent based on the first one of them // (arbitrarily). - - // FIXME: the code from here to where ptr is computed is highly - // redundant with the case for a vector linear below. - - llvm::Value *firstOffset = - llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset", - callInst); - llvm::Value *indices[1] = { firstOffset }; -#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn) - llvm::ArrayRef arrayRef(&indices[0], &indices[1]); - llvm::Value *ptr = - llvm::GetElementPtrInst::Create(base, arrayRef, "ptr", callInst); -#else - llvm::Value *ptr = - llvm::GetElementPtrInst::Create(base, &indices[0], &indices[1], - "ptr", callInst); -#endif + llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst); lCopyMetadata(ptr, callInst); if (gatherInfo != NULL) { @@ -2175,9 +2505,11 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) { llvm::ExtractElementInst::Create(storeValue, LLVMInt32(0), "rvalue_first", callInst); lCopyMetadata(first, callInst); + ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(first->getType(), 0), "ptr2rvalue_type", callInst); lCopyMetadata(ptr, callInst); + llvm::Instruction *sinst = new llvm::StoreInst(first, ptr, false, scatterInfo->align); lCopyMetadata(sinst, callInst); @@ -2190,34 +2522,15 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) { } int step = gatherInfo ? gatherInfo->align : scatterInfo->align; - step /= (int)offsetScaleInt->getZExtValue(); std::vector seenPhis; - if (step > 0 && lVectorIsLinear(offsets, g->target.vectorWidth, + if (step > 0 && lVectorIsLinear(fullOffsets, g->target.vectorWidth, step, seenPhis)) { // We have a linear sequence of memory locations being accessed // starting with the location given by the offset from // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit // and 64 bit gather/scatters, respectively.) - - // Get the base pointer using the first guy's offset. - llvm::Value *firstOffset = - llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset", - callInst); - llvm::Value *scaledOffset = - llvm::BinaryOperator::Create(llvm::Instruction::Mul, firstOffset, - offsetScale, "scaled_offset", callInst); - - llvm::Value *indices[1] = { scaledOffset }; -#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn) - llvm::ArrayRef arrayRef(&indices[0], &indices[1]); - llvm::Value *ptr = - llvm::GetElementPtrInst::Create(base, arrayRef, "ptr", callInst); -#else - llvm::Value *ptr = - llvm::GetElementPtrInst::Create(base, &indices[0], &indices[1], - "ptr", callInst); -#endif + llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst); lCopyMetadata(ptr, callInst); if (gatherInfo != NULL) {