Extract constant offsets from gather/scatter base+offsets offset vectors.

When we're able to turn a general gather/scatter into the "base + offsets"
form, we now try to extract out any constant components of the offsets and
then pass them as a separate parameter to the gather/scatter function
implementation.

We then in turn carefully emit code for the addressing calculation so that
these constant offsets match LLVM's patterns to detect this case, such that
we get the constant offsets directly encoded in the instruction's addressing
calculation in many cases, saving arithmetic instructions to do these
calculations.

Improves performance of stencil by ~15%.  Other workloads unchanged.
This commit is contained in:
Matt Pharr
2012-01-24 14:41:15 -08:00
parent 7be2c399b1
commit a5b7fca7e0
5 changed files with 614 additions and 355 deletions

View File

@@ -289,18 +289,18 @@ declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
define(`gather_scatter', `
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
i32, <WIDTH x i1>) nounwind readonly
i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
i32, <WIDTH x i1>) nounwind readonly
i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>,
<WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>,
<WIDTH x i1>) nounwind readonly
declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
i32, <WIDTH x $1>, <WIDTH x i1>) nounwind
i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
i32, <WIDTH x $1>, <WIDTH x i1>) nounwind
i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
<WIDTH x i1>) nounwind
declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,

View File

@@ -1565,17 +1565,15 @@ declare void @__pseudo_masked_store_64(<WIDTH x i64> * nocapture, <WIDTH x i64>,
; these represent gathers from a common base pointer with offsets. The
; offset_scale factor scales the offsets before they are added to the base
; pointer--it should have the value 1, 2, 4, or 8. (It can always just be 1.)
; The 2, 4, 8 cases are used to match LLVM patterns that use the free 2/4/8 scaling
; available in x86 addressing calculations...
; Then, the offset delta_value (guaranteed to be a compile-time constant value),
; is added to the final address. The 2, 4, 8 scales are used to match LLVM patterns
; that use the free 2/4/8 scaling available in x86 addressing calculations, and
; offset_delta feeds into the free offset calculation.
;
; varying int8 __pseudo_gather_base_offsets{32,64}_8(uniform int8 *base,
; int{32,64} offsets, int32 offset_scale, mask)
; varying int16 __pseudo_gather_base_offsets{32,64}_16(uniform int16 *base,
; int{32,64} offsets, int32 offset_scale, mask)
; varying int32 __pseudo_gather_base_offsets{32,64}_32(uniform int32 *base,
; int{32,64} offsets, int32 offset_scale, mask)
; varying int64 __pseudo_gather_base_offsets{32,64}_64(uniform int64 *base,
; int{32,64} offsets, int32 offset_scale, mask)
; varying int{8,16,32,64}
; __pseudo_gather_base_offsets{32,64}_{8,16,32,64}(uniform int8 *base,
; int{32,64} offsets, uniform int32 offset_scale,
; int{32,64} offset_delta, mask)
;
; Then, the GSImprovementsPass optimizations finds these and either
; converts them to native gather functions or converts them to vector
@@ -1591,22 +1589,22 @@ declare <WIDTH x i16> @__pseudo_gather64_16(<WIDTH x i64>, <WIDTH x MASK>) nounw
declare <WIDTH x i32> @__pseudo_gather64_32(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64> @__pseudo_gather64_64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i8> @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32,
declare <WIDTH x i8> @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32,
declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32,
declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i8> @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32,
declare <WIDTH x i8> @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32,
declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32,
declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32,
declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
; Similarly to the pseudo-gathers defined above, we also declare undefined
@@ -1621,13 +1619,9 @@ declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i3
; transforms them to scatters like:
;
; void __pseudo_scatter_base_offsets{32,64}_8(uniform int8 *base,
; varying int32 offsets, int32 offset_scale, varying int8 values, mask)
; void __pseudo_scatter_base_offsets{32,64}_16(uniform int16 *base,
; varying int32 offsets, int32 offset_scale, varying int16 values, mask)
; void __pseudo_scatter_base_offsets{32,64}_32(uniform int32 *base,
; varying int32 offsets, int32 offset_scale, varying int32 values, mask)
; void __pseudo_scatter_base_offsets{32,64}_64(uniform int64 *base,
; varying int32 offsets, int32 offset_scale, varying int64 values, mask)
; varying int32 offsets, uniform int32 offset_scale,
; varying int{32,64} offset_delta, varying int8 values, mask)
; (and similarly for 16/32/64 bit values)
;
; And the GSImprovementsPass in turn converts these to actual native
; scatters or masked stores.
@@ -1642,22 +1636,22 @@ declare void @__pseudo_scatter64_16(<WIDTH x i64>, <WIDTH x i16>, <WIDTH x MASK>
declare void @__pseudo_scatter64_32(<WIDTH x i64>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter64_64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32,
declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i8>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32,
declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i16>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32,
declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32,
declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32,
declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i8>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32,
declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i16>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32,
declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32,
declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i64>, <WIDTH x MASK>) nounwind
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2701,7 +2695,8 @@ define(`gen_gather', `
;; Define the utility function to do the gather operation for a single element
;; of the type
define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
<$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
<$1 x i32> %offset_delta, <$1 x $2> %ret,
i32 %lane) nounwind readonly alwaysinline {
; compute address for this one from the base
%offset32 = extractelement <$1 x i32> %offsets, i32 %lane
; the order and details of the next 4 lines are important--they match LLVMs
@@ -2711,15 +2706,20 @@ define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_
%offset = mul i64 %offset64, %scale64
%ptroffset = getelementptr i8 * %ptr, i64 %offset
%delta = extractelement <$1 x i32> %offset_delta, i32 %lane
%delta64 = sext i32 %delta to i64
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
; load value and insert into returned value
%ptrcast = bitcast i8 * %ptroffset to $2 *
%ptrcast = bitcast i8 * %finalptr to $2 *
%val = load $2 *%ptrcast
%updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
ret <$1 x $2> %updatedret
}
define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
<$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
<$1 x i64> %offset_delta, <$1 x $2> %ret,
i32 %lane) nounwind readonly alwaysinline {
; compute address for this one from the base
%offset64 = extractelement <$1 x i64> %offsets, i32 %lane
; the order and details of the next 4 lines are important--they match LLVMs
@@ -2728,8 +2728,11 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_
%offset = mul i64 %offset64, %offset_scale64
%ptroffset = getelementptr i8 * %ptr, i64 %offset
%delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
; load value and insert into returned value
%ptrcast = bitcast i8 * %ptroffset to $2 *
%ptrcast = bitcast i8 * %finalptr to $2 *
%val = load $2 *%ptrcast
%updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
ret <$1 x $2> %updatedret
@@ -2737,6 +2740,7 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_
define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
<$1 x i32> %offset_delta,
<$1 x i32> %vecmask) nounwind readonly alwaysinline {
; We can be clever and avoid the per-lane stuff for gathers if we are willing
; to require that the 0th element of the array being gathered from is always
@@ -2749,16 +2753,25 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32
<$1 x i32> %vecmask)
%newOffsets = load <$1 x i32> * %offsetsPtr
%deltaPtr = alloca <$1 x i32>
store <$1 x i32> zeroinitializer, <$1 x i32> * %deltaPtr
call void @__masked_store_blend_32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta,
<$1 x i32> %vecmask)
%newDelta = load <$1 x i32> * %deltaPtr
%ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
i32 %offset_scale, <$1 x $2> undef, i32 0)
i32 %offset_scale, <$1 x i32> %offset_delta,
<$1 x $2> undef, i32 0)
forloop(lane, 1, eval($1-1),
`patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr,
<$1 x i32> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
<$1 x i32> %newOffsets, i32 %offset_scale, <$1 x i32> %offset_delta,
<$1 x $2> %retPREV, i32 LANE)
', `LANE', lane), `PREV', eval(lane-1))')
ret <$1 x $2> %ret`'eval($1-1)
}
define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
<$1 x i64> %offset_delta,
<$1 x i32> %vecmask) nounwind readonly alwaysinline {
; We can be clever and avoid the per-lane stuff for gathers if we are willing
; to require that the 0th element of the array being gathered from is always
@@ -2771,11 +2784,19 @@ define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32
<$1 x i32> %vecmask)
%newOffsets = load <$1 x i64> * %offsetsPtr
%deltaPtr = alloca <$1 x i64>
store <$1 x i64> zeroinitializer, <$1 x i64> * %deltaPtr
call void @__masked_store_blend_64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta,
<$1 x i32> %vecmask)
%newDelta = load <$1 x i64> * %deltaPtr
%ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets,
i32 %offset_scale, <$1 x $2> undef, i32 0)
i32 %offset_scale, <$1 x i64> %newDelta,
<$1 x $2> undef, i32 0)
forloop(lane, 1, eval($1-1),
`patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr,
<$1 x i64> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
<$1 x i64> %newOffsets, i32 %offset_scale, <$1 x i64> %newDelta,
<$1 x $2> %retPREV, i32 LANE)
', `LANE', lane), `PREV', eval(lane-1))')
ret <$1 x $2> %ret`'eval($1-1)
}
@@ -2826,7 +2847,8 @@ define(`gen_scatter', `
;; Define the function that descripes the work to do to scatter a single
;; value
define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
<$1 x $2> %values, i32 %lane) nounwind alwaysinline {
<$1 x i32> %offset_delta, <$1 x $2> %values,
i32 %lane) nounwind alwaysinline {
%offset32 = extractelement <$1 x i32> %offsets, i32 %lane
; the order and details of the next 4 lines are important--they match LLVMs
; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
@@ -2835,42 +2857,52 @@ define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scal
%offset = mul i64 %offset64, %scale64
%ptroffset = getelementptr i8 * %ptr, i64 %offset
%ptrcast = bitcast i8 * %ptroffset to $2 *
%delta = extractelement <$1 x i32> %offset_delta, i32 %lane
%delta64 = sext i32 %delta to i64
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
%ptrcast = bitcast i8 * %finalptr to $2 *
%storeval = extractelement <$1 x $2> %values, i32 %lane
store $2 %storeval, $2 * %ptrcast
ret void
}
define void @__scatter_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
<$1 x $2> %values, i32 %lane) nounwind alwaysinline {
<$1 x i64> %offset_delta, <$1 x $2> %values,
i32 %lane) nounwind alwaysinline {
%offset64 = extractelement <$1 x i64> %offsets, i32 %lane
; the order and details of the next 4 lines are important--they match LLVMs
; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
%scale64 = sext i32 %offset_scale to i64
%offset = mul i64 %offset64, %scale64
%ptroffset = getelementptr i8 * %ptr, i64 %offset
%ptrcast = bitcast i8 * %ptroffset to $2 *
%delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
%ptrcast = bitcast i8 * %finalptr to $2 *
%storeval = extractelement <$1 x $2> %values, i32 %lane
store $2 %storeval, $2 * %ptrcast
ret void
}
define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, i32 %offset_scale,
<$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
<$1 x i32> %offset_delta, <$1 x $2> %values,
<$1 x i32> %mask) nounwind alwaysinline {
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
per_lane($1, <$1 x i32> %mask, `
call void @__scatter_elt32_$2(i8 * %base, <$1 x i32> %offsets, i32 %offset_scale,
<$1 x $2> %values, i32 LANE)')
<$1 x i32> %offset_delta, <$1 x $2> %values, i32 LANE)')
ret void
}
define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %offset_scale,
<$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
<$1 x i64> %offset_delta, <$1 x $2> %values,
<$1 x i32> %mask) nounwind alwaysinline {
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
per_lane($1, <$1 x i32> %mask, `
call void @__scatter_elt64_$2(i8 * %base, <$1 x i64> %offsets, i32 %offset_scale,
<$1 x $2> %values, i32 LANE)')
<$1 x i64> %offset_delta, <$1 x $2> %values, i32 LANE)')
ret void
}

View File

@@ -1060,13 +1060,15 @@ static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
// offsets * offsetScale is in bytes (for all of these)
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
__vec16_i1 mask) { \
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
__vec16_i1 mask) { \
VTYPE ret; \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 16; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]); \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
ret.v[i] = *ptr; \
} \
return ret; \
@@ -1104,13 +1106,15 @@ GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
// scatter
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE void FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
#define SCATTER_BASE_VARYINGOFFSET(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
VTYPE val, __vec16_i1 mask) { \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 16; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]); \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
*ptr = val.v[i]; \
} \
}

View File

@@ -51,8 +51,8 @@
#define FORCEINLINE __attribute__((always_inline)) inline
#endif
//CO#undef FORCEINLINE
//CO#define FORCEINLINE
#undef FORCEINLINE
#define FORCEINLINE
typedef float __vec1_f;
typedef double __vec1_d;
@@ -2612,52 +2612,54 @@ static FORCEINLINE void __masked_store_blend_64(void *p, __vec4_i64 val,
template<typename RetVec, typename RetScalar>
static FORCEINLINE RetVec
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
__vec4_i32 offsets, uint32_t scale, __vec4_i1 mask) {
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
RetScalar r[4];
#if 1
// "Fast gather" trick...
offsets = __select(mask, offsets, __smear_i32(0));
int offset = scale * _mm_extract_epi32(offsets.v, 0);
constOffset = __select(mask, constOffset, __smear_i32(0));
int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
offset = scale * _mm_extract_epi32(offsets.v, 1);
offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
offset = scale * _mm_extract_epi32(offsets.v, 2);
offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
offset = scale * _mm_extract_epi32(offsets.v, 3);
offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
#else
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 0);
int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 1);
int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
RetScalar *ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 2);
int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
RetScalar *ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 3);
int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
RetScalar *ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
}
@@ -2665,54 +2667,57 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
return RetVec(r[0], r[1], r[2], r[3]);
}
template<typename RetVec, typename RetScalar>
static FORCEINLINE RetVec
lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
RetScalar r[4];
#if 1
// "Fast gather" trick...
offsets = __select(mask, offsets, __smear_i64(0));
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
constOffset = __select(mask, constOffset, __smear_i64(0));
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
offset = scale * _mm_extract_epi64(offsets.v[0], 1);
offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
offset = scale * _mm_extract_epi64(offsets.v[1], 0);
offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
offset = scale * _mm_extract_epi64(offsets.v[1], 1);
offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
#else
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
RetScalar *ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
RetScalar *ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
}
@@ -2723,80 +2728,89 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
static FORCEINLINE __vec4_i8
__gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale,
mask);
constOffset, mask);
}
static FORCEINLINE __vec4_i8
__gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale,
mask);
constOffset, mask);
}
static FORCEINLINE __vec4_i16
__gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale,
mask);
constOffset, mask);
}
static FORCEINLINE __vec4_i16
__gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale,
mask);
constOffset, mask);
}
static FORCEINLINE __vec4_i32
__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i1 mask) {
__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
__vec4_i32 constOffset, __vec4_i1 mask) {
__m128i r = _mm_set_epi32(0, 0, 0, 0);
#if 1
// "Fast gather"...
offsets = __select(mask, offsets, __smear_i32(0));
constOffset = __select(mask, constOffset, __smear_i32(0));
int offset = scale * _mm_extract_epi32(offsets.v, 0);
int offset = scale * _mm_extract_epi32(offsets.v, 0) +
_mm_extract_epi32(constOffset.v, 0);
uint32_t *ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 0);
offset = scale * _mm_extract_epi32(offsets.v, 1);
offset = scale * _mm_extract_epi32(offsets.v, 1) +
_mm_extract_epi32(constOffset.v, 1);
ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 1);
offset = scale * _mm_extract_epi32(offsets.v, 2);
offset = scale * _mm_extract_epi32(offsets.v, 2) +
_mm_extract_epi32(constOffset.v, 2);
ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 2);
offset = scale * _mm_extract_epi32(offsets.v, 3);
offset = scale * _mm_extract_epi32(offsets.v, 3) +
_mm_extract_epi32(constOffset.v, 3);
ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 3);
#else
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 0);
int offset = scale * _mm_extract_epi32(offsets.v, 0) +
_mm_extract_epi32(constOffset.v, 0);
uint32_t *ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 1);
int offset = scale * _mm_extract_epi32(offsets.v, 1) +
_mm_extract_epi32(constOffset.v, 1);
uint32_t *ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 2);
int offset = scale * _mm_extract_epi32(offsets.v, 2) +
_mm_extract_epi32(constOffset.v, 2);
uint32_t *ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 2);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 3);
int offset = scale * _mm_extract_epi32(offsets.v, 3) +
_mm_extract_epi32(constOffset.v, 3);
uint32_t *ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 3);
}
@@ -2806,23 +2820,23 @@ __gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,
static FORCEINLINE __vec4_i32
__gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale,
mask);
delta, mask);
}
static FORCEINLINE __vec4_i64
__gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale,
mask);
delta, mask);
}
static FORCEINLINE __vec4_i64
__gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale,
mask);
delta, mask);
}
template<typename RetVec, typename RetScalar>
@@ -2969,217 +2983,108 @@ static FORCEINLINE __vec4_i64 __gather64_i64(__vec4_i64 ptrs, __vec4_i1 mask) {
// scatter
static FORCEINLINE void
__scatter_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
*ptr = _mm_extract_epi8(val.v, 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
*ptr = _mm_extract_epi8(val.v, 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
*ptr = _mm_extract_epi8(val.v, 2);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
*ptr = _mm_extract_epi8(val.v, 3);
}
#define SCATTER32_64(SUFFIX, TYPE, EXTRACT) \
static FORCEINLINE void \
__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
uint32_t scale, __vec4_i32 constOffset, \
__vec4_##SUFFIX val, __vec4_i1 mask) { \
uint32_t m = _mm_extract_ps(mask.v, 0); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \
_mm_extract_epi32(constOffset.v, 0)); \
*ptr = EXTRACT(val.v, 0); \
} \
m = _mm_extract_ps(mask.v, 1); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \
_mm_extract_epi32(constOffset.v, 1)); \
*ptr = EXTRACT(val.v, 1); \
} \
m = _mm_extract_ps(mask.v, 2); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \
_mm_extract_epi32(constOffset.v, 2)); \
*ptr = EXTRACT(val.v, 2); \
} \
m = _mm_extract_ps(mask.v, 3); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \
_mm_extract_epi32(constOffset.v, 3)); \
*ptr = EXTRACT(val.v, 3); \
} \
} \
static FORCEINLINE void \
__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
uint32_t scale, __vec4_i64 constOffset, \
__vec4_##SUFFIX val, __vec4_i1 mask) { \
uint32_t m = _mm_extract_ps(mask.v, 0); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + \
_mm_extract_epi64(constOffset.v[0], 0); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 0); \
} \
m = _mm_extract_ps(mask.v, 1); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + \
_mm_extract_epi64(constOffset.v[0], 1); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 1); \
} \
m = _mm_extract_ps(mask.v, 2); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + \
_mm_extract_epi64(constOffset.v[1], 0); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 2); \
} \
m = _mm_extract_ps(mask.v, 3); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + \
_mm_extract_epi64(constOffset.v[1], 1); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 3); \
} \
}
static FORCEINLINE void
__scatter_base_offsets64_i8(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
uint8_t *ptr = (uint8_t *)(p + offset);
*ptr = _mm_extract_epi8(val.v, 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
uint8_t *ptr = (uint8_t *)(p + offset);
*ptr = _mm_extract_epi8(val.v, 1);
}
SCATTER32_64(i8, int8_t, _mm_extract_epi8)
SCATTER32_64(i16, int16_t, _mm_extract_epi16)
SCATTER32_64(i32, int32_t, _mm_extract_epi32)
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
uint8_t *ptr = (uint8_t *)(p + offset);
*ptr = _mm_extract_epi8(val.v, 2);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
uint8_t *ptr = (uint8_t *)(p + offset);
*ptr = _mm_extract_epi8(val.v, 3);
}
}
static FORCEINLINE void
__scatter_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
*ptr = _mm_extract_epi16(val.v, 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
*ptr = _mm_extract_epi16(val.v, 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
*ptr = _mm_extract_epi16(val.v, 2);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
*ptr = _mm_extract_epi16(val.v, 3);
}
}
static FORCEINLINE void
__scatter_base_offsets64_i16(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
uint16_t *ptr = (uint16_t *)(p + offset);
*ptr = _mm_extract_epi16(val.v, 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
uint16_t *ptr = (uint16_t *)(p + offset);
*ptr = _mm_extract_epi16(val.v, 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
uint16_t *ptr = (uint16_t *)(p + offset);
*ptr = _mm_extract_epi16(val.v, 2);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
uint16_t *ptr = (uint16_t *)(p + offset);
*ptr = _mm_extract_epi16(val.v, 3);
}
}
static FORCEINLINE void
__scatter_base_offsets32_i32(unsigned char *b, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int32_t *ptr = (int32_t *)(b + scale *
_mm_extract_epi32(offsets.v, 0));
*ptr = _mm_extract_epi32(val.v, 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int32_t *ptr = (int32_t *)(b + scale *
_mm_extract_epi32(offsets.v, 1));
*ptr = _mm_extract_epi32(val.v, 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int32_t *ptr = (int32_t *)(b + scale *
_mm_extract_epi32(offsets.v, 2));
*ptr = _mm_extract_epi32(val.v, 2);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int32_t *ptr = (int32_t *)(b + scale *
_mm_extract_epi32(offsets.v, 3));
*ptr = _mm_extract_epi32(val.v, 3);
}
}
static FORCEINLINE void
__scatter_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
uint32_t *ptr = (uint32_t *)(p + offset);
*ptr = _mm_extract_epi32(val.v, 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
uint32_t *ptr = (uint32_t *)(p + offset);
*ptr = _mm_extract_epi32(val.v, 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
uint32_t *ptr = (uint32_t *)(p + offset);
*ptr = _mm_extract_epi32(val.v, 2);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
uint32_t *ptr = (uint32_t *)(p + offset);
*ptr = _mm_extract_epi32(val.v, 3);
}
}
static FORCEINLINE void
__scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val,
__vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 0);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) +
_mm_extract_epi32(constOffset.v, 0);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 1);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) +
_mm_extract_epi32(constOffset.v, 1);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 2);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) +
_mm_extract_epi32(constOffset.v, 2);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 0);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 3);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) +
_mm_extract_epi32(constOffset.v, 3);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 1);
}
@@ -3187,31 +3092,36 @@ __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
static FORCEINLINE void
__scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 constOffset,
__vec4_i64 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +
_mm_extract_epi64(constOffset.v[0], 0);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +
_mm_extract_epi64(constOffset.v[0], 1);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +
_mm_extract_epi64(constOffset.v[1], 0);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 0);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +
_mm_extract_epi64(constOffset.v[1], 1);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 1);
}

435
opt.cpp
View File

@@ -205,6 +205,7 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
}
#if 0
static llvm::Instruction *
lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
llvm::Value *arg2, llvm::Value *arg3, const char *name,
@@ -218,7 +219,7 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
name, insertBefore);
#endif
}
#endif
static llvm::Instruction *
lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
@@ -234,6 +235,21 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
#endif
}
static llvm::Instruction *
lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
llvm::Value *arg2, llvm::Value *arg3, llvm::Value *arg4,
llvm::Value *arg5, const char *name,
llvm::Instruction *insertBefore = NULL) {
llvm::Value *args[6] = { arg0, arg1, arg2, arg3, arg4, arg5 };
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[6]);
return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
#else
return llvm::CallInst::Create(func, &newArgs[0], &newArgs[6],
name, insertBefore);
#endif
}
///////////////////////////////////////////////////////////////////////////
void
@@ -302,10 +318,13 @@ Optimize(llvm::Module *module, int optLevel) {
// Early optimizations to try to reduce the total amount of code to
// work with if we can
optPM.add(CreateDetectGSBaseOffsetsPass());
optPM.add(llvm::createReassociatePass());
optPM.add(llvm::createConstantPropagationPass());
optPM.add(llvm::createConstantPropagationPass());
optPM.add(llvm::createDeadInstEliminationPass());
optPM.add(llvm::createCFGSimplificationPass());
optPM.add(CreateDetectGSBaseOffsetsPass());
if (!g->opt.disableMaskAllOnOptimizations) {
optPM.add(CreateIntrinsicsOptPass());
optPM.add(CreateVSelMovmskOptPass());
@@ -314,11 +333,7 @@ Optimize(llvm::Module *module, int optLevel) {
}
optPM.add(llvm::createDeadInstEliminationPass());
optPM.add(llvm::createConstantPropagationPass());
optPM.add(llvm::createDeadInstEliminationPass());
// On to more serious optimizations
optPM.add(llvm::createCFGSimplificationPass());
if (runSROA)
optPM.add(llvm::createScalarReplAggregatesPass());
optPM.add(llvm::createInstructionCombiningPass());
@@ -1173,6 +1188,166 @@ lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets,
}
static llvm::Value *
lGetZeroOffsetVector(llvm::Value *origVec) {
if (origVec->getType() == LLVMTypes::Int32VectorType)
return LLVMInt32Vector((int32_t)0);
else
return LLVMInt64Vector((int64_t)0);
}
#if 0
static void
lPrint(llvm::Value *v, int indent = 0) {
if (llvm::isa<llvm::PHINode>(v))
return;
fprintf(stderr, "%*c", indent, ' ');
v->dump();
llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(v);
if (inst != NULL) {
for (int i = 0; i < (int)inst->getNumOperands(); ++i) {
llvm::Value *op = inst->getOperand(i);
if (llvm::isa<llvm::Constant>(op) == false)
lPrint(op, indent+4);
}
}
}
#endif
/** Given a vector expression in vec, separate it into a compile-time
constant component and a variable component, returning the two parts in
*constOffset and *variableOffset. (It should be the case that the sum
of these two is exactly equal to the original vector.)
This routine only handles some (important) patterns; in some cases it
will fail and return components that are actually compile-time
constants in *variableOffset.
Finally, if there aren't any constant (or, respectivaly, variable)
components, the corresponding return value may be set to NULL.
*/
static void
lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
llvm::Value **variableOffset,
llvm::Instruction *insertBefore) {
if (llvm::isa<llvm::ConstantVector>(vec) ||
llvm::isa<llvm::ConstantAggregateZero>(vec)) {
*constOffset = vec;
*variableOffset = NULL;
return;
}
llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(vec);
if (sext != NULL) {
// Check the sext target.
llvm::Value *co, *vo;
lExtractConstantOffset(sext->getOperand(0), &co, &vo, insertBefore);
// make new sext instructions for the two parts
if (co == NULL)
*constOffset = NULL;
else
*constOffset = new llvm::SExtInst(co, sext->getType(),
"const_offset_sext", insertBefore);
if (vo == NULL)
*variableOffset = NULL;
else
*variableOffset = new llvm::SExtInst(vo, sext->getType(),
"variable_offset_sext",
insertBefore);
return;
}
// FIXME? handle bitcasts / type casts here
llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(vec);
if (bop != NULL) {
llvm::Value *op0 = bop->getOperand(0);
llvm::Value *op1 = bop->getOperand(1);
llvm::Value *c0, *v0, *c1, *v1;
if (bop->getOpcode() == llvm::Instruction::Add) {
lExtractConstantOffset(op0, &c0, &v0, insertBefore);
lExtractConstantOffset(op1, &c1, &v1, insertBefore);
if (c0 == NULL)
*constOffset = c1;
else if (c1 == NULL)
*constOffset = c0;
else
*constOffset =
llvm::BinaryOperator::Create(llvm::Instruction::Add, c0, c1,
"const_op", insertBefore);
if (v0 == NULL)
*variableOffset = v1;
else if (v1 == NULL)
*variableOffset = v0;
else
*variableOffset =
llvm::BinaryOperator::Create(llvm::Instruction::Add, v0, v1,
"variable_op", insertBefore);
return;
}
else if (bop->getOpcode() == llvm::Instruction::Mul) {
lExtractConstantOffset(op0, &c0, &v0, insertBefore);
lExtractConstantOffset(op1, &c1, &v1, insertBefore);
// Given the product of constant and variable terms, we have:
// (c0 + v0) * (c1 + v1) == (c0 c1) + (v0 c1 + c0 v1 + v0 v1)
// Note that the first term is a constant and the last three are
// variable.
if (c0 != NULL && c1 != NULL)
*constOffset =
llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, c1,
"const_mul", insertBefore);
else
*constOffset = NULL;
llvm::Value *va = NULL, *vb = NULL, *vc = NULL;
if (v0 != NULL && c1 != NULL)
va = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, c1,
"va_mul", insertBefore);
if (c0 != NULL && v1 != NULL)
vb = llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, v1,
"vb_mul", insertBefore);
if (v0 != NULL && v1 != NULL)
vc = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, v1,
"vc_mul", insertBefore);
llvm::Value *vab = NULL;
if (va != NULL && vb != NULL)
vab = llvm::BinaryOperator::Create(llvm::Instruction::Add, va, vb,
"vab_add", insertBefore);
else if (va != NULL)
vab = va;
else
vab = vb;
if (vab != NULL && vc != NULL)
*variableOffset =
llvm::BinaryOperator::Create(llvm::Instruction::Add, vab, vc,
"vabc_add", insertBefore);
else if (vab != NULL)
*variableOffset = vab;
else
*variableOffset = vc;
return;
}
}
// Nothing matched, just return what we have as a variable component
*constOffset = NULL;
*variableOffset = vec;
}
/* Returns true if the given value is a constant vector of integers with
the value 2, 4, 8 in all of the elements. (Returns the splatted value
in *splat, if so). */
@@ -1277,6 +1452,123 @@ lExtractOffsetVector248Scale(llvm::Value **vec) {
return LLVMInt32(1);
}
#if 0
static llvm::Value *
lExtractUniforms(llvm::Value **vec, llvm::Instruction *insertBefore) {
fprintf(stderr, " lextract: ");
(*vec)->dump();
fprintf(stderr, "\n");
if (llvm::isa<llvm::ConstantVector>(*vec) ||
llvm::isa<llvm::ConstantAggregateZero>(*vec))
return NULL;
llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(*vec);
if (sext != NULL) {
llvm::Value *sextOp = sext->getOperand(0);
// Check the sext target.
llvm::Value *unif = lExtractUniforms(&sextOp, insertBefore);
if (unif == NULL)
return NULL;
// make a new sext instruction so that we end up with the right
// type
*vec = new llvm::SExtInst(sextOp, sext->getType(), "offset_sext", sext);
return unif;
}
std::vector<llvm::PHINode *> phis;
if (LLVMVectorValuesAllEqual(*vec, g->target.vectorWidth, phis)) {
// FIXME: we may want to redo all of the expression here, in scalar
// form (if at all possible), for code quality...
llvm::Value *unif =
llvm::ExtractElementInst::Create(*vec, LLVMInt32(0),
"first_uniform", insertBefore);
*vec = NULL;
return unif;
}
llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
if (bop == NULL)
return NULL;
llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
if (bop->getOpcode() == llvm::Instruction::Add) {
llvm::Value *s0 = lExtractUniforms(&op0, insertBefore);
llvm::Value *s1 = lExtractUniforms(&op1, insertBefore);
if (s0 == NULL && s1 == NULL)
return NULL;
if (op0 == NULL)
*vec = op1;
else if (op1 == NULL)
*vec = op0;
else
*vec = llvm::BinaryOperator::Create(llvm::Instruction::Add,
op0, op1, "new_add", insertBefore);
if (s0 == NULL)
return s1;
else if (s1 == NULL)
return s0;
else
return llvm::BinaryOperator::Create(llvm::Instruction::Add, s0, s1,
"add_unif", insertBefore);
}
#if 0
else if (bop->getOpcode() == llvm::Instruction::Mul) {
// Check each operand for being one of the scale factors we care about.
int splat;
if (lIs248Splat(op0, &splat)) {
*vec = op1;
return LLVMInt32(splat);
}
else if (lIs248Splat(op1, &splat)) {
*vec = op0;
return LLVMInt32(splat);
}
else
return LLVMInt32(1);
}
#endif
else
return NULL;
}
static void
lExtractUniformsFromOffset(llvm::Value **basePtr, llvm::Value **offsetVector,
llvm::Value *offsetScale,
llvm::Instruction *insertBefore) {
#if 1
(*basePtr)->dump();
printf("\n");
(*offsetVector)->dump();
printf("\n");
offsetScale->dump();
printf("-----\n");
#endif
llvm::Value *uniformDelta = lExtractUniforms(offsetVector, insertBefore);
if (uniformDelta == NULL)
return;
llvm::Value *index[1] = { uniformDelta };
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
*basePtr = llvm::GetElementPtrInst::Create(*basePtr, arrayRef, "new_base",
insertBefore);
#else
*basePtr = llvm::GetElementPtrInst::Create(*basePtr, &index[0],
&index[1], "new_base",
insertBefore);
#endif
// this should only happen if we have only uniforms, but that in turn
// shouldn't be a gather/scatter!
Assert(*offsetVector != NULL);
}
#endif
struct GSInfo {
GSInfo(const char *pgFuncName, const char *pgboFuncName,
@@ -1367,7 +1659,24 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
// to the next instruction...
continue;
llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector);
// Try to decompose the offset vector into a compile time constant
// component and a varying component. The constant component is
// passed as a separate parameter to the gather/scatter functions,
// which in turn allows their implementations to end up emitting
// x86 instructions with constant offsets encoded in them.
llvm::Value *constOffset, *variableOffset;
lExtractConstantOffset(offsetVector, &constOffset, &variableOffset,
callInst);
if (constOffset == NULL)
constOffset = lGetZeroOffsetVector(offsetVector);
if (variableOffset == NULL)
variableOffset = lGetZeroOffsetVector(offsetVector);
// See if the varying component is scaled by 2, 4, or 8. If so,
// extract that scale factor and rewrite variableOffset to remove
// it. (This also is pulled out so that we can match the scales by
// 2/4/8 offered by x86 addressing operators.)
llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset);
// Cast the base pointer to a void *, since that's what the
// __pseudo_*_base_offsets_* functions want.
@@ -1386,11 +1695,15 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
// walk past the sext to get the i32 offset values and then
// call out to the corresponding 32-bit gather/scatter
// function.
llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(offsetVector);
llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(variableOffset);
if (sext != NULL &&
sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) {
offsetVector = sext->getOperand(0);
variableOffset = sext->getOperand(0);
gatherScatterFunc = info->baseOffsets32Func;
if (constOffset->getType() != LLVMTypes::Int32VectorType)
constOffset =
new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
"trunc_const_offset", callInst);
}
}
@@ -1403,8 +1716,8 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
// the instruction isn't inserted into a basic block and that
// way we can then call ReplaceInstWithInst().
llvm::Instruction *newCall =
lCallInst(gatherScatterFunc, basePtr, offsetVector, offsetScale,
mask, "newgather", NULL);
lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
constOffset, mask, "newgather", NULL);
lCopyMetadata(newCall, callInst);
llvm::ReplaceInstWithInst(callInst, newCall);
}
@@ -1416,8 +1729,8 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
// base+offsets instruction. See above for why passing NULL
// for the Instruction * is intended.
llvm::Instruction *newCall =
lCallInst(gatherScatterFunc, basePtr, offsetVector, offsetScale,
storeValue, mask, "", NULL);
lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
constOffset, storeValue, mask, "", NULL);
lCopyMetadata(newCall, callInst);
llvm::ReplaceInstWithInst(callInst, newCall);
}
@@ -2016,6 +2329,26 @@ struct GatherImpInfo {
};
static llvm::Value *
lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets,
llvm::Instruction *insertBefore) {
llvm::Value *firstOffset =
llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
insertBefore);
llvm::Value *offsetIndex[1] = { firstOffset };
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
llvm::ArrayRef<llvm::Value *> arrayRef(&offsetIndex[0], &offsetIndex[1]);
return
llvm::GetElementPtrInst::Create(base, arrayRef, "ptr", insertBefore);
#else
return
llvm::GetElementPtrInst::Create(base, &offsetIndex[0], &offsetIndex[1],
"ptr", insertBefore);
#endif
}
struct ScatterImpInfo {
ScatterImpInfo(const char *pName, const char *msName,
LLVM_TYPE_CONST llvm::Type *vpt, int a)
@@ -2109,45 +2442,42 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
Assert(ok);
llvm::Value *base = callInst->getArgOperand(0);
llvm::Value *offsets = callInst->getArgOperand(1);
llvm::Value *varyingOffsets = callInst->getArgOperand(1);
llvm::Value *offsetScale = callInst->getArgOperand(2);
llvm::Value *storeValue = (scatterInfo != NULL) ? callInst->getArgOperand(3) : NULL;
llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4);
llvm::Value *constOffsets = callInst->getArgOperand(3);
llvm::Value *storeValue = (scatterInfo != NULL) ? callInst->getArgOperand(4) : NULL;
llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5);
// Compute the full offset vector: offsetScale * varyingOffsets + constOffsets
llvm::ConstantInt *offsetScaleInt =
llvm::dyn_cast<llvm::ConstantInt>(offsetScale);
Assert(offsetScaleInt != NULL);
uint64_t scaleValue = offsetScaleInt->getZExtValue();
if (offsets->getType() == LLVMTypes::Int64VectorType)
// offsetScale is an i32, so sext it so that if we use it in a
// multiply below, it has the same type as the i64 offset used
// as the other operand...
offsetScale = new llvm::SExtInst(offsetScale, LLVMTypes::Int64Type,
"offset_sext", callInst);
std::vector<llvm::Constant *> scales;
for (int i = 0; i < g->target.vectorWidth; ++i) {
if (varyingOffsets->getType() == LLVMTypes::Int64VectorType)
scales.push_back(LLVMInt64(scaleValue));
else
scales.push_back(LLVMInt32(scaleValue));
}
llvm::Constant *offsetScaleVec = llvm::ConstantVector::get(scales);
llvm::Value *scaledVarying =
llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
varyingOffsets, "scaled_varying", callInst);
llvm::Value *fullOffsets =
llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying,
constOffsets, "varying+const_offsets",
callInst);
{
std::vector<llvm::PHINode *> seenPhis;
if (LLVMVectorValuesAllEqual(offsets, g->target.vectorWidth, seenPhis)) {
if (LLVMVectorValuesAllEqual(fullOffsets, g->target.vectorWidth, seenPhis)) {
// If all the offsets are equal, then compute the single
// pointer they all represent based on the first one of them
// (arbitrarily).
// FIXME: the code from here to where ptr is computed is highly
// redundant with the case for a vector linear below.
llvm::Value *firstOffset =
llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
callInst);
llvm::Value *indices[1] = { firstOffset };
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[1]);
llvm::Value *ptr =
llvm::GetElementPtrInst::Create(base, arrayRef, "ptr", callInst);
#else
llvm::Value *ptr =
llvm::GetElementPtrInst::Create(base, &indices[0], &indices[1],
"ptr", callInst);
#endif
llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
lCopyMetadata(ptr, callInst);
if (gatherInfo != NULL) {
@@ -2175,9 +2505,11 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
llvm::ExtractElementInst::Create(storeValue, LLVMInt32(0), "rvalue_first",
callInst);
lCopyMetadata(first, callInst);
ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(first->getType(), 0),
"ptr2rvalue_type", callInst);
lCopyMetadata(ptr, callInst);
llvm::Instruction *sinst = new llvm::StoreInst(first, ptr, false,
scatterInfo->align);
lCopyMetadata(sinst, callInst);
@@ -2190,34 +2522,15 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
}
int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
step /= (int)offsetScaleInt->getZExtValue();
std::vector<llvm::PHINode *> seenPhis;
if (step > 0 && lVectorIsLinear(offsets, g->target.vectorWidth,
if (step > 0 && lVectorIsLinear(fullOffsets, g->target.vectorWidth,
step, seenPhis)) {
// We have a linear sequence of memory locations being accessed
// starting with the location given by the offset from
// offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
// and 64 bit gather/scatters, respectively.)
// Get the base pointer using the first guy's offset.
llvm::Value *firstOffset =
llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
callInst);
llvm::Value *scaledOffset =
llvm::BinaryOperator::Create(llvm::Instruction::Mul, firstOffset,
offsetScale, "scaled_offset", callInst);
llvm::Value *indices[1] = { scaledOffset };
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[1]);
llvm::Value *ptr =
llvm::GetElementPtrInst::Create(base, arrayRef, "ptr", callInst);
#else
llvm::Value *ptr =
llvm::GetElementPtrInst::Create(base, &indices[0], &indices[1],
"ptr", callInst);
#endif
llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
lCopyMetadata(ptr, callInst);
if (gatherInfo != NULL) {