Extract constant offsets from gather/scatter base+offsets offset vectors.

When we're able to turn a general gather/scatter into the "base + offsets" form, we now try to extract out any constant components of the offsets and then pass them as a separate parameter to the gather/scatter function implementation. We then in turn carefully emit code for the addressing calculation so that these constant offsets match LLVM's patterns to detect this case, such that we get the constant offsets directly encoded in the instruction's addressing calculation in many cases, saving arithmetic instructions to do these calculations. Improves performance of stencil by ~15%. Other workloads unchanged.
2012-01-24 14:41:15 -08:00
parent 7be2c399b1
commit a5b7fca7e0
5 changed files with 614 additions and 355 deletions
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -289,18 +289,18 @@ declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,

 define(`gather_scatter', `
 declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
-                        i32, <WIDTH x i1>) nounwind readonly 
+                        i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
-                        i32, <WIDTH x i1>) nounwind readonly 
+                        i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>, 
                                    <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>, 
                                    <WIDTH x i1>) nounwind readonly 

 declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
-                  i32, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+                  i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
 declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
-                  i32, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+                  i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
 declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
                             <WIDTH x i1>) nounwind 
 declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1565,17 +1565,15 @@ declare void @__pseudo_masked_store_64(<WIDTH x i64> * nocapture, <WIDTH x i64>,
 ; these represent gathers from a common base pointer with offsets.  The
 ; offset_scale factor scales the offsets before they are added to the base
 ; pointer--it should have the value 1, 2, 4, or 8.  (It can always just be 1.)
-; The 2, 4, 8 cases are used to match LLVM patterns that use the free 2/4/8 scaling
-; available in x86 addressing calculations... 
+; Then, the offset delta_value (guaranteed to be a compile-time constant value),
+; is added to the final address. The 2, 4, 8 scales are used to match LLVM patterns
+; that use the free 2/4/8 scaling available in x86 addressing calculations, and
+; offset_delta feeds into the free offset calculation. 
 ;
-; varying int8  __pseudo_gather_base_offsets{32,64}_8(uniform int8 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
-; varying int16 __pseudo_gather_base_offsets{32,64}_16(uniform int16 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
-; varying int32 __pseudo_gather_base_offsets{32,64}_32(uniform int32 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
-; varying int64 __pseudo_gather_base_offsets{32,64}_64(uniform int64 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
+; varying int{8,16,32,64}
+; __pseudo_gather_base_offsets{32,64}_{8,16,32,64}(uniform int8 *base,
+;                                    int{32,64} offsets, uniform int32 offset_scale, 
+;                                    int{32,64} offset_delta, mask)
 ;
 ; Then, the GSImprovementsPass optimizations finds these and either
 ; converts them to native gather functions or converts them to vector
@@ -1591,22 +1589,22 @@ declare <WIDTH x i16> @__pseudo_gather64_16(<WIDTH x i64>, <WIDTH x MASK>) nounw
 declare <WIDTH x i32> @__pseudo_gather64_32(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i64> @__pseudo_gather64_64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly

-declare <WIDTH x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32,
-                                                     <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32,
+declare <WIDTH x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                        <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                      <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32,
+declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                      <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32,
+declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                      <WIDTH x MASK>) nounwind readonly

-declare <WIDTH x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                     <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                      <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                      <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                      <WIDTH x MASK>) nounwind readonly

 ; Similarly to the pseudo-gathers defined above, we also declare undefined
@@ -1621,13 +1619,9 @@ declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i3
 ; transforms them to scatters like:
 ;
 ; void __pseudo_scatter_base_offsets{32,64}_8(uniform int8 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int8 values, mask)
-; void __pseudo_scatter_base_offsets{32,64}_16(uniform int16 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int16 values, mask)
-; void __pseudo_scatter_base_offsets{32,64}_32(uniform int32 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int32 values, mask)
-; void __pseudo_scatter_base_offsets{32,64}_64(uniform int64 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int64 values, mask)
+;             varying int32 offsets, uniform int32 offset_scale, 
+;             varying int{32,64} offset_delta, varying int8 values, mask)
+; (and similarly for 16/32/64 bit values)
 ;
 ; And the GSImprovementsPass in turn converts these to actual native
 ; scatters or masked stores.  
@@ -1642,22 +1636,22 @@ declare void @__pseudo_scatter64_16(<WIDTH x i64>, <WIDTH x i16>, <WIDTH x MASK>
 declare void @__pseudo_scatter64_32(<WIDTH x i64>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter64_64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind

-declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                <WIDTH x i8>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                 <WIDTH x i16>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                 <WIDTH x i32>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                 <WIDTH x i64>, <WIDTH x MASK>) nounwind

-declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                <WIDTH x i8>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                 <WIDTH x i16>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                 <WIDTH x i32>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                 <WIDTH x i64>, <WIDTH x MASK>) nounwind

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2701,7 +2695,8 @@ define(`gen_gather', `
 ;; Define the utility function to do the gather operation for a single element
 ;; of the type
 define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
+                                    <$1 x i32> %offset_delta, <$1 x $2> %ret,
+                                    i32 %lane) nounwind readonly alwaysinline {
  ; compute address for this one from the base
  %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
  ; the order and details of the next 4 lines are important--they match LLVMs 
@@ -2711,15 +2706,20 @@ define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_
  %offset = mul i64 %offset64, %scale64
  %ptroffset = getelementptr i8 * %ptr, i64 %offset

+  %delta = extractelement <$1 x i32> %offset_delta, i32 %lane
+  %delta64 = sext i32 %delta to i64
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
  ; load value and insert into returned value
-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  %ptrcast = bitcast i8 * %finalptr to $2 *
  %val = load $2 *%ptrcast
  %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
  ret <$1 x $2> %updatedret
 }

 define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
+                                    <$1 x i64> %offset_delta, <$1 x $2> %ret,
+                                    i32 %lane) nounwind readonly alwaysinline {
  ; compute address for this one from the base
  %offset64 = extractelement <$1 x i64> %offsets, i32 %lane
  ; the order and details of the next 4 lines are important--they match LLVMs 
@@ -2728,8 +2728,11 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_
  %offset = mul i64 %offset64, %offset_scale64
  %ptroffset = getelementptr i8 * %ptr, i64 %offset

+  %delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
  ; load value and insert into returned value
-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  %ptrcast = bitcast i8 * %finalptr to $2 *
  %val = load $2 *%ptrcast
  %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
  ret <$1 x $2> %updatedret
@@ -2737,6 +2740,7 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_


 define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
+                                             <$1 x i32> %offset_delta,
                                             <$1 x i32> %vecmask) nounwind readonly alwaysinline {
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
  ; to require that the 0th element of the array being gathered from is always
@@ -2749,16 +2753,25 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32
                                     <$1 x i32> %vecmask)
  %newOffsets = load <$1 x i32> * %offsetsPtr

+  %deltaPtr = alloca <$1 x i32>
+  store <$1 x i32> zeroinitializer, <$1 x i32> * %deltaPtr
+  call void @__masked_store_blend_32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta, 
+                                     <$1 x i32> %vecmask)
+  %newDelta = load <$1 x i32> * %deltaPtr
+
  %ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
-                                            i32 %offset_scale, <$1 x $2> undef, i32 0)
+                                            i32 %offset_scale, <$1 x i32> %offset_delta,
+                                            <$1 x $2> undef, i32 0)
  forloop(lane, 1, eval($1-1), 
          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, 
-                                <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
+                                <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x i32> %offset_delta,
+                                <$1 x $2> %retPREV, i32 LANE)
                    ', `LANE', lane), `PREV', eval(lane-1))')
  ret <$1 x $2> %ret`'eval($1-1)
 }

 define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
+                                             <$1 x i64> %offset_delta,
                                             <$1 x i32> %vecmask) nounwind readonly alwaysinline {
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
  ; to require that the 0th element of the array being gathered from is always
@@ -2771,11 +2784,19 @@ define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32
                                     <$1 x i32> %vecmask)
  %newOffsets = load <$1 x i64> * %offsetsPtr

+  %deltaPtr = alloca <$1 x i64>
+  store <$1 x i64> zeroinitializer, <$1 x i64> * %deltaPtr
+  call void @__masked_store_blend_64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta, 
+                                     <$1 x i32> %vecmask)
+  %newDelta = load <$1 x i64> * %deltaPtr
+
  %ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets,
-                                            i32 %offset_scale, <$1 x $2> undef, i32 0)
+                                            i32 %offset_scale, <$1 x i64> %newDelta,
+                                            <$1 x $2> undef, i32 0)
  forloop(lane, 1, eval($1-1), 
          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, 
-                                <$1 x i64> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
+                                <$1 x i64> %newOffsets, i32 %offset_scale, <$1 x i64> %newDelta,
+                                <$1 x $2> %retPREV, i32 LANE)
                    ', `LANE', lane), `PREV', eval(lane-1))')
  ret <$1 x $2> %ret`'eval($1-1)
 }
@@ -2826,7 +2847,8 @@ define(`gen_scatter', `
 ;; Define the function that descripes the work to do to scatter a single
 ;; value
 define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
-                                <$1 x $2> %values, i32 %lane) nounwind alwaysinline {
+                                <$1 x i32> %offset_delta, <$1 x $2> %values,
+                                i32 %lane) nounwind alwaysinline {
  %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
  ; the order and details of the next 4 lines are important--they match LLVMs 
  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
@@ -2835,42 +2857,52 @@ define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scal
  %offset = mul i64 %offset64, %scale64
  %ptroffset = getelementptr i8 * %ptr, i64 %offset

-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  %delta = extractelement <$1 x i32> %offset_delta, i32 %lane
+  %delta64 = sext i32 %delta to i64
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  %ptrcast = bitcast i8 * %finalptr to $2 *
  %storeval = extractelement <$1 x $2> %values, i32 %lane
  store $2 %storeval, $2 * %ptrcast
  ret void
 }

 define void @__scatter_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
-                                <$1 x $2> %values, i32 %lane) nounwind alwaysinline {
+                                <$1 x i64> %offset_delta, <$1 x $2> %values,
+                                i32 %lane) nounwind alwaysinline {
  %offset64 = extractelement <$1 x i64> %offsets, i32 %lane
  ; the order and details of the next 4 lines are important--they match LLVMs 
  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
  %scale64 = sext i32 %offset_scale to i64
  %offset = mul i64 %offset64, %scale64
  %ptroffset = getelementptr i8 * %ptr, i64 %offset
-  %ptrcast = bitcast i8 * %ptroffset to $2 *

+  %delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  %ptrcast = bitcast i8 * %finalptr to $2 *
  %storeval = extractelement <$1 x $2> %values, i32 %lane
  store $2 %storeval, $2 * %ptrcast
  ret void
 }

 define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, i32 %offset_scale,
-                                         <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
+                                         <$1 x i32> %offset_delta, <$1 x $2> %values,
+                                         <$1 x i32> %mask) nounwind alwaysinline {
  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
  per_lane($1, <$1 x i32> %mask, `
      call void @__scatter_elt32_$2(i8 * %base, <$1 x i32> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %values, i32 LANE)')
+                                    <$1 x i32> %offset_delta, <$1 x $2> %values, i32 LANE)')
  ret void
 }

 define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %offset_scale,
-                                         <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
+                                         <$1 x i64> %offset_delta, <$1 x $2> %values,
+                                         <$1 x i32> %mask) nounwind alwaysinline {
  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
  per_lane($1, <$1 x i32> %mask, `
      call void @__scatter_elt64_$2(i8 * %base, <$1 x i64> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %values, i32 LANE)')
+                                    <$1 x i64> %offset_delta, <$1 x $2> %values, i32 LANE)')
  ret void
 }

--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1060,13 +1060,15 @@ static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
 // offsets * offsetScale is in bytes (for all of these)

 #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
-static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
-                         __vec16_i1 mask) {                             \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
+                              uint32_t scale, OTYPE constOffset, \
+                              __vec16_i1 mask) {                        \
    VTYPE ret;                                                          \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 16; ++i)                                        \
        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]);        \
+            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
+                                   constOffset.v[i]);                   \
            ret.v[i] = *ptr;                                            \
        }                                                               \
    return ret;                                                         \
@@ -1104,13 +1106,15 @@ GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)

 // scatter

-#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
-static FORCEINLINE void FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
+#define SCATTER_BASE_VARYINGOFFSET(VTYPE, STYPE, OTYPE, FUNC)           \
+static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
+                             uint32_t scale, OTYPE constOffset,         \
                             VTYPE val, __vec16_i1 mask) {              \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 16; ++i)                                        \
        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]);        \
+            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
+                                   constOffset.v[i]);                   \
            *ptr = val.v[i];                                            \
        }                                                               \
 }
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -51,8 +51,8 @@
 #define FORCEINLINE __attribute__((always_inline)) inline
 #endif

-//CO#undef FORCEINLINE
-//CO#define FORCEINLINE
+#undef FORCEINLINE
+#define FORCEINLINE

 typedef float __vec1_f;
 typedef double __vec1_d;
@@ -2612,52 +2612,54 @@ static FORCEINLINE void __masked_store_blend_64(void *p, __vec4_i64 val,

 template<typename RetVec, typename RetScalar>
 static FORCEINLINE RetVec
-lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
-                     __vec4_i32 offsets, uint32_t scale, __vec4_i1 mask) {
+lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets, 
+                     uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
    RetScalar r[4];
 #if 1
    // "Fast gather" trick...
    offsets = __select(mask, offsets, __smear_i32(0));
-    int offset = scale * _mm_extract_epi32(offsets.v, 0);
+    constOffset = __select(mask, constOffset, __smear_i32(0));
+
+    int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
    RetScalar *ptr = (RetScalar *)(p + offset);
    r[0] = *ptr;

-    offset = scale * _mm_extract_epi32(offsets.v, 1);
+    offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
    ptr = (RetScalar *)(p + offset);
    r[1] = *ptr;

-    offset = scale * _mm_extract_epi32(offsets.v, 2);
+    offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
    ptr = (RetScalar *)(p + offset);
    r[2] = *ptr;

-    offset = scale * _mm_extract_epi32(offsets.v, 3);
+    offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
    ptr = (RetScalar *)(p + offset);
    r[3] = *ptr;
 #else
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 0);
+        int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[0] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 1);
+        int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[1] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 2);
+        int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[2] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 3);
+        int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[3] = *ptr;
    }
@@ -2665,54 +2667,57 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
    return RetVec(r[0], r[1], r[2], r[3]);
 }

+
 template<typename RetVec, typename RetScalar>
 static FORCEINLINE RetVec
 lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
-                     uint32_t scale, __vec4_i1 mask) {
+                     uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
    RetScalar r[4];
 #if 1
    // "Fast gather" trick...
    offsets = __select(mask, offsets, __smear_i64(0));
-    int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+    constOffset = __select(mask, constOffset, __smear_i64(0));
+
+    int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
    RetScalar *ptr = (RetScalar *)(p + offset);
    r[0] = *ptr;

-    offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+    offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
    ptr = (RetScalar *)(p + offset);
    r[1] = *ptr;

-    offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+    offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
    ptr = (RetScalar *)(p + offset);
    r[2] = *ptr;

-    offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+    offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
    ptr = (RetScalar *)(p + offset);
    r[3] = *ptr;
 #else
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[0] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[1] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[2] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[3] = *ptr;
    }
@@ -2723,80 +2728,89 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,

 static FORCEINLINE __vec4_i8
 __gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
-                           uint32_t scale,  __vec4_i1 mask) {
+                           uint32_t scale,  __vec4_i32 constOffset, __vec4_i1 mask) {
    return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }

 static FORCEINLINE __vec4_i8
 __gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
-                           uint32_t scale, __vec4_i1 mask) {
+                           uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
    return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }

 static FORCEINLINE __vec4_i16
 __gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
    return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }

 static FORCEINLINE __vec4_i16
 __gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
-                             uint32_t scale, __vec4_i1 mask) {
+                             uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
    return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }

 static FORCEINLINE __vec4_i32
-__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
+                            __vec4_i32 constOffset, __vec4_i1 mask) {
    __m128i r = _mm_set_epi32(0, 0, 0, 0);
 #if 1
    // "Fast gather"...
    offsets = __select(mask, offsets, __smear_i32(0));
+    constOffset = __select(mask, constOffset, __smear_i32(0));

-    int offset = scale * _mm_extract_epi32(offsets.v, 0);
+    int offset = scale * _mm_extract_epi32(offsets.v, 0) +
+        _mm_extract_epi32(constOffset.v, 0);
    uint32_t *ptr = (uint32_t *)(p + offset);
    r = _mm_insert_epi32(r, *ptr, 0);

-    offset = scale * _mm_extract_epi32(offsets.v, 1);
+    offset = scale * _mm_extract_epi32(offsets.v, 1) +
+        _mm_extract_epi32(constOffset.v, 1);
    ptr = (uint32_t *)(p + offset);
    r = _mm_insert_epi32(r, *ptr, 1);

-    offset = scale * _mm_extract_epi32(offsets.v, 2);
+    offset = scale * _mm_extract_epi32(offsets.v, 2) +
+        _mm_extract_epi32(constOffset.v, 2);
    ptr = (uint32_t *)(p + offset);
    r = _mm_insert_epi32(r, *ptr, 2);

-    offset = scale * _mm_extract_epi32(offsets.v, 3);
+    offset = scale * _mm_extract_epi32(offsets.v, 3) +
+        _mm_extract_epi32(constOffset.v, 3);
    ptr = (uint32_t *)(p + offset);
    r = _mm_insert_epi32(r, *ptr, 3);
 #else
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 0);
+        int offset = scale * _mm_extract_epi32(offsets.v, 0) +
+            _mm_extract_epi32(constOffset.v, 0);
        uint32_t *ptr = (uint32_t *)(p + offset);
        r = _mm_insert_epi32(r, *ptr, 0);
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 1);
+        int offset = scale * _mm_extract_epi32(offsets.v, 1) +
+            _mm_extract_epi32(constOffset.v, 1);
        uint32_t *ptr = (uint32_t *)(p + offset);
        r = _mm_insert_epi32(r, *ptr, 1);
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 2);
+        int offset = scale * _mm_extract_epi32(offsets.v, 2) +
+            _mm_extract_epi32(constOffset.v, 2);
        uint32_t *ptr = (uint32_t *)(p + offset);
        r = _mm_insert_epi32(r, *ptr, 2);
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 3);
+        int offset = scale * _mm_extract_epi32(offsets.v, 3) +
+            _mm_extract_epi32(constOffset.v, 3);
        uint32_t *ptr = (uint32_t *)(p + offset);
        r = _mm_insert_epi32(r, *ptr, 3);
    }
@@ -2806,23 +2820,23 @@ __gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,

 static FORCEINLINE __vec4_i32
 __gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
    return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale, 
-                                mask);
+                                delta, mask);
 }

 static FORCEINLINE __vec4_i64
 __gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) {
    return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale, 
-                                mask);
+                                delta, mask);
 }

 static FORCEINLINE __vec4_i64
 __gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
    return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale, 
-                                mask);
+                                delta, mask);
 }

 template<typename RetVec, typename RetScalar>
@@ -2969,217 +2983,108 @@ static FORCEINLINE __vec4_i64 __gather64_i64(__vec4_i64 ptrs, __vec4_i1 mask) {

 // scatter
  
-static FORCEINLINE void
-__scatter_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets, 
-                            uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
-        *ptr = _mm_extract_epi8(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
-        *ptr = _mm_extract_epi8(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
-        *ptr = _mm_extract_epi8(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
-        *ptr = _mm_extract_epi8(val.v, 3);
-    }
+#define SCATTER32_64(SUFFIX, TYPE, EXTRACT)                         \
+static FORCEINLINE void                                             \
+__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
+                                   uint32_t scale, __vec4_i32 constOffset, \
+                                   __vec4_##SUFFIX val, __vec4_i1 mask) { \
+    uint32_t m = _mm_extract_ps(mask.v, 0);                             \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \
+                             _mm_extract_epi32(constOffset.v, 0));      \
+        *ptr = EXTRACT(val.v, 0);                                       \
+    }                                                                   \
+    m = _mm_extract_ps(mask.v, 1);                                      \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \
+                             _mm_extract_epi32(constOffset.v, 1));      \
+        *ptr = EXTRACT(val.v, 1);                                       \
+    }                                                                   \
+    m = _mm_extract_ps(mask.v, 2);                                      \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \
+                             _mm_extract_epi32(constOffset.v, 2));      \
+        *ptr = EXTRACT(val.v, 2);                                       \
+    }                                                                   \
+    m = _mm_extract_ps(mask.v, 3);                                      \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \
+                             _mm_extract_epi32(constOffset.v, 3));      \
+        *ptr = EXTRACT(val.v, 3);                                       \
+    }                                                                   \
+}                                                                       \
+static FORCEINLINE void                                                \
+__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
+                                  uint32_t scale, __vec4_i64 constOffset, \
+                                  __vec4_##SUFFIX val, __vec4_i1 mask) { \
+    uint32_t m = _mm_extract_ps(mask.v, 0);                            \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +  \
+            _mm_extract_epi64(constOffset.v[0], 0);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 0);                                      \
+    }                                                                  \
+    m = _mm_extract_ps(mask.v, 1);                                     \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +  \
+            _mm_extract_epi64(constOffset.v[0], 1);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 1);                                      \
+    }                                                                  \
+    m = _mm_extract_ps(mask.v, 2);                                     \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +  \
+            _mm_extract_epi64(constOffset.v[1], 0);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 2);                                      \
+    }                                                                  \
+    m = _mm_extract_ps(mask.v, 3);                                     \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +  \
+            _mm_extract_epi64(constOffset.v[1], 1);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 3);                                      \
+    }                                                                  \
 }

-static FORCEINLINE void
-__scatter_base_offsets64_i8(unsigned char *p, __vec4_i64 offsets, 
-                            uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 0);
-    }

-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 1);
-    }
+SCATTER32_64(i8, int8_t, _mm_extract_epi8)
+SCATTER32_64(i16, int16_t, _mm_extract_epi16)
+SCATTER32_64(i32, int32_t, _mm_extract_epi32)

-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
-                             uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
-        *ptr = _mm_extract_epi16(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
-        *ptr = _mm_extract_epi16(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
-        *ptr = _mm_extract_epi16(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
-        *ptr = _mm_extract_epi16(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets64_i16(unsigned char *p, __vec4_i64 offsets, 
-                             uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets32_i32(unsigned char *b, __vec4_i32 offsets,
-                             uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 0));
-        *ptr = _mm_extract_epi32(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 1));
-        *ptr = _mm_extract_epi32(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 2));
-        *ptr = _mm_extract_epi32(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 3));
-        *ptr = _mm_extract_epi32(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets, 
-                             uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 3);
-    }
-}

 static FORCEINLINE void
 __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, 
-                             uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
+                             uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val, 
+                             __vec4_i1 mask) {
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 0);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) +
+            _mm_extract_epi32(constOffset.v, 0);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[0], 0);
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 1);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) +
+            _mm_extract_epi32(constOffset.v, 1);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[0], 1);
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 2);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) +
+            _mm_extract_epi32(constOffset.v, 2);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[1], 0);
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 3);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) +
+            _mm_extract_epi32(constOffset.v, 3);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[1], 1);
    }
@@ -3187,31 +3092,36 @@ __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,

 static FORCEINLINE void
 __scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, 
-                             uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
+                             uint32_t scale, __vec4_i64 constOffset,
+                             __vec4_i64 val, __vec4_i1 mask) {
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +
+            _mm_extract_epi64(constOffset.v[0], 0);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[0], 0);
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +
+            _mm_extract_epi64(constOffset.v[0], 1);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[0], 1);
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +
+            _mm_extract_epi64(constOffset.v[1], 0);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[1], 0);
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +
+            _mm_extract_epi64(constOffset.v[1], 1);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[1], 1);
    }
--- a/opt.cpp
+++ b/opt.cpp
@@ -205,6 +205,7 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
 }


+#if 0
 static llvm::Instruction *
 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, 
          llvm::Value *arg2, llvm::Value *arg3, const char *name,
@@ -218,7 +219,7 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
                                  name, insertBefore);
 #endif
 }
-
+#endif

 static llvm::Instruction *
 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, 
@@ -234,6 +235,21 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
 #endif
 }

+static llvm::Instruction *
+lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, 
+          llvm::Value *arg2, llvm::Value *arg3, llvm::Value *arg4,
+          llvm::Value *arg5, const char *name, 
+          llvm::Instruction *insertBefore = NULL) {
+    llvm::Value *args[6] = { arg0, arg1, arg2, arg3, arg4, arg5 };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+    llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[6]);
+    return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
+#else
+    return llvm::CallInst::Create(func, &newArgs[0], &newArgs[6],
+                                  name, insertBefore);
+#endif
+}
+
 ///////////////////////////////////////////////////////////////////////////

 void
@@ -302,10 +318,13 @@ Optimize(llvm::Module *module, int optLevel) {

        // Early optimizations to try to reduce the total amount of code to
        // work with if we can
-        optPM.add(CreateDetectGSBaseOffsetsPass());
        optPM.add(llvm::createReassociatePass());
        optPM.add(llvm::createConstantPropagationPass());
+        optPM.add(llvm::createConstantPropagationPass());
+        optPM.add(llvm::createDeadInstEliminationPass());
+        optPM.add(llvm::createCFGSimplificationPass());

+        optPM.add(CreateDetectGSBaseOffsetsPass());
        if (!g->opt.disableMaskAllOnOptimizations) {
            optPM.add(CreateIntrinsicsOptPass());
            optPM.add(CreateVSelMovmskOptPass());
@@ -314,11 +333,7 @@ Optimize(llvm::Module *module, int optLevel) {
        }
        optPM.add(llvm::createDeadInstEliminationPass());

-        optPM.add(llvm::createConstantPropagationPass());
-        optPM.add(llvm::createDeadInstEliminationPass());
-        
        // On to more serious optimizations
-        optPM.add(llvm::createCFGSimplificationPass());
        if (runSROA)
            optPM.add(llvm::createScalarReplAggregatesPass());
        optPM.add(llvm::createInstructionCombiningPass());
@@ -1173,6 +1188,166 @@ lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets,
 }


+static llvm::Value *
+lGetZeroOffsetVector(llvm::Value *origVec) {
+    if (origVec->getType() == LLVMTypes::Int32VectorType)
+        return LLVMInt32Vector((int32_t)0);
+    else
+        return LLVMInt64Vector((int64_t)0);
+}
+
+
+#if 0
+static void
+lPrint(llvm::Value *v, int indent = 0) {
+    if (llvm::isa<llvm::PHINode>(v))
+        return;
+
+    fprintf(stderr, "%*c", indent, ' ');
+    v->dump();
+
+    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(v);
+    if (inst != NULL) {
+        for (int i = 0; i < (int)inst->getNumOperands(); ++i) {
+            llvm::Value *op = inst->getOperand(i);
+            if (llvm::isa<llvm::Constant>(op) == false)
+                lPrint(op, indent+4);
+        }
+    }
+}
+#endif
+
+
+/** Given a vector expression in vec, separate it into a compile-time
+    constant component and a variable component, returning the two parts in
+    *constOffset and *variableOffset.  (It should be the case that the sum
+    of these two is exactly equal to the original vector.)
+
+    This routine only handles some (important) patterns; in some cases it
+    will fail and return components that are actually compile-time
+    constants in *variableOffset.
+
+    Finally, if there aren't any constant (or, respectivaly, variable)
+    components, the corresponding return value may be set to NULL.
+ */
+static void
+lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
+                       llvm::Value **variableOffset, 
+                       llvm::Instruction *insertBefore) {
+    if (llvm::isa<llvm::ConstantVector>(vec) ||
+        llvm::isa<llvm::ConstantAggregateZero>(vec)) {
+        *constOffset = vec;
+        *variableOffset = NULL;
+        return;
+    }
+
+    llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(vec);
+    if (sext != NULL) {
+        // Check the sext target.
+        llvm::Value *co, *vo;
+        lExtractConstantOffset(sext->getOperand(0), &co, &vo, insertBefore);
+
+        // make new sext instructions for the two parts
+        if (co == NULL)
+            *constOffset = NULL;
+        else
+            *constOffset = new llvm::SExtInst(co, sext->getType(), 
+                                              "const_offset_sext", insertBefore);
+        if (vo == NULL)
+            *variableOffset = NULL;
+        else
+            *variableOffset = new llvm::SExtInst(vo, sext->getType(), 
+                                                 "variable_offset_sext", 
+                                                 insertBefore);
+        return;
+    }
+
+    // FIXME? handle bitcasts / type casts here
+
+    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(vec);
+    if (bop != NULL) {
+        llvm::Value *op0 = bop->getOperand(0);
+        llvm::Value *op1 = bop->getOperand(1);
+        llvm::Value *c0, *v0, *c1, *v1;
+
+        if (bop->getOpcode() == llvm::Instruction::Add) {
+            lExtractConstantOffset(op0, &c0, &v0, insertBefore);
+            lExtractConstantOffset(op1, &c1, &v1, insertBefore);
+
+            if (c0 == NULL)
+                *constOffset = c1;
+            else if (c1 == NULL)
+                *constOffset = c0;
+            else
+                *constOffset = 
+                    llvm::BinaryOperator::Create(llvm::Instruction::Add, c0, c1,
+                                                 "const_op", insertBefore);
+
+            if (v0 == NULL)
+                *variableOffset = v1;
+            else if (v1 == NULL)
+                *variableOffset = v0;
+            else
+                *variableOffset = 
+                    llvm::BinaryOperator::Create(llvm::Instruction::Add, v0, v1,
+                                                 "variable_op", insertBefore);
+            return;
+        }
+        else if (bop->getOpcode() == llvm::Instruction::Mul) {
+            lExtractConstantOffset(op0, &c0, &v0, insertBefore);
+            lExtractConstantOffset(op1, &c1, &v1, insertBefore);
+
+            // Given the product of constant and variable terms, we have:
+            // (c0 + v0) * (c1 + v1) == (c0 c1) + (v0 c1 + c0 v1 + v0 v1)
+            // Note that the first term is a constant and the last three are
+            // variable.
+            if (c0 != NULL && c1 != NULL)
+                *constOffset =
+                    llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, c1,
+                                                 "const_mul", insertBefore);
+            else
+                *constOffset = NULL;
+
+            llvm::Value *va = NULL, *vb = NULL, *vc = NULL;
+            if (v0 != NULL && c1 != NULL)
+                va = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, c1,
+                                                  "va_mul", insertBefore);
+            if (c0 != NULL && v1 != NULL)
+                vb = llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, v1,
+                                                  "vb_mul", insertBefore);
+            if (v0 != NULL && v1 != NULL)
+                vc = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, v1,
+                                                  "vc_mul", insertBefore);
+
+            
+            llvm::Value *vab = NULL;
+            if (va != NULL && vb != NULL)
+                vab = llvm::BinaryOperator::Create(llvm::Instruction::Add, va, vb,
+                                                   "vab_add", insertBefore);
+            else if (va != NULL)
+                vab = va;
+            else
+                vab = vb;
+
+            if (vab != NULL && vc != NULL)
+                *variableOffset = 
+                    llvm::BinaryOperator::Create(llvm::Instruction::Add, vab, vc,
+                                                 "vabc_add", insertBefore);
+            else if (vab != NULL)
+                *variableOffset = vab;
+            else
+                *variableOffset = vc;
+
+            return;
+        }
+    }
+
+    // Nothing matched, just return what we have as a variable component
+    *constOffset = NULL;
+    *variableOffset = vec;
+}
+
+
 /* Returns true if the given value is a constant vector of integers with
   the value 2, 4, 8 in all of the elements.  (Returns the splatted value
   in *splat, if so). */
@@ -1277,6 +1452,123 @@ lExtractOffsetVector248Scale(llvm::Value **vec) {
        return LLVMInt32(1);
 }

+#if 0
+static llvm::Value *
+lExtractUniforms(llvm::Value **vec, llvm::Instruction *insertBefore) {
+    fprintf(stderr, " lextract: ");
+    (*vec)->dump();
+    fprintf(stderr, "\n");
+
+    if (llvm::isa<llvm::ConstantVector>(*vec) ||
+        llvm::isa<llvm::ConstantAggregateZero>(*vec))
+        return NULL;
+
+    llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(*vec);
+    if (sext != NULL) {
+        llvm::Value *sextOp = sext->getOperand(0);
+        // Check the sext target.
+        llvm::Value *unif = lExtractUniforms(&sextOp, insertBefore);
+        if (unif == NULL)
+            return NULL;
+
+        // make a new sext instruction so that we end up with the right
+        // type
+        *vec = new llvm::SExtInst(sextOp, sext->getType(), "offset_sext", sext);
+        return unif;
+    }
+
+    std::vector<llvm::PHINode *> phis;
+    if (LLVMVectorValuesAllEqual(*vec, g->target.vectorWidth, phis)) {
+        // FIXME: we may want to redo all of the expression here, in scalar
+        // form (if at all possible), for code quality...
+        llvm::Value *unif = 
+            llvm::ExtractElementInst::Create(*vec, LLVMInt32(0),
+                                             "first_uniform", insertBefore);
+        *vec = NULL;
+        return unif;
+    }
+
+    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
+    if (bop == NULL)
+        return NULL;
+
+    llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
+    if (bop->getOpcode() == llvm::Instruction::Add) {
+        llvm::Value *s0 = lExtractUniforms(&op0, insertBefore);
+        llvm::Value *s1 = lExtractUniforms(&op1, insertBefore);
+        if (s0 == NULL && s1 == NULL)
+            return NULL;
+
+        if (op0 == NULL)
+            *vec = op1;
+        else if (op1 == NULL)
+            *vec = op0;
+        else
+            *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add,
+                                                op0, op1, "new_add", insertBefore);
+
+        if (s0 == NULL) 
+            return s1;
+        else if (s1 == NULL)
+            return s0;
+        else
+            return llvm::BinaryOperator::Create(llvm::Instruction::Add, s0, s1,
+                                                "add_unif", insertBefore);
+    }
+#if 0
+    else if (bop->getOpcode() == llvm::Instruction::Mul) {
+        // Check each operand for being one of the scale factors we care about.
+        int splat;
+        if (lIs248Splat(op0, &splat)) {
+            *vec = op1;
+            return LLVMInt32(splat);
+        }
+        else if (lIs248Splat(op1, &splat)) {
+            *vec = op0;
+            return LLVMInt32(splat);
+        }
+        else
+            return LLVMInt32(1);
+    }
+#endif
+    else
+        return NULL;
+}
+
+
+static void
+lExtractUniformsFromOffset(llvm::Value **basePtr, llvm::Value **offsetVector, 
+                           llvm::Value *offsetScale, 
+                           llvm::Instruction *insertBefore) {
+#if 1
+    (*basePtr)->dump();
+    printf("\n");
+    (*offsetVector)->dump();
+    printf("\n");
+    offsetScale->dump();
+    printf("-----\n");
+#endif
+
+    llvm::Value *uniformDelta = lExtractUniforms(offsetVector, insertBefore);
+    if (uniformDelta == NULL)
+        return;
+
+    llvm::Value *index[1] = { uniformDelta };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+    llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
+    *basePtr = llvm::GetElementPtrInst::Create(*basePtr, arrayRef, "new_base",
+                                               insertBefore);
+#else
+    *basePtr = llvm::GetElementPtrInst::Create(*basePtr, &index[0], 
+                                               &index[1], "new_base",
+                                               insertBefore);
+#endif
+
+    // this should only happen if we have only uniforms, but that in turn
+    // shouldn't be a gather/scatter!
+    Assert(*offsetVector != NULL);
+}
+#endif

 struct GSInfo {
    GSInfo(const char *pgFuncName, const char *pgboFuncName, 
@@ -1367,7 +1659,24 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            // to the next instruction...
            continue;

-        llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector);
+        // Try to decompose the offset vector into a compile time constant
+        // component and a varying component.  The constant component is
+        // passed as a separate parameter to the gather/scatter functions,
+        // which in turn allows their implementations to end up emitting
+        // x86 instructions with constant offsets encoded in them.
+        llvm::Value *constOffset, *variableOffset;
+        lExtractConstantOffset(offsetVector, &constOffset, &variableOffset, 
+                               callInst);
+        if (constOffset == NULL)
+            constOffset = lGetZeroOffsetVector(offsetVector);
+        if (variableOffset == NULL)
+            variableOffset = lGetZeroOffsetVector(offsetVector);
+
+        // See if the varying component is scaled by 2, 4, or 8.  If so,
+        // extract that scale factor and rewrite variableOffset to remove
+        // it.  (This also is pulled out so that we can match the scales by
+        // 2/4/8 offered by x86 addressing operators.)
+        llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset);

        // Cast the base pointer to a void *, since that's what the
        // __pseudo_*_base_offsets_* functions want.
@@ -1386,11 +1695,15 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            // walk past the sext to get the i32 offset values and then
            // call out to the corresponding 32-bit gather/scatter
            // function.
-            llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(offsetVector);
+            llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(variableOffset);
            if (sext != NULL && 
                sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) {
-                offsetVector = sext->getOperand(0);
+                variableOffset = sext->getOperand(0);
                gatherScatterFunc = info->baseOffsets32Func;
+                if (constOffset->getType() != LLVMTypes::Int32VectorType)
+                    constOffset = 
+                        new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
+                                            "trunc_const_offset", callInst);
            }
        }

@@ -1403,8 +1716,8 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            // the instruction isn't inserted into a basic block and that
            // way we can then call ReplaceInstWithInst().
            llvm::Instruction *newCall = 
-                lCallInst(gatherScatterFunc, basePtr, offsetVector, offsetScale,
-                          mask, "newgather", NULL);
+                lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
+                          constOffset, mask, "newgather", NULL);
            lCopyMetadata(newCall, callInst);
            llvm::ReplaceInstWithInst(callInst, newCall);
        }
@@ -1416,8 +1729,8 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            // base+offsets instruction.  See above for why passing NULL
            // for the Instruction * is intended.
            llvm::Instruction *newCall = 
-                lCallInst(gatherScatterFunc, basePtr, offsetVector, offsetScale,
-                          storeValue, mask, "", NULL);
+                lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
+                          constOffset, storeValue, mask, "", NULL);
            lCopyMetadata(newCall, callInst);
            llvm::ReplaceInstWithInst(callInst, newCall);
        }
@@ -2016,6 +2329,26 @@ struct GatherImpInfo {
 };


+static llvm::Value *
+lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets,
+                      llvm::Instruction *insertBefore) {
+    llvm::Value *firstOffset = 
+        llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
+                                         insertBefore);
+
+    llvm::Value *offsetIndex[1] = { firstOffset };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+    llvm::ArrayRef<llvm::Value *> arrayRef(&offsetIndex[0], &offsetIndex[1]);
+    return
+        llvm::GetElementPtrInst::Create(base, arrayRef, "ptr", insertBefore);
+#else
+    return
+        llvm::GetElementPtrInst::Create(base, &offsetIndex[0], &offsetIndex[1],
+                                        "ptr", insertBefore);
+#endif
+}
+
+
 struct ScatterImpInfo {
    ScatterImpInfo(const char *pName, const char *msName, 
                   LLVM_TYPE_CONST llvm::Type *vpt, int a)
@@ -2109,45 +2442,42 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
        Assert(ok);     

        llvm::Value *base = callInst->getArgOperand(0);
-        llvm::Value *offsets = callInst->getArgOperand(1);
+        llvm::Value *varyingOffsets = callInst->getArgOperand(1);
        llvm::Value *offsetScale = callInst->getArgOperand(2);
-        llvm::Value *storeValue = (scatterInfo != NULL) ? callInst->getArgOperand(3) : NULL;
-        llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4);
+        llvm::Value *constOffsets = callInst->getArgOperand(3);
+        llvm::Value *storeValue = (scatterInfo != NULL) ? callInst->getArgOperand(4) : NULL;
+        llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5);

+        // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets
        llvm::ConstantInt *offsetScaleInt = 
            llvm::dyn_cast<llvm::ConstantInt>(offsetScale);
        Assert(offsetScaleInt != NULL);
+        uint64_t scaleValue = offsetScaleInt->getZExtValue();

-        if (offsets->getType() == LLVMTypes::Int64VectorType)
-            // offsetScale is an i32, so sext it so that if we use it in a
-            // multiply below, it has the same type as the i64 offset used
-            // as the other operand...
-            offsetScale = new llvm::SExtInst(offsetScale, LLVMTypes::Int64Type,
-                                             "offset_sext", callInst);
+        std::vector<llvm::Constant *> scales;
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            if (varyingOffsets->getType() == LLVMTypes::Int64VectorType)
+                scales.push_back(LLVMInt64(scaleValue));
+            else
+                scales.push_back(LLVMInt32(scaleValue));
+        }
+        llvm::Constant *offsetScaleVec = llvm::ConstantVector::get(scales);
+
+        llvm::Value *scaledVarying = 
+            llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
+                                         varyingOffsets, "scaled_varying", callInst);
+        llvm::Value *fullOffsets =
+            llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying,
+                                         constOffsets, "varying+const_offsets",
+                                         callInst);

        {
        std::vector<llvm::PHINode *> seenPhis;
-        if (LLVMVectorValuesAllEqual(offsets, g->target.vectorWidth, seenPhis)) {
+        if (LLVMVectorValuesAllEqual(fullOffsets, g->target.vectorWidth, seenPhis)) {
            // If all the offsets are equal, then compute the single
            // pointer they all represent based on the first one of them
            // (arbitrarily).
-
-            // FIXME: the code from here to where ptr is computed is highly
-            // redundant with the case for a vector linear below.
-
-            llvm::Value *firstOffset = 
-                llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
-                                                 callInst);
-            llvm::Value *indices[1] = { firstOffset };
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-            llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[1]);
-            llvm::Value *ptr = 
-                llvm::GetElementPtrInst::Create(base, arrayRef, "ptr", callInst);
-#else
-            llvm::Value *ptr = 
-                llvm::GetElementPtrInst::Create(base, &indices[0], &indices[1],
-                                                "ptr", callInst);
-#endif
+            llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
            lCopyMetadata(ptr, callInst);

            if (gatherInfo != NULL) {
@@ -2175,9 +2505,11 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
                    llvm::ExtractElementInst::Create(storeValue, LLVMInt32(0), "rvalue_first",
                                                     callInst);
                lCopyMetadata(first, callInst);
+
                ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(first->getType(), 0),
                                            "ptr2rvalue_type", callInst);
                lCopyMetadata(ptr, callInst);
+
                llvm::Instruction *sinst = new llvm::StoreInst(first, ptr, false, 
                                                               scatterInfo->align);
                lCopyMetadata(sinst, callInst);
@@ -2190,34 +2522,15 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
        }

        int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
-        step /= (int)offsetScaleInt->getZExtValue();

        std::vector<llvm::PHINode *> seenPhis;
-        if (step > 0 && lVectorIsLinear(offsets, g->target.vectorWidth, 
+        if (step > 0 && lVectorIsLinear(fullOffsets, g->target.vectorWidth, 
                                        step, seenPhis)) {
            // We have a linear sequence of memory locations being accessed
            // starting with the location given by the offset from
            // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
            // and 64 bit gather/scatters, respectively.)
-
-            // Get the base pointer using the first guy's offset.
-            llvm::Value *firstOffset = 
-                llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
-                                                 callInst);
-            llvm::Value *scaledOffset = 
-                llvm::BinaryOperator::Create(llvm::Instruction::Mul, firstOffset,
-                                             offsetScale, "scaled_offset", callInst);
-
-            llvm::Value *indices[1] = { scaledOffset };
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-            llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[1]);
-            llvm::Value *ptr = 
-                llvm::GetElementPtrInst::Create(base, arrayRef, "ptr", callInst);
-#else
-            llvm::Value *ptr = 
-                llvm::GetElementPtrInst::Create(base, &indices[0], &indices[1],
-                                                "ptr", callInst);
-#endif
+            llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
            lCopyMetadata(ptr, callInst);

            if (gatherInfo != NULL) {