diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 533def68..6e280ba6 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -289,18 +289,18 @@ declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
 
 define(`gather_scatter', `
 declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
-                        i32, <WIDTH x i1>) nounwind readonly 
+                        i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
-                        i32, <WIDTH x i1>) nounwind readonly 
+                        i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>, 
                                     <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>, 
                                     <WIDTH x i1>) nounwind readonly 
 
 declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
-                  i32, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+                  i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
 declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
-                  i32, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+                  i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
 declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
                              <WIDTH x i1>) nounwind 
 declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
diff --git a/builtins/util.m4 b/builtins/util.m4
index 64e3a130..36882491 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1565,17 +1565,15 @@ declare void @__pseudo_masked_store_64(<WIDTH x i64> * nocapture, <WIDTH x i64>,
 ; these represent gathers from a common base pointer with offsets.  The
 ; offset_scale factor scales the offsets before they are added to the base
 ; pointer--it should have the value 1, 2, 4, or 8.  (It can always just be 1.)
-; The 2, 4, 8 cases are used to match LLVM patterns that use the free 2/4/8 scaling
-; available in x86 addressing calculations... 
+; Then, the offset delta_value (guaranteed to be a compile-time constant value),
+; is added to the final address. The 2, 4, 8 scales are used to match LLVM patterns
+; that use the free 2/4/8 scaling available in x86 addressing calculations, and
+; offset_delta feeds into the free offset calculation. 
 ;
-; varying int8  __pseudo_gather_base_offsets{32,64}_8(uniform int8 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
-; varying int16 __pseudo_gather_base_offsets{32,64}_16(uniform int16 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
-; varying int32 __pseudo_gather_base_offsets{32,64}_32(uniform int32 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
-; varying int64 __pseudo_gather_base_offsets{32,64}_64(uniform int64 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
+; varying int{8,16,32,64}
+; __pseudo_gather_base_offsets{32,64}_{8,16,32,64}(uniform int8 *base,
+;                                    int{32,64} offsets, uniform int32 offset_scale, 
+;                                    int{32,64} offset_delta, mask)
 ;
 ; Then, the GSImprovementsPass optimizations finds these and either
 ; converts them to native gather functions or converts them to vector
@@ -1591,22 +1589,22 @@ declare <WIDTH x i16> @__pseudo_gather64_16(<WIDTH x i64>, <WIDTH x MASK>) nounw
 declare <WIDTH x i32> @__pseudo_gather64_32(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i64> @__pseudo_gather64_64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
 
-declare <WIDTH x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32,
-                                                     <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32,
+declare <WIDTH x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                        <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                       <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32,
+declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                       <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32,
+declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                       <WIDTH x MASK>) nounwind readonly
 
-declare <WIDTH x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                      <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                       <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                       <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                       <WIDTH x MASK>) nounwind readonly
 
 ; Similarly to the pseudo-gathers defined above, we also declare undefined
@@ -1621,13 +1619,9 @@ declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i3
 ; transforms them to scatters like:
 ;
 ; void __pseudo_scatter_base_offsets{32,64}_8(uniform int8 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int8 values, mask)
-; void __pseudo_scatter_base_offsets{32,64}_16(uniform int16 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int16 values, mask)
-; void __pseudo_scatter_base_offsets{32,64}_32(uniform int32 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int32 values, mask)
-; void __pseudo_scatter_base_offsets{32,64}_64(uniform int64 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int64 values, mask)
+;             varying int32 offsets, uniform int32 offset_scale, 
+;             varying int{32,64} offset_delta, varying int8 values, mask)
+; (and similarly for 16/32/64 bit values)
 ;
 ; And the GSImprovementsPass in turn converts these to actual native
 ; scatters or masked stores.  
@@ -1642,22 +1636,22 @@ declare void @__pseudo_scatter64_16(<WIDTH x i64>, <WIDTH x i16>, <WIDTH x MASK>
 declare void @__pseudo_scatter64_32(<WIDTH x i64>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter64_64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
 
-declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                 <WIDTH x i8>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                  <WIDTH x i16>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                  <WIDTH x i32>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                  <WIDTH x i64>, <WIDTH x MASK>) nounwind
 
-declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                 <WIDTH x i8>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                  <WIDTH x i16>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                  <WIDTH x i32>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                  <WIDTH x i64>, <WIDTH x MASK>) nounwind
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2701,7 +2695,8 @@ define(`gen_gather', `
 ;; Define the utility function to do the gather operation for a single element
 ;; of the type
 define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
+                                    <$1 x i32> %offset_delta, <$1 x $2> %ret,
+                                    i32 %lane) nounwind readonly alwaysinline {
   ; compute address for this one from the base
   %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
   ; the order and details of the next 4 lines are important--they match LLVMs 
@@ -2711,15 +2706,20 @@ define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_
   %offset = mul i64 %offset64, %scale64
   %ptroffset = getelementptr i8 * %ptr, i64 %offset
 
+  %delta = extractelement <$1 x i32> %offset_delta, i32 %lane
+  %delta64 = sext i32 %delta to i64
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
   ; load value and insert into returned value
-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  %ptrcast = bitcast i8 * %finalptr to $2 *
   %val = load $2 *%ptrcast
   %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
   ret <$1 x $2> %updatedret
 }
 
 define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
+                                    <$1 x i64> %offset_delta, <$1 x $2> %ret,
+                                    i32 %lane) nounwind readonly alwaysinline {
   ; compute address for this one from the base
   %offset64 = extractelement <$1 x i64> %offsets, i32 %lane
   ; the order and details of the next 4 lines are important--they match LLVMs 
@@ -2728,8 +2728,11 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_
   %offset = mul i64 %offset64, %offset_scale64
   %ptroffset = getelementptr i8 * %ptr, i64 %offset
 
+  %delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
   ; load value and insert into returned value
-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  %ptrcast = bitcast i8 * %finalptr to $2 *
   %val = load $2 *%ptrcast
   %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
   ret <$1 x $2> %updatedret
@@ -2737,6 +2740,7 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_
 
 
 define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
+                                             <$1 x i32> %offset_delta,
                                              <$1 x i32> %vecmask) nounwind readonly alwaysinline {
   ; We can be clever and avoid the per-lane stuff for gathers if we are willing
   ; to require that the 0th element of the array being gathered from is always
@@ -2749,16 +2753,25 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32
                                      <$1 x i32> %vecmask)
   %newOffsets = load <$1 x i32> * %offsetsPtr
 
+  %deltaPtr = alloca <$1 x i32>
+  store <$1 x i32> zeroinitializer, <$1 x i32> * %deltaPtr
+  call void @__masked_store_blend_32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta, 
+                                     <$1 x i32> %vecmask)
+  %newDelta = load <$1 x i32> * %deltaPtr
+
   %ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
-                                            i32 %offset_scale, <$1 x $2> undef, i32 0)
+                                            i32 %offset_scale, <$1 x i32> %offset_delta,
+                                            <$1 x $2> undef, i32 0)
   forloop(lane, 1, eval($1-1), 
           `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, 
-                                <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
+                                <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x i32> %offset_delta,
+                                <$1 x $2> %retPREV, i32 LANE)
                     ', `LANE', lane), `PREV', eval(lane-1))')
   ret <$1 x $2> %ret`'eval($1-1)
 }
 
 define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
+                                             <$1 x i64> %offset_delta,
                                              <$1 x i32> %vecmask) nounwind readonly alwaysinline {
   ; We can be clever and avoid the per-lane stuff for gathers if we are willing
   ; to require that the 0th element of the array being gathered from is always
@@ -2771,11 +2784,19 @@ define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32
                                      <$1 x i32> %vecmask)
   %newOffsets = load <$1 x i64> * %offsetsPtr
 
+  %deltaPtr = alloca <$1 x i64>
+  store <$1 x i64> zeroinitializer, <$1 x i64> * %deltaPtr
+  call void @__masked_store_blend_64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta, 
+                                     <$1 x i32> %vecmask)
+  %newDelta = load <$1 x i64> * %deltaPtr
+
   %ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets,
-                                            i32 %offset_scale, <$1 x $2> undef, i32 0)
+                                            i32 %offset_scale, <$1 x i64> %newDelta,
+                                            <$1 x $2> undef, i32 0)
   forloop(lane, 1, eval($1-1), 
           `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, 
-                                <$1 x i64> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
+                                <$1 x i64> %newOffsets, i32 %offset_scale, <$1 x i64> %newDelta,
+                                <$1 x $2> %retPREV, i32 LANE)
                     ', `LANE', lane), `PREV', eval(lane-1))')
   ret <$1 x $2> %ret`'eval($1-1)
 }
@@ -2826,7 +2847,8 @@ define(`gen_scatter', `
 ;; Define the function that descripes the work to do to scatter a single
 ;; value
 define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
-                                <$1 x $2> %values, i32 %lane) nounwind alwaysinline {
+                                <$1 x i32> %offset_delta, <$1 x $2> %values,
+                                i32 %lane) nounwind alwaysinline {
   %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
   ; the order and details of the next 4 lines are important--they match LLVMs 
   ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
@@ -2835,42 +2857,52 @@ define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scal
   %offset = mul i64 %offset64, %scale64
   %ptroffset = getelementptr i8 * %ptr, i64 %offset
 
-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  %delta = extractelement <$1 x i32> %offset_delta, i32 %lane
+  %delta64 = sext i32 %delta to i64
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  %ptrcast = bitcast i8 * %finalptr to $2 *
   %storeval = extractelement <$1 x $2> %values, i32 %lane
   store $2 %storeval, $2 * %ptrcast
   ret void
 }
 
 define void @__scatter_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
-                                <$1 x $2> %values, i32 %lane) nounwind alwaysinline {
+                                <$1 x i64> %offset_delta, <$1 x $2> %values,
+                                i32 %lane) nounwind alwaysinline {
   %offset64 = extractelement <$1 x i64> %offsets, i32 %lane
   ; the order and details of the next 4 lines are important--they match LLVMs 
   ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
   %scale64 = sext i32 %offset_scale to i64
   %offset = mul i64 %offset64, %scale64
   %ptroffset = getelementptr i8 * %ptr, i64 %offset
-  %ptrcast = bitcast i8 * %ptroffset to $2 *
 
+  %delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  %ptrcast = bitcast i8 * %finalptr to $2 *
   %storeval = extractelement <$1 x $2> %values, i32 %lane
   store $2 %storeval, $2 * %ptrcast
   ret void
 }
 
 define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, i32 %offset_scale,
-                                         <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
+                                         <$1 x i32> %offset_delta, <$1 x $2> %values,
+                                         <$1 x i32> %mask) nounwind alwaysinline {
   ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
   per_lane($1, <$1 x i32> %mask, `
       call void @__scatter_elt32_$2(i8 * %base, <$1 x i32> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %values, i32 LANE)')
+                                    <$1 x i32> %offset_delta, <$1 x $2> %values, i32 LANE)')
   ret void
 }
 
 define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %offset_scale,
-                                         <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
+                                         <$1 x i64> %offset_delta, <$1 x $2> %values,
+                                         <$1 x i32> %mask) nounwind alwaysinline {
   ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
   per_lane($1, <$1 x i32> %mask, `
       call void @__scatter_elt64_$2(i8 * %base, <$1 x i64> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %values, i32 LANE)')
+                                    <$1 x i64> %offset_delta, <$1 x $2> %values, i32 LANE)')
   ret void
 }
 
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index ffeb4680..7418f5d6 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1060,13 +1060,15 @@ static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
 // offsets * offsetScale is in bytes (for all of these)
 
 #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
-static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
-                         __vec16_i1 mask) {                             \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
+                              uint32_t scale, OTYPE constOffset, \
+                              __vec16_i1 mask) {                        \
     VTYPE ret;                                                          \
     int8_t *base = (int8_t *)b;                                         \
     for (int i = 0; i < 16; ++i)                                        \
         if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]);        \
+            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
+                                   constOffset.v[i]);                   \
             ret.v[i] = *ptr;                                            \
         }                                                               \
     return ret;                                                         \
@@ -1104,13 +1106,15 @@ GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
 
 // scatter
 
-#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
-static FORCEINLINE void FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
+#define SCATTER_BASE_VARYINGOFFSET(VTYPE, STYPE, OTYPE, FUNC)           \
+static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
+                             uint32_t scale, OTYPE constOffset,         \
                              VTYPE val, __vec16_i1 mask) {              \
     int8_t *base = (int8_t *)b;                                         \
     for (int i = 0; i < 16; ++i)                                        \
         if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]);        \
+            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
+                                   constOffset.v[i]);                   \
             *ptr = val.v[i];                                            \
         }                                                               \
 }
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 2dc48b06..7a3af6ad 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -51,8 +51,8 @@
 #define FORCEINLINE __attribute__((always_inline)) inline
 #endif
 
-//CO#undef FORCEINLINE
-//CO#define FORCEINLINE
+#undef FORCEINLINE
+#define FORCEINLINE
 
 typedef float __vec1_f;
 typedef double __vec1_d;
@@ -2612,52 +2612,54 @@ static FORCEINLINE void __masked_store_blend_64(void *p, __vec4_i64 val,
 
 template<typename RetVec, typename RetScalar>
 static FORCEINLINE RetVec
-lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
-                     __vec4_i32 offsets, uint32_t scale, __vec4_i1 mask) {
+lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets, 
+                     uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
     RetScalar r[4];
 #if 1
     // "Fast gather" trick...
     offsets = __select(mask, offsets, __smear_i32(0));
-    int offset = scale * _mm_extract_epi32(offsets.v, 0);
+    constOffset = __select(mask, constOffset, __smear_i32(0));
+
+    int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
     RetScalar *ptr = (RetScalar *)(p + offset);
     r[0] = *ptr;
 
-    offset = scale * _mm_extract_epi32(offsets.v, 1);
+    offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
     ptr = (RetScalar *)(p + offset);
     r[1] = *ptr;
 
-    offset = scale * _mm_extract_epi32(offsets.v, 2);
+    offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
     ptr = (RetScalar *)(p + offset);
     r[2] = *ptr;
 
-    offset = scale * _mm_extract_epi32(offsets.v, 3);
+    offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
     ptr = (RetScalar *)(p + offset);
     r[3] = *ptr;
 #else
     uint32_t m = _mm_extract_ps(mask.v, 0);
     if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 0);
+        int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[0] = *ptr;
     }
 
     m = _mm_extract_ps(mask.v, 1);
     if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 1);
+        int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[1] = *ptr;
     }
 
     m = _mm_extract_ps(mask.v, 2);
     if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 2);
+        int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[2] = *ptr;
     }
 
     m = _mm_extract_ps(mask.v, 3);
     if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 3);
+        int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[3] = *ptr;
     }
@@ -2665,54 +2667,57 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
     return RetVec(r[0], r[1], r[2], r[3]);
 }
 
+
 template<typename RetVec, typename RetScalar>
 static FORCEINLINE RetVec
 lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
-                     uint32_t scale, __vec4_i1 mask) {
+                     uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
     RetScalar r[4];
 #if 1
     // "Fast gather" trick...
     offsets = __select(mask, offsets, __smear_i64(0));
-    int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+    constOffset = __select(mask, constOffset, __smear_i64(0));
+
+    int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
     RetScalar *ptr = (RetScalar *)(p + offset);
     r[0] = *ptr;
 
-    offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+    offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
     ptr = (RetScalar *)(p + offset);
     r[1] = *ptr;
 
-    offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+    offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
     ptr = (RetScalar *)(p + offset);
     r[2] = *ptr;
 
-    offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+    offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
     ptr = (RetScalar *)(p + offset);
     r[3] = *ptr;
 #else
     uint32_t m = _mm_extract_ps(mask.v, 0);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[0] = *ptr;
     }
 
     m = _mm_extract_ps(mask.v, 1);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[1] = *ptr;
     }
 
     m = _mm_extract_ps(mask.v, 2);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[2] = *ptr;
     }
 
     m = _mm_extract_ps(mask.v, 3);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[3] = *ptr;
     }
@@ -2723,80 +2728,89 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
 
 static FORCEINLINE __vec4_i8
 __gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
-                           uint32_t scale,  __vec4_i1 mask) {
+                           uint32_t scale,  __vec4_i32 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }
 
 static FORCEINLINE __vec4_i8
 __gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
-                           uint32_t scale, __vec4_i1 mask) {
+                           uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }
 
 static FORCEINLINE __vec4_i16
 __gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }
 
 static FORCEINLINE __vec4_i16
  __gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
-                             uint32_t scale, __vec4_i1 mask) {
+                             uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }
 
 static FORCEINLINE __vec4_i32
-__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
+                            __vec4_i32 constOffset, __vec4_i1 mask) {
     __m128i r = _mm_set_epi32(0, 0, 0, 0);
 #if 1
     // "Fast gather"...
     offsets = __select(mask, offsets, __smear_i32(0));
+    constOffset = __select(mask, constOffset, __smear_i32(0));
 
-    int offset = scale * _mm_extract_epi32(offsets.v, 0);
+    int offset = scale * _mm_extract_epi32(offsets.v, 0) +
+        _mm_extract_epi32(constOffset.v, 0);
     uint32_t *ptr = (uint32_t *)(p + offset);
     r = _mm_insert_epi32(r, *ptr, 0);
 
-    offset = scale * _mm_extract_epi32(offsets.v, 1);
+    offset = scale * _mm_extract_epi32(offsets.v, 1) +
+        _mm_extract_epi32(constOffset.v, 1);
     ptr = (uint32_t *)(p + offset);
     r = _mm_insert_epi32(r, *ptr, 1);
 
-    offset = scale * _mm_extract_epi32(offsets.v, 2);
+    offset = scale * _mm_extract_epi32(offsets.v, 2) +
+        _mm_extract_epi32(constOffset.v, 2);
     ptr = (uint32_t *)(p + offset);
     r = _mm_insert_epi32(r, *ptr, 2);
 
-    offset = scale * _mm_extract_epi32(offsets.v, 3);
+    offset = scale * _mm_extract_epi32(offsets.v, 3) +
+        _mm_extract_epi32(constOffset.v, 3);
     ptr = (uint32_t *)(p + offset);
     r = _mm_insert_epi32(r, *ptr, 3);
 #else
     uint32_t m = _mm_extract_ps(mask.v, 0);
     if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 0);
+        int offset = scale * _mm_extract_epi32(offsets.v, 0) +
+            _mm_extract_epi32(constOffset.v, 0);
         uint32_t *ptr = (uint32_t *)(p + offset);
         r = _mm_insert_epi32(r, *ptr, 0);
     }
 
     m = _mm_extract_ps(mask.v, 1);
     if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 1);
+        int offset = scale * _mm_extract_epi32(offsets.v, 1) +
+            _mm_extract_epi32(constOffset.v, 1);
         uint32_t *ptr = (uint32_t *)(p + offset);
         r = _mm_insert_epi32(r, *ptr, 1);
     }
 
     m = _mm_extract_ps(mask.v, 2);
     if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 2);
+        int offset = scale * _mm_extract_epi32(offsets.v, 2) +
+            _mm_extract_epi32(constOffset.v, 2);
         uint32_t *ptr = (uint32_t *)(p + offset);
         r = _mm_insert_epi32(r, *ptr, 2);
     }
 
     m = _mm_extract_ps(mask.v, 3);
     if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 3);
+        int offset = scale * _mm_extract_epi32(offsets.v, 3) +
+            _mm_extract_epi32(constOffset.v, 3);
         uint32_t *ptr = (uint32_t *)(p + offset);
         r = _mm_insert_epi32(r, *ptr, 3);
     }
@@ -2806,23 +2820,23 @@ __gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,
 
 static FORCEINLINE __vec4_i32
 __gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
     return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale, 
-                                mask);
+                                delta, mask);
 }
 
 static FORCEINLINE __vec4_i64
 __gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) {
     return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale, 
-                                mask);
+                                delta, mask);
 }
 
 static FORCEINLINE __vec4_i64
 __gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
     return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale, 
-                                mask);
+                                delta, mask);
 }
 
 template<typename RetVec, typename RetScalar>
@@ -2969,217 +2983,108 @@ static FORCEINLINE __vec4_i64 __gather64_i64(__vec4_i64 ptrs, __vec4_i1 mask) {
 
 // scatter
   
-static FORCEINLINE void
-__scatter_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets, 
-                            uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
-        *ptr = _mm_extract_epi8(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
-        *ptr = _mm_extract_epi8(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
-        *ptr = _mm_extract_epi8(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
-        *ptr = _mm_extract_epi8(val.v, 3);
-    }
+#define SCATTER32_64(SUFFIX, TYPE, EXTRACT)                         \
+static FORCEINLINE void                                             \
+__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
+                                   uint32_t scale, __vec4_i32 constOffset, \
+                                   __vec4_##SUFFIX val, __vec4_i1 mask) { \
+    uint32_t m = _mm_extract_ps(mask.v, 0);                             \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \
+                             _mm_extract_epi32(constOffset.v, 0));      \
+        *ptr = EXTRACT(val.v, 0);                                       \
+    }                                                                   \
+    m = _mm_extract_ps(mask.v, 1);                                      \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \
+                             _mm_extract_epi32(constOffset.v, 1));      \
+        *ptr = EXTRACT(val.v, 1);                                       \
+    }                                                                   \
+    m = _mm_extract_ps(mask.v, 2);                                      \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \
+                             _mm_extract_epi32(constOffset.v, 2));      \
+        *ptr = EXTRACT(val.v, 2);                                       \
+    }                                                                   \
+    m = _mm_extract_ps(mask.v, 3);                                      \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \
+                             _mm_extract_epi32(constOffset.v, 3));      \
+        *ptr = EXTRACT(val.v, 3);                                       \
+    }                                                                   \
+}                                                                       \
+static FORCEINLINE void                                                \
+__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
+                                  uint32_t scale, __vec4_i64 constOffset, \
+                                  __vec4_##SUFFIX val, __vec4_i1 mask) { \
+    uint32_t m = _mm_extract_ps(mask.v, 0);                            \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +  \
+            _mm_extract_epi64(constOffset.v[0], 0);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 0);                                      \
+    }                                                                  \
+    m = _mm_extract_ps(mask.v, 1);                                     \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +  \
+            _mm_extract_epi64(constOffset.v[0], 1);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 1);                                      \
+    }                                                                  \
+    m = _mm_extract_ps(mask.v, 2);                                     \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +  \
+            _mm_extract_epi64(constOffset.v[1], 0);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 2);                                      \
+    }                                                                  \
+    m = _mm_extract_ps(mask.v, 3);                                     \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +  \
+            _mm_extract_epi64(constOffset.v[1], 1);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 3);                                      \
+    }                                                                  \
 }
 
-static FORCEINLINE void
-__scatter_base_offsets64_i8(unsigned char *p, __vec4_i64 offsets, 
-                            uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 0);
-    }
 
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 1);
-    }
+SCATTER32_64(i8, int8_t, _mm_extract_epi8)
+SCATTER32_64(i16, int16_t, _mm_extract_epi16)
+SCATTER32_64(i32, int32_t, _mm_extract_epi32)
 
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
-                             uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
-        *ptr = _mm_extract_epi16(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
-        *ptr = _mm_extract_epi16(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
-        *ptr = _mm_extract_epi16(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
-        *ptr = _mm_extract_epi16(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets64_i16(unsigned char *p, __vec4_i64 offsets, 
-                             uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets32_i32(unsigned char *b, __vec4_i32 offsets,
-                             uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 0));
-        *ptr = _mm_extract_epi32(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 1));
-        *ptr = _mm_extract_epi32(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 2));
-        *ptr = _mm_extract_epi32(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 3));
-        *ptr = _mm_extract_epi32(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets, 
-                             uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 3);
-    }
-}
 
 static FORCEINLINE void
 __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, 
-                             uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
+                             uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val, 
+                             __vec4_i1 mask) {
     uint32_t m = _mm_extract_ps(mask.v, 0);
     if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 0);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) +
+            _mm_extract_epi32(constOffset.v, 0);
         uint64_t *ptr = (uint64_t *)(p + offset);
         *ptr = _mm_extract_epi64(val.v[0], 0);
     }
 
     m = _mm_extract_ps(mask.v, 1);
     if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 1);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) +
+            _mm_extract_epi32(constOffset.v, 1);
         uint64_t *ptr = (uint64_t *)(p + offset);
         *ptr = _mm_extract_epi64(val.v[0], 1);
     }
 
     m = _mm_extract_ps(mask.v, 2);
     if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 2);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) +
+            _mm_extract_epi32(constOffset.v, 2);
         uint64_t *ptr = (uint64_t *)(p + offset);
         *ptr = _mm_extract_epi64(val.v[1], 0);
     }
 
     m = _mm_extract_ps(mask.v, 3);
     if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 3);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) +
+            _mm_extract_epi32(constOffset.v, 3);
         uint64_t *ptr = (uint64_t *)(p + offset);
         *ptr = _mm_extract_epi64(val.v[1], 1);
     }
@@ -3187,31 +3092,36 @@ __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
 
 static FORCEINLINE void
 __scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, 
-                             uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
+                             uint32_t scale, __vec4_i64 constOffset,
+                             __vec4_i64 val, __vec4_i1 mask) {
     uint32_t m = _mm_extract_ps(mask.v, 0);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +
+            _mm_extract_epi64(constOffset.v[0], 0);
         uint64_t *ptr = (uint64_t *)(p + offset);
         *ptr = _mm_extract_epi64(val.v[0], 0);
     }
 
     m = _mm_extract_ps(mask.v, 1);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +
+            _mm_extract_epi64(constOffset.v[0], 1);
         uint64_t *ptr = (uint64_t *)(p + offset);
         *ptr = _mm_extract_epi64(val.v[0], 1);
     }
 
     m = _mm_extract_ps(mask.v, 2);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +
+            _mm_extract_epi64(constOffset.v[1], 0);
         uint64_t *ptr = (uint64_t *)(p + offset);
         *ptr = _mm_extract_epi64(val.v[1], 0);
     }
 
     m = _mm_extract_ps(mask.v, 3);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +
+            _mm_extract_epi64(constOffset.v[1], 1);
         uint64_t *ptr = (uint64_t *)(p + offset);
         *ptr = _mm_extract_epi64(val.v[1], 1);
     }
diff --git a/opt.cpp b/opt.cpp
index c105947b..f6eab8c6 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -205,6 +205,7 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
 }
 
 
+#if 0
 static llvm::Instruction *
 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, 
           llvm::Value *arg2, llvm::Value *arg3, const char *name,
@@ -218,7 +219,7 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
                                   name, insertBefore);
 #endif
 }
-
+#endif
 
 static llvm::Instruction *
 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, 
@@ -234,6 +235,21 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
 #endif
 }
 
+static llvm::Instruction *
+lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, 
+          llvm::Value *arg2, llvm::Value *arg3, llvm::Value *arg4,
+          llvm::Value *arg5, const char *name, 
+          llvm::Instruction *insertBefore = NULL) {
+    llvm::Value *args[6] = { arg0, arg1, arg2, arg3, arg4, arg5 };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+    llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[6]);
+    return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
+#else
+    return llvm::CallInst::Create(func, &newArgs[0], &newArgs[6],
+                                  name, insertBefore);
+#endif
+}
+
 ///////////////////////////////////////////////////////////////////////////
 
 void
@@ -302,10 +318,13 @@ Optimize(llvm::Module *module, int optLevel) {
 
         // Early optimizations to try to reduce the total amount of code to
         // work with if we can
-        optPM.add(CreateDetectGSBaseOffsetsPass());
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createConstantPropagationPass());
+        optPM.add(llvm::createConstantPropagationPass());
+        optPM.add(llvm::createDeadInstEliminationPass());
+        optPM.add(llvm::createCFGSimplificationPass());
 
+        optPM.add(CreateDetectGSBaseOffsetsPass());
         if (!g->opt.disableMaskAllOnOptimizations) {
             optPM.add(CreateIntrinsicsOptPass());
             optPM.add(CreateVSelMovmskOptPass());
@@ -314,11 +333,7 @@ Optimize(llvm::Module *module, int optLevel) {
         }
         optPM.add(llvm::createDeadInstEliminationPass());
 
-        optPM.add(llvm::createConstantPropagationPass());
-        optPM.add(llvm::createDeadInstEliminationPass());
-        
         // On to more serious optimizations
-        optPM.add(llvm::createCFGSimplificationPass());
         if (runSROA)
             optPM.add(llvm::createScalarReplAggregatesPass());
         optPM.add(llvm::createInstructionCombiningPass());
@@ -1173,6 +1188,166 @@ lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets,
 }
 
 
+static llvm::Value *
+lGetZeroOffsetVector(llvm::Value *origVec) {
+    if (origVec->getType() == LLVMTypes::Int32VectorType)
+        return LLVMInt32Vector((int32_t)0);
+    else
+        return LLVMInt64Vector((int64_t)0);
+}
+
+
+#if 0
+static void
+lPrint(llvm::Value *v, int indent = 0) {
+    if (llvm::isa<llvm::PHINode>(v))
+        return;
+
+    fprintf(stderr, "%*c", indent, ' ');
+    v->dump();
+
+    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(v);
+    if (inst != NULL) {
+        for (int i = 0; i < (int)inst->getNumOperands(); ++i) {
+            llvm::Value *op = inst->getOperand(i);
+            if (llvm::isa<llvm::Constant>(op) == false)
+                lPrint(op, indent+4);
+        }
+    }
+}
+#endif
+
+
+/** Given a vector expression in vec, separate it into a compile-time
+    constant component and a variable component, returning the two parts in
+    *constOffset and *variableOffset.  (It should be the case that the sum
+    of these two is exactly equal to the original vector.)
+
+    This routine only handles some (important) patterns; in some cases it
+    will fail and return components that are actually compile-time
+    constants in *variableOffset.
+
+    Finally, if there aren't any constant (or, respectivaly, variable)
+    components, the corresponding return value may be set to NULL.
+ */
+static void
+lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
+                       llvm::Value **variableOffset, 
+                       llvm::Instruction *insertBefore) {
+    if (llvm::isa<llvm::ConstantVector>(vec) ||
+        llvm::isa<llvm::ConstantAggregateZero>(vec)) {
+        *constOffset = vec;
+        *variableOffset = NULL;
+        return;
+    }
+
+    llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(vec);
+    if (sext != NULL) {
+        // Check the sext target.
+        llvm::Value *co, *vo;
+        lExtractConstantOffset(sext->getOperand(0), &co, &vo, insertBefore);
+
+        // make new sext instructions for the two parts
+        if (co == NULL)
+            *constOffset = NULL;
+        else
+            *constOffset = new llvm::SExtInst(co, sext->getType(), 
+                                              "const_offset_sext", insertBefore);
+        if (vo == NULL)
+            *variableOffset = NULL;
+        else
+            *variableOffset = new llvm::SExtInst(vo, sext->getType(), 
+                                                 "variable_offset_sext", 
+                                                 insertBefore);
+        return;
+    }
+
+    // FIXME? handle bitcasts / type casts here
+
+    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(vec);
+    if (bop != NULL) {
+        llvm::Value *op0 = bop->getOperand(0);
+        llvm::Value *op1 = bop->getOperand(1);
+        llvm::Value *c0, *v0, *c1, *v1;
+
+        if (bop->getOpcode() == llvm::Instruction::Add) {
+            lExtractConstantOffset(op0, &c0, &v0, insertBefore);
+            lExtractConstantOffset(op1, &c1, &v1, insertBefore);
+
+            if (c0 == NULL)
+                *constOffset = c1;
+            else if (c1 == NULL)
+                *constOffset = c0;
+            else
+                *constOffset = 
+                    llvm::BinaryOperator::Create(llvm::Instruction::Add, c0, c1,
+                                                 "const_op", insertBefore);
+
+            if (v0 == NULL)
+                *variableOffset = v1;
+            else if (v1 == NULL)
+                *variableOffset = v0;
+            else
+                *variableOffset = 
+                    llvm::BinaryOperator::Create(llvm::Instruction::Add, v0, v1,
+                                                 "variable_op", insertBefore);
+            return;
+        }
+        else if (bop->getOpcode() == llvm::Instruction::Mul) {
+            lExtractConstantOffset(op0, &c0, &v0, insertBefore);
+            lExtractConstantOffset(op1, &c1, &v1, insertBefore);
+
+            // Given the product of constant and variable terms, we have:
+            // (c0 + v0) * (c1 + v1) == (c0 c1) + (v0 c1 + c0 v1 + v0 v1)
+            // Note that the first term is a constant and the last three are
+            // variable.
+            if (c0 != NULL && c1 != NULL)
+                *constOffset =
+                    llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, c1,
+                                                 "const_mul", insertBefore);
+            else
+                *constOffset = NULL;
+
+            llvm::Value *va = NULL, *vb = NULL, *vc = NULL;
+            if (v0 != NULL && c1 != NULL)
+                va = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, c1,
+                                                  "va_mul", insertBefore);
+            if (c0 != NULL && v1 != NULL)
+                vb = llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, v1,
+                                                  "vb_mul", insertBefore);
+            if (v0 != NULL && v1 != NULL)
+                vc = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, v1,
+                                                  "vc_mul", insertBefore);
+
+            
+            llvm::Value *vab = NULL;
+            if (va != NULL && vb != NULL)
+                vab = llvm::BinaryOperator::Create(llvm::Instruction::Add, va, vb,
+                                                   "vab_add", insertBefore);
+            else if (va != NULL)
+                vab = va;
+            else
+                vab = vb;
+
+            if (vab != NULL && vc != NULL)
+                *variableOffset = 
+                    llvm::BinaryOperator::Create(llvm::Instruction::Add, vab, vc,
+                                                 "vabc_add", insertBefore);
+            else if (vab != NULL)
+                *variableOffset = vab;
+            else
+                *variableOffset = vc;
+
+            return;
+        }
+    }
+
+    // Nothing matched, just return what we have as a variable component
+    *constOffset = NULL;
+    *variableOffset = vec;
+}
+
+
 /* Returns true if the given value is a constant vector of integers with
    the value 2, 4, 8 in all of the elements.  (Returns the splatted value
    in *splat, if so). */
@@ -1277,6 +1452,123 @@ lExtractOffsetVector248Scale(llvm::Value **vec) {
         return LLVMInt32(1);
 }
 
+#if 0
+static llvm::Value *
+lExtractUniforms(llvm::Value **vec, llvm::Instruction *insertBefore) {
+    fprintf(stderr, " lextract: ");
+    (*vec)->dump();
+    fprintf(stderr, "\n");
+
+    if (llvm::isa<llvm::ConstantVector>(*vec) ||
+        llvm::isa<llvm::ConstantAggregateZero>(*vec))
+        return NULL;
+
+    llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(*vec);
+    if (sext != NULL) {
+        llvm::Value *sextOp = sext->getOperand(0);
+        // Check the sext target.
+        llvm::Value *unif = lExtractUniforms(&sextOp, insertBefore);
+        if (unif == NULL)
+            return NULL;
+
+        // make a new sext instruction so that we end up with the right
+        // type
+        *vec = new llvm::SExtInst(sextOp, sext->getType(), "offset_sext", sext);
+        return unif;
+    }
+
+    std::vector<llvm::PHINode *> phis;
+    if (LLVMVectorValuesAllEqual(*vec, g->target.vectorWidth, phis)) {
+        // FIXME: we may want to redo all of the expression here, in scalar
+        // form (if at all possible), for code quality...
+        llvm::Value *unif = 
+            llvm::ExtractElementInst::Create(*vec, LLVMInt32(0),
+                                             "first_uniform", insertBefore);
+        *vec = NULL;
+        return unif;
+    }
+
+    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
+    if (bop == NULL)
+        return NULL;
+
+    llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
+    if (bop->getOpcode() == llvm::Instruction::Add) {
+        llvm::Value *s0 = lExtractUniforms(&op0, insertBefore);
+        llvm::Value *s1 = lExtractUniforms(&op1, insertBefore);
+        if (s0 == NULL && s1 == NULL)
+            return NULL;
+
+        if (op0 == NULL)
+            *vec = op1;
+        else if (op1 == NULL)
+            *vec = op0;
+        else
+            *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add,
+                                                op0, op1, "new_add", insertBefore);
+
+        if (s0 == NULL) 
+            return s1;
+        else if (s1 == NULL)
+            return s0;
+        else
+            return llvm::BinaryOperator::Create(llvm::Instruction::Add, s0, s1,
+                                                "add_unif", insertBefore);
+    }
+#if 0
+    else if (bop->getOpcode() == llvm::Instruction::Mul) {
+        // Check each operand for being one of the scale factors we care about.
+        int splat;
+        if (lIs248Splat(op0, &splat)) {
+            *vec = op1;
+            return LLVMInt32(splat);
+        }
+        else if (lIs248Splat(op1, &splat)) {
+            *vec = op0;
+            return LLVMInt32(splat);
+        }
+        else
+            return LLVMInt32(1);
+    }
+#endif
+    else
+        return NULL;
+}
+
+
+static void
+lExtractUniformsFromOffset(llvm::Value **basePtr, llvm::Value **offsetVector, 
+                           llvm::Value *offsetScale, 
+                           llvm::Instruction *insertBefore) {
+#if 1
+    (*basePtr)->dump();
+    printf("\n");
+    (*offsetVector)->dump();
+    printf("\n");
+    offsetScale->dump();
+    printf("-----\n");
+#endif
+
+    llvm::Value *uniformDelta = lExtractUniforms(offsetVector, insertBefore);
+    if (uniformDelta == NULL)
+        return;
+
+    llvm::Value *index[1] = { uniformDelta };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+    llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
+    *basePtr = llvm::GetElementPtrInst::Create(*basePtr, arrayRef, "new_base",
+                                               insertBefore);
+#else
+    *basePtr = llvm::GetElementPtrInst::Create(*basePtr, &index[0], 
+                                               &index[1], "new_base",
+                                               insertBefore);
+#endif
+
+    // this should only happen if we have only uniforms, but that in turn
+    // shouldn't be a gather/scatter!
+    Assert(*offsetVector != NULL);
+}
+#endif
 
 struct GSInfo {
     GSInfo(const char *pgFuncName, const char *pgboFuncName, 
@@ -1367,7 +1659,24 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
             // to the next instruction...
             continue;
 
-        llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector);
+        // Try to decompose the offset vector into a compile time constant
+        // component and a varying component.  The constant component is
+        // passed as a separate parameter to the gather/scatter functions,
+        // which in turn allows their implementations to end up emitting
+        // x86 instructions with constant offsets encoded in them.
+        llvm::Value *constOffset, *variableOffset;
+        lExtractConstantOffset(offsetVector, &constOffset, &variableOffset, 
+                               callInst);
+        if (constOffset == NULL)
+            constOffset = lGetZeroOffsetVector(offsetVector);
+        if (variableOffset == NULL)
+            variableOffset = lGetZeroOffsetVector(offsetVector);
+
+        // See if the varying component is scaled by 2, 4, or 8.  If so,
+        // extract that scale factor and rewrite variableOffset to remove
+        // it.  (This also is pulled out so that we can match the scales by
+        // 2/4/8 offered by x86 addressing operators.)
+        llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset);
 
         // Cast the base pointer to a void *, since that's what the
         // __pseudo_*_base_offsets_* functions want.
@@ -1386,11 +1695,15 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
             // walk past the sext to get the i32 offset values and then
             // call out to the corresponding 32-bit gather/scatter
             // function.
-            llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(offsetVector);
+            llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(variableOffset);
             if (sext != NULL && 
                 sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) {
-                offsetVector = sext->getOperand(0);
+                variableOffset = sext->getOperand(0);
                 gatherScatterFunc = info->baseOffsets32Func;
+                if (constOffset->getType() != LLVMTypes::Int32VectorType)
+                    constOffset = 
+                        new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
+                                            "trunc_const_offset", callInst);
             }
         }
 
@@ -1403,8 +1716,8 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
             // the instruction isn't inserted into a basic block and that
             // way we can then call ReplaceInstWithInst().
             llvm::Instruction *newCall = 
-                lCallInst(gatherScatterFunc, basePtr, offsetVector, offsetScale,
-                          mask, "newgather", NULL);
+                lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
+                          constOffset, mask, "newgather", NULL);
             lCopyMetadata(newCall, callInst);
             llvm::ReplaceInstWithInst(callInst, newCall);
         }
@@ -1416,8 +1729,8 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
             // base+offsets instruction.  See above for why passing NULL
             // for the Instruction * is intended.
             llvm::Instruction *newCall = 
-                lCallInst(gatherScatterFunc, basePtr, offsetVector, offsetScale,
-                          storeValue, mask, "", NULL);
+                lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
+                          constOffset, storeValue, mask, "", NULL);
             lCopyMetadata(newCall, callInst);
             llvm::ReplaceInstWithInst(callInst, newCall);
         }
@@ -2016,6 +2329,26 @@ struct GatherImpInfo {
 };
 
 
+static llvm::Value *
+lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets,
+                      llvm::Instruction *insertBefore) {
+    llvm::Value *firstOffset = 
+        llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
+                                         insertBefore);
+
+    llvm::Value *offsetIndex[1] = { firstOffset };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+    llvm::ArrayRef<llvm::Value *> arrayRef(&offsetIndex[0], &offsetIndex[1]);
+    return
+        llvm::GetElementPtrInst::Create(base, arrayRef, "ptr", insertBefore);
+#else
+    return
+        llvm::GetElementPtrInst::Create(base, &offsetIndex[0], &offsetIndex[1],
+                                        "ptr", insertBefore);
+#endif
+}
+
+
 struct ScatterImpInfo {
     ScatterImpInfo(const char *pName, const char *msName, 
                    LLVM_TYPE_CONST llvm::Type *vpt, int a)
@@ -2109,45 +2442,42 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
         Assert(ok);     
 
         llvm::Value *base = callInst->getArgOperand(0);
-        llvm::Value *offsets = callInst->getArgOperand(1);
+        llvm::Value *varyingOffsets = callInst->getArgOperand(1);
         llvm::Value *offsetScale = callInst->getArgOperand(2);
-        llvm::Value *storeValue = (scatterInfo != NULL) ? callInst->getArgOperand(3) : NULL;
-        llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4);
+        llvm::Value *constOffsets = callInst->getArgOperand(3);
+        llvm::Value *storeValue = (scatterInfo != NULL) ? callInst->getArgOperand(4) : NULL;
+        llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5);
 
+        // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets
         llvm::ConstantInt *offsetScaleInt = 
             llvm::dyn_cast<llvm::ConstantInt>(offsetScale);
         Assert(offsetScaleInt != NULL);
+        uint64_t scaleValue = offsetScaleInt->getZExtValue();
 
-        if (offsets->getType() == LLVMTypes::Int64VectorType)
-            // offsetScale is an i32, so sext it so that if we use it in a
-            // multiply below, it has the same type as the i64 offset used
-            // as the other operand...
-            offsetScale = new llvm::SExtInst(offsetScale, LLVMTypes::Int64Type,
-                                             "offset_sext", callInst);
+        std::vector<llvm::Constant *> scales;
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            if (varyingOffsets->getType() == LLVMTypes::Int64VectorType)
+                scales.push_back(LLVMInt64(scaleValue));
+            else
+                scales.push_back(LLVMInt32(scaleValue));
+        }
+        llvm::Constant *offsetScaleVec = llvm::ConstantVector::get(scales);
+
+        llvm::Value *scaledVarying = 
+            llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
+                                         varyingOffsets, "scaled_varying", callInst);
+        llvm::Value *fullOffsets =
+            llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying,
+                                         constOffsets, "varying+const_offsets",
+                                         callInst);
 
         {
         std::vector<llvm::PHINode *> seenPhis;
-        if (LLVMVectorValuesAllEqual(offsets, g->target.vectorWidth, seenPhis)) {
+        if (LLVMVectorValuesAllEqual(fullOffsets, g->target.vectorWidth, seenPhis)) {
             // If all the offsets are equal, then compute the single
             // pointer they all represent based on the first one of them
             // (arbitrarily).
-
-            // FIXME: the code from here to where ptr is computed is highly
-            // redundant with the case for a vector linear below.
-
-            llvm::Value *firstOffset = 
-                llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
-                                                 callInst);
-            llvm::Value *indices[1] = { firstOffset };
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-            llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[1]);
-            llvm::Value *ptr = 
-                llvm::GetElementPtrInst::Create(base, arrayRef, "ptr", callInst);
-#else
-            llvm::Value *ptr = 
-                llvm::GetElementPtrInst::Create(base, &indices[0], &indices[1],
-                                                "ptr", callInst);
-#endif
+            llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
             lCopyMetadata(ptr, callInst);
 
             if (gatherInfo != NULL) {
@@ -2175,9 +2505,11 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
                     llvm::ExtractElementInst::Create(storeValue, LLVMInt32(0), "rvalue_first",
                                                      callInst);
                 lCopyMetadata(first, callInst);
+
                 ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(first->getType(), 0),
                                             "ptr2rvalue_type", callInst);
                 lCopyMetadata(ptr, callInst);
+
                 llvm::Instruction *sinst = new llvm::StoreInst(first, ptr, false, 
                                                                scatterInfo->align);
                 lCopyMetadata(sinst, callInst);
@@ -2190,34 +2522,15 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
         }
 
         int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
-        step /= (int)offsetScaleInt->getZExtValue();
 
         std::vector<llvm::PHINode *> seenPhis;
-        if (step > 0 && lVectorIsLinear(offsets, g->target.vectorWidth, 
+        if (step > 0 && lVectorIsLinear(fullOffsets, g->target.vectorWidth, 
                                         step, seenPhis)) {
             // We have a linear sequence of memory locations being accessed
             // starting with the location given by the offset from
             // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
             // and 64 bit gather/scatters, respectively.)
-
-            // Get the base pointer using the first guy's offset.
-            llvm::Value *firstOffset = 
-                llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
-                                                 callInst);
-            llvm::Value *scaledOffset = 
-                llvm::BinaryOperator::Create(llvm::Instruction::Mul, firstOffset,
-                                             offsetScale, "scaled_offset", callInst);
-
-            llvm::Value *indices[1] = { scaledOffset };
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-            llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[1]);
-            llvm::Value *ptr = 
-                llvm::GetElementPtrInst::Create(base, arrayRef, "ptr", callInst);
-#else
-            llvm::Value *ptr = 
-                llvm::GetElementPtrInst::Create(base, &indices[0], &indices[1],
-                                                "ptr", callInst);
-#endif
+            llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
             lCopyMetadata(ptr, callInst);
 
             if (gatherInfo != NULL) {