Take advantage of x86's free "scale by 2, 4, or 8" in addressing calculations

When loading from an address that's computed by adding two registers together, x86 can scale one of them by 2, 4, or 8, for free as part of the addressing calculation. This change makes the code generated for gather and scatter use this. For the cases where gather/scatter is based on a base pointer and an integer offset vector, the GSImprovementsPass looks to see if the integer offsets are being computed as 2/4/8 times some other value. If so, it extracts the 2x/4x/8x part and leaves the rest there as the the offsets. The {gather,scatter}_base_offsets_* functions take an i32 scale factor, which is passed to them, and then they carefully generate IR so that it hits LLVM's pattern matching for these scales. This is particular win on AVX, since it saves us two 4-wide integer multiplies. Noise runs 14% faster with this. Issue #132.
2011-12-16 15:49:34 -08:00
parent f23d030e43
commit 6dbb15027a
2 changed files with 219 additions and 87 deletions
--- a/builtins.m4
+++ b/builtins.m4
@@ -883,21 +883,22 @@ declare void @__pseudo_masked_store_64(<$1 x i64> * nocapture, <$1 x i64>, <$1 x
 ; varying int32 __pseudo_gather(varying int32 *, mask)
 ; varying int64 __pseudo_gather(varying int64 *, mask)
 ;
-; These functions are never actually implemented; the
-; GatherScatterFlattenOpt optimization pass finds them and then converts
-; them to make calls to the following functions, which represent gathers
-; from a common base pointer with offsets.  This approach allows the
-; front-end to be relatively simple in how it emits address calculation
-; for gathers.
+; The GatherScatterFlattenOpt optimization pass finds these calls and then 
+; converts them to make calls to the following functions (when appropriate); 
+; these represent gathers from a common base pointer with offsets.  The
+; offset_scale factor scales the offsets before they are added to the base
+; pointer--it should have the value 1, 2, 4, or 8.  (It can always just be 1.)
+; The 2, 4, 8 cases are used to match LLVM patterns that use the free 2/4/8 scaling
+; available in x86 addressing calculations... 
 ;
-; varying int8  __pseudo_gather_base_offsets_8(uniform int8 *base, 
-;                                              int32 offsets, mask)
-; varying int16 __pseudo_gather_base_offsets_16(uniform int16 *base, 
-;                                               int32 offsets, mask)
-; varying int32 __pseudo_gather_base_offsets_32(uniform int32 *base, 
-;                                               int32 offsets, mask)
-; varying int64 __pseudo_gather_base_offsets_64(uniform int64 *base, 
-;                                               int64 offsets, mask)
+; varying int8  __pseudo_gather_base_offsets{32,64}_8(uniform int8 *base, 
+;                                    int{32,64} offsets, int32 offset_scale, mask)
+; varying int16 __pseudo_gather_base_offsets{32,64}_16(uniform int16 *base, 
+;                                    int{32,64} offsets, int32 offset_scale, mask)
+; varying int32 __pseudo_gather_base_offsets{32,64}_32(uniform int32 *base, 
+;                                    int{32,64} offsets, int32 offset_scale, mask)
+; varying int64 __pseudo_gather_base_offsets{32,64}_64(uniform int64 *base, 
+;                                    int{32,64} offsets, int32 offset_scale, mask)
 ;
 ; Then, the GSImprovementsPass optimizations finds these and either
 ; converts them to native gather functions or converts them to vector
@@ -913,15 +914,23 @@ declare <$1 x i16> @__pseudo_gather64_16(<$1 x i64>, <$1 x i32>) nounwind readon
 declare <$1 x i32> @__pseudo_gather64_32(<$1 x i64>, <$1 x i32>) nounwind readonly
 declare <$1 x i64> @__pseudo_gather64_64(<$1 x i64>, <$1 x i32>) nounwind readonly

-declare <$1 x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather_base_offsets32_16(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather_base_offsets32_32(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather_base_offsets32_64(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
+declare <$1 x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <$1 x i32>, i32,
+                                                     <$1 x i32>) nounwind readonly
+declare <$1 x i16> @__pseudo_gather_base_offsets32_16(i8 *, <$1 x i32>, i32,
+                                                      <$1 x i32>) nounwind readonly
+declare <$1 x i32> @__pseudo_gather_base_offsets32_32(i8 *, <$1 x i32>, i32,
+                                                      <$1 x i32>) nounwind readonly
+declare <$1 x i64> @__pseudo_gather_base_offsets32_64(i8 *, <$1 x i32>, i32,
+                                                      <$1 x i32>) nounwind readonly

-declare <$1 x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather_base_offsets64_16(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather_base_offsets64_32(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
+declare <$1 x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <$1 x i64>, i32,
+                                                     <$1 x i32>) nounwind readonly
+declare <$1 x i16> @__pseudo_gather_base_offsets64_16(i8 *, <$1 x i64>, i32,
+                                                      <$1 x i32>) nounwind readonly
+declare <$1 x i32> @__pseudo_gather_base_offsets64_32(i8 *, <$1 x i64>, i32,
+                                                      <$1 x i32>) nounwind readonly
+declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, i32,
+                                                      <$1 x i32>) nounwind readonly

 ; Similarly to the pseudo-gathers defined above, we also declare undefined
 ; pseudo-scatter instructions with signatures:
@@ -934,14 +943,14 @@ declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, <$1 x i3
 ; The GatherScatterFlattenOpt optimization pass also finds these and
 ; transforms them to scatters like:
 ;
-; void __pseudo_scatter_base_offsets_8(uniform int8 *base, 
-;                 varying int32 offsets, varying int8 values, mask)
-; void __pseudo_scatter_base_offsets_16(uniform int16 *base, 
-;                 varying int32 offsets, varying int16 values, mask)
-; void __pseudo_scatter_base_offsets_32(uniform int32 *base, 
-;                 varying int32 offsets, varying int32 values, mask)
-; void __pseudo_scatter_base_offsets_64(uniform int64 *base, 
-;                 varying int32 offsets, varying int64 values, mask)
+; void __pseudo_scatter_base_offsets{32,64}_8(uniform int8 *base, 
+;             varying int32 offsets, int32 offset_scale, varying int8 values, mask)
+; void __pseudo_scatter_base_offsets{32,64}_16(uniform int16 *base, 
+;             varying int32 offsets, int32 offset_scale, varying int16 values, mask)
+; void __pseudo_scatter_base_offsets{32,64}_32(uniform int32 *base, 
+;             varying int32 offsets, int32 offset_scale, varying int32 values, mask)
+; void __pseudo_scatter_base_offsets{32,64}_64(uniform int64 *base, 
+;             varying int32 offsets, int32 offset_scale, varying int64 values, mask)
 ;
 ; And the GSImprovementsPass in turn converts these to actual native
 ; scatters or masked stores.  
@@ -956,22 +965,22 @@ declare void @__pseudo_scatter64_16(<$1 x i64>, <$1 x i16>, <$1 x i32>) nounwind
 declare void @__pseudo_scatter64_32(<$1 x i64>, <$1 x i32>, <$1 x i32>) nounwind
 declare void @__pseudo_scatter64_64(<$1 x i64>, <$1 x i64>, <$1 x i32>) nounwind

-declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <$1 x i32>,
+declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <$1 x i32>, i32,
                                                <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <$1 x i32>,
+declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <$1 x i32>, i32,
                                                 <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <$1 x i32>,
+declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <$1 x i32>, i32,
                                                 <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <$1 x i32>,
+declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <$1 x i32>, i32,
                                                 <$1 x i64>, <$1 x i32>) nounwind

-declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <$1 x i64>,
+declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <$1 x i64>, i32,
                                                <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <$1 x i64>,
+declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <$1 x i64>, i32,
                                                 <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <$1 x i64>,
+declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <$1 x i64>, i32,
                                                 <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <$1 x i64>,
+declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <$1 x i64>, i32,
                                                 <$1 x i64>, <$1 x i32>) nounwind

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2732,34 +2741,43 @@ pl_done:
 define(`gen_gather', `
 ;; Define the utility function to do the gather operation for a single element
 ;; of the type
-define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x $2> %ret,
-                                    i32 %lane) nounwind readonly alwaysinline {
+define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
+                                    <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
  ; compute address for this one from the base
  %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
-  %ptroffset = getelementptr i8 * %ptr, i32 %offset32
-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %offset64 = sext i32 %offset32 to i64
+  %scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset

  ; load value and insert into returned value
+  %ptrcast = bitcast i8 * %ptroffset to $2 *
  %val = load $2 *%ptrcast
  %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
  ret <$1 x $2> %updatedret
 }

-define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, <$1 x $2> %ret,
-                                    i32 %lane) nounwind readonly alwaysinline {
+define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
+                                    <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
  ; compute address for this one from the base
-  %offset32 = extractelement <$1 x i64> %offsets, i32 %lane
-  %ptroffset = getelementptr i8 * %ptr, i64 %offset32
-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  %offset64 = extractelement <$1 x i64> %offsets, i32 %lane
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %offset_scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %offset_scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset

  ; load value and insert into returned value
+  %ptrcast = bitcast i8 * %ptroffset to $2 *
  %val = load $2 *%ptrcast
  %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
  ret <$1 x $2> %updatedret
 }


-define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets,
+define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
                                             <$1 x i32> %vecmask) nounwind readonly alwaysinline {
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
  ; to require that the 0th element of the array being gathered from is always
@@ -2773,15 +2791,15 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets,
  %newOffsets = load <$1 x i32> * %offsetsPtr

  %ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
-                                            <$1 x $2> undef, i32 0)
+                                            i32 %offset_scale, <$1 x $2> undef, i32 0)
  forloop(lane, 1, eval($1-1), 
          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, 
-                                <$1 x i32> %newOffsets, <$1 x $2> %retPREV, i32 LANE)
+                                <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
                    ', `LANE', lane), `PREV', eval(lane-1))')
  ret <$1 x $2> %ret`'eval($1-1)
 }

-define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets,
+define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
                                             <$1 x i32> %vecmask) nounwind readonly alwaysinline {
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
  ; to require that the 0th element of the array being gathered from is always
@@ -2795,10 +2813,10 @@ define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets,
  %newOffsets = load <$1 x i64> * %offsetsPtr

  %ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets,
-                                            <$1 x $2> undef, i32 0)
+                                            i32 %offset_scale, <$1 x $2> undef, i32 0)
  forloop(lane, 1, eval($1-1), 
          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, 
-                                <$1 x i64> %newOffsets, <$1 x $2> %retPREV, i32 LANE)
+                                <$1 x i64> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
                    ', `LANE', lane), `PREV', eval(lane-1))')
  ret <$1 x $2> %ret`'eval($1-1)
 }
@@ -2848,42 +2866,52 @@ define <$1 x $2> @__gather64_$2(<$1 x i64> %ptrs,
 define(`gen_scatter', `
 ;; Define the function that descripes the work to do to scatter a single
 ;; value
-define void @__scatter_elt32_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values,
-                                i32 %lane) nounwind alwaysinline {
+define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
+                                <$1 x $2> %values, i32 %lane) nounwind alwaysinline {
  %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
-  %offset64 = zext i32 %offset32 to i64
-  %ptrdelta = add i64 %ptr64, %offset64
-  %ptr = inttoptr i64 %ptrdelta to $2 *
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %offset64 = sext i32 %offset32 to i64
+  %scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset
+
+  %ptrcast = bitcast i8 * %ptroffset to $2 *
  %storeval = extractelement <$1 x $2> %values, i32 %lane
-  store $2 %storeval, $2 * %ptr
+  store $2 %storeval, $2 * %ptrcast
  ret void
 }

-define void @__scatter_elt64_$2(i64 %ptr64, <$1 x i64> %offsets, <$1 x $2> %values,
-                                i32 %lane) nounwind alwaysinline {
+define void @__scatter_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
+                                <$1 x $2> %values, i32 %lane) nounwind alwaysinline {
  %offset64 = extractelement <$1 x i64> %offsets, i32 %lane
-  %ptrdelta = add i64 %ptr64, %offset64
-  %ptr = inttoptr i64 %ptrdelta to $2 *
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset
+  %ptrcast = bitcast i8 * %ptroffset to $2 *
+
  %storeval = extractelement <$1 x $2> %values, i32 %lane
-  store $2 %storeval, $2 * %ptr
+  store $2 %storeval, $2 * %ptrcast
  ret void
 }

-define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, <$1 x $2> %values,
-                                         <$1 x i32> %mask) nounwind alwaysinline {
+define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, i32 %offset_scale,
+                                         <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  %ptr64 = ptrtoint i8 * %base to i64
  per_lane($1, <$1 x i32> %mask, `
-      call void @__scatter_elt32_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values, i32 LANE)')
+      call void @__scatter_elt32_$2(i8 * %base, <$1 x i32> %offsets, i32 %offset_scale,
+                                    <$1 x $2> %values, i32 LANE)')
  ret void
 }

-define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, <$1 x $2> %values,
-                                         <$1 x i32> %mask) nounwind alwaysinline {
+define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %offset_scale,
+                                         <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  %ptr64 = ptrtoint i8 * %base to i64
  per_lane($1, <$1 x i32> %mask, `
-      call void @__scatter_elt64_$2(i64 %ptr64, <$1 x i64> %offsets, <$1 x $2> %values, i32 LANE)')
+      call void @__scatter_elt64_$2(i8 * %base, <$1 x i64> %offsets, i32 %offset_scale,
+                                    <$1 x $2> %values, i32 LANE)')
  ret void
 }

--- a/opt.cpp
+++ b/opt.cpp
@@ -979,6 +979,81 @@ lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets) {
 }


+/* Returns true if the given value is a constant vector of integers with
+   the value 2, 4, 8 in all of the elements.  (Returns the splatted value
+   in *splat, if so). */
+static bool
+lIs248Splat(llvm::Value *v, int *splat) {
+    llvm::ConstantVector *cvec = llvm::dyn_cast<llvm::ConstantVector>(v);
+    if (cvec == NULL)
+        return false;
+
+    llvm::ConstantInt *ci = 
+        llvm::dyn_cast<llvm::ConstantInt>(cvec->getSplatValue());
+    if (ci == NULL)
+        return false;
+
+    int64_t splatVal = ci->getSExtValue();
+    if (splatVal != 2 && splatVal != 4 && splatVal != 8)
+        return false;
+
+    *splat = (int)splatVal;
+    return true;
+}
+        
+
+/** Given a vector of integer offsets to a base pointer being used for a
+    gather or a scatter, see if its root operation is a multiply by a
+    vector of some value by all 2s/4s/8s.  If not, return NULL.
+
+    If it is return an i32 value of 2, 4, 8 from the function and modify
+    *vec so that it points to the operand that is being multiplied by
+    2/4/8.
+
+    We go through all this trouble so that we can pass the i32 scale factor
+    to the {gather,scatter}_base_offsets function as a separate scale
+    factor for the offsets.  This in turn is used in a way so that the LLVM
+    x86 code generator matches it to apply x86's free scale by 2x, 4x, or
+    8x to one of two registers being added together for an addressing
+    calculation.
+ */
+static llvm::Value *
+lExtractOffsetVector248Scale(llvm::Value **vec) {
+    llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(*vec);
+    if (sext != NULL) {
+        llvm::Value *sextOp = sext->getOperand(0);
+        // Check the sext target.
+        llvm::Value *scale = lExtractOffsetVector248Scale(&sextOp);
+        if (scale == NULL)
+            return NULL;
+
+        // make a new sext instruction so that we end up with the right
+        // type
+        *vec = new llvm::SExtInst(sextOp, sext->getType(), "offset_sext", sext);
+        return scale;
+    }
+
+    // If we don't have a multiply, then just return
+    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
+    if (bop == NULL || bop->getOpcode() != llvm::Instruction::Mul)
+        return LLVMInt32(1);
+
+    // Check each operand for being one of the scale factors we care about.
+    llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
+    int splat;
+    if (lIs248Splat(op0, &splat)) {
+        *vec = op1;
+        return LLVMInt32(splat);
+    }
+    else if (lIs248Splat(op1, &splat)) {
+        *vec = op0;
+        return LLVMInt32(splat);
+    }
+    else
+        return LLVMInt32(1);
+}
+
+
 struct GSInfo {
    GSInfo(const char *pgFuncName, const char *pgboFuncName, 
           const char *pgbo32FuncName, bool ig) 
@@ -1067,6 +1142,8 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
            // to the next instruction...
            continue;

+        llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector);
+
        // Cast the base pointer to a void *, since that's what the
        // __pseudo_*_base_offsets_* functions want.
        basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType,
@@ -1100,37 +1177,38 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
            // llvm::Instruction to llvm::CallInst::Create; this means that
            // the instruction isn't inserted into a basic block and that
            // way we can then call ReplaceInstWithInst().
-            llvm::Value *newArgs[3] = { basePtr, offsetVector, mask };
+            llvm::Value *newArgs[4] = { basePtr, offsetVector, offsetScale, mask };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-            llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[3]);
+            llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[4]);
            llvm::Instruction *newCall = 
                llvm::CallInst::Create(gatherScatterFunc, newArgArray, "newgather",
                                       (llvm::Instruction *)NULL);
 #else
            llvm::Instruction *newCall = 
-                llvm::CallInst::Create(gatherScatterFunc, &newArgs[0], &newArgs[3],
+                llvm::CallInst::Create(gatherScatterFunc, &newArgs[0], &newArgs[4],
                                       "newgather");
 #endif
            lCopyMetadata(newCall, callInst);
            llvm::ReplaceInstWithInst(callInst, newCall);
        }
        else {
+            llvm::Value *storeValue = callInst->getArgOperand(1);
            llvm::Value *mask = callInst->getArgOperand(2);
-            llvm::Value *rvalue = callInst->getArgOperand(1);

            // Generate a new function call to the next pseudo scatter
            // base+offsets instruction.  See above for why passing NULL
            // for the Instruction * is intended.
-            llvm::Value *newArgs[4] = { basePtr, offsetVector, rvalue, mask };
+            llvm::Value *newArgs[5] = { basePtr, offsetVector, offsetScale, 
+                                        storeValue, mask };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-            llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[4]);
+            llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[5]);
            llvm::Instruction *newCall = 
                llvm::CallInst::Create(gatherScatterFunc, newArgArray, "", 
                                       (llvm::Instruction *)NULL);
 #else
            llvm::Instruction *newCall = 
                llvm::CallInst::Create(gatherScatterFunc, &newArgs[0], 
-                                       &newArgs[4]);
+                                       &newArgs[5]);
 #endif
            lCopyMetadata(newCall, callInst);
            llvm::ReplaceInstWithInst(callInst, newCall);
@@ -1893,7 +1971,20 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {

        llvm::Value *base = callInst->getArgOperand(0);
        llvm::Value *offsets = callInst->getArgOperand(1);
-        llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 2 : 3);
+        llvm::Value *offsetScale = callInst->getArgOperand(2);
+        llvm::Value *storeValue = (scatterInfo != NULL) ? callInst->getArgOperand(3) : NULL;
+        llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4);
+
+        llvm::ConstantInt *offsetScaleInt = 
+            llvm::dyn_cast<llvm::ConstantInt>(offsetScale);
+        assert(offsetScaleInt != NULL);
+
+        if (offsets->getType() == LLVMTypes::Int64VectorType)
+            // offsetScale is an i32, so sext it so that if we use it in a
+            // multiply below, it has the same type as the i64 offset used
+            // as the other operand...
+            offsetScale = new llvm::SExtInst(offsetScale, LLVMTypes::Int64Type,
+                                             "offset_sext", callInst);

        {
        std::vector<llvm::PHINode *> seenPhis;
@@ -1901,10 +1992,18 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            // If all the offsets are equal, then compute the single
            // pointer they all represent based on the first one of them
            // (arbitrarily).
+
+            // FIXME: the code from here to where ptr is computed is highly
+            // redundant with the case for a vector linear below.
+
            llvm::Value *firstOffset = 
                llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
                                                 callInst);
-            llvm::Value *indices[1] = { firstOffset };
+            llvm::Value *scaledOffset = 
+                llvm::BinaryOperator::Create(llvm::Instruction::Mul, firstOffset,
+                                             offsetScale, "scaled_offset", callInst);
+
+            llvm::Value *indices[1] = { scaledOffset };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
            llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[1]);
            llvm::Value *ptr = 
@@ -1945,9 +2044,8 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                Warning(pos, "Undefined behavior: all program instances are "
                        "writing to the same location!");

-                llvm::Value *rvalue = callInst->getArgOperand(2);
                llvm::Value *first = 
-                    llvm::ExtractElementInst::Create(rvalue, LLVMInt32(0), "rvalue_first",
+                    llvm::ExtractElementInst::Create(storeValue, LLVMInt32(0), "rvalue_first",
                                                     callInst);
                lCopyMetadata(first, callInst);
                ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(first->getType(), 0),
@@ -1965,8 +2063,11 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        }

        int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
+        step /= (int)offsetScaleInt->getZExtValue();
+
        std::vector<llvm::PHINode *> seenPhis;
-        if (lVectorIsLinear(offsets, g->target.vectorWidth, step, seenPhis)) {
+        if (step > 0 && lVectorIsLinear(offsets, g->target.vectorWidth, 
+                                        step, seenPhis)) {
            // We have a linear sequence of memory locations being accessed
            // starting with the location given by the offset from
            // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
@@ -1976,7 +2077,11 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            llvm::Value *firstOffset = 
                llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
                                                 callInst);
-            llvm::Value *indices[1] = { firstOffset };
+            llvm::Value *scaledOffset = 
+                llvm::BinaryOperator::Create(llvm::Instruction::Mul, firstOffset,
+                                             offsetScale, "scaled_offset", callInst);
+
+            llvm::Value *indices[1] = { scaledOffset };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
            llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[1]);
            llvm::Value *ptr = 
@@ -2006,11 +2111,10 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            }
            else {
                Debug(pos, "Transformed scatter to unaligned vector store!");
-                llvm::Value *rvalue = callInst->getArgOperand(2);
                ptr = new llvm::BitCastInst(ptr, scatterInfo->vecPtrType, "ptrcast", 
                                            callInst);

-                llvm::Value *args[3] = { ptr, rvalue, mask };
+                llvm::Value *args[3] = { ptr, storeValue, mask };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
                llvm::ArrayRef<llvm::Value *> argArray(&args[0], &args[3]);
                llvm::Instruction *newCall =