diff --git a/builtins.m4 b/builtins.m4
index 268825e1..f83bdbff 100644
--- a/builtins.m4
+++ b/builtins.m4
@@ -883,21 +883,22 @@ declare void @__pseudo_masked_store_64(<$1 x i64> * nocapture, <$1 x i64>, <$1 x
 ; varying int32 __pseudo_gather(varying int32 *, mask)
 ; varying int64 __pseudo_gather(varying int64 *, mask)
 ;
-; These functions are never actually implemented; the
-; GatherScatterFlattenOpt optimization pass finds them and then converts
-; them to make calls to the following functions, which represent gathers
-; from a common base pointer with offsets.  This approach allows the
-; front-end to be relatively simple in how it emits address calculation
-; for gathers.
+; The GatherScatterFlattenOpt optimization pass finds these calls and then 
+; converts them to make calls to the following functions (when appropriate); 
+; these represent gathers from a common base pointer with offsets.  The
+; offset_scale factor scales the offsets before they are added to the base
+; pointer--it should have the value 1, 2, 4, or 8.  (It can always just be 1.)
+; The 2, 4, 8 cases are used to match LLVM patterns that use the free 2/4/8 scaling
+; available in x86 addressing calculations... 
 ;
-; varying int8  __pseudo_gather_base_offsets_8(uniform int8 *base, 
-;                                              int32 offsets, mask)
-; varying int16 __pseudo_gather_base_offsets_16(uniform int16 *base, 
-;                                               int32 offsets, mask)
-; varying int32 __pseudo_gather_base_offsets_32(uniform int32 *base, 
-;                                               int32 offsets, mask)
-; varying int64 __pseudo_gather_base_offsets_64(uniform int64 *base, 
-;                                               int64 offsets, mask)
+; varying int8  __pseudo_gather_base_offsets{32,64}_8(uniform int8 *base, 
+;                                    int{32,64} offsets, int32 offset_scale, mask)
+; varying int16 __pseudo_gather_base_offsets{32,64}_16(uniform int16 *base, 
+;                                    int{32,64} offsets, int32 offset_scale, mask)
+; varying int32 __pseudo_gather_base_offsets{32,64}_32(uniform int32 *base, 
+;                                    int{32,64} offsets, int32 offset_scale, mask)
+; varying int64 __pseudo_gather_base_offsets{32,64}_64(uniform int64 *base, 
+;                                    int{32,64} offsets, int32 offset_scale, mask)
 ;
 ; Then, the GSImprovementsPass optimizations finds these and either
 ; converts them to native gather functions or converts them to vector
@@ -913,15 +914,23 @@ declare <$1 x i16> @__pseudo_gather64_16(<$1 x i64>, <$1 x i32>) nounwind readon
 declare <$1 x i32> @__pseudo_gather64_32(<$1 x i64>, <$1 x i32>) nounwind readonly
 declare <$1 x i64> @__pseudo_gather64_64(<$1 x i64>, <$1 x i32>) nounwind readonly
 
-declare <$1 x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather_base_offsets32_16(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather_base_offsets32_32(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather_base_offsets32_64(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
+declare <$1 x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <$1 x i32>, i32,
+                                                     <$1 x i32>) nounwind readonly
+declare <$1 x i16> @__pseudo_gather_base_offsets32_16(i8 *, <$1 x i32>, i32,
+                                                      <$1 x i32>) nounwind readonly
+declare <$1 x i32> @__pseudo_gather_base_offsets32_32(i8 *, <$1 x i32>, i32,
+                                                      <$1 x i32>) nounwind readonly
+declare <$1 x i64> @__pseudo_gather_base_offsets32_64(i8 *, <$1 x i32>, i32,
+                                                      <$1 x i32>) nounwind readonly
 
-declare <$1 x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather_base_offsets64_16(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather_base_offsets64_32(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
+declare <$1 x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <$1 x i64>, i32,
+                                                     <$1 x i32>) nounwind readonly
+declare <$1 x i16> @__pseudo_gather_base_offsets64_16(i8 *, <$1 x i64>, i32,
+                                                      <$1 x i32>) nounwind readonly
+declare <$1 x i32> @__pseudo_gather_base_offsets64_32(i8 *, <$1 x i64>, i32,
+                                                      <$1 x i32>) nounwind readonly
+declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, i32,
+                                                      <$1 x i32>) nounwind readonly
 
 ; Similarly to the pseudo-gathers defined above, we also declare undefined
 ; pseudo-scatter instructions with signatures:
@@ -934,14 +943,14 @@ declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, <$1 x i3
 ; The GatherScatterFlattenOpt optimization pass also finds these and
 ; transforms them to scatters like:
 ;
-; void __pseudo_scatter_base_offsets_8(uniform int8 *base, 
-;                 varying int32 offsets, varying int8 values, mask)
-; void __pseudo_scatter_base_offsets_16(uniform int16 *base, 
-;                 varying int32 offsets, varying int16 values, mask)
-; void __pseudo_scatter_base_offsets_32(uniform int32 *base, 
-;                 varying int32 offsets, varying int32 values, mask)
-; void __pseudo_scatter_base_offsets_64(uniform int64 *base, 
-;                 varying int32 offsets, varying int64 values, mask)
+; void __pseudo_scatter_base_offsets{32,64}_8(uniform int8 *base, 
+;             varying int32 offsets, int32 offset_scale, varying int8 values, mask)
+; void __pseudo_scatter_base_offsets{32,64}_16(uniform int16 *base, 
+;             varying int32 offsets, int32 offset_scale, varying int16 values, mask)
+; void __pseudo_scatter_base_offsets{32,64}_32(uniform int32 *base, 
+;             varying int32 offsets, int32 offset_scale, varying int32 values, mask)
+; void __pseudo_scatter_base_offsets{32,64}_64(uniform int64 *base, 
+;             varying int32 offsets, int32 offset_scale, varying int64 values, mask)
 ;
 ; And the GSImprovementsPass in turn converts these to actual native
 ; scatters or masked stores.  
@@ -956,22 +965,22 @@ declare void @__pseudo_scatter64_16(<$1 x i64>, <$1 x i16>, <$1 x i32>) nounwind
 declare void @__pseudo_scatter64_32(<$1 x i64>, <$1 x i32>, <$1 x i32>) nounwind
 declare void @__pseudo_scatter64_64(<$1 x i64>, <$1 x i64>, <$1 x i32>) nounwind
 
-declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <$1 x i32>,
+declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <$1 x i32>, i32,
                                                 <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <$1 x i32>,
+declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <$1 x i32>, i32,
                                                  <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <$1 x i32>,
+declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <$1 x i32>, i32,
                                                  <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <$1 x i32>,
+declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <$1 x i32>, i32,
                                                  <$1 x i64>, <$1 x i32>) nounwind
 
-declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <$1 x i64>,
+declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <$1 x i64>, i32,
                                                 <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <$1 x i64>,
+declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <$1 x i64>, i32,
                                                  <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <$1 x i64>,
+declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <$1 x i64>, i32,
                                                  <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <$1 x i64>,
+declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <$1 x i64>, i32,
                                                  <$1 x i64>, <$1 x i32>) nounwind
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2732,34 +2741,43 @@ pl_done:
 define(`gen_gather', `
 ;; Define the utility function to do the gather operation for a single element
 ;; of the type
-define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x $2> %ret,
-                                    i32 %lane) nounwind readonly alwaysinline {
+define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
+                                    <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
   ; compute address for this one from the base
   %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
-  %ptroffset = getelementptr i8 * %ptr, i32 %offset32
-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %offset64 = sext i32 %offset32 to i64
+  %scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset
 
   ; load value and insert into returned value
+  %ptrcast = bitcast i8 * %ptroffset to $2 *
   %val = load $2 *%ptrcast
   %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
   ret <$1 x $2> %updatedret
 }
 
-define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, <$1 x $2> %ret,
-                                    i32 %lane) nounwind readonly alwaysinline {
+define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
+                                    <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
   ; compute address for this one from the base
-  %offset32 = extractelement <$1 x i64> %offsets, i32 %lane
-  %ptroffset = getelementptr i8 * %ptr, i64 %offset32
-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  %offset64 = extractelement <$1 x i64> %offsets, i32 %lane
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %offset_scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %offset_scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset
 
   ; load value and insert into returned value
+  %ptrcast = bitcast i8 * %ptroffset to $2 *
   %val = load $2 *%ptrcast
   %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
   ret <$1 x $2> %updatedret
 }
 
 
-define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets,
+define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
                                              <$1 x i32> %vecmask) nounwind readonly alwaysinline {
   ; We can be clever and avoid the per-lane stuff for gathers if we are willing
   ; to require that the 0th element of the array being gathered from is always
@@ -2773,15 +2791,15 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets,
   %newOffsets = load <$1 x i32> * %offsetsPtr
 
   %ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
-                                            <$1 x $2> undef, i32 0)
+                                            i32 %offset_scale, <$1 x $2> undef, i32 0)
   forloop(lane, 1, eval($1-1), 
           `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, 
-                                <$1 x i32> %newOffsets, <$1 x $2> %retPREV, i32 LANE)
+                                <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
                     ', `LANE', lane), `PREV', eval(lane-1))')
   ret <$1 x $2> %ret`'eval($1-1)
 }
 
-define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets,
+define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
                                              <$1 x i32> %vecmask) nounwind readonly alwaysinline {
   ; We can be clever and avoid the per-lane stuff for gathers if we are willing
   ; to require that the 0th element of the array being gathered from is always
@@ -2795,10 +2813,10 @@ define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets,
   %newOffsets = load <$1 x i64> * %offsetsPtr
 
   %ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets,
-                                            <$1 x $2> undef, i32 0)
+                                            i32 %offset_scale, <$1 x $2> undef, i32 0)
   forloop(lane, 1, eval($1-1), 
           `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, 
-                                <$1 x i64> %newOffsets, <$1 x $2> %retPREV, i32 LANE)
+                                <$1 x i64> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
                     ', `LANE', lane), `PREV', eval(lane-1))')
   ret <$1 x $2> %ret`'eval($1-1)
 }
@@ -2848,42 +2866,52 @@ define <$1 x $2> @__gather64_$2(<$1 x i64> %ptrs,
 define(`gen_scatter', `
 ;; Define the function that descripes the work to do to scatter a single
 ;; value
-define void @__scatter_elt32_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values,
-                                i32 %lane) nounwind alwaysinline {
+define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
+                                <$1 x $2> %values, i32 %lane) nounwind alwaysinline {
   %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
-  %offset64 = zext i32 %offset32 to i64
-  %ptrdelta = add i64 %ptr64, %offset64
-  %ptr = inttoptr i64 %ptrdelta to $2 *
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %offset64 = sext i32 %offset32 to i64
+  %scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset
+
+  %ptrcast = bitcast i8 * %ptroffset to $2 *
   %storeval = extractelement <$1 x $2> %values, i32 %lane
-  store $2 %storeval, $2 * %ptr
+  store $2 %storeval, $2 * %ptrcast
   ret void
 }
 
-define void @__scatter_elt64_$2(i64 %ptr64, <$1 x i64> %offsets, <$1 x $2> %values,
-                                i32 %lane) nounwind alwaysinline {
+define void @__scatter_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
+                                <$1 x $2> %values, i32 %lane) nounwind alwaysinline {
   %offset64 = extractelement <$1 x i64> %offsets, i32 %lane
-  %ptrdelta = add i64 %ptr64, %offset64
-  %ptr = inttoptr i64 %ptrdelta to $2 *
+  ; the order and details of the next 4 lines are important--they match LLVMs 
+  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
+  %scale64 = sext i32 %offset_scale to i64
+  %offset = mul i64 %offset64, %scale64
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset
+  %ptrcast = bitcast i8 * %ptroffset to $2 *
+
   %storeval = extractelement <$1 x $2> %values, i32 %lane
-  store $2 %storeval, $2 * %ptr
+  store $2 %storeval, $2 * %ptrcast
   ret void
 }
 
-define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, <$1 x $2> %values,
-                                         <$1 x i32> %mask) nounwind alwaysinline {
+define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, i32 %offset_scale,
+                                         <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
   ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  %ptr64 = ptrtoint i8 * %base to i64
   per_lane($1, <$1 x i32> %mask, `
-      call void @__scatter_elt32_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values, i32 LANE)')
+      call void @__scatter_elt32_$2(i8 * %base, <$1 x i32> %offsets, i32 %offset_scale,
+                                    <$1 x $2> %values, i32 LANE)')
   ret void
 }
 
-define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, <$1 x $2> %values,
-                                         <$1 x i32> %mask) nounwind alwaysinline {
+define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %offset_scale,
+                                         <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
   ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  %ptr64 = ptrtoint i8 * %base to i64
   per_lane($1, <$1 x i32> %mask, `
-      call void @__scatter_elt64_$2(i64 %ptr64, <$1 x i64> %offsets, <$1 x $2> %values, i32 LANE)')
+      call void @__scatter_elt64_$2(i8 * %base, <$1 x i64> %offsets, i32 %offset_scale,
+                                    <$1 x $2> %values, i32 LANE)')
   ret void
 }
 
diff --git a/opt.cpp b/opt.cpp
index 25c950cb..c77a76f7 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -979,6 +979,81 @@ lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets) {
 }
 
 
+/* Returns true if the given value is a constant vector of integers with
+   the value 2, 4, 8 in all of the elements.  (Returns the splatted value
+   in *splat, if so). */
+static bool
+lIs248Splat(llvm::Value *v, int *splat) {
+    llvm::ConstantVector *cvec = llvm::dyn_cast<llvm::ConstantVector>(v);
+    if (cvec == NULL)
+        return false;
+
+    llvm::ConstantInt *ci = 
+        llvm::dyn_cast<llvm::ConstantInt>(cvec->getSplatValue());
+    if (ci == NULL)
+        return false;
+
+    int64_t splatVal = ci->getSExtValue();
+    if (splatVal != 2 && splatVal != 4 && splatVal != 8)
+        return false;
+
+    *splat = (int)splatVal;
+    return true;
+}
+        
+
+/** Given a vector of integer offsets to a base pointer being used for a
+    gather or a scatter, see if its root operation is a multiply by a
+    vector of some value by all 2s/4s/8s.  If not, return NULL.
+
+    If it is return an i32 value of 2, 4, 8 from the function and modify
+    *vec so that it points to the operand that is being multiplied by
+    2/4/8.
+
+    We go through all this trouble so that we can pass the i32 scale factor
+    to the {gather,scatter}_base_offsets function as a separate scale
+    factor for the offsets.  This in turn is used in a way so that the LLVM
+    x86 code generator matches it to apply x86's free scale by 2x, 4x, or
+    8x to one of two registers being added together for an addressing
+    calculation.
+ */
+static llvm::Value *
+lExtractOffsetVector248Scale(llvm::Value **vec) {
+    llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(*vec);
+    if (sext != NULL) {
+        llvm::Value *sextOp = sext->getOperand(0);
+        // Check the sext target.
+        llvm::Value *scale = lExtractOffsetVector248Scale(&sextOp);
+        if (scale == NULL)
+            return NULL;
+
+        // make a new sext instruction so that we end up with the right
+        // type
+        *vec = new llvm::SExtInst(sextOp, sext->getType(), "offset_sext", sext);
+        return scale;
+    }
+
+    // If we don't have a multiply, then just return
+    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
+    if (bop == NULL || bop->getOpcode() != llvm::Instruction::Mul)
+        return LLVMInt32(1);
+
+    // Check each operand for being one of the scale factors we care about.
+    llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
+    int splat;
+    if (lIs248Splat(op0, &splat)) {
+        *vec = op1;
+        return LLVMInt32(splat);
+    }
+    else if (lIs248Splat(op1, &splat)) {
+        *vec = op0;
+        return LLVMInt32(splat);
+    }
+    else
+        return LLVMInt32(1);
+}
+
+
 struct GSInfo {
     GSInfo(const char *pgFuncName, const char *pgboFuncName, 
            const char *pgbo32FuncName, bool ig) 
@@ -1067,6 +1142,8 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
             // to the next instruction...
             continue;
 
+        llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector);
+
         // Cast the base pointer to a void *, since that's what the
         // __pseudo_*_base_offsets_* functions want.
         basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType,
@@ -1100,37 +1177,38 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
             // llvm::Instruction to llvm::CallInst::Create; this means that
             // the instruction isn't inserted into a basic block and that
             // way we can then call ReplaceInstWithInst().
-            llvm::Value *newArgs[3] = { basePtr, offsetVector, mask };
+            llvm::Value *newArgs[4] = { basePtr, offsetVector, offsetScale, mask };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-            llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[3]);
+            llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[4]);
             llvm::Instruction *newCall = 
                 llvm::CallInst::Create(gatherScatterFunc, newArgArray, "newgather",
                                        (llvm::Instruction *)NULL);
 #else
             llvm::Instruction *newCall = 
-                llvm::CallInst::Create(gatherScatterFunc, &newArgs[0], &newArgs[3],
+                llvm::CallInst::Create(gatherScatterFunc, &newArgs[0], &newArgs[4],
                                        "newgather");
 #endif
             lCopyMetadata(newCall, callInst);
             llvm::ReplaceInstWithInst(callInst, newCall);
         }
         else {
+            llvm::Value *storeValue = callInst->getArgOperand(1);
             llvm::Value *mask = callInst->getArgOperand(2);
-            llvm::Value *rvalue = callInst->getArgOperand(1);
 
             // Generate a new function call to the next pseudo scatter
             // base+offsets instruction.  See above for why passing NULL
             // for the Instruction * is intended.
-            llvm::Value *newArgs[4] = { basePtr, offsetVector, rvalue, mask };
+            llvm::Value *newArgs[5] = { basePtr, offsetVector, offsetScale, 
+                                        storeValue, mask };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-            llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[4]);
+            llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[5]);
             llvm::Instruction *newCall = 
                 llvm::CallInst::Create(gatherScatterFunc, newArgArray, "", 
                                        (llvm::Instruction *)NULL);
 #else
             llvm::Instruction *newCall = 
                 llvm::CallInst::Create(gatherScatterFunc, &newArgs[0], 
-                                       &newArgs[4]);
+                                       &newArgs[5]);
 #endif
             lCopyMetadata(newCall, callInst);
             llvm::ReplaceInstWithInst(callInst, newCall);
@@ -1893,7 +1971,20 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
 
         llvm::Value *base = callInst->getArgOperand(0);
         llvm::Value *offsets = callInst->getArgOperand(1);
-        llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 2 : 3);
+        llvm::Value *offsetScale = callInst->getArgOperand(2);
+        llvm::Value *storeValue = (scatterInfo != NULL) ? callInst->getArgOperand(3) : NULL;
+        llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4);
+
+        llvm::ConstantInt *offsetScaleInt = 
+            llvm::dyn_cast<llvm::ConstantInt>(offsetScale);
+        assert(offsetScaleInt != NULL);
+
+        if (offsets->getType() == LLVMTypes::Int64VectorType)
+            // offsetScale is an i32, so sext it so that if we use it in a
+            // multiply below, it has the same type as the i64 offset used
+            // as the other operand...
+            offsetScale = new llvm::SExtInst(offsetScale, LLVMTypes::Int64Type,
+                                             "offset_sext", callInst);
 
         {
         std::vector<llvm::PHINode *> seenPhis;
@@ -1901,10 +1992,18 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
             // If all the offsets are equal, then compute the single
             // pointer they all represent based on the first one of them
             // (arbitrarily).
+
+            // FIXME: the code from here to where ptr is computed is highly
+            // redundant with the case for a vector linear below.
+
             llvm::Value *firstOffset = 
                 llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
                                                  callInst);
-            llvm::Value *indices[1] = { firstOffset };
+            llvm::Value *scaledOffset = 
+                llvm::BinaryOperator::Create(llvm::Instruction::Mul, firstOffset,
+                                             offsetScale, "scaled_offset", callInst);
+
+            llvm::Value *indices[1] = { scaledOffset };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
             llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[1]);
             llvm::Value *ptr = 
@@ -1945,9 +2044,8 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                 Warning(pos, "Undefined behavior: all program instances are "
                         "writing to the same location!");
 
-                llvm::Value *rvalue = callInst->getArgOperand(2);
                 llvm::Value *first = 
-                    llvm::ExtractElementInst::Create(rvalue, LLVMInt32(0), "rvalue_first",
+                    llvm::ExtractElementInst::Create(storeValue, LLVMInt32(0), "rvalue_first",
                                                      callInst);
                 lCopyMetadata(first, callInst);
                 ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(first->getType(), 0),
@@ -1965,8 +2063,11 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         }
 
         int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
+        step /= (int)offsetScaleInt->getZExtValue();
+
         std::vector<llvm::PHINode *> seenPhis;
-        if (lVectorIsLinear(offsets, g->target.vectorWidth, step, seenPhis)) {
+        if (step > 0 && lVectorIsLinear(offsets, g->target.vectorWidth, 
+                                        step, seenPhis)) {
             // We have a linear sequence of memory locations being accessed
             // starting with the location given by the offset from
             // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
@@ -1976,7 +2077,11 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
             llvm::Value *firstOffset = 
                 llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
                                                  callInst);
-            llvm::Value *indices[1] = { firstOffset };
+            llvm::Value *scaledOffset = 
+                llvm::BinaryOperator::Create(llvm::Instruction::Mul, firstOffset,
+                                             offsetScale, "scaled_offset", callInst);
+
+            llvm::Value *indices[1] = { scaledOffset };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
             llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[1]);
             llvm::Value *ptr = 
@@ -2006,11 +2111,10 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
             }
             else {
                 Debug(pos, "Transformed scatter to unaligned vector store!");
-                llvm::Value *rvalue = callInst->getArgOperand(2);
                 ptr = new llvm::BitCastInst(ptr, scatterInfo->vecPtrType, "ptrcast", 
                                             callInst);
 
-                llvm::Value *args[3] = { ptr, rvalue, mask };
+                llvm::Value *args[3] = { ptr, storeValue, mask };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
                 llvm::ArrayRef<llvm::Value *> argArray(&args[0], &args[3]);
                 llvm::Instruction *newCall =