From ec0280be113b860f9cb6fec50d77a59e35bf25d2 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Wed, 11 Jul 2012 11:06:30 -0700
Subject: [PATCH 01/15] Rename gather/scatter_base_offsets functions to
 *factored_based_offsets*.

No functional change; just preparation for having a path that doesn't
factor the offsets into constant and varying parts, which will be better
for AVX2 and KNC.
---
 builtins/target-generic-common.ll |   8 +-
 builtins/util.m4                  | 156 +++++++--------
 examples/intrinsics/generic-16.h  |  52 ++---
 examples/intrinsics/generic-32.h  |  52 ++---
 examples/intrinsics/generic-64.h  |  52 ++---
 examples/intrinsics/knc.h         |  40 ++--
 examples/intrinsics/sse4.h        |  40 ++--
 opt.cpp                           | 322 +++++++++++++++---------------
 8 files changed, 361 insertions(+), 361 deletions(-)
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 48e7b836..c54dd948 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -334,18 +334,18 @@ define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
 ;; gather/scatter
 
 define(`gather_scatter', `
-declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
+declare <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
                         i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly 
-declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
+declare <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
                         i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>, 
                                     <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>, 
                                     <WIDTH x i1>) nounwind readonly 
 
-declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
+declare void @__scatter_factored_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
                   i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
-declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
+declare void @__scatter_factored_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
                   i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
 declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
                              <WIDTH x i1>) nounwind 
diff --git a/builtins/util.m4 b/builtins/util.m4
index ce25a761..4a8822bb 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1599,7 +1599,7 @@ declare void @__pseudo_masked_store_double(<WIDTH x double> * nocapture, <WIDTH
 ; offset_delta feeds into the free offset calculation. 
 ;
 ; varying int{8,16,32,float,64,double}
-; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
+; __pseudo_gather_factored_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
 ;                                    int{32,64} offsets, uniform int32 offset_scale, 
 ;                                    int{32,64} offset_delta, mask)
 ;
@@ -1621,30 +1621,30 @@ declare <WIDTH x float> @__pseudo_gather64_float(<WIDTH x i64>, <WIDTH x MASK>)
 declare <WIDTH x i64> @__pseudo_gather64_i64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x double> @__pseudo_gather64_double(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
 
-declare <WIDTH x i8>  @__pseudo_gather_base_offsets32_i8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+declare <WIDTH x i8>  @__pseudo_gather_factored_base_offsets32_i8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                         <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i16> @__pseudo_gather_base_offsets32_i16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+declare <WIDTH x i16> @__pseudo_gather_factored_base_offsets32_i16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i32> @__pseudo_gather_base_offsets32_i32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+declare <WIDTH x i32> @__pseudo_gather_factored_base_offsets32_i32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                           <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x float> @__pseudo_gather_base_offsets32_float(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+declare <WIDTH x float> @__pseudo_gather_factored_base_offsets32_float(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                           <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i64> @__pseudo_gather_base_offsets32_i64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+declare <WIDTH x i64> @__pseudo_gather_factored_base_offsets32_i64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                           <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x double> @__pseudo_gather_base_offsets32_double(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+declare <WIDTH x double> @__pseudo_gather_factored_base_offsets32_double(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                           <WIDTH x MASK>) nounwind readonly
 
-declare <WIDTH x i8>  @__pseudo_gather_base_offsets64_i8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+declare <WIDTH x i8>  @__pseudo_gather_factored_base_offsets64_i8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i16> @__pseudo_gather_base_offsets64_i16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+declare <WIDTH x i16> @__pseudo_gather_factored_base_offsets64_i16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                           <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i32> @__pseudo_gather_base_offsets64_i32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+declare <WIDTH x i32> @__pseudo_gather_factored_base_offsets64_i32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                           <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x float> @__pseudo_gather_base_offsets64_float(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+declare <WIDTH x float> @__pseudo_gather_factored_base_offsets64_float(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                           <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i64> @__pseudo_gather_base_offsets64_i64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+declare <WIDTH x i64> @__pseudo_gather_factored_base_offsets64_i64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                           <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x double> @__pseudo_gather_base_offsets64_double(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+declare <WIDTH x double> @__pseudo_gather_factored_base_offsets64_double(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                                 <WIDTH x MASK>) nounwind readonly
 
 ; Similarly to the pseudo-gathers defined above, we also declare undefined
@@ -1660,7 +1660,7 @@ declare <WIDTH x double> @__pseudo_gather_base_offsets64_double(i8 *, <WIDTH x i
 ; The GatherScatterFlattenOpt optimization pass also finds these and
 ; transforms them to scatters like:
 ;
-; void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base, 
+; void __pseudo_scatter_factored_base_offsets{32,64}_i8(uniform int8 *base, 
 ;             varying int32 offsets, uniform int32 offset_scale, 
 ;             varying int{32,64} offset_delta, varying int8 values, mask)
 ; (and similarly for 16/32/64 bit values)
@@ -1682,30 +1682,30 @@ declare void @__pseudo_scatter64_float(<WIDTH x i64>, <WIDTH x float>, <WIDTH x
 declare void @__pseudo_scatter64_i64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter64_double(<WIDTH x i64>, <WIDTH x double>, <WIDTH x MASK>) nounwind
 
-declare void @__pseudo_scatter_base_offsets32_i8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+declare void @__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                  <WIDTH x i8>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_i16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+declare void @__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                   <WIDTH x i16>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_i32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+declare void @__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                   <WIDTH x i32>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_float(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+declare void @__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                   <WIDTH x float>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_i64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+declare void @__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                   <WIDTH x i64>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_double(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+declare void @__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                    <WIDTH x double>, <WIDTH x MASK>) nounwind
 
-declare void @__pseudo_scatter_base_offsets64_i8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+declare void @__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                  <WIDTH x i8>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_i16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+declare void @__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                   <WIDTH x i16>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_i32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+declare void @__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                   <WIDTH x i32>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_float(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+declare void @__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                   <WIDTH x float>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_i64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+declare void @__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                   <WIDTH x i64>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_double(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+declare void @__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                      <WIDTH x double>, <WIDTH x MASK>) nounwind
 
 declare float @__log_uniform_float(float) nounwind readnone
@@ -1872,103 +1872,103 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
   call void @__usedouble(<WIDTH x double> %g64_d)
 
   %pgbo32_8 = call <WIDTH x i8>
-       @__pseudo_gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__pseudo_gather_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
   call void @__use8(<WIDTH x i8> %pgbo32_8)
   %pgbo32_16 = call <WIDTH x i16>
-       @__pseudo_gather_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__pseudo_gather_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                            <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
   call void @__use16(<WIDTH x i16> %pgbo32_16)
   %pgbo32_32 = call <WIDTH x i32>
-       @__pseudo_gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__pseudo_gather_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                            <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
   call void @__use32(<WIDTH x i32> %pgbo32_32)
   %pgbo32_f = call <WIDTH x float>
-       @__pseudo_gather_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__pseudo_gather_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                            <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
   call void @__usefloat(<WIDTH x float> %pgbo32_f)
   %pgbo32_64 = call <WIDTH x i64>
-       @__pseudo_gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__pseudo_gather_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                            <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
   call void @__use64(<WIDTH x i64> %pgbo32_64)
   %pgbo32_d = call <WIDTH x double>
-       @__pseudo_gather_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__pseudo_gather_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                            <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
   call void @__usedouble(<WIDTH x double> %pgbo32_d)
 
   %gbo32_8 = call <WIDTH x i8>
-       @__gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__gather_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
   call void @__use8(<WIDTH x i8> %gbo32_8)
   %gbo32_16 = call <WIDTH x i16>
-       @__gather_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__gather_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                    <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
   call void @__use16(<WIDTH x i16> %gbo32_16)
   %gbo32_32 = call <WIDTH x i32>
-       @__gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__gather_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                    <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
   call void @__use32(<WIDTH x i32> %gbo32_32)
   %gbo32_f = call <WIDTH x float>
-       @__gather_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__gather_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                    <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
   call void @__usefloat(<WIDTH x float> %gbo32_f)
   %gbo32_64 = call <WIDTH x i64>
-       @__gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__gather_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                    <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
   call void @__use64(<WIDTH x i64> %gbo32_64)
   %gbo32_d = call <WIDTH x double>
-       @__gather_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__gather_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                    <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
   call void @__usedouble(<WIDTH x double> %gbo32_d)
 
 
   %pgbo64_8 = call <WIDTH x i8>
-       @__pseudo_gather_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__pseudo_gather_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__use8(<WIDTH x i8> %pgbo64_8)
   %pgbo64_16 = call <WIDTH x i16>
-       @__pseudo_gather_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__pseudo_gather_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                            <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__use16(<WIDTH x i16> %pgbo64_16)
   %pgbo64_32 = call <WIDTH x i32>
-       @__pseudo_gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__pseudo_gather_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                            <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__use32(<WIDTH x i32> %pgbo64_32)
   %pgbo64_f = call <WIDTH x float>
-       @__pseudo_gather_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__pseudo_gather_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                            <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__usefloat(<WIDTH x float> %pgbo64_f)
   %pgbo64_64 = call <WIDTH x i64>
-       @__pseudo_gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__pseudo_gather_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                            <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__use64(<WIDTH x i64> %pgbo64_64)
   %pgbo64_d = call <WIDTH x double>
-       @__pseudo_gather_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__pseudo_gather_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                            <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__usedouble(<WIDTH x double> %pgbo64_d)
 
   %gbo64_8 = call <WIDTH x i8>
-       @__gather_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__gather_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__use8(<WIDTH x i8> %gbo64_8)
   %gbo64_16 = call <WIDTH x i16>
-       @__gather_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__gather_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                    <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__use16(<WIDTH x i16> %gbo64_16)
   %gbo64_32 = call <WIDTH x i32>
-       @__gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__gather_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                    <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__use32(<WIDTH x i32> %gbo64_32)
   %gbo64_f = call <WIDTH x float>
-       @__gather_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__gather_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                    <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__usefloat(<WIDTH x float> %gbo64_f)
   %gbo64_64 = call <WIDTH x i64>
-       @__gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__gather_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                    <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__use64(<WIDTH x i64> %gbo64_64)
   %gbo64_d = call <WIDTH x double>
-       @__gather_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__gather_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                    <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__usedouble(<WIDTH x double> %gbo64_d)
 
@@ -2003,56 +2003,56 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
   call void @__scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__scatter64_double(<WIDTH x i64> %v64, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
 
-  call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__pseudo_scatter_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                                 <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__pseudo_scatter_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                                  <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__pseudo_scatter_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                                  <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__pseudo_scatter_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                                  <WIDTH x float> %vf, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__pseudo_scatter_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                                  <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__pseudo_scatter_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                                     <WIDTH x double> %vd, <WIDTH x MASK> %mask)
 
-  call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__pseudo_scatter_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                                 <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__pseudo_scatter_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                                  <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__pseudo_scatter_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                                  <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__pseudo_scatter_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                                    <WIDTH x float> %vf, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__pseudo_scatter_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                                  <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__pseudo_scatter_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                                     <WIDTH x double> %vd, <WIDTH x MASK> %mask)
 
-  call void @__scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__scatter_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                          <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__scatter_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                           <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__scatter_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__scatter_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                             <WIDTH x float> %vf, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__scatter_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__scatter_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                              <WIDTH x double> %vd, <WIDTH x MASK> %mask)
 
-  call void @__scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__scatter_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                          <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__scatter_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                           <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__scatter_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__scatter_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                             <WIDTH x float> %vf, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__scatter_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__scatter_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                              <WIDTH x double> %vd, <WIDTH x MASK> %mask)
 
   ret void
@@ -3245,7 +3245,7 @@ define <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %o
 }
 
 
-define <WIDTH x $1> @__gather_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
+define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
                                              <WIDTH x i32> %offset_delta,
                                              <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
   ; We can be clever and avoid the per-lane stuff for gathers if we are willing
@@ -3276,7 +3276,7 @@ define <WIDTH x $1> @__gather_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offset
   ret <WIDTH x $1> %ret`'eval(WIDTH-1)
 }
 
-define <WIDTH x $1> @__gather_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
+define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
                                              <WIDTH x i64> %offset_delta,
                                              <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
   ; We can be clever and avoid the per-lane stuff for gathers if we are willing
@@ -3391,7 +3391,7 @@ define void @__scatter_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_s
   ret void
 }
 
-define void @__scatter_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
+define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
                                          <WIDTH x i32> %offset_delta, <WIDTH x $1> %values,
                                          <WIDTH x i32> %mask) nounwind alwaysinline {
   ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
@@ -3401,7 +3401,7 @@ define void @__scatter_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32
   ret void
 }
 
-define void @__scatter_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
+define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
                                          <WIDTH x i64> %offset_delta, <WIDTH x $1> %values,
                                          <WIDTH x i32> %mask) nounwind alwaysinline {
   ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index 1851ff7e..c18e9fbe 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1306,7 +1306,7 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
 
 // offsets * offsetScale is in bytes (for all of these)
 
-#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
 static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
                               uint32_t scale, OTYPE constOffset, \
                               __vec16_i1 mask) {                        \
@@ -1322,18 +1322,18 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
 }
     
 
-GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __gather_base_offsets32_i8)
-GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
-GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
-GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
-GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
-GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
-GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
-GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
-GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
-GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
-GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __gather_base_offsets32_double)
-GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __gather_base_offsets64_double)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __gather_factored_base_offsets32_i8)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_factored_base_offsets64_i8)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_factored_base_offsets32_i32)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_factored_base_offsets32_float)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_factored_base_offsets64_float)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __gather_factored_base_offsets32_double)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __gather_factored_base_offsets64_double)
 
 #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
 static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
@@ -1361,7 +1361,7 @@ GATHER_GENERAL(__vec16_d,   double,  __vec16_i64, __gather64_double)
 
 // scatter
 
-#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
                              uint32_t scale, OTYPE constOffset,         \
                              VTYPE val, __vec16_i1 mask) {              \
@@ -1375,18 +1375,18 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
 }
     
 
-SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __scatter_base_offsets32_i8)
-SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __scatter_base_offsets64_i8)
-SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
-SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
-SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
-SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
-SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_base_offsets32_float)
-SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
-SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
-SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
-SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __scatter_base_offsets32_double)
-SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __scatter_base_offsets64_double)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __scatter_factored_base_offsets32_i8)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __scatter_factored_base_offsets64_i8)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_factored_base_offsets32_i32)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_factored_base_offsets32_float)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_factored_base_offsets64_float)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __scatter_factored_base_offsets32_double)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __scatter_factored_base_offsets64_double)
 
 #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index 628aab84..c1f89cd8 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -1374,7 +1374,7 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec32_d val,
 
 // offsets * offsetScale is in bytes (for all of these)
 
-#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
 static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
                               uint32_t scale, OTYPE constOffset, \
                               __vec32_i1 mask) {                        \
@@ -1390,18 +1390,18 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
 }
     
 
-GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_base_offsets32_i8)
-GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_base_offsets64_i8)
-GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_base_offsets32_i16)
-GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_base_offsets64_i16)
-GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_base_offsets32_i32)
-GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_base_offsets64_i32)
-GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_base_offsets32_float)
-GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_base_offsets64_float)
-GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_base_offsets32_i64)
-GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_base_offsets64_i64)
-GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_base_offsets32_double)
-GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_base_offsets64_double)
+GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_factored_base_offsets32_i8)
+GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_factored_base_offsets64_i8)
+GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_factored_base_offsets32_i16)
+GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_factored_base_offsets64_i16)
+GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_factored_base_offsets32_i32)
+GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_factored_base_offsets64_i32)
+GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_factored_base_offsets32_float)
+GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_factored_base_offsets64_float)
+GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_factored_base_offsets32_i64)
+GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_factored_base_offsets64_i64)
+GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_factored_base_offsets32_double)
+GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_factored_base_offsets64_double)
 
 #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
 static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec32_i1 mask) {   \
@@ -1429,7 +1429,7 @@ GATHER_GENERAL(__vec32_d, double, __vec32_i64, __gather64_double)
 
 // scatter
 
-#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
                              uint32_t scale, OTYPE constOffset,         \
                              VTYPE val, __vec32_i1 mask) {              \
@@ -1443,18 +1443,18 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
 }
     
 
-SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_base_offsets32_i8)
-SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_base_offsets64_i8)
-SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_base_offsets32_i16)
-SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_base_offsets64_i16)
-SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_base_offsets32_i32)
-SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_base_offsets64_i32)
-SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_base_offsets32_float)
-SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_base_offsets64_float)
-SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_base_offsets32_i64)
-SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_base_offsets64_i64)
-SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_base_offsets32_double)
-SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_base_offsets64_double)
+SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_factored_base_offsets32_i8)
+SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_factored_base_offsets64_i8)
+SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_factored_base_offsets32_i16)
+SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_factored_base_offsets64_i16)
+SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_factored_base_offsets32_i32)
+SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_factored_base_offsets64_i32)
+SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_factored_base_offsets32_float)
+SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_factored_base_offsets64_float)
+SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_factored_base_offsets32_i64)
+SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_factored_base_offsets64_i64)
+SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_factored_base_offsets32_double)
+SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_factored_base_offsets64_double)
 
 #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec32_i1 mask) {  \
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index 2630e306..2a54446e 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -1507,7 +1507,7 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec64_d val,
 
 // offsets * offsetScale is in bytes (for all of these)
 
-#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
 static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
                               uint32_t scale, OTYPE constOffset, \
                               __vec64_i1 mask) {                        \
@@ -1523,18 +1523,18 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
 }
     
 
-GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_base_offsets32_i8)
-GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_base_offsets64_i8)
-GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_base_offsets32_i16)
-GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_base_offsets64_i16)
-GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_base_offsets32_i32)
-GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_base_offsets64_i32)
-GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_base_offsets32_float)
-GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_base_offsets64_float)
-GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_base_offsets32_i64)
-GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_base_offsets64_i64)
-GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_base_offsets32_double)
-GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_base_offsets64_double)
+GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_factored_base_offsets32_i8)
+GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_factored_base_offsets64_i8)
+GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_factored_base_offsets32_i16)
+GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_factored_base_offsets64_i16)
+GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_factored_base_offsets32_i32)
+GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_factored_base_offsets64_i32)
+GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_factored_base_offsets32_float)
+GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_factored_base_offsets64_float)
+GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_factored_base_offsets32_i64)
+GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_factored_base_offsets64_i64)
+GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_factored_base_offsets32_double)
+GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_factored_base_offsets64_double)
 
 #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
 static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) {   \
@@ -1562,7 +1562,7 @@ GATHER_GENERAL(__vec64_d, double, __vec64_i64, __gather64_double)
 
 // scatter
 
-#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
                              uint32_t scale, OTYPE constOffset,         \
                              VTYPE val, __vec64_i1 mask) {              \
@@ -1576,18 +1576,18 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
 }
     
 
-SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_base_offsets32_i8)
-SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_base_offsets64_i8)
-SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_base_offsets32_i16)
-SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_base_offsets64_i16)
-SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_base_offsets32_i32)
-SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_base_offsets64_i32)
-SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_base_offsets32_float)
-SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_base_offsets64_float)
-SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_base_offsets32_i64)
-SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_base_offsets64_i64)
-SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_base_offsets32_double)
-SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_base_offsets64_double)
+SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_factored_base_offsets32_i8)
+SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_factored_base_offsets64_i8)
+SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_factored_base_offsets32_i16)
+SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_factored_base_offsets64_i16)
+SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_factored_base_offsets32_i32)
+SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_factored_base_offsets64_i32)
+SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_factored_base_offsets32_float)
+SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_factored_base_offsets64_float)
+SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_factored_base_offsets32_i64)
+SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_factored_base_offsets64_i64)
+SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_factored_base_offsets32_double)
+SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_factored_base_offsets64_double)
 
 #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec64_i1 mask) {  \
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index eceeb885..fb11db11 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -1940,7 +1940,7 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
 
 // offsets * offsetScale is in bytes (for all of these)
 
-#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
+#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
 /*
 static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
                               uint32_t scale, OTYPE constOffset, \
@@ -1958,7 +1958,7 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
 */
 
 static FORCEINLINE __vec16_i32
-__gather_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset, 
+__gather_factored_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset, 
 			    uint32_t scale, __vec16_i32 constOffset, 
 			    __vec16_i1 mask) { 
     __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
@@ -1973,7 +1973,7 @@ __gather_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset,
 }
 
 static FORCEINLINE __vec16_f
-__gather_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset, 
+__gather_factored_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset, 
                               uint32_t scale, __vec16_i32 constOffset, 
                               __vec16_i1 mask) { 
     __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
@@ -1987,13 +1987,13 @@ __gather_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset,
     return ret;
 }
 
-GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
-GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
-GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
-GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
-GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
-GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
-GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_factored_base_offsets32_i8)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_factored_base_offsets64_i8)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64)
+GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64)
 
 #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
 /*
@@ -2039,7 +2039,7 @@ static FORCEINLINE __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask)
 */
 // scatter
 
-#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
+#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
 /*
 static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
                              uint32_t scale, OTYPE constOffset,         \
@@ -2054,16 +2054,16 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
 }
 */
 
-SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
-SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
-SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
-SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
-SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
-SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
-SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_factored_base_offsets32_i8)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_factored_base_offsets64_i8)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64)
+SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64)
 
 static FORCEINLINE void
-__scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
+__scatter_factored_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
                              uint32_t scale, __vec16_i32 constOffset,
                              __vec16_i32 val, __vec16_i1 mask)
 {
@@ -2072,7 +2072,7 @@ __scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
 }
 
 static FORCEINLINE void 
-__scatter_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset, 
+__scatter_factored_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset, 
                                uint32_t scale, const __vec16_i32 &constOffset, 
                                const __vec16_f &val, const __vec16_i1 mask) 
 { 
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index fcc14618..088b694d 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -3007,84 +3007,84 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
 }
 
 static FORCEINLINE __vec4_i8
-__gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
+__gather_factored_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
                            uint32_t scale,  __vec4_i32 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale, 
                                 constOffset, mask);
 }
 
 static FORCEINLINE __vec4_i8
-__gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
+__gather_factored_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
                            uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale, 
                                 constOffset, mask);
 }
 
 static FORCEINLINE __vec4_i16
-__gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
+__gather_factored_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
                             uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale, 
                                 constOffset, mask);
 }
 
 static FORCEINLINE __vec4_i16
- __gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
+ __gather_factored_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
                              uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale, 
                                 constOffset, mask);
 }
 
 static FORCEINLINE __vec4_i32
-__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
+__gather_factored_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
                             __vec4_i32 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, offsets, scale, 
                                 constOffset, mask);
 }
 
 static FORCEINLINE __vec4_i32
-__gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
+__gather_factored_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
                             uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale, 
                                 constOffset, mask);
 }
 
 static FORCEINLINE __vec4_f
-__gather_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
+__gather_factored_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
                               __vec4_i32 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets32(__vec4_f(), float(), p, offsets, scale, 
                                 constOffset, mask);
 }
 
 static FORCEINLINE __vec4_f
-__gather_base_offsets64_float(unsigned char *p, __vec4_i64 offsets,
+__gather_factored_base_offsets64_float(unsigned char *p, __vec4_i64 offsets,
                               uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets64(__vec4_f(), float(), p, offsets, scale, 
                                 constOffset, mask);
 }
 
 static FORCEINLINE __vec4_i64
-__gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
+__gather_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
                             uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale, 
                                 constOffset, mask);
 }
 
 static FORCEINLINE __vec4_i64
-__gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
+__gather_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
                             uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale, 
                                 constOffset, mask);
 }
 
 static FORCEINLINE __vec4_d
-__gather_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
+__gather_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
                                uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets32(__vec4_d(), double(), p, offsets, scale, 
                                 constOffset, mask);
 }
 
 static FORCEINLINE __vec4_d
-__gather_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
+__gather_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
                                uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
     return lGatherBaseOffsets64(__vec4_d(), double(), p, offsets, scale, 
                                 constOffset, mask);
@@ -3252,7 +3252,7 @@ static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) {
   
 #define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT)             \
 static FORCEINLINE void                                             \
-__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
+__scatter_factored_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
                                    uint32_t scale, __vec4_i32 constOffset, \
                                    __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
     uint32_t m = _mm_extract_ps(mask.v, 0);                             \
@@ -3281,7 +3281,7 @@ __scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
     }                                                                   \
 }                                                                       \
 static FORCEINLINE void                                                \
-__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
+__scatter_factored_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
                                   uint32_t scale, __vec4_i64 constOffset, \
                                   __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
     uint32_t m = _mm_extract_ps(mask.v, 0);                            \
@@ -3322,7 +3322,7 @@ SCATTER32_64(float, f,     float,   _mm_extract_ps_as_float)
 
 
 static FORCEINLINE void
-__scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, 
+__scatter_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, 
                              uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val, 
                              __vec4_i1 mask) {
     uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -3359,7 +3359,7 @@ __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
 }
 
 static FORCEINLINE void
-__scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, 
+__scatter_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, 
                              uint32_t scale, __vec4_i64 constOffset,
                              __vec4_i64 val, __vec4_i1 mask) {
     uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -3396,17 +3396,17 @@ __scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
 }
 
 static FORCEINLINE void
-__scatter_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, 
+__scatter_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, 
                                 uint32_t scale, __vec4_i32 constOffset, __vec4_d val, 
                                 __vec4_i1 mask) {
-    __scatter_base_offsets32_i64(p, offsets, scale, constOffset, val, mask);
+    __scatter_factored_base_offsets32_i64(p, offsets, scale, constOffset, val, mask);
 }
 
 static FORCEINLINE void
-__scatter_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, 
+__scatter_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, 
                                 uint32_t scale, __vec4_i64 constOffset, __vec4_d val, 
                                 __vec4_i1 mask) {
-    __scatter_base_offsets64_i64(p, offsets, scale, constOffset, val, mask);
+    __scatter_factored_base_offsets64_i64(p, offsets, scale, constOffset, val, mask);
 }
 
 
diff --git a/opt.cpp b/opt.cpp
index 1456dfd7..1140c9ce 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -1689,57 +1689,57 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
     };
 
     GSInfo gsFuncs[] = {
-        GSInfo("__pseudo_gather32_i8",  "__pseudo_gather_base_offsets32_i8",
-               "__pseudo_gather_base_offsets32_i8", true),
-        GSInfo("__pseudo_gather32_i16", "__pseudo_gather_base_offsets32_i16", 
-               "__pseudo_gather_base_offsets32_i16", true),
-        GSInfo("__pseudo_gather32_i32", "__pseudo_gather_base_offsets32_i32", 
-               "__pseudo_gather_base_offsets32_i32", true),
-        GSInfo("__pseudo_gather32_float", "__pseudo_gather_base_offsets32_float", 
-               "__pseudo_gather_base_offsets32_float", true),
-        GSInfo("__pseudo_gather32_i64", "__pseudo_gather_base_offsets32_i64", 
-               "__pseudo_gather_base_offsets32_i64", true),
-        GSInfo("__pseudo_gather32_double", "__pseudo_gather_base_offsets32_double", 
-               "__pseudo_gather_base_offsets32_double", true),
+        GSInfo("__pseudo_gather32_i8",  "__pseudo_gather_factored_base_offsets32_i8",
+               "__pseudo_gather_factored_base_offsets32_i8", true),
+        GSInfo("__pseudo_gather32_i16", "__pseudo_gather_factored_base_offsets32_i16", 
+               "__pseudo_gather_factored_base_offsets32_i16", true),
+        GSInfo("__pseudo_gather32_i32", "__pseudo_gather_factored_base_offsets32_i32", 
+               "__pseudo_gather_factored_base_offsets32_i32", true),
+        GSInfo("__pseudo_gather32_float", "__pseudo_gather_factored_base_offsets32_float", 
+               "__pseudo_gather_factored_base_offsets32_float", true),
+        GSInfo("__pseudo_gather32_i64", "__pseudo_gather_factored_base_offsets32_i64", 
+               "__pseudo_gather_factored_base_offsets32_i64", true),
+        GSInfo("__pseudo_gather32_double", "__pseudo_gather_factored_base_offsets32_double", 
+               "__pseudo_gather_factored_base_offsets32_double", true),
 
-        GSInfo("__pseudo_scatter32_i8",  "__pseudo_scatter_base_offsets32_i8", 
-               "__pseudo_scatter_base_offsets32_i8", false),
-        GSInfo("__pseudo_scatter32_i16", "__pseudo_scatter_base_offsets32_i16", 
-               "__pseudo_scatter_base_offsets32_i16", false),
-        GSInfo("__pseudo_scatter32_i32", "__pseudo_scatter_base_offsets32_i32", 
-               "__pseudo_scatter_base_offsets32_i32", false),
-        GSInfo("__pseudo_scatter32_float", "__pseudo_scatter_base_offsets32_float", 
-               "__pseudo_scatter_base_offsets32_float", false),
-        GSInfo("__pseudo_scatter32_i64", "__pseudo_scatter_base_offsets32_i64", 
-               "__pseudo_scatter_base_offsets32_i64", false),
-        GSInfo("__pseudo_scatter32_double", "__pseudo_scatter_base_offsets32_double", 
-               "__pseudo_scatter_base_offsets32_double", false),
+        GSInfo("__pseudo_scatter32_i8",  "__pseudo_scatter_factored_base_offsets32_i8", 
+               "__pseudo_scatter_factored_base_offsets32_i8", false),
+        GSInfo("__pseudo_scatter32_i16", "__pseudo_scatter_factored_base_offsets32_i16", 
+               "__pseudo_scatter_factored_base_offsets32_i16", false),
+        GSInfo("__pseudo_scatter32_i32", "__pseudo_scatter_factored_base_offsets32_i32", 
+               "__pseudo_scatter_factored_base_offsets32_i32", false),
+        GSInfo("__pseudo_scatter32_float", "__pseudo_scatter_factored_base_offsets32_float", 
+               "__pseudo_scatter_factored_base_offsets32_float", false),
+        GSInfo("__pseudo_scatter32_i64", "__pseudo_scatter_factored_base_offsets32_i64", 
+               "__pseudo_scatter_factored_base_offsets32_i64", false),
+        GSInfo("__pseudo_scatter32_double", "__pseudo_scatter_factored_base_offsets32_double", 
+               "__pseudo_scatter_factored_base_offsets32_double", false),
 
-        GSInfo("__pseudo_gather64_i8",  "__pseudo_gather_base_offsets64_i8", 
-               "__pseudo_gather_base_offsets32_i8", true),
-        GSInfo("__pseudo_gather64_i16", "__pseudo_gather_base_offsets64_i16", 
-               "__pseudo_gather_base_offsets32_i16", true),
-        GSInfo("__pseudo_gather64_i32", "__pseudo_gather_base_offsets64_i32", 
-               "__pseudo_gather_base_offsets32_i32", true),
-        GSInfo("__pseudo_gather64_float", "__pseudo_gather_base_offsets64_float", 
-               "__pseudo_gather_base_offsets32_float", true),
-        GSInfo("__pseudo_gather64_i64", "__pseudo_gather_base_offsets64_i64", 
-               "__pseudo_gather_base_offsets32_i64", true),
-        GSInfo("__pseudo_gather64_double", "__pseudo_gather_base_offsets64_double", 
-               "__pseudo_gather_base_offsets32_double", true),
+        GSInfo("__pseudo_gather64_i8",  "__pseudo_gather_factored_base_offsets64_i8", 
+               "__pseudo_gather_factored_base_offsets32_i8", true),
+        GSInfo("__pseudo_gather64_i16", "__pseudo_gather_factored_base_offsets64_i16", 
+               "__pseudo_gather_factored_base_offsets32_i16", true),
+        GSInfo("__pseudo_gather64_i32", "__pseudo_gather_factored_base_offsets64_i32", 
+               "__pseudo_gather_factored_base_offsets32_i32", true),
+        GSInfo("__pseudo_gather64_float", "__pseudo_gather_factored_base_offsets64_float", 
+               "__pseudo_gather_factored_base_offsets32_float", true),
+        GSInfo("__pseudo_gather64_i64", "__pseudo_gather_factored_base_offsets64_i64", 
+               "__pseudo_gather_factored_base_offsets32_i64", true),
+        GSInfo("__pseudo_gather64_double", "__pseudo_gather_factored_base_offsets64_double", 
+               "__pseudo_gather_factored_base_offsets32_double", true),
 
-        GSInfo("__pseudo_scatter64_i8",  "__pseudo_scatter_base_offsets64_i8", 
-               "__pseudo_scatter_base_offsets32_i8", false),
-        GSInfo("__pseudo_scatter64_i16", "__pseudo_scatter_base_offsets64_i16", 
-               "__pseudo_scatter_base_offsets32_i16", false),
-        GSInfo("__pseudo_scatter64_i32", "__pseudo_scatter_base_offsets64_i32", 
-               "__pseudo_scatter_base_offsets32_i32", false),
-        GSInfo("__pseudo_scatter64_float", "__pseudo_scatter_base_offsets64_float", 
-               "__pseudo_scatter_base_offsets32_float", false),
-        GSInfo("__pseudo_scatter64_i64", "__pseudo_scatter_base_offsets64_i64", 
-               "__pseudo_scatter_base_offsets32_i64", false),
-        GSInfo("__pseudo_scatter64_double", "__pseudo_scatter_base_offsets64_double", 
-               "__pseudo_scatter_base_offsets32_double", false),
+        GSInfo("__pseudo_scatter64_i8",  "__pseudo_scatter_factored_base_offsets64_i8", 
+               "__pseudo_scatter_factored_base_offsets32_i8", false),
+        GSInfo("__pseudo_scatter64_i16", "__pseudo_scatter_factored_base_offsets64_i16", 
+               "__pseudo_scatter_factored_base_offsets32_i16", false),
+        GSInfo("__pseudo_scatter64_i32", "__pseudo_scatter_factored_base_offsets64_i32", 
+               "__pseudo_scatter_factored_base_offsets32_i32", false),
+        GSInfo("__pseudo_scatter64_float", "__pseudo_scatter_factored_base_offsets64_float", 
+               "__pseudo_scatter_factored_base_offsets32_float", false),
+        GSInfo("__pseudo_scatter64_i64", "__pseudo_scatter_factored_base_offsets64_i64", 
+               "__pseudo_scatter_factored_base_offsets32_i64", false),
+        GSInfo("__pseudo_scatter64_double", "__pseudo_scatter_factored_base_offsets64_double", 
+               "__pseudo_scatter_factored_base_offsets32_double", false),
     };
 
     int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
@@ -1858,57 +1858,57 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
     };
 
     GSBOInfo gsFuncs[] = {
-        GSBOInfo("__pseudo_gather_base_offsets32_i8", 
-               "__pseudo_gather_base_offsets32_i8", true),
-        GSBOInfo("__pseudo_gather_base_offsets32_i16", 
-               "__pseudo_gather_base_offsets32_i16", true),
-        GSBOInfo("__pseudo_gather_base_offsets32_i32", 
-               "__pseudo_gather_base_offsets32_i32", true),
-        GSBOInfo("__pseudo_gather_base_offsets32_float", 
-               "__pseudo_gather_base_offsets32_float", true),
-        GSBOInfo("__pseudo_gather_base_offsets32_i64", 
-               "__pseudo_gather_base_offsets32_i64", true),
-        GSBOInfo("__pseudo_gather_base_offsets32_double", 
-               "__pseudo_gather_base_offsets32_double", true),
+        GSBOInfo("__pseudo_gather_factored_base_offsets32_i8", 
+               "__pseudo_gather_factored_base_offsets32_i8", true),
+        GSBOInfo("__pseudo_gather_factored_base_offsets32_i16", 
+               "__pseudo_gather_factored_base_offsets32_i16", true),
+        GSBOInfo("__pseudo_gather_factored_base_offsets32_i32", 
+               "__pseudo_gather_factored_base_offsets32_i32", true),
+        GSBOInfo("__pseudo_gather_factored_base_offsets32_float", 
+               "__pseudo_gather_factored_base_offsets32_float", true),
+        GSBOInfo("__pseudo_gather_factored_base_offsets32_i64", 
+               "__pseudo_gather_factored_base_offsets32_i64", true),
+        GSBOInfo("__pseudo_gather_factored_base_offsets32_double", 
+               "__pseudo_gather_factored_base_offsets32_double", true),
 
-        GSBOInfo( "__pseudo_scatter_base_offsets32_i8", 
-               "__pseudo_scatter_base_offsets32_i8", false),
-        GSBOInfo("__pseudo_scatter_base_offsets32_i16", 
-               "__pseudo_scatter_base_offsets32_i16", false),
-        GSBOInfo("__pseudo_scatter_base_offsets32_i32", 
-               "__pseudo_scatter_base_offsets32_i32", false),
-        GSBOInfo("__pseudo_scatter_base_offsets32_float", 
-               "__pseudo_scatter_base_offsets32_float", false),
-        GSBOInfo("__pseudo_scatter_base_offsets32_i64", 
-               "__pseudo_scatter_base_offsets32_i64", false),
-        GSBOInfo("__pseudo_scatter_base_offsets32_double", 
-               "__pseudo_scatter_base_offsets32_double", false),
+        GSBOInfo( "__pseudo_scatter_factored_base_offsets32_i8", 
+               "__pseudo_scatter_factored_base_offsets32_i8", false),
+        GSBOInfo("__pseudo_scatter_factored_base_offsets32_i16", 
+               "__pseudo_scatter_factored_base_offsets32_i16", false),
+        GSBOInfo("__pseudo_scatter_factored_base_offsets32_i32", 
+               "__pseudo_scatter_factored_base_offsets32_i32", false),
+        GSBOInfo("__pseudo_scatter_factored_base_offsets32_float", 
+               "__pseudo_scatter_factored_base_offsets32_float", false),
+        GSBOInfo("__pseudo_scatter_factored_base_offsets32_i64", 
+               "__pseudo_scatter_factored_base_offsets32_i64", false),
+        GSBOInfo("__pseudo_scatter_factored_base_offsets32_double", 
+               "__pseudo_scatter_factored_base_offsets32_double", false),
 
-        GSBOInfo( "__pseudo_gather_base_offsets64_i8", 
-               "__pseudo_gather_base_offsets32_i8", true),
-        GSBOInfo("__pseudo_gather_base_offsets64_i16", 
-               "__pseudo_gather_base_offsets32_i16", true),
-        GSBOInfo("__pseudo_gather_base_offsets64_i32", 
-               "__pseudo_gather_base_offsets32_i32", true),
-        GSBOInfo("__pseudo_gather_base_offsets64_float", 
-               "__pseudo_gather_base_offsets32_float", true),
-        GSBOInfo("__pseudo_gather_base_offsets64_i64", 
-               "__pseudo_gather_base_offsets32_i64", true),
-        GSBOInfo("__pseudo_gather_base_offsets64_double", 
-               "__pseudo_gather_base_offsets32_double", true),
+        GSBOInfo( "__pseudo_gather_factored_base_offsets64_i8", 
+               "__pseudo_gather_factored_base_offsets32_i8", true),
+        GSBOInfo("__pseudo_gather_factored_base_offsets64_i16", 
+               "__pseudo_gather_factored_base_offsets32_i16", true),
+        GSBOInfo("__pseudo_gather_factored_base_offsets64_i32", 
+               "__pseudo_gather_factored_base_offsets32_i32", true),
+        GSBOInfo("__pseudo_gather_factored_base_offsets64_float", 
+               "__pseudo_gather_factored_base_offsets32_float", true),
+        GSBOInfo("__pseudo_gather_factored_base_offsets64_i64", 
+               "__pseudo_gather_factored_base_offsets32_i64", true),
+        GSBOInfo("__pseudo_gather_factored_base_offsets64_double", 
+               "__pseudo_gather_factored_base_offsets32_double", true),
 
-        GSBOInfo( "__pseudo_scatter_base_offsets64_i8", 
-               "__pseudo_scatter_base_offsets32_i8", false),
-        GSBOInfo("__pseudo_scatter_base_offsets64_i16", 
-               "__pseudo_scatter_base_offsets32_i16", false),
-        GSBOInfo("__pseudo_scatter_base_offsets64_i32", 
-               "__pseudo_scatter_base_offsets32_i32", false),
-        GSBOInfo("__pseudo_scatter_base_offsets64_float", 
-               "__pseudo_scatter_base_offsets32_float", false),
-        GSBOInfo("__pseudo_scatter_base_offsets64_i64", 
-               "__pseudo_scatter_base_offsets32_i64", false),
-        GSBOInfo("__pseudo_scatter_base_offsets64_double", 
-               "__pseudo_scatter_base_offsets32_double", false),
+        GSBOInfo( "__pseudo_scatter_factored_base_offsets64_i8", 
+               "__pseudo_scatter_factored_base_offsets32_i8", false),
+        GSBOInfo("__pseudo_scatter_factored_base_offsets64_i16", 
+               "__pseudo_scatter_factored_base_offsets32_i16", false),
+        GSBOInfo("__pseudo_scatter_factored_base_offsets64_i32", 
+               "__pseudo_scatter_factored_base_offsets32_i32", false),
+        GSBOInfo("__pseudo_scatter_factored_base_offsets64_float", 
+               "__pseudo_scatter_factored_base_offsets32_float", false),
+        GSBOInfo("__pseudo_scatter_factored_base_offsets64_i64", 
+               "__pseudo_scatter_factored_base_offsets32_i64", false),
+        GSBOInfo("__pseudo_scatter_factored_base_offsets64_double", 
+               "__pseudo_scatter_factored_base_offsets32_double", false),
     };
 
     int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
@@ -2025,29 +2025,29 @@ lGSToLoadStore(llvm::CallInst *callInst) {
     };
 
     GatherImpInfo gInfo[] = {
-        GatherImpInfo("__pseudo_gather_base_offsets32_i8",     "__masked_load_i8",     
+        GatherImpInfo("__pseudo_gather_factored_base_offsets32_i8",     "__masked_load_i8",     
                       LLVMTypes::Int8Type, 1),
-        GatherImpInfo("__pseudo_gather_base_offsets32_i16",    "__masked_load_i16",    
+        GatherImpInfo("__pseudo_gather_factored_base_offsets32_i16",    "__masked_load_i16",    
                       LLVMTypes::Int16Type, 2),
-        GatherImpInfo("__pseudo_gather_base_offsets32_i32",    "__masked_load_i32",    
+        GatherImpInfo("__pseudo_gather_factored_base_offsets32_i32",    "__masked_load_i32",    
                       LLVMTypes::Int32Type, 4),
-        GatherImpInfo("__pseudo_gather_base_offsets32_float",  "__masked_load_float",  
+        GatherImpInfo("__pseudo_gather_factored_base_offsets32_float",  "__masked_load_float",  
                       LLVMTypes::FloatType, 4),
-        GatherImpInfo("__pseudo_gather_base_offsets32_i64",    "__masked_load_i64",    
+        GatherImpInfo("__pseudo_gather_factored_base_offsets32_i64",    "__masked_load_i64",    
                       LLVMTypes::Int64Type, 8),
-        GatherImpInfo("__pseudo_gather_base_offsets32_double", "__masked_load_double", 
+        GatherImpInfo("__pseudo_gather_factored_base_offsets32_double", "__masked_load_double", 
                       LLVMTypes::DoubleType, 8),
-        GatherImpInfo("__pseudo_gather_base_offsets64_i8",     "__masked_load_i8",     
+        GatherImpInfo("__pseudo_gather_factored_base_offsets64_i8",     "__masked_load_i8",     
                       LLVMTypes::Int8Type, 1),
-        GatherImpInfo("__pseudo_gather_base_offsets64_i16",    "__masked_load_i16",    
+        GatherImpInfo("__pseudo_gather_factored_base_offsets64_i16",    "__masked_load_i16",    
                       LLVMTypes::Int16Type, 2),
-        GatherImpInfo("__pseudo_gather_base_offsets64_i32",    "__masked_load_i32",    
+        GatherImpInfo("__pseudo_gather_factored_base_offsets64_i32",    "__masked_load_i32",    
                       LLVMTypes::Int32Type, 4),
-        GatherImpInfo("__pseudo_gather_base_offsets64_float",  "__masked_load_float",  
+        GatherImpInfo("__pseudo_gather_factored_base_offsets64_float",  "__masked_load_float",  
                       LLVMTypes::FloatType, 4),
-        GatherImpInfo("__pseudo_gather_base_offsets64_i64",    "__masked_load_i64",    
+        GatherImpInfo("__pseudo_gather_factored_base_offsets64_i64",    "__masked_load_i64",    
                       LLVMTypes::Int64Type, 8),
-        GatherImpInfo("__pseudo_gather_base_offsets64_double", "__masked_load_double", 
+        GatherImpInfo("__pseudo_gather_factored_base_offsets64_double", "__masked_load_double", 
                       LLVMTypes::DoubleType, 8)
     };
 
@@ -2067,29 +2067,29 @@ lGSToLoadStore(llvm::CallInst *callInst) {
     };
     
     ScatterImpInfo sInfo[] = {
-        ScatterImpInfo("__pseudo_scatter_base_offsets32_i8",  "__pseudo_masked_store_i8", 
+        ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i8",  "__pseudo_masked_store_i8", 
                        LLVMTypes::Int8VectorPointerType, 1),
-        ScatterImpInfo("__pseudo_scatter_base_offsets32_i16", "__pseudo_masked_store_i16",
+        ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i16", "__pseudo_masked_store_i16",
                        LLVMTypes::Int16VectorPointerType, 2),
-        ScatterImpInfo("__pseudo_scatter_base_offsets32_i32", "__pseudo_masked_store_i32",
+        ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i32", "__pseudo_masked_store_i32",
                        LLVMTypes::Int32VectorPointerType, 4),
-        ScatterImpInfo("__pseudo_scatter_base_offsets32_float", "__pseudo_masked_store_float",
+        ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_float", "__pseudo_masked_store_float",
                        LLVMTypes::FloatVectorPointerType, 4),
-        ScatterImpInfo("__pseudo_scatter_base_offsets32_i64", "__pseudo_masked_store_i64",
+        ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i64", "__pseudo_masked_store_i64",
                        LLVMTypes::Int64VectorPointerType, 8),
-        ScatterImpInfo("__pseudo_scatter_base_offsets32_double", "__pseudo_masked_store_double",
+        ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_double", "__pseudo_masked_store_double",
                        LLVMTypes::DoubleVectorPointerType, 8),
-        ScatterImpInfo("__pseudo_scatter_base_offsets64_i8",  "__pseudo_masked_store_i8", 
+        ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i8",  "__pseudo_masked_store_i8", 
                        LLVMTypes::Int8VectorPointerType, 1),
-        ScatterImpInfo("__pseudo_scatter_base_offsets64_i16", "__pseudo_masked_store_i16",
+        ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i16", "__pseudo_masked_store_i16",
                        LLVMTypes::Int16VectorPointerType, 2),
-        ScatterImpInfo("__pseudo_scatter_base_offsets64_i32", "__pseudo_masked_store_i32",
+        ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i32", "__pseudo_masked_store_i32",
                        LLVMTypes::Int32VectorPointerType, 4),
-        ScatterImpInfo("__pseudo_scatter_base_offsets64_float", "__pseudo_masked_store_float",
+        ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_float", "__pseudo_masked_store_float",
                        LLVMTypes::FloatVectorPointerType, 4),
-        ScatterImpInfo("__pseudo_scatter_base_offsets64_i64", "__pseudo_masked_store_i64",
+        ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i64", "__pseudo_masked_store_i64",
                        LLVMTypes::Int64VectorPointerType, 8),
-        ScatterImpInfo("__pseudo_scatter_base_offsets64_double", "__pseudo_masked_store_double",
+        ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_double", "__pseudo_masked_store_double",
                        LLVMTypes::DoubleVectorPointerType, 8)
     };
 
@@ -3354,10 +3354,10 @@ GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) {
     DEBUG_START_PASS("GatherCoalescePass");
 
     llvm::Function *gatherFuncs[] = {
-        m->module->getFunction("__pseudo_gather_base_offsets32_i32"),
-        m->module->getFunction("__pseudo_gather_base_offsets32_float"),
-        m->module->getFunction("__pseudo_gather_base_offsets64_i32"),
-        m->module->getFunction("__pseudo_gather_base_offsets64_float"),
+        m->module->getFunction("__pseudo_gather_factored_base_offsets32_i32"),
+        m->module->getFunction("__pseudo_gather_factored_base_offsets32_float"),
+        m->module->getFunction("__pseudo_gather_factored_base_offsets64_i32"),
+        m->module->getFunction("__pseudo_gather_factored_base_offsets64_float"),
     };
     int nGatherFuncs = sizeof(gatherFuncs) / sizeof(gatherFuncs[0]);
 
@@ -3367,7 +3367,7 @@ GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) {
     for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e;
          ++iter) {
         // Iterate over all of the instructions and look for calls to
-        // __pseudo_gather_base_offsets{32,64}_{i32,float} calls.
+        // __pseudo_gather_factored_base_offsets{32,64}_{i32,float} calls.
         llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
         if (callInst == NULL)
             continue;
@@ -3639,19 +3639,19 @@ lReplacePseudoGS(llvm::CallInst *callInst) {
     };
 
     LowerGSInfo lgsInfo[] = {
-        LowerGSInfo("__pseudo_gather_base_offsets32_i8",  "__gather_base_offsets32_i8",  true),
-        LowerGSInfo("__pseudo_gather_base_offsets32_i16", "__gather_base_offsets32_i16", true),
-        LowerGSInfo("__pseudo_gather_base_offsets32_i32", "__gather_base_offsets32_i32", true),
-        LowerGSInfo("__pseudo_gather_base_offsets32_float", "__gather_base_offsets32_float", true),
-        LowerGSInfo("__pseudo_gather_base_offsets32_i64", "__gather_base_offsets32_i64", true),
-        LowerGSInfo("__pseudo_gather_base_offsets32_double", "__gather_base_offsets32_double", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8",  "__gather_factored_base_offsets32_i8",  true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", "__gather_factored_base_offsets32_i16", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i32", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_float", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", "__gather_factored_base_offsets32_i64", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", "__gather_factored_base_offsets32_double", true),
 
-        LowerGSInfo("__pseudo_gather_base_offsets64_i8",  "__gather_base_offsets64_i8",  true),
-        LowerGSInfo("__pseudo_gather_base_offsets64_i16", "__gather_base_offsets64_i16", true),
-        LowerGSInfo("__pseudo_gather_base_offsets64_i32", "__gather_base_offsets64_i32", true),
-        LowerGSInfo("__pseudo_gather_base_offsets64_float", "__gather_base_offsets64_float", true),
-        LowerGSInfo("__pseudo_gather_base_offsets64_i64", "__gather_base_offsets64_i64", true),
-        LowerGSInfo("__pseudo_gather_base_offsets64_double", "__gather_base_offsets64_double", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8",  "__gather_factored_base_offsets64_i8",  true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", "__gather_factored_base_offsets64_i16", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i32", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_float", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", "__gather_factored_base_offsets64_i64", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", "__gather_factored_base_offsets64_double", true),
 
         LowerGSInfo("__pseudo_gather32_i8",  "__gather32_i8",  true),
         LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true),
@@ -3667,19 +3667,19 @@ lReplacePseudoGS(llvm::CallInst *callInst) {
         LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true),
         LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true),
 
-        LowerGSInfo("__pseudo_scatter_base_offsets32_i8",  "__scatter_base_offsets32_i8",  false),
-        LowerGSInfo("__pseudo_scatter_base_offsets32_i16", "__scatter_base_offsets32_i16", false),
-        LowerGSInfo("__pseudo_scatter_base_offsets32_i32", "__scatter_base_offsets32_i32", false),
-        LowerGSInfo("__pseudo_scatter_base_offsets32_float", "__scatter_base_offsets32_float", false),
-        LowerGSInfo("__pseudo_scatter_base_offsets32_i64", "__scatter_base_offsets32_i64", false),
-        LowerGSInfo("__pseudo_scatter_base_offsets32_double", "__scatter_base_offsets32_double", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8",  "__scatter_factored_base_offsets32_i8",  false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", "__scatter_factored_base_offsets32_i16", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i32", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_float", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", "__scatter_factored_base_offsets32_i64", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double", "__scatter_factored_base_offsets32_double", false),
 
-        LowerGSInfo("__pseudo_scatter_base_offsets64_i8",  "__scatter_base_offsets64_i8",  false),
-        LowerGSInfo("__pseudo_scatter_base_offsets64_i16", "__scatter_base_offsets64_i16", false),
-        LowerGSInfo("__pseudo_scatter_base_offsets64_i32", "__scatter_base_offsets64_i32", false),
-        LowerGSInfo("__pseudo_scatter_base_offsets64_float", "__scatter_base_offsets64_float", false),
-        LowerGSInfo("__pseudo_scatter_base_offsets64_i64", "__scatter_base_offsets64_i64", false),
-        LowerGSInfo("__pseudo_scatter_base_offsets64_double", "__scatter_base_offsets64_double", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8",  "__scatter_factored_base_offsets64_i8",  false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", "__scatter_factored_base_offsets64_i16", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i32", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_float", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", "__scatter_factored_base_offsets64_i64", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", "__scatter_factored_base_offsets64_double", false),
 
         LowerGSInfo("__pseudo_scatter32_i8",  "__scatter32_i8",  false),
         LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false),
@@ -3899,12 +3899,12 @@ bool
 MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
     const char *names[] = {
         "__fast_masked_vload", 
-        "__gather_base_offsets32_i8", "__gather_base_offsets32_i16",
-        "__gather_base_offsets32_i32", "__gather_base_offsets32_i64",
-        "__gather_base_offsets32_float", "__gather_base_offsets32_double",
-        "__gather_base_offsets64_i8", "__gather_base_offsets64_i16",
-        "__gather_base_offsets64_i32", "__gather_base_offsets64_i64",
-        "__gather_base_offsets64_float", "__gather_base_offsets64_double",
+        "__gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i16",
+        "__gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i64",
+        "__gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_double",
+        "__gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i16",
+        "__gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i64",
+        "__gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_double",
         "__gather32_i8", "__gather32_i16",
         "__gather32_i32", "__gather32_i64",
         "__gather32_float", "__gather32_double",
@@ -3926,12 +3926,12 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
         "__masked_store_blend_i8", "__masked_store_blend_i16",
         "__masked_store_blend_i32", "__masked_store_blend_i64",
         "__masked_store_blend_float", "__masked_store_blend_double",
-        "__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16",
-        "__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64",
-        "__scatter_base_offsets32_float", "__scatter_base_offsets32_double",
-        "__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16",
-        "__scatter_base_offsets64_i32", "__scatter_base_offsets64_i64",
-        "__scatter_base_offsets64_float", "__scatter_base_offsets64_double",
+        "__scatter_factored_base_offsets32_i8", "__scatter_factored_base_offsets32_i16",
+        "__scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i64",
+        "__scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_double",
+        "__scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i16",
+        "__scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i64",
+        "__scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_double",
         "__scatter_elt32_i8", "__scatter_elt32_i16", 
         "__scatter_elt32_i32", "__scatter_elt32_i64", 
         "__scatter_elt32_float", "__scatter_elt32_double", 

From 10b79fb41b71caeb94b40f4bff0f6efb7fef0c4a Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Wed, 11 Jul 2012 14:09:06 -0700
Subject: [PATCH 02/15] Add support for non-factored variants of gather/scatter
 functions.

We now have two ways of approaching gather/scatters with a common base
pointer and with offset vectors.  For targets with native gather/scatter,
we just turn those into base + {1/2/4/8}*offsets.  For targets without,
we turn those into base + {1/2/4/8}*varying_offsets + const_offsets,
where const_offsets is a compile-time constant.

Infrastructure for issue #325.
---
 builtins/util.m4 | 482 +++++++++++++++++++++------
 ispc.cpp         |   1 +
 ispc.h           |   8 +-
 opt.cpp          | 830 +++++++++++++++++++++++++++++++++--------------
 4 files changed, 965 insertions(+), 356 deletions(-)

diff --git a/builtins/util.m4 b/builtins/util.m4
index 4a8822bb..26466b1f 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1579,7 +1579,7 @@ declare void @__pseudo_masked_store_double(<WIDTH x double> * nocapture, <WIDTH
 
 ; Declare the pseudo-gather functions.  When the ispc front-end needs
 ; to perform a gather, it generates a call to one of these functions,
-; which have signatures:
+; which ideally have these signatures:
 ;    
 ; varying int8  __pseudo_gather_i8(varying int8 *, mask)
 ; varying int16 __pseudo_gather_i16(varying int16 *, mask)
@@ -1588,24 +1588,9 @@ declare void @__pseudo_masked_store_double(<WIDTH x double> * nocapture, <WIDTH
 ; varying int64 __pseudo_gather_i64(varying int64 *, mask)
 ; varying double __pseudo_gather_double(varying double *, mask)
 ;
-; The GatherScatterFlattenOpt optimization pass finds these calls and then 
-; converts them to make calls to the following functions (when appropriate); 
-; these represent gathers from a common base pointer with offsets.  The
-; offset_scale factor scales the offsets before they are added to the base
-; pointer--it should have the value 1, 2, 4, or 8.  (It can always just be 1.)
-; Then, the offset delta_value (guaranteed to be a compile-time constant value),
-; is added to the final address. The 2, 4, 8 scales are used to match LLVM patterns
-; that use the free 2/4/8 scaling available in x86 addressing calculations, and
-; offset_delta feeds into the free offset calculation. 
-;
-; varying int{8,16,32,float,64,double}
-; __pseudo_gather_factored_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
-;                                    int{32,64} offsets, uniform int32 offset_scale, 
-;                                    int{32,64} offset_delta, mask)
-;
-; Then, the GSImprovementsPass optimizations finds these and either
-; converts them to native gather functions or converts them to vector
-; loads, if equivalent.
+; However, vectors of pointers weren not legal in LLVM until recently, so
+; instead, it emits calls to functions that either take vectors of int32s
+; or int64s, depending on the compilation target.
 
 declare <WIDTH x i8>  @__pseudo_gather32_i8(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i16> @__pseudo_gather32_i16(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
@@ -1621,31 +1606,106 @@ declare <WIDTH x float> @__pseudo_gather64_float(<WIDTH x i64>, <WIDTH x MASK>)
 declare <WIDTH x i64> @__pseudo_gather64_i64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x double> @__pseudo_gather64_double(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
 
-declare <WIDTH x i8>  @__pseudo_gather_factored_base_offsets32_i8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                        <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i16> @__pseudo_gather_factored_base_offsets32_i16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                         <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i32> @__pseudo_gather_factored_base_offsets32_i32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x float> @__pseudo_gather_factored_base_offsets32_float(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i64> @__pseudo_gather_factored_base_offsets32_i64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x double> @__pseudo_gather_factored_base_offsets32_double(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                          <WIDTH x MASK>) nounwind readonly
+; The ImproveMemoryOps optimization pass finds these calls and then 
+; tries to convert them to be calls to gather functions that take a uniform
+; base pointer and then a varying integer offset, when possible.
+;
+; For targets without a native gather instruction, it is best to factor the
+; integer offsets like "{1/2/4/8} * varying_offset + constant_offset",
+; where varying_offset includes non-compile time constant values, and
+; constant_offset includes compile-time constant values.  (The scalar loads
+; generated in turn can then take advantage of the free offsetting and scale by
+; 1/2/4/8 that is offered by the x86 addresisng modes.)
+;
+; varying int{8,16,32,float,64,double}
+; __pseudo_gather_factored_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
+;                                    int{32,64} offsets, uniform int32 offset_scale, 
+;                                    int{32,64} offset_delta, mask)
+;
+; For targets with a gather instruction, it is better to just factor them into
+; a gather from a uniform base pointer and then "{1/2/4/8} * offsets", where the
+; offsets are int32/64 vectors.
+;
+; varying int{8,16,32,float,64,double}
+; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
+;                                    uniform int32 offset_scale, int{32,64} offsets, mask)
 
-declare <WIDTH x i8>  @__pseudo_gather_factored_base_offsets64_i8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                         <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i16> @__pseudo_gather_factored_base_offsets64_i16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i32> @__pseudo_gather_factored_base_offsets64_i32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x float> @__pseudo_gather_factored_base_offsets64_float(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i64> @__pseudo_gather_factored_base_offsets64_i64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x double> @__pseudo_gather_factored_base_offsets64_double(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                                <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>
+@__pseudo_gather_factored_base_offsets32_i8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                            <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_factored_base_offsets32_i16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_factored_base_offsets32_i32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_factored_base_offsets32_float(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                               <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_factored_base_offsets32_i64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_factored_base_offsets32_double(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>
+@__pseudo_gather_factored_base_offsets64_i8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                            <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_factored_base_offsets64_i16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_factored_base_offsets64_i32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_factored_base_offsets64_float(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                               <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_factored_base_offsets64_i64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_factored_base_offsets64_double(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>
+@__pseudo_gather_base_offsets32_i8(i8 *, i32, <WIDTH x i32>,
+                                   <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_base_offsets32_i16(i8 *, i32, <WIDTH x i32>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_base_offsets32_i32(i8 *, i32, <WIDTH x i32>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_base_offsets32_float(i8 *, i32, <WIDTH x i32>,
+                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_base_offsets32_i64(i8 *, i32, <WIDTH x i32>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_base_offsets32_double(i8 *, i32, <WIDTH x i32>,
+                                       <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>
+@__pseudo_gather_base_offsets64_i8(i8 *, i32, <WIDTH x i64>,
+                                   <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_base_offsets64_i16(i8 *, i32, <WIDTH x i64>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_base_offsets64_i32(i8 *, i32, <WIDTH x i64>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_base_offsets64_float(i8 *, i32, <WIDTH x i64>,
+                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_base_offsets64_i64(i8 *, i32, <WIDTH x i64>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_base_offsets64_double(i8 *, i32, <WIDTH x i64>,
+                                       <WIDTH x MASK>) nounwind readonly
 
 ; Similarly to the pseudo-gathers defined above, we also declare undefined
 ; pseudo-scatter instructions with signatures:
@@ -1657,16 +1717,6 @@ declare <WIDTH x double> @__pseudo_gather_factored_base_offsets64_double(i8 *, <
 ; void __pseudo_scatter_i64(varying int64 *, varying int64 values, mask)
 ; void __pseudo_scatter_double(varying double *, varying double values, mask)
 ;
-; The GatherScatterFlattenOpt optimization pass also finds these and
-; transforms them to scatters like:
-;
-; void __pseudo_scatter_factored_base_offsets{32,64}_i8(uniform int8 *base, 
-;             varying int32 offsets, uniform int32 offset_scale, 
-;             varying int{32,64} offset_delta, varying int8 values, mask)
-; (and similarly for 16/32/64 bit values)
-;
-; And the GSImprovementsPass in turn converts these to actual native
-; scatters or masked stores.  
 
 declare void @__pseudo_scatter32_i8(<WIDTH x i32>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter32_i16(<WIDTH x i32>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
@@ -1682,31 +1732,96 @@ declare void @__pseudo_scatter64_float(<WIDTH x i64>, <WIDTH x float>, <WIDTH x
 declare void @__pseudo_scatter64_i64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter64_double(<WIDTH x i64>, <WIDTH x double>, <WIDTH x MASK>) nounwind
 
-declare void @__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                 <WIDTH x i8>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                  <WIDTH x i16>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                  <WIDTH x i32>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                  <WIDTH x float>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                  <WIDTH x i64>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                   <WIDTH x double>, <WIDTH x MASK>) nounwind
+; And the ImproveMemoryOps optimization pass also finds these and
+; either transforms them to scatters like:
+;
+; void __pseudo_scatter_factored_base_offsets{32,64}_i8(uniform int8 *base, 
+;             varying int32 offsets, uniform int32 offset_scale, 
+;             varying int{32,64} offset_delta, varying int8 values, mask)
+; (and similarly for 16/32/64 bit values)
+;
+; Or, if the target has a native scatter instruction:
+;
+; void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base, 
+;             uniform int32 offset_scale, varying int{32,64} offsets,
+;             varying int8 values, mask)
+; (and similarly for 16/32/64 bit values)
 
-declare void @__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                 <WIDTH x i8>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                  <WIDTH x i16>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                  <WIDTH x i32>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                  <WIDTH x float>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                  <WIDTH x i64>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                     <WIDTH x double>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                              <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                              <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                              <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                 <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                              <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                              <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                              <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                 <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_scatter_base_offsets32_i8(i8 * nocapture, i32, <WIDTH x i32>,
+                                    <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_i16(i8 * nocapture, i32, <WIDTH x i32>,
+                                     <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_i32(i8 * nocapture, i32, <WIDTH x i32>,
+                                     <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_float(i8 * nocapture, i32, <WIDTH x i32>,
+                                       <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_i64(i8 * nocapture, i32, <WIDTH x i32>,
+                                     <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_double(i8 * nocapture, i32, <WIDTH x i32>,
+                                        <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_scatter_base_offsets64_i8(i8 * nocapture, i32, <WIDTH x i64>,
+                                    <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_i16(i8 * nocapture, i32, <WIDTH x i64>,
+                                     <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_i32(i8 * nocapture, i32, <WIDTH x i64>,
+                                     <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_float(i8 * nocapture, i32, <WIDTH x i64>,
+                                       <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_i64(i8 * nocapture, i32, <WIDTH x i64>,
+                                     <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, <WIDTH x i64>,
+                                        <WIDTH x double>, <WIDTH x MASK>) nounwind
 
 declare float @__log_uniform_float(float) nounwind readnone
 declare <WIDTH x float> @__log_varying_float(<WIDTH x float>) nounwind readnone
@@ -1871,6 +1986,109 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
                                                     <WIDTH x MASK> %mask)
   call void @__usedouble(<WIDTH x double> %g64_d)
 
+ifelse(HAVE_GATHER, `1', 
+`
+  %nfpgbo32_8 = call <WIDTH x i8>
+       @__pseudo_gather_base_offsets32_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfpgbo32_8)
+  %nfpgbo32_16 = call <WIDTH x i16>
+       @__pseudo_gather_base_offsets32_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfpgbo32_16)
+  %nfpgbo32_32 = call <WIDTH x i32>
+       @__pseudo_gather_base_offsets32_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfpgbo32_32)
+  %nfpgbo32_f = call <WIDTH x float>
+       @__pseudo_gather_base_offsets32_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfpgbo32_f)
+  %nfpgbo32_64 = call <WIDTH x i64>
+       @__pseudo_gather_base_offsets32_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfpgbo32_64)
+  %nfpgbo32_d = call <WIDTH x double>
+       @__pseudo_gather_base_offsets32_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfpgbo32_d)
+
+  %nfpgbo64_8 = call <WIDTH x i8>
+       @__pseudo_gather_base_offsets64_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfpgbo64_8)
+  %nfpgbo64_16 = call <WIDTH x i16>
+       @__pseudo_gather_base_offsets64_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfpgbo64_16)
+  %nfpgbo64_32 = call <WIDTH x i32>
+       @__pseudo_gather_base_offsets64_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfpgbo64_32)
+  %nfpgbo64_f = call <WIDTH x float>
+       @__pseudo_gather_base_offsets64_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfpgbo64_f)
+  %nfpgbo64_64 = call <WIDTH x i64>
+       @__pseudo_gather_base_offsets64_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfpgbo64_64)
+  %nfpgbo64_d = call <WIDTH x double>
+       @__pseudo_gather_base_offsets64_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfpgbo64_d)
+
+  %nfgbo32_8 = call <WIDTH x i8>
+       @__gather_base_offsets32_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfgbo32_8)
+  %nfgbo32_16 = call <WIDTH x i16>
+       @__gather_base_offsets32_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfgbo32_16)
+  %nfgbo32_32 = call <WIDTH x i32>
+       @__gather_base_offsets32_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfgbo32_32)
+  %nfgbo32_f = call <WIDTH x float>
+       @__gather_base_offsets32_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfgbo32_f)
+  %nfgbo32_64 = call <WIDTH x i64>
+       @__gather_base_offsets32_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfgbo32_64)
+  %nfgbo32_d = call <WIDTH x double>
+       @__gather_base_offsets32_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfgbo32_d)
+
+  %nfgbo64_8 = call <WIDTH x i8>
+       @__gather_base_offsets64_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfgbo64_8)
+  %nfgbo64_16 = call <WIDTH x i16>
+       @__gather_base_offsets64_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfgbo64_16)
+  %nfgbo64_32 = call <WIDTH x i32>
+       @__gather_base_offsets64_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfgbo64_32)
+  %nfgbo64_f = call <WIDTH x float>
+       @__gather_base_offsets64_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfgbo64_f)
+  %nfgbo64_64 = call <WIDTH x i64>
+       @__gather_base_offsets64_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfgbo64_64)
+  %nfgbo64_d = call <WIDTH x double>
+       @__gather_base_offsets64_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfgbo64_d)
+',
+`
   %pgbo32_8 = call <WIDTH x i8>
        @__pseudo_gather_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
@@ -1896,32 +2114,6 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
                                            <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
   call void @__usedouble(<WIDTH x double> %pgbo32_d)
 
-  %gbo32_8 = call <WIDTH x i8>
-       @__gather_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
-                                  <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__use8(<WIDTH x i8> %gbo32_8)
-  %gbo32_16 = call <WIDTH x i16>
-       @__gather_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
-                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__use16(<WIDTH x i16> %gbo32_16)
-  %gbo32_32 = call <WIDTH x i32>
-       @__gather_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
-                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__use32(<WIDTH x i32> %gbo32_32)
-  %gbo32_f = call <WIDTH x float>
-       @__gather_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
-                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__usefloat(<WIDTH x float> %gbo32_f)
-  %gbo32_64 = call <WIDTH x i64>
-       @__gather_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
-                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__use64(<WIDTH x i64> %gbo32_64)
-  %gbo32_d = call <WIDTH x double>
-       @__gather_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
-                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__usedouble(<WIDTH x double> %gbo32_d)
-
-
   %pgbo64_8 = call <WIDTH x i8>
        @__pseudo_gather_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
@@ -1947,6 +2139,31 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
                                            <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__usedouble(<WIDTH x double> %pgbo64_d)
 
+  %gbo32_8 = call <WIDTH x i8>
+       @__gather_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %gbo32_8)
+  %gbo32_16 = call <WIDTH x i16>
+       @__gather_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %gbo32_16)
+  %gbo32_32 = call <WIDTH x i32>
+       @__gather_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %gbo32_32)
+  %gbo32_f = call <WIDTH x float>
+       @__gather_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %gbo32_f)
+  %gbo32_64 = call <WIDTH x i64>
+       @__gather_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %gbo32_64)
+  %gbo32_d = call <WIDTH x double>
+       @__gather_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %gbo32_d)
+
   %gbo64_8 = call <WIDTH x i8>
        @__gather_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
@@ -1970,7 +2187,8 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
   %gbo64_d = call <WIDTH x double>
        @__gather_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                    <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
-  call void @__usedouble(<WIDTH x double> %gbo64_d)
+  call void @__usedouble(<WIDTH x double> %pgbo64_d)
+')
 
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ;; scatters
@@ -2003,6 +2221,61 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
   call void @__scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__scatter64_double(<WIDTH x i64> %v64, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
 
+ifelse(HAVE_SCATTER, `1',
+`
+  call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter_base_offsets32_i8(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i16(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i32(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_float(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i64(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_double(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter_base_offsets64_i8(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i16(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i32(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_float(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i64(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_double(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+',
+`
   call void @__pseudo_scatter_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
                                                 <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
   call void @__pseudo_scatter_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
@@ -2054,6 +2327,7 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__scatter_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
                                              <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+')
 
   ret void
 }
diff --git a/ispc.cpp b/ispc.cpp
index 2b98de86..0980c3d2 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -212,6 +212,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
 
     // This is the case for most of them
     t->hasHalf = t->hasRand = t->hasTranscendentals = false;
+    t->hasGather = t->hasScatter = false;
 
     if (!strcasecmp(isa, "sse2")) {
         t->isa = Target::SSE2;
diff --git a/ispc.h b/ispc.h
index 9632f514..66191844 100644
--- a/ispc.h
+++ b/ispc.h
@@ -252,9 +252,15 @@ struct Target {
         conversions. */
     bool hasHalf;
 
-    /** Indicates whether there is an ISA random number instruciton. */
+    /** Indicates whether there is an ISA random number instruction. */
     bool hasRand;
 
+    /** Indicates whether the target has a native gather instruction */
+    bool hasGather;
+
+    /** Indicates whether the target has a native scatter instruction */
+    bool hasScatter;
+
     /** Indicates whether the target has support for transcendentals (beyond
         sqrt, which we assume that all of them handle). */
     bool hasTranscendentals;
diff --git a/opt.cpp b/opt.cpp
index 1140c9ce..824c5bc7 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -225,7 +225,6 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
 }
 
 
-#if 0
 static llvm::Instruction *
 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, 
           llvm::Value *arg2, llvm::Value *arg3, const char *name,
@@ -234,7 +233,6 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
     llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[4]);
     return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
 }
-#endif
 
 static llvm::Instruction *
 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, 
@@ -1673,6 +1671,39 @@ lOffsets32BitSafe(llvm::Value **variableOffsetPtr,
 }
 
 
+/** Check to see if the single offset vector can safely be represented with
+    32-bit values.  If so, return true and update the pointed-to
+    llvm::Value * to be the 32-bit equivalent. */
+static bool
+lOffsets32BitSafe(llvm::Value **offsetPtr, 
+                  llvm::Instruction *insertBefore) {
+    llvm::Value *offset = *offsetPtr;
+
+    if (offset->getType() == LLVMTypes::Int32VectorType)
+        return true;
+
+    llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(offset);
+    if (sext != NULL && 
+        sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) {
+        // sext of a 32-bit vector -> the 32-bit vector is good
+        *offsetPtr = sext->getOperand(0);
+        return true;
+    }
+    else if (lVectorIs32BitInts(offset)) {
+        // The only constant vector we should have here is a vector of
+        // all zeros (i.e. a ConstantAggregateZero, but just in case,
+        // do the more general check with lVectorIs32BitInts().
+        *offsetPtr = 
+            new llvm::TruncInst(offset, LLVMTypes::Int32VectorType,
+                                LLVMGetName(offset, "_trunc"),
+                                insertBefore);
+        return true;
+    }
+    else
+        return false;
+}
+
+
 static bool
 lGSToGSBaseOffsets(llvm::CallInst *callInst) {
     struct GSInfo {
@@ -1689,57 +1720,153 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
     };
 
     GSInfo gsFuncs[] = {
-        GSInfo("__pseudo_gather32_i8",  "__pseudo_gather_factored_base_offsets32_i8",
-               "__pseudo_gather_factored_base_offsets32_i8", true),
-        GSInfo("__pseudo_gather32_i16", "__pseudo_gather_factored_base_offsets32_i16", 
-               "__pseudo_gather_factored_base_offsets32_i16", true),
-        GSInfo("__pseudo_gather32_i32", "__pseudo_gather_factored_base_offsets32_i32", 
-               "__pseudo_gather_factored_base_offsets32_i32", true),
-        GSInfo("__pseudo_gather32_float", "__pseudo_gather_factored_base_offsets32_float", 
-               "__pseudo_gather_factored_base_offsets32_float", true),
-        GSInfo("__pseudo_gather32_i64", "__pseudo_gather_factored_base_offsets32_i64", 
-               "__pseudo_gather_factored_base_offsets32_i64", true),
-        GSInfo("__pseudo_gather32_double", "__pseudo_gather_factored_base_offsets32_double", 
-               "__pseudo_gather_factored_base_offsets32_double", true),
+        GSInfo("__pseudo_gather32_i8",  
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" :
+               "__pseudo_gather_factored_base_offsets32_i8",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" :
+               "__pseudo_gather_factored_base_offsets32_i8",
+               true),
+        GSInfo("__pseudo_gather32_i16",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" :
+               "__pseudo_gather_factored_base_offsets32_i16",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" :
+               "__pseudo_gather_factored_base_offsets32_i16",
+               true),
+        GSInfo("__pseudo_gather32_i32",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" :
+               "__pseudo_gather_factored_base_offsets32_i32",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" :
+               "__pseudo_gather_factored_base_offsets32_i32",
+               true),
+        GSInfo("__pseudo_gather32_float",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_float" :
+               "__pseudo_gather_factored_base_offsets32_float",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_float" :
+               "__pseudo_gather_factored_base_offsets32_float",
+               true),
+        GSInfo("__pseudo_gather32_i64",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" :
+               "__pseudo_gather_factored_base_offsets32_i64",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" :
+               "__pseudo_gather_factored_base_offsets32_i64",
+               true),
+        GSInfo("__pseudo_gather32_double",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_double" :
+               "__pseudo_gather_factored_base_offsets32_double",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_double" :
+               "__pseudo_gather_factored_base_offsets32_double",
+               true),
 
-        GSInfo("__pseudo_scatter32_i8",  "__pseudo_scatter_factored_base_offsets32_i8", 
-               "__pseudo_scatter_factored_base_offsets32_i8", false),
-        GSInfo("__pseudo_scatter32_i16", "__pseudo_scatter_factored_base_offsets32_i16", 
-               "__pseudo_scatter_factored_base_offsets32_i16", false),
-        GSInfo("__pseudo_scatter32_i32", "__pseudo_scatter_factored_base_offsets32_i32", 
-               "__pseudo_scatter_factored_base_offsets32_i32", false),
-        GSInfo("__pseudo_scatter32_float", "__pseudo_scatter_factored_base_offsets32_float", 
-               "__pseudo_scatter_factored_base_offsets32_float", false),
-        GSInfo("__pseudo_scatter32_i64", "__pseudo_scatter_factored_base_offsets32_i64", 
-               "__pseudo_scatter_factored_base_offsets32_i64", false),
-        GSInfo("__pseudo_scatter32_double", "__pseudo_scatter_factored_base_offsets32_double", 
-               "__pseudo_scatter_factored_base_offsets32_double", false),
+        GSInfo("__pseudo_scatter32_i8",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" :
+               "__pseudo_scatter_factored_base_offsets32_i8",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" :
+               "__pseudo_scatter_factored_base_offsets32_i8",
+               false),
+        GSInfo("__pseudo_scatter32_i16",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" :
+               "__pseudo_scatter_factored_base_offsets32_i16",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" :
+               "__pseudo_scatter_factored_base_offsets32_i16",
+               false),
+        GSInfo("__pseudo_scatter32_i32",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" :
+               "__pseudo_scatter_factored_base_offsets32_i32",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" :
+               "__pseudo_scatter_factored_base_offsets32_i32",
+               false),
+        GSInfo("__pseudo_scatter32_float",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" :
+               "__pseudo_scatter_factored_base_offsets32_float",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" :
+               "__pseudo_scatter_factored_base_offsets32_float",
+               false),
+        GSInfo("__pseudo_scatter32_i64",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" :
+               "__pseudo_scatter_factored_base_offsets32_i64",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" :
+               "__pseudo_scatter_factored_base_offsets32_i64",
+               false),
+        GSInfo("__pseudo_scatter32_double",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" :
+               "__pseudo_scatter_factored_base_offsets32_double",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" :
+               "__pseudo_scatter_factored_base_offsets32_double",
+               false),
 
-        GSInfo("__pseudo_gather64_i8",  "__pseudo_gather_factored_base_offsets64_i8", 
-               "__pseudo_gather_factored_base_offsets32_i8", true),
-        GSInfo("__pseudo_gather64_i16", "__pseudo_gather_factored_base_offsets64_i16", 
-               "__pseudo_gather_factored_base_offsets32_i16", true),
-        GSInfo("__pseudo_gather64_i32", "__pseudo_gather_factored_base_offsets64_i32", 
-               "__pseudo_gather_factored_base_offsets32_i32", true),
-        GSInfo("__pseudo_gather64_float", "__pseudo_gather_factored_base_offsets64_float", 
-               "__pseudo_gather_factored_base_offsets32_float", true),
-        GSInfo("__pseudo_gather64_i64", "__pseudo_gather_factored_base_offsets64_i64", 
-               "__pseudo_gather_factored_base_offsets32_i64", true),
-        GSInfo("__pseudo_gather64_double", "__pseudo_gather_factored_base_offsets64_double", 
-               "__pseudo_gather_factored_base_offsets32_double", true),
+        GSInfo("__pseudo_gather64_i8",
+               g->target.hasGather ? "__pseudo_gather_base_offsets64_i8" :
+               "__pseudo_gather_factored_base_offsets64_i8",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" :
+               "__pseudo_gather_factored_base_offsets32_i8",
+               true),
+        GSInfo("__pseudo_gather64_i16",
+               g->target.hasGather ? "__pseudo_gather_base_offsets64_i16" :
+               "__pseudo_gather_factored_base_offsets64_i16",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" :
+               "__pseudo_gather_factored_base_offsets32_i16",
+               true),
+        GSInfo("__pseudo_gather64_i32",
+               g->target.hasGather ? "__pseudo_gather_base_offsets64_i32" :
+               "__pseudo_gather_factored_base_offsets64_i32",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" :
+               "__pseudo_gather_factored_base_offsets32_i32",
+               true),
+        GSInfo("__pseudo_gather64_float",
+               g->target.hasGather ? "__pseudo_gather_base_offsets64_float" :
+               "__pseudo_gather_factored_base_offsets64_float",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_float" :
+               "__pseudo_gather_factored_base_offsets32_float",
+               true),
+        GSInfo("__pseudo_gather64_i64",
+               g->target.hasGather ? "__pseudo_gather_base_offsets64_i64" :
+               "__pseudo_gather_factored_base_offsets64_i64",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" :
+               "__pseudo_gather_factored_base_offsets32_i64",
+               true),
+        GSInfo("__pseudo_gather64_double",
+               g->target.hasGather ? "__pseudo_gather_base_offsets64_double" :
+               "__pseudo_gather_factored_base_offsets64_double",
+               g->target.hasGather ? "__pseudo_gather_base_offsets32_double" :
+               "__pseudo_gather_factored_base_offsets32_double",
+               true),
 
-        GSInfo("__pseudo_scatter64_i8",  "__pseudo_scatter_factored_base_offsets64_i8", 
-               "__pseudo_scatter_factored_base_offsets32_i8", false),
-        GSInfo("__pseudo_scatter64_i16", "__pseudo_scatter_factored_base_offsets64_i16", 
-               "__pseudo_scatter_factored_base_offsets32_i16", false),
-        GSInfo("__pseudo_scatter64_i32", "__pseudo_scatter_factored_base_offsets64_i32", 
-               "__pseudo_scatter_factored_base_offsets32_i32", false),
-        GSInfo("__pseudo_scatter64_float", "__pseudo_scatter_factored_base_offsets64_float", 
-               "__pseudo_scatter_factored_base_offsets32_float", false),
-        GSInfo("__pseudo_scatter64_i64", "__pseudo_scatter_factored_base_offsets64_i64", 
-               "__pseudo_scatter_factored_base_offsets32_i64", false),
-        GSInfo("__pseudo_scatter64_double", "__pseudo_scatter_factored_base_offsets64_double", 
-               "__pseudo_scatter_factored_base_offsets32_double", false),
+        GSInfo("__pseudo_scatter64_i8",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i8" :
+               "__pseudo_scatter_factored_base_offsets64_i8",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" :
+               "__pseudo_scatter_factored_base_offsets32_i8",
+               false),
+        GSInfo("__pseudo_scatter64_i16",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i16" :
+               "__pseudo_scatter_factored_base_offsets64_i16",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" :
+               "__pseudo_scatter_factored_base_offsets32_i16",
+               false),
+        GSInfo("__pseudo_scatter64_i32",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i32" :
+               "__pseudo_scatter_factored_base_offsets64_i32",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" :
+               "__pseudo_scatter_factored_base_offsets32_i32",
+               false),
+        GSInfo("__pseudo_scatter64_float",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets64_float" :
+               "__pseudo_scatter_factored_base_offsets64_float",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" :
+               "__pseudo_scatter_factored_base_offsets32_float",
+               false),
+        GSInfo("__pseudo_scatter64_i64",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i64" :
+               "__pseudo_scatter_factored_base_offsets64_i64",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" :
+               "__pseudo_scatter_factored_base_offsets32_i64",
+               false),
+        GSInfo("__pseudo_scatter64_double",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets64_double" :
+               "__pseudo_scatter_factored_base_offsets64_double",
+               g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" :
+               "__pseudo_scatter_factored_base_offsets32_double",
+               false),
     };
 
     int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
@@ -1771,25 +1898,6 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
         // to the next instruction...
         return false;
 
-    // Try to decompose the offset vector into a compile time constant
-    // component and a varying component.  The constant component is
-    // passed as a separate parameter to the gather/scatter functions,
-    // which in turn allows their implementations to end up emitting
-    // x86 instructions with constant offsets encoded in them.
-    llvm::Value *constOffset, *variableOffset;
-    lExtractConstantOffset(offsetVector, &constOffset, &variableOffset, 
-                           callInst);
-    if (constOffset == NULL)
-        constOffset = LLVMIntAsType(0, offsetVector->getType());
-    if (variableOffset == NULL)
-        variableOffset = LLVMIntAsType(0, offsetVector->getType());
-
-    // See if the varying component is scaled by 2, 4, or 8.  If so,
-    // extract that scale factor and rewrite variableOffset to remove
-    // it.  (This also is pulled out so that we can match the scales by
-    // 2/4/8 offered by x86 addressing operators.)
-    llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset);
-
     // Cast the base pointer to a void *, since that's what the
     // __pseudo_*_base_offsets_* functions want.
     basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType,
@@ -1798,43 +1906,107 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
 
     llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;
 
-    // If we're doing 32-bit addressing on a 64-bit target, here we
-    // will see if we can call one of the 32-bit variants of the pseudo
-    // gather/scatter functions.
-    if (g->opt.force32BitAddressing && 
-        lOffsets32BitSafe(&variableOffset, &constOffset, callInst)) {
-        gatherScatterFunc = info->baseOffsets32Func;
-    }
+    if ((info->isGather == true && g->target.hasGather) ||
+        (info->isGather == false && g->target.hasScatter)) {
+        // See if the offsets are scaled by 2, 4, or 8.  If so,
+        // extract that scale factor and rewrite the offsets to remove
+        // it.
+        llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector);
 
-    if (info->isGather) {
-        llvm::Value *mask = callInst->getArgOperand(1);
+        // If we're doing 32-bit addressing on a 64-bit target, here we
+        // will see if we can call one of the 32-bit variants of the pseudo
+        // gather/scatter functions.
+        if (g->opt.force32BitAddressing && 
+            lOffsets32BitSafe(&offsetVector, callInst)) {
+            gatherScatterFunc = info->baseOffsets32Func;
+        }
 
-        // Generate a new function call to the next pseudo gather
-        // base+offsets instruction.  Note that we're passing a NULL
-        // llvm::Instruction to llvm::CallInst::Create; this means that
-        // the instruction isn't inserted into a basic block and that
-        // way we can then call ReplaceInstWithInst().
-        llvm::Instruction *newCall = 
-            lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
-                      constOffset, mask, callInst->getName().str().c_str(),
-                      NULL);
-        lCopyMetadata(newCall, callInst);
-        llvm::ReplaceInstWithInst(callInst, newCall);
+        if (info->isGather) {
+            llvm::Value *mask = callInst->getArgOperand(1);
+
+            // Generate a new function call to the next pseudo gather
+            // base+offsets instruction.  Note that we're passing a NULL
+            // llvm::Instruction to llvm::CallInst::Create; this means that
+            // the instruction isn't inserted into a basic block and that
+            // way we can then call ReplaceInstWithInst().
+            llvm::Instruction *newCall = 
+                lCallInst(gatherScatterFunc, basePtr, offsetScale, offsetVector,
+                          mask, callInst->getName().str().c_str(),
+                          NULL);
+            lCopyMetadata(newCall, callInst);
+            llvm::ReplaceInstWithInst(callInst, newCall);
+        }
+        else {
+            llvm::Value *storeValue = callInst->getArgOperand(1);
+            llvm::Value *mask = callInst->getArgOperand(2);
+
+            // Generate a new function call to the next pseudo scatter
+            // base+offsets instruction.  See above for why passing NULL
+            // for the Instruction * is intended.
+            llvm::Instruction *newCall = 
+                lCallInst(gatherScatterFunc, basePtr, offsetScale, 
+                          offsetVector, storeValue, mask, "", NULL);
+            lCopyMetadata(newCall, callInst);
+            llvm::ReplaceInstWithInst(callInst, newCall);
+        }
     }
     else {
-        llvm::Value *storeValue = callInst->getArgOperand(1);
-        llvm::Value *mask = callInst->getArgOperand(2);
+        // Try to decompose the offset vector into a compile time constant
+        // component and a varying component.  The constant component is
+        // passed as a separate parameter to the gather/scatter functions,
+        // which in turn allows their implementations to end up emitting
+        // x86 instructions with constant offsets encoded in them.
+        llvm::Value *constOffset, *variableOffset;
+        lExtractConstantOffset(offsetVector, &constOffset, &variableOffset, 
+                               callInst);
+        if (constOffset == NULL)
+            constOffset = LLVMIntAsType(0, offsetVector->getType());
+        if (variableOffset == NULL)
+            variableOffset = LLVMIntAsType(0, offsetVector->getType());
 
-        // Generate a new function call to the next pseudo scatter
-        // base+offsets instruction.  See above for why passing NULL
-        // for the Instruction * is intended.
-        llvm::Instruction *newCall = 
-            lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
-                      constOffset, storeValue, mask, "", NULL);
-        lCopyMetadata(newCall, callInst);
-        llvm::ReplaceInstWithInst(callInst, newCall);
+        // See if the varying component is scaled by 2, 4, or 8.  If so,
+        // extract that scale factor and rewrite variableOffset to remove
+        // it.  (This also is pulled out so that we can match the scales by
+        // 2/4/8 offered by x86 addressing operators.)
+        llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset);
+
+        // If we're doing 32-bit addressing on a 64-bit target, here we
+        // will see if we can call one of the 32-bit variants of the pseudo
+        // gather/scatter functions.
+        if (g->opt.force32BitAddressing && 
+            lOffsets32BitSafe(&variableOffset, &constOffset, callInst)) {
+            gatherScatterFunc = info->baseOffsets32Func;
+        }
+
+        if (info->isGather) {
+            llvm::Value *mask = callInst->getArgOperand(1);
+
+            // Generate a new function call to the next pseudo gather
+            // base+offsets instruction.  Note that we're passing a NULL
+            // llvm::Instruction to llvm::CallInst::Create; this means that
+            // the instruction isn't inserted into a basic block and that
+            // way we can then call ReplaceInstWithInst().
+            llvm::Instruction *newCall = 
+                lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
+                          constOffset, mask, callInst->getName().str().c_str(),
+                          NULL);
+            lCopyMetadata(newCall, callInst);
+            llvm::ReplaceInstWithInst(callInst, newCall);
+        }
+        else {
+            llvm::Value *storeValue = callInst->getArgOperand(1);
+            llvm::Value *mask = callInst->getArgOperand(2);
+
+            // Generate a new function call to the next pseudo scatter
+            // base+offsets instruction.  See above for why passing NULL
+            // for the Instruction * is intended.
+            llvm::Instruction *newCall = 
+                lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
+                          constOffset, storeValue, mask, "", NULL);
+            lCopyMetadata(newCall, callInst);
+            llvm::ReplaceInstWithInst(callInst, newCall);
+        }
     }
-
     return true;
 }
 
@@ -1858,57 +2030,67 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
     };
 
     GSBOInfo gsFuncs[] = {
-        GSBOInfo("__pseudo_gather_factored_base_offsets32_i8", 
-               "__pseudo_gather_factored_base_offsets32_i8", true),
-        GSBOInfo("__pseudo_gather_factored_base_offsets32_i16", 
-               "__pseudo_gather_factored_base_offsets32_i16", true),
-        GSBOInfo("__pseudo_gather_factored_base_offsets32_i32", 
-               "__pseudo_gather_factored_base_offsets32_i32", true),
-        GSBOInfo("__pseudo_gather_factored_base_offsets32_float", 
-               "__pseudo_gather_factored_base_offsets32_float", true),
-        GSBOInfo("__pseudo_gather_factored_base_offsets32_i64", 
-               "__pseudo_gather_factored_base_offsets32_i64", true),
-        GSBOInfo("__pseudo_gather_factored_base_offsets32_double", 
-               "__pseudo_gather_factored_base_offsets32_double", true),
+        GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" :
+                                       "__pseudo_gather_factored_base_offsets32_i8", 
+                 g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" :
+                                       "__pseudo_gather_factored_base_offsets32_i8", 
+                 true),
+        GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" :
+                                       "__pseudo_gather_factored_base_offsets32_i16", 
+                 g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" :
+                                       "__pseudo_gather_factored_base_offsets32_i16", 
+                 true),
+        GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" :
+                                       "__pseudo_gather_factored_base_offsets32_i32", 
+                 g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" :
+                                       "__pseudo_gather_factored_base_offsets32_i32", 
+                 true),
+        GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_float" :
+                                       "__pseudo_gather_factored_base_offsets32_float", 
+                 g->target.hasGather ? "__pseudo_gather_base_offsets32_float" :
+                                       "__pseudo_gather_factored_base_offsets32_float", 
+                 true),
+        GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" :
+                                       "__pseudo_gather_factored_base_offsets32_i64", 
+                 g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" :
+                                       "__pseudo_gather_factored_base_offsets32_i64", 
+                 true),
+        GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_double" :
+                                       "__pseudo_gather_factored_base_offsets32_double", 
+                 g->target.hasGather ? "__pseudo_gather_base_offsets32_double" :
+                                       "__pseudo_gather_factored_base_offsets32_double", 
+                 true),
 
-        GSBOInfo( "__pseudo_scatter_factored_base_offsets32_i8", 
-               "__pseudo_scatter_factored_base_offsets32_i8", false),
-        GSBOInfo("__pseudo_scatter_factored_base_offsets32_i16", 
-               "__pseudo_scatter_factored_base_offsets32_i16", false),
-        GSBOInfo("__pseudo_scatter_factored_base_offsets32_i32", 
-               "__pseudo_scatter_factored_base_offsets32_i32", false),
-        GSBOInfo("__pseudo_scatter_factored_base_offsets32_float", 
-               "__pseudo_scatter_factored_base_offsets32_float", false),
-        GSBOInfo("__pseudo_scatter_factored_base_offsets32_i64", 
-               "__pseudo_scatter_factored_base_offsets32_i64", false),
-        GSBOInfo("__pseudo_scatter_factored_base_offsets32_double", 
-               "__pseudo_scatter_factored_base_offsets32_double", false),
-
-        GSBOInfo( "__pseudo_gather_factored_base_offsets64_i8", 
-               "__pseudo_gather_factored_base_offsets32_i8", true),
-        GSBOInfo("__pseudo_gather_factored_base_offsets64_i16", 
-               "__pseudo_gather_factored_base_offsets32_i16", true),
-        GSBOInfo("__pseudo_gather_factored_base_offsets64_i32", 
-               "__pseudo_gather_factored_base_offsets32_i32", true),
-        GSBOInfo("__pseudo_gather_factored_base_offsets64_float", 
-               "__pseudo_gather_factored_base_offsets32_float", true),
-        GSBOInfo("__pseudo_gather_factored_base_offsets64_i64", 
-               "__pseudo_gather_factored_base_offsets32_i64", true),
-        GSBOInfo("__pseudo_gather_factored_base_offsets64_double", 
-               "__pseudo_gather_factored_base_offsets32_double", true),
-
-        GSBOInfo( "__pseudo_scatter_factored_base_offsets64_i8", 
-               "__pseudo_scatter_factored_base_offsets32_i8", false),
-        GSBOInfo("__pseudo_scatter_factored_base_offsets64_i16", 
-               "__pseudo_scatter_factored_base_offsets32_i16", false),
-        GSBOInfo("__pseudo_scatter_factored_base_offsets64_i32", 
-               "__pseudo_scatter_factored_base_offsets32_i32", false),
-        GSBOInfo("__pseudo_scatter_factored_base_offsets64_float", 
-               "__pseudo_scatter_factored_base_offsets32_float", false),
-        GSBOInfo("__pseudo_scatter_factored_base_offsets64_i64", 
-               "__pseudo_scatter_factored_base_offsets32_i64", false),
-        GSBOInfo("__pseudo_scatter_factored_base_offsets64_double", 
-               "__pseudo_scatter_factored_base_offsets32_double", false),
+        GSBOInfo( g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" :
+                                         "__pseudo_scatter_factored_base_offsets32_i8", 
+                  g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" :
+                                         "__pseudo_scatter_factored_base_offsets32_i8", 
+                  false),
+        GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" :
+                                        "__pseudo_scatter_factored_base_offsets32_i16", 
+                 g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" :
+                                        "__pseudo_scatter_factored_base_offsets32_i16", 
+                 false),
+        GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" :
+                                        "__pseudo_scatter_factored_base_offsets32_i32", 
+                 g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" :
+                                        "__pseudo_scatter_factored_base_offsets32_i32", 
+                 false),
+        GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" :
+                                        "__pseudo_scatter_factored_base_offsets32_float", 
+                 g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" :
+                                        "__pseudo_scatter_factored_base_offsets32_float", 
+                 false),
+        GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" :
+                                        "__pseudo_scatter_factored_base_offsets32_i64", 
+                 g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" :
+                                        "__pseudo_scatter_factored_base_offsets32_i64", 
+                 false),
+        GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" :
+                                        "__pseudo_scatter_factored_base_offsets32_double", 
+                 g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" :
+                                        "__pseudo_scatter_factored_base_offsets32_double", 
+                 false),
     };
 
     int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
@@ -1991,6 +2173,26 @@ lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets,
 }
 
 
+static llvm::Constant *
+lGetOffsetScaleVec(llvm::Value *offsetScale, llvm::Type *vecType) {
+    llvm::ConstantInt *offsetScaleInt = 
+        llvm::dyn_cast<llvm::ConstantInt>(offsetScale);
+    Assert(offsetScaleInt != NULL);
+    uint64_t scaleValue = offsetScaleInt->getZExtValue();
+
+    std::vector<llvm::Constant *> scales;
+    for (int i = 0; i < g->target.vectorWidth; ++i) {
+        if (vecType == LLVMTypes::Int64VectorType)
+            scales.push_back(LLVMInt64(scaleValue));
+        else {
+            Assert(vecType == LLVMTypes::Int32VectorType);
+            scales.push_back(LLVMInt32((int32_t)scaleValue));
+        }
+    }
+    return llvm::ConstantVector::get(scales);
+}
+
+
 /** After earlier optimization passes have run, we are sometimes able to
     determine that gathers/scatters are actually accessing memory in a more
     regular fashion and then change the operation to something simpler and
@@ -2011,7 +2213,7 @@ lGSToLoadStore(llvm::CallInst *callInst) {
     struct GatherImpInfo {
         GatherImpInfo(const char *pName, const char *lmName, llvm::Type *st,
                       int a) 
-            : align(a) {
+            : align(a), isFactored(!g->target.hasGather) {
             pseudoFunc = m->module->getFunction(pName);
             loadMaskedFunc = m->module->getFunction(lmName);
             Assert(pseudoFunc != NULL && loadMaskedFunc != NULL);
@@ -2022,39 +2224,52 @@ lGSToLoadStore(llvm::CallInst *callInst) {
         llvm::Function *loadMaskedFunc;
         llvm::Type *scalarType;
         const int align;
+        const bool isFactored;
     };
 
     GatherImpInfo gInfo[] = {
-        GatherImpInfo("__pseudo_gather_factored_base_offsets32_i8",     "__masked_load_i8",     
-                      LLVMTypes::Int8Type, 1),
-        GatherImpInfo("__pseudo_gather_factored_base_offsets32_i16",    "__masked_load_i16",    
-                      LLVMTypes::Int16Type, 2),
-        GatherImpInfo("__pseudo_gather_factored_base_offsets32_i32",    "__masked_load_i32",    
-                      LLVMTypes::Int32Type, 4),
-        GatherImpInfo("__pseudo_gather_factored_base_offsets32_float",  "__masked_load_float",  
-                      LLVMTypes::FloatType, 4),
-        GatherImpInfo("__pseudo_gather_factored_base_offsets32_i64",    "__masked_load_i64",    
-                      LLVMTypes::Int64Type, 8),
-        GatherImpInfo("__pseudo_gather_factored_base_offsets32_double", "__masked_load_double", 
-                      LLVMTypes::DoubleType, 8),
-        GatherImpInfo("__pseudo_gather_factored_base_offsets64_i8",     "__masked_load_i8",     
-                      LLVMTypes::Int8Type, 1),
-        GatherImpInfo("__pseudo_gather_factored_base_offsets64_i16",    "__masked_load_i16",    
-                      LLVMTypes::Int16Type, 2),
-        GatherImpInfo("__pseudo_gather_factored_base_offsets64_i32",    "__masked_load_i32",    
-                      LLVMTypes::Int32Type, 4),
-        GatherImpInfo("__pseudo_gather_factored_base_offsets64_float",  "__masked_load_float",  
-                      LLVMTypes::FloatType, 4),
-        GatherImpInfo("__pseudo_gather_factored_base_offsets64_i64",    "__masked_load_i64",    
-                      LLVMTypes::Int64Type, 8),
-        GatherImpInfo("__pseudo_gather_factored_base_offsets64_double", "__masked_load_double", 
-                      LLVMTypes::DoubleType, 8)
+        GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" :
+                                            "__pseudo_gather_factored_base_offsets32_i8", 
+                      "__masked_load_i8", LLVMTypes::Int8Type, 1),
+        GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" :
+                                            "__pseudo_gather_factored_base_offsets32_i16", 
+                      "__masked_load_i16", LLVMTypes::Int16Type, 2),
+        GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" :
+                                            "__pseudo_gather_factored_base_offsets32_i32", 
+                      "__masked_load_i32", LLVMTypes::Int32Type, 4),
+        GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_float" :
+                                            "__pseudo_gather_factored_base_offsets32_float", 
+                      "__masked_load_float", LLVMTypes::FloatType, 4),
+        GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" :
+                                            "__pseudo_gather_factored_base_offsets32_i64", 
+                      "__masked_load_i64", LLVMTypes::Int64Type, 8),
+        GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_double" :
+                                            "__pseudo_gather_factored_base_offsets32_double", 
+                      "__masked_load_double", LLVMTypes::DoubleType, 8),
+        GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i8" :
+                                            "__pseudo_gather_factored_base_offsets64_i8", 
+                      "__masked_load_i8", LLVMTypes::Int8Type, 1),
+        GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i16" :
+                                            "__pseudo_gather_factored_base_offsets64_i16", 
+                      "__masked_load_i16", LLVMTypes::Int16Type, 2),
+        GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i32" :
+                                            "__pseudo_gather_factored_base_offsets64_i32", 
+                      "__masked_load_i32", LLVMTypes::Int32Type, 4),
+        GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_float" :
+                                            "__pseudo_gather_factored_base_offsets64_float", 
+                       "__masked_load_float", LLVMTypes::FloatType, 4),
+        GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i64" :
+                                            "__pseudo_gather_factored_base_offsets64_i64", 
+                      "__masked_load_i64", LLVMTypes::Int64Type, 8),
+        GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_double" :
+                                            "__pseudo_gather_factored_base_offsets64_double", 
+                      "__masked_load_double", LLVMTypes::DoubleType, 8),
     };
 
     struct ScatterImpInfo {
         ScatterImpInfo(const char *pName, const char *msName, 
                        llvm::Type *vpt, int a)
-            : align(a) {
+            : align(a), isFactored(!g->target.hasScatter) {
             pseudoFunc = m->module->getFunction(pName);
             maskedStoreFunc = m->module->getFunction(msName);
             vecPtrType = vpt;
@@ -2064,33 +2279,46 @@ lGSToLoadStore(llvm::CallInst *callInst) {
         llvm::Function *maskedStoreFunc;
         llvm::Type *vecPtrType;
         const int align;
+        const bool isFactored;
     };
     
     ScatterImpInfo sInfo[] = {
-        ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i8",  "__pseudo_masked_store_i8", 
-                       LLVMTypes::Int8VectorPointerType, 1),
-        ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i16", "__pseudo_masked_store_i16",
-                       LLVMTypes::Int16VectorPointerType, 2),
-        ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i32", "__pseudo_masked_store_i32",
-                       LLVMTypes::Int32VectorPointerType, 4),
-        ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_float", "__pseudo_masked_store_float",
-                       LLVMTypes::FloatVectorPointerType, 4),
-        ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i64", "__pseudo_masked_store_i64",
-                       LLVMTypes::Int64VectorPointerType, 8),
-        ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_double", "__pseudo_masked_store_double",
-                       LLVMTypes::DoubleVectorPointerType, 8),
-        ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i8",  "__pseudo_masked_store_i8", 
-                       LLVMTypes::Int8VectorPointerType, 1),
-        ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i16", "__pseudo_masked_store_i16",
-                       LLVMTypes::Int16VectorPointerType, 2),
-        ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i32", "__pseudo_masked_store_i32",
-                       LLVMTypes::Int32VectorPointerType, 4),
-        ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_float", "__pseudo_masked_store_float",
-                       LLVMTypes::FloatVectorPointerType, 4),
-        ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i64", "__pseudo_masked_store_i64",
-                       LLVMTypes::Int64VectorPointerType, 8),
-        ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_double", "__pseudo_masked_store_double",
-                       LLVMTypes::DoubleVectorPointerType, 8)
+        ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" :
+                                              "__pseudo_scatter_factored_base_offsets32_i8",
+                       "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
+        ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" :
+                                              "__pseudo_scatter_factored_base_offsets32_i16",
+                       "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
+        ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" :
+                                              "__pseudo_scatter_factored_base_offsets32_i32",
+                       "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
+        ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" :
+                                              "__pseudo_scatter_factored_base_offsets32_float",
+                       "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
+        ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" :
+                                              "__pseudo_scatter_factored_base_offsets32_i64",
+                       "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
+        ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" :
+                                              "__pseudo_scatter_factored_base_offsets32_double",
+                       "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
+        ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i8" :
+                                              "__pseudo_scatter_factored_base_offsets64_i8",
+                       "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
+        ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i16" :
+                                              "__pseudo_scatter_factored_base_offsets64_i16",
+                       "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
+        ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i32" :
+                                              "__pseudo_scatter_factored_base_offsets64_i32",
+                       "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
+        ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_float" :
+                                              "__pseudo_scatter_factored_base_offsets64_float",
+                       "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
+        ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i64" :
+                                              "__pseudo_scatter_factored_base_offsets64_i64",
+                       "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
+        ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_double" :
+                                              "__pseudo_scatter_factored_base_offsets64_double",
+                       "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
     };
 
     llvm::Function *calledFunc = callInst->getCalledFunction();
@@ -2118,34 +2346,45 @@ lGSToLoadStore(llvm::CallInst *callInst) {
     lGetSourcePosFromMetadata(callInst, &pos);
 
     llvm::Value *base = callInst->getArgOperand(0);
-    llvm::Value *varyingOffsets = callInst->getArgOperand(1);
-    llvm::Value *offsetScale = callInst->getArgOperand(2);
-    llvm::Value *constOffsets = callInst->getArgOperand(3);
-    llvm::Value *storeValue = (scatterInfo != NULL) ? callInst->getArgOperand(4) : NULL;
-    llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5);
+    llvm::Value *fullOffsets = NULL;
+    llvm::Value *storeValue = NULL;
+    llvm::Value *mask = NULL;
 
-    // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets
-    llvm::ConstantInt *offsetScaleInt = 
-        llvm::dyn_cast<llvm::ConstantInt>(offsetScale);
-    Assert(offsetScaleInt != NULL);
-    uint64_t scaleValue = offsetScaleInt->getZExtValue();
+    if ((gatherInfo != NULL && gatherInfo->isFactored) ||
+        (scatterInfo != NULL && scatterInfo->isFactored)) {
+        llvm::Value *varyingOffsets = callInst->getArgOperand(1);
+        llvm::Value *offsetScale = callInst->getArgOperand(2);
+        llvm::Value *constOffsets = callInst->getArgOperand(3);
+        if (scatterInfo)
+            storeValue = callInst->getArgOperand(4);
+        mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5);
 
-    std::vector<llvm::Constant *> scales;
-    for (int i = 0; i < g->target.vectorWidth; ++i) {
-        if (varyingOffsets->getType() == LLVMTypes::Int64VectorType)
-            scales.push_back(LLVMInt64(scaleValue));
-        else
-            scales.push_back(LLVMInt32((int32_t)scaleValue));
+        // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets
+        llvm::Constant *offsetScaleVec = 
+            lGetOffsetScaleVec(offsetScale, varyingOffsets->getType());
+
+        llvm::Value *scaledVarying = 
+            llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
+                                         varyingOffsets, "scaled_varying", callInst);
+        fullOffsets =
+            llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying,
+                                         constOffsets, "varying+const_offsets",
+                                         callInst);
     }
-    llvm::Constant *offsetScaleVec = llvm::ConstantVector::get(scales);
+    else {
+        if (scatterInfo)
+            storeValue = callInst->getArgOperand(3);
+        mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4);
 
-    llvm::Value *scaledVarying = 
-        llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
-                                     varyingOffsets, "scaled_varying", callInst);
-    llvm::Value *fullOffsets =
-        llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying,
-                                     constOffsets, "varying+const_offsets",
-                                     callInst);
+        llvm::Value *offsetScale = callInst->getArgOperand(1);
+        llvm::Value *offsets = callInst->getArgOperand(2);
+        llvm::Value *offsetScaleVec = 
+            lGetOffsetScaleVec(offsetScale, offsets->getType());
+
+        fullOffsets =
+            llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
+                                         offsets, "scaled_offsets", callInst);
+    }
 
     Debug(SourcePos(), "GSToLoadStore: %s.", 
           fullOffsets->getName().str().c_str());
@@ -3631,7 +3870,6 @@ lReplacePseudoGS(llvm::CallInst *callInst) {
             : isGather(ig) {
             pseudoFunc = m->module->getFunction(pName);
             actualFunc = m->module->getFunction(aName);
-            Assert(pseudoFunc != NULL && actualFunc != NULL);
         }
         llvm::Function *pseudoFunc;
         llvm::Function *actualFunc;
@@ -3639,20 +3877,6 @@ lReplacePseudoGS(llvm::CallInst *callInst) {
     };
 
     LowerGSInfo lgsInfo[] = {
-        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8",  "__gather_factored_base_offsets32_i8",  true),
-        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", "__gather_factored_base_offsets32_i16", true),
-        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i32", true),
-        LowerGSInfo("__pseudo_gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_float", true),
-        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", "__gather_factored_base_offsets32_i64", true),
-        LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", "__gather_factored_base_offsets32_double", true),
-
-        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8",  "__gather_factored_base_offsets64_i8",  true),
-        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", "__gather_factored_base_offsets64_i16", true),
-        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i32", true),
-        LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_float", true),
-        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", "__gather_factored_base_offsets64_i64", true),
-        LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", "__gather_factored_base_offsets64_double", true),
-
         LowerGSInfo("__pseudo_gather32_i8",  "__gather32_i8",  true),
         LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true),
         LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true),
@@ -3667,19 +3891,57 @@ lReplacePseudoGS(llvm::CallInst *callInst) {
         LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true),
         LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true),
 
-        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8",  "__scatter_factored_base_offsets32_i8",  false),
-        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", "__scatter_factored_base_offsets32_i16", false),
-        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i32", false),
-        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_float", false),
-        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", "__scatter_factored_base_offsets32_i64", false),
-        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double", "__scatter_factored_base_offsets32_double", false),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8",  
+                    "__gather_factored_base_offsets32_i8",  true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", 
+                    "__gather_factored_base_offsets32_i16", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", 
+                    "__gather_factored_base_offsets32_i32", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets32_float",
+                    "__gather_factored_base_offsets32_float", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", 
+                    "__gather_factored_base_offsets32_i64", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", 
+                    "__gather_factored_base_offsets32_double", true),
 
-        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8",  "__scatter_factored_base_offsets64_i8",  false),
-        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", "__scatter_factored_base_offsets64_i16", false),
-        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i32", false),
-        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_float", false),
-        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", "__scatter_factored_base_offsets64_i64", false),
-        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", "__scatter_factored_base_offsets64_double", false),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8",  
+                    "__gather_factored_base_offsets64_i8",  true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", 
+                    "__gather_factored_base_offsets64_i16", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", 
+                    "__gather_factored_base_offsets64_i32", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", 
+                    "__gather_factored_base_offsets64_float", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", 
+                    "__gather_factored_base_offsets64_i64", true),
+        LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", 
+                    "__gather_factored_base_offsets64_double", true),
+
+        LowerGSInfo("__pseudo_gather_base_offsets32_i8",  
+                    "__gather_base_offsets32_i8",  true),
+        LowerGSInfo("__pseudo_gather_base_offsets32_i16", 
+                    "__gather_base_offsets32_i16", true),
+        LowerGSInfo("__pseudo_gather_base_offsets32_i32", 
+                    "__gather_base_offsets32_i32", true),
+        LowerGSInfo("__pseudo_gather_base_offsets32_float",
+                    "__gather_base_offsets32_float", true),
+        LowerGSInfo("__pseudo_gather_base_offsets32_i64", 
+                    "__gather_base_offsets32_i64", true),
+        LowerGSInfo("__pseudo_gather_base_offsets32_double", 
+                    "__gather_base_offsets32_double", true),
+
+        LowerGSInfo("__pseudo_gather_base_offsets64_i8",  
+                    "__gather_base_offsets64_i8",  true),
+        LowerGSInfo("__pseudo_gather_base_offsets64_i16", 
+                    "__gather_base_offsets64_i16", true),
+        LowerGSInfo("__pseudo_gather_base_offsets64_i32", 
+                    "__gather_base_offsets64_i32", true),
+        LowerGSInfo("__pseudo_gather_base_offsets64_float", 
+                    "__gather_base_offsets64_float", true),
+        LowerGSInfo("__pseudo_gather_base_offsets64_i64", 
+                    "__gather_base_offsets64_i64", true),
+        LowerGSInfo("__pseudo_gather_base_offsets64_double", 
+                    "__gather_base_offsets64_double", true),
 
         LowerGSInfo("__pseudo_scatter32_i8",  "__scatter32_i8",  false),
         LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false),
@@ -3694,6 +3956,59 @@ lReplacePseudoGS(llvm::CallInst *callInst) {
         LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false),
         LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false),
         LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false),
+
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8",  
+                    "__scatter_factored_base_offsets32_i8",  false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", 
+                    "__scatter_factored_base_offsets32_i16", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", 
+                    "__scatter_factored_base_offsets32_i32", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", 
+                    "__scatter_factored_base_offsets32_float", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", 
+                    "__scatter_factored_base_offsets32_i64", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double",
+                    "__scatter_factored_base_offsets32_double", false),
+
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8",  
+                    "__scatter_factored_base_offsets64_i8",  false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", 
+                    "__scatter_factored_base_offsets64_i16", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", 
+                    "__scatter_factored_base_offsets64_i32", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float",
+                    "__scatter_factored_base_offsets64_float", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", 
+                    "__scatter_factored_base_offsets64_i64", false),
+        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", 
+                    "__scatter_factored_base_offsets64_double", false),
+
+
+        LowerGSInfo("__pseudo_scatter_base_offsets32_i8",  
+                    "__scatter_base_offsets32_i8",  false),
+        LowerGSInfo("__pseudo_scatter_base_offsets32_i16", 
+                    "__scatter_base_offsets32_i16", false),
+        LowerGSInfo("__pseudo_scatter_base_offsets32_i32", 
+                    "__scatter_base_offsets32_i32", false),
+        LowerGSInfo("__pseudo_scatter_base_offsets32_float", 
+                    "__scatter_base_offsets32_float", false),
+        LowerGSInfo("__pseudo_scatter_base_offsets32_i64", 
+                    "__scatter_base_offsets32_i64", false),
+        LowerGSInfo("__pseudo_scatter_base_offsets32_double",
+                    "__scatter_base_offsets32_double", false),
+
+        LowerGSInfo("__pseudo_scatter_base_offsets64_i8",  
+                    "__scatter_base_offsets64_i8",  false),
+        LowerGSInfo("__pseudo_scatter_base_offsets64_i16", 
+                    "__scatter_base_offsets64_i16", false),
+        LowerGSInfo("__pseudo_scatter_base_offsets64_i32", 
+                    "__scatter_base_offsets64_i32", false),
+        LowerGSInfo("__pseudo_scatter_base_offsets64_float",
+                    "__scatter_base_offsets64_float", false),
+        LowerGSInfo("__pseudo_scatter_base_offsets64_i64", 
+                    "__scatter_base_offsets64_i64", false),
+        LowerGSInfo("__pseudo_scatter_base_offsets64_double", 
+                    "__scatter_base_offsets64_double", false),
     };
 
     llvm::Function *calledFunc = callInst->getCalledFunction();
@@ -3709,6 +4024,7 @@ lReplacePseudoGS(llvm::CallInst *callInst) {
     if (info == NULL)
         return false;
 
+    Assert(info->actualFunc != NULL);
 
     // Get the source position from the metadata attached to the call
     // instruction so that we can issue PerformanceWarning()s below.
@@ -3905,6 +4221,12 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
         "__gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i16",
         "__gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i64",
         "__gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_double",
+        "__gather_base_offsets32_i8", "__gather_base_offsets32_i16",
+        "__gather_base_offsets32_i32", "__gather_base_offsets32_i64",
+        "__gather_base_offsets32_float", "__gather_base_offsets32_double",
+        "__gather_base_offsets64_i8", "__gather_base_offsets64_i16",
+        "__gather_base_offsets64_i32", "__gather_base_offsets64_i64",
+        "__gather_base_offsets64_float", "__gather_base_offsets64_double",
         "__gather32_i8", "__gather32_i16",
         "__gather32_i32", "__gather32_i64",
         "__gather32_float", "__gather32_double",
@@ -3932,6 +4254,12 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
         "__scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i16",
         "__scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i64",
         "__scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_double",
+        "__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16",
+        "__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64",
+        "__scatter_base_offsets32_float", "__scatter_base_offsets32_double",
+        "__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16",
+        "__scatter_base_offsets64_i32", "__scatter_base_offsets64_i64",
+        "__scatter_base_offsets64_float", "__scatter_base_offsets64_double",
         "__scatter_elt32_i8", "__scatter_elt32_i16", 
         "__scatter_elt32_i32", "__scatter_elt32_i64", 
         "__scatter_elt32_float", "__scatter_elt32_double", 

From c09c87873eca8d2d8dffc96486af41dccaf2f50c Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Wed, 11 Jul 2012 14:09:17 -0700
Subject: [PATCH 03/15] Whitespace / indentation fixes.

---
 builtins/util.m4 | 61 ++++++++++++++++++++++++------------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/builtins/util.m4 b/builtins/util.m4
index 26466b1f..974c799c 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1949,41 +1949,41 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
   call void @__usedouble(<WIDTH x double> %pg64_d)
 
   %g32_8 = call <WIDTH x i8>  @__gather32_i8(<WIDTH x i32> %v32,
-                                             <WIDTH x MASK> %mask)
+                                                     <WIDTH x MASK> %mask)
   call void @__use8(<WIDTH x i8> %g32_8)
   %g32_16 = call <WIDTH x i16>  @__gather32_i16(<WIDTH x i32> %v32,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
   call void @__use16(<WIDTH x i16> %g32_16)
   %g32_32 = call <WIDTH x i32>  @__gather32_i32(<WIDTH x i32> %v32,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
   call void @__use32(<WIDTH x i32> %g32_32)
   %g32_f = call <WIDTH x float>  @__gather32_float(<WIDTH x i32> %v32,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
   call void @__usefloat(<WIDTH x float> %g32_f)
   %g32_64 = call <WIDTH x i64>  @__gather32_i64(<WIDTH x i32> %v32,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
   call void @__use64(<WIDTH x i64> %g32_64)
   %g32_d = call <WIDTH x double>  @__gather32_double(<WIDTH x i32> %v32,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
   call void @__usedouble(<WIDTH x double> %g32_d)
 
   %g64_8 = call <WIDTH x i8>  @__gather64_i8(<WIDTH x i64> %v64,
-                                             <WIDTH x MASK> %mask)
+                                                     <WIDTH x MASK> %mask)
   call void @__use8(<WIDTH x i8> %g64_8)
   %g64_16 = call <WIDTH x i16>  @__gather64_i16(<WIDTH x i64> %v64,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
   call void @__use16(<WIDTH x i16> %g64_16)
   %g64_32 = call <WIDTH x i32>  @__gather64_i32(<WIDTH x i64> %v64,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
   call void @__use32(<WIDTH x i32> %g64_32)
   %g64_f = call <WIDTH x float>  @__gather64_float(<WIDTH x i64> %v64,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
   call void @__usefloat(<WIDTH x float> %g64_f)
   %g64_64 = call <WIDTH x i64>  @__gather64_i64(<WIDTH x i64> %v64,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
   call void @__use64(<WIDTH x i64> %g64_64)
   %g64_d = call <WIDTH x double>  @__gather64_double(<WIDTH x i64> %v64,
-                                                    <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
   call void @__usedouble(<WIDTH x double> %g64_d)
 
 ifelse(HAVE_GATHER, `1', 
@@ -2166,27 +2166,27 @@ ifelse(HAVE_GATHER, `1',
 
   %gbo64_8 = call <WIDTH x i8>
        @__gather_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
-                                  <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__use8(<WIDTH x i8> %gbo64_8)
   %gbo64_16 = call <WIDTH x i16>
        @__gather_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
-                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__use16(<WIDTH x i16> %gbo64_16)
   %gbo64_32 = call <WIDTH x i32>
        @__gather_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
-                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__use32(<WIDTH x i32> %gbo64_32)
   %gbo64_f = call <WIDTH x float>
        @__gather_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
-                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__usefloat(<WIDTH x float> %gbo64_f)
   %gbo64_64 = call <WIDTH x i64>
        @__gather_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
-                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__use64(<WIDTH x i64> %gbo64_64)
   %gbo64_d = call <WIDTH x double>
        @__gather_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
-                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__usedouble(<WIDTH x double> %pgbo64_d)
 ')
 
@@ -2303,35 +2303,36 @@ ifelse(HAVE_SCATTER, `1',
                                                     <WIDTH x double> %vd, <WIDTH x MASK> %mask)
 
   call void @__scatter_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
-                                         <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
   call void @__scatter_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
-                                          <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
   call void @__scatter_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
-                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
   call void @__scatter_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
-                                            <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
   call void @__scatter_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
-                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__scatter_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
-                                             <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
 
   call void @__scatter_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
-                                         <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
   call void @__scatter_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
-                                          <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
   call void @__scatter_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
-                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
   call void @__scatter_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
-                                            <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
   call void @__scatter_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
-                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
   call void @__scatter_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
-                                             <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
 ')
 
   ret void
 }
 
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops
 

From 216ac4b1a484653a56993dbb46ac233deb263134 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Wed, 11 Jul 2012 14:52:14 -0700
Subject: [PATCH 04/15] Stop factoring out constant offsets for gather/scatter
 if instr is available.

For KNC (gather/scatter), it's not helpful to factor base+offsets gathers
and scatters into base_ptr + {1/2/4/8} * varying_offsets + const_offsets.
Now, if a HW instruction is available for gather/scatter, we just factor
into base + {1/2/4/8} * offsets (if possible).  Not only is this simpler,
but it's also what we need to pass a value along to the scale by
2/4/8 available directly in those instructions.

Finishes issue #325.
---
 builtins/target-generic-common.ll |  19 ++-
 examples/intrinsics/generic-16.h  |  69 ++++----
 examples/intrinsics/generic-32.h  |  68 ++++----
 examples/intrinsics/generic-64.h  |  74 ++++-----
 examples/intrinsics/knc.h         | 100 ++++--------
 examples/intrinsics/sse4.h        | 258 +++++++++++++-----------------
 ispc.cpp                          |   5 +
 7 files changed, 257 insertions(+), 336 deletions(-)

diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index c54dd948..77c7aabe 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -32,6 +32,9 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32";
 
 define(`MASK',`i1')
+define(`HAVE_GATHER',`1')
+define(`HAVE_SCATTER',`1')
+
 include(`util.m4')
 
 stdlib_core()
@@ -334,19 +337,19 @@ define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
 ;; gather/scatter
 
 define(`gather_scatter', `
-declare <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
-                        i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly 
-declare <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
-                        i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, i32, <WIDTH x i32>,
+                                                 <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, i32, <WIDTH x i64>,
+                                                  <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>, 
                                     <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>, 
                                     <WIDTH x i1>) nounwind readonly 
 
-declare void @__scatter_factored_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
-                  i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
-declare void @__scatter_factored_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
-                  i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, <WIDTH x i32>,
+                                          <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, <WIDTH x i64>,
+                                          <WIDTH x $1>, <WIDTH x i1>) nounwind 
 declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
                              <WIDTH x i1>) nounwind 
 declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index c18e9fbe..42978701 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1306,34 +1306,32 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
 
 // offsets * offsetScale is in bytes (for all of these)
 
-#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
-static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
-                              uint32_t scale, OTYPE constOffset, \
-                              __vec16_i1 mask) {                        \
+#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec16_i1 mask) {          \
     VTYPE ret;                                                          \
     int8_t *base = (int8_t *)b;                                         \
     for (int i = 0; i < 16; ++i)                                        \
         if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
             ret.v[i] = *ptr;                                            \
         }                                                               \
     return ret;                                                         \
 }
     
 
-GATHER_FACTORED_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __gather_factored_base_offsets32_i8)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_factored_base_offsets64_i8)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_factored_base_offsets32_i32)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_factored_base_offsets32_float)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_factored_base_offsets64_float)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __gather_factored_base_offsets32_double)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __gather_factored_base_offsets64_double)
+GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __gather_base_offsets32_i8)
+GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
+GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
+GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
+GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
+GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
+GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
+GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
+GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
+GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
+GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __gather_base_offsets32_double)
+GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __gather_base_offsets64_double)
 
 #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
 static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
@@ -1361,32 +1359,31 @@ GATHER_GENERAL(__vec16_d,   double,  __vec16_i64, __gather64_double)
 
 // scatter
 
-#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
-static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
-                             uint32_t scale, OTYPE constOffset,         \
-                             VTYPE val, __vec16_i1 mask) {              \
+#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val,                   \
+                             __vec16_i1 mask) {                         \
     int8_t *base = (int8_t *)b;                                         \
     for (int i = 0; i < 16; ++i)                                        \
         if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
             *ptr = val.v[i];                                            \
         }                                                               \
 }
     
 
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __scatter_factored_base_offsets32_i8)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __scatter_factored_base_offsets64_i8)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_factored_base_offsets32_i32)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_factored_base_offsets32_float)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_factored_base_offsets64_float)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __scatter_factored_base_offsets32_double)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __scatter_factored_base_offsets64_double)
+SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __scatter_base_offsets64_i8)
+SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
+SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
+SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
+SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
+SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_base_offsets32_float)
+SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
+SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
+SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
+SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __scatter_base_offsets32_double)
+SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __scatter_base_offsets64_double)
 
 #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index c1f89cd8..94946f4a 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -1374,34 +1374,32 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec32_d val,
 
 // offsets * offsetScale is in bytes (for all of these)
 
-#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
-static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
-                              uint32_t scale, OTYPE constOffset, \
-                              __vec32_i1 mask) {                        \
+#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec32_i1 mask) {          \
     VTYPE ret;                                                          \
     int8_t *base = (int8_t *)b;                                         \
     for (int i = 0; i < 32; ++i)                                        \
         if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
             ret.v[i] = *ptr;                                            \
         }                                                               \
     return ret;                                                         \
 }
     
 
-GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_factored_base_offsets32_i8)
-GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_factored_base_offsets64_i8)
-GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_factored_base_offsets32_i16)
-GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_factored_base_offsets64_i16)
-GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_factored_base_offsets32_i32)
-GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_factored_base_offsets64_i32)
-GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_factored_base_offsets32_float)
-GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_factored_base_offsets64_float)
-GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_factored_base_offsets32_i64)
-GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_factored_base_offsets64_i64)
-GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_factored_base_offsets32_double)
-GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_factored_base_offsets64_double)
+GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_base_offsets32_i8)
+GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_base_offsets64_i8)
+GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_base_offsets32_i16)
+GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_base_offsets64_i16)
+GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_base_offsets32_i32)
+GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_base_offsets64_i32)
+GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_base_offsets32_float)
+GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_base_offsets64_float)
+GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_base_offsets32_i64)
+GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_base_offsets64_i64)
+GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_base_offsets32_double)
+GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_base_offsets64_double)
 
 #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
 static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec32_i1 mask) {   \
@@ -1429,32 +1427,30 @@ GATHER_GENERAL(__vec32_d, double, __vec32_i64, __gather64_double)
 
 // scatter
 
-#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
-static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
-                             uint32_t scale, OTYPE constOffset,         \
-                             VTYPE val, __vec32_i1 mask) {              \
+#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val, __vec32_i1 mask) { \
     int8_t *base = (int8_t *)b;                                         \
     for (int i = 0; i < 32; ++i)                                        \
         if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
             *ptr = val.v[i];                                            \
         }                                                               \
 }
     
 
-SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_factored_base_offsets32_i8)
-SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_factored_base_offsets64_i8)
-SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_factored_base_offsets32_i16)
-SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_factored_base_offsets64_i16)
-SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_factored_base_offsets32_i32)
-SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_factored_base_offsets64_i32)
-SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_factored_base_offsets32_float)
-SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_factored_base_offsets64_float)
-SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_factored_base_offsets32_i64)
-SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_factored_base_offsets64_i64)
-SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_factored_base_offsets32_double)
-SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_factored_base_offsets64_double)
+SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_base_offsets64_i8)
+SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_base_offsets32_i16)
+SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_base_offsets64_i16)
+SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_base_offsets32_i32)
+SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_base_offsets64_i32)
+SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_base_offsets32_float)
+SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_base_offsets64_float)
+SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_base_offsets32_i64)
+SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_base_offsets64_i64)
+SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_base_offsets32_double)
+SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_base_offsets64_double)
 
 #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec32_i1 mask) {  \
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index 2a54446e..ff84fee3 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -1507,40 +1507,38 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec64_d val,
 
 // offsets * offsetScale is in bytes (for all of these)
 
-#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
-static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
-                              uint32_t scale, OTYPE constOffset, \
-                              __vec64_i1 mask) {                        \
+#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec64_i1 mask) {          \
     VTYPE ret;                                                          \
     int8_t *base = (int8_t *)b;                                         \
     for (int i = 0; i < 64; ++i)                                        \
-        if ((mask.v & (1ull << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
+        if ((mask.v & (1ull << i)) != 0) {                              \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
             ret.v[i] = *ptr;                                            \
         }                                                               \
     return ret;                                                         \
 }
     
 
-GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_factored_base_offsets32_i8)
-GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_factored_base_offsets64_i8)
-GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_factored_base_offsets32_i16)
-GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_factored_base_offsets64_i16)
-GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_factored_base_offsets32_i32)
-GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_factored_base_offsets64_i32)
-GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_factored_base_offsets32_float)
-GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_factored_base_offsets64_float)
-GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_factored_base_offsets32_i64)
-GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_factored_base_offsets64_i64)
-GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_factored_base_offsets32_double)
-GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_factored_base_offsets64_double)
+GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_base_offsets32_i8)
+GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_base_offsets64_i8)
+GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_base_offsets32_i16)
+GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_base_offsets64_i16)
+GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_base_offsets32_i32)
+GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_base_offsets64_i32)
+GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_base_offsets32_float)
+GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_base_offsets64_float)
+GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_base_offsets32_i64)
+GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_base_offsets64_i64)
+GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_base_offsets32_double)
+GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_base_offsets64_double)
 
 #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
 static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) {   \
     VTYPE ret;                                              \
     for (int i = 0; i < 64; ++i)                            \
-        if ((mask.v & (1ull << i)) != 0) {                     \
+        if ((mask.v & (1ull << i)) != 0) {                  \
             STYPE *ptr = (STYPE *)ptrs.v[i];                \
             ret.v[i] = *ptr;                                \
         }                                                   \
@@ -1562,32 +1560,30 @@ GATHER_GENERAL(__vec64_d, double, __vec64_i64, __gather64_double)
 
 // scatter
 
-#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
-static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
-                             uint32_t scale, OTYPE constOffset,         \
-                             VTYPE val, __vec64_i1 mask) {              \
+#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val, __vec64_i1 mask) { \
     int8_t *base = (int8_t *)b;                                         \
     for (int i = 0; i < 64; ++i)                                        \
-        if ((mask.v & (1ull << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
+        if ((mask.v & (1ull << i)) != 0) {                              \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
             *ptr = val.v[i];                                            \
         }                                                               \
 }
     
 
-SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_factored_base_offsets32_i8)
-SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_factored_base_offsets64_i8)
-SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_factored_base_offsets32_i16)
-SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_factored_base_offsets64_i16)
-SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_factored_base_offsets32_i32)
-SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_factored_base_offsets64_i32)
-SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_factored_base_offsets32_float)
-SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_factored_base_offsets64_float)
-SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_factored_base_offsets32_i64)
-SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_factored_base_offsets64_i64)
-SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_factored_base_offsets32_double)
-SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_factored_base_offsets64_double)
+SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_base_offsets64_i8)
+SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_base_offsets32_i16)
+SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_base_offsets64_i16)
+SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_base_offsets32_i32)
+SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_base_offsets64_i32)
+SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_base_offsets32_float)
+SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_base_offsets64_float)
+SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_base_offsets32_i64)
+SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_base_offsets64_i64)
+SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_base_offsets32_double)
+SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_base_offsets64_double)
 
 #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec64_i1 mask) {  \
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index fb11db11..a0331afb 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -1940,60 +1940,33 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
 
 // offsets * offsetScale is in bytes (for all of these)
 
-#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
-/*
-static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
-                              uint32_t scale, OTYPE constOffset, \
-                              __vec16_i1 mask) {                        \
-    VTYPE ret;                                                          \
-    int8_t *base = (int8_t *)b;                                         \
-    for (int i = 0; i < 16; ++i)                                        \
-        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
-            ret.v[i] = *ptr;                                            \
-        }                                                               \
-    return ret;                                                         \
-}
-*/
-
 static FORCEINLINE __vec16_i32
-__gather_factored_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset, 
-			    uint32_t scale, __vec16_i32 constOffset, 
-			    __vec16_i1 mask) { 
-    __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
-    __vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
-    __vec16_i32 tmp;
-
+__gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets, 
+			    __vec16_i1 mask) {
     // Loop is generated by intrinsic
     __vec16_i32 ret = _mm512_mask_i32extgather_epi32(tmp, mask, offsets, base, 
-                                                     _MM_UPCONV_EPI32_NONE, 1,
+                                                     _MM_UPCONV_EPI32_NONE, scale,
                                                      _MM_HINT_NONE);
     return ret;
 }
 
 static FORCEINLINE __vec16_f
-__gather_factored_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset, 
-                              uint32_t scale, __vec16_i32 constOffset, 
+__gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
                               __vec16_i1 mask) { 
-    __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
-    __vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
-    __vec16_f tmp;
-
     // Loop is generated by intrinsic
-    __vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base, 
-                                                _MM_UPCONV_PS_NONE, 1,
+    __vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base,
+                                                _MM_UPCONV_PS_NONE, scale,
                                                 _MM_HINT_NONE);
     return ret;
 }
 
-GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_factored_base_offsets32_i8)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_factored_base_offsets64_i8)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64)
-GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64)
+//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
+//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
+//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
+//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
+//GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
+//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
+//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
 
 #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
 /*
@@ -2039,45 +2012,30 @@ static FORCEINLINE __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask)
 */
 // scatter
 
-#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
-/*
-static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
-                             uint32_t scale, OTYPE constOffset,         \
-                             VTYPE val, __vec16_i1 mask) {              \
-    int8_t *base = (int8_t *)b;                                         \
-    for (int i = 0; i < 16; ++i)                                        \
-        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
-            *ptr = val.v[i];                                            \
-        }                                                               \
-}
-*/
-
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_factored_base_offsets32_i8)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_factored_base_offsets64_i8)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64)
-SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64)
+//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
+//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
+//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
+//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
+//SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
+//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
+//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
 
 static FORCEINLINE void
-__scatter_factored_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
-                             uint32_t scale, __vec16_i32 constOffset,
+__scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
                              __vec16_i32 val, __vec16_i1 mask)
 {
-    __vec16_i32 offsets = __add(__mul(__vec16_i32(scale), varyingOffset), constOffset);
-    _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE);
+    _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, 
+                                    _MM_DOWNCONV_EPI32_NONE, scale, 
+                                    _MM_HINT_NONE);
 }
 
 static FORCEINLINE void 
-__scatter_factored_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset, 
-                               uint32_t scale, const __vec16_i32 &constOffset, 
-                               const __vec16_f &val, const __vec16_i1 mask) 
+__scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
+                               __vec16_f val, __vec16_i1 mask) 
 { 
-    __vec16_i32 offsets = __add(__mul(varyingOffset,__vec16_i32(scale)), constOffset);
-    _mm512_mask_i32extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
+    _mm512_mask_i32extscatter_ps(base, mask, offsets, val, 
+                                 _MM_DOWNCONV_PS_NONE, scale,
+                                 _MM_HINT_NONE);
 }
 
 #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 088b694d..17ab8f18 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -2892,54 +2892,53 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec4_d val,
 
 template<typename RetVec, typename RetScalar>
 static FORCEINLINE RetVec
-lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets, 
-                     uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
+lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, uint32_t scale, 
+                     __vec4_i32 offsets, __vec4_i1 mask) {
     RetScalar r[4];
 #if 1
     // "Fast gather" trick...
     offsets = __select(mask, offsets, __setzero_i32());
-    constOffset = __select(mask, constOffset, __setzero_i32());
 
-    int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
+    int offset = scale * _mm_extract_epi32(offsets.v, 0);
     RetScalar *ptr = (RetScalar *)(p + offset);
     r[0] = *ptr;
 
-    offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
+    offset = scale * _mm_extract_epi32(offsets.v, 1);
     ptr = (RetScalar *)(p + offset);
     r[1] = *ptr;
 
-    offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
+    offset = scale * _mm_extract_epi32(offsets.v, 2);
     ptr = (RetScalar *)(p + offset);
     r[2] = *ptr;
 
-    offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
+    offset = scale * _mm_extract_epi32(offsets.v, 3);
     ptr = (RetScalar *)(p + offset);
     r[3] = *ptr;
 #else
     uint32_t m = _mm_extract_ps(mask.v, 0);
     if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
+        int offset = scale * _mm_extract_epi32(offsets.v, 0);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[0] = *ptr;
     }
 
     m = _mm_extract_ps(mask.v, 1);
     if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
+        int offset = scale * _mm_extract_epi32(offsets.v, 1);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[1] = *ptr;
     }
 
     m = _mm_extract_ps(mask.v, 2);
     if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
+        int offset = scale * _mm_extract_epi32(offsets.v, 2);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[2] = *ptr;
     }
 
     m = _mm_extract_ps(mask.v, 3);
     if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
+        int offset = scale * _mm_extract_epi32(offsets.v, 3);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[3] = *ptr;
     }
@@ -2950,54 +2949,53 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,
 
 template<typename RetVec, typename RetScalar>
 static FORCEINLINE RetVec
-lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
-                     uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
+lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, uint32_t scale, 
+                     __vec4_i64 offsets, __vec4_i1 mask) {
     RetScalar r[4];
 #if 1
     // "Fast gather" trick...
     offsets = __select(mask, offsets, __setzero_i64());
-    constOffset = __select(mask, constOffset, __setzero_i64());
 
-    int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
+    int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
     RetScalar *ptr = (RetScalar *)(p + offset);
     r[0] = *ptr;
 
-    offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
+    offset = scale * _mm_extract_epi64(offsets.v[0], 1);
     ptr = (RetScalar *)(p + offset);
     r[1] = *ptr;
 
-    offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
+    offset = scale * _mm_extract_epi64(offsets.v[1], 0);
     ptr = (RetScalar *)(p + offset);
     r[2] = *ptr;
 
-    offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
+    offset = scale * _mm_extract_epi64(offsets.v[1], 1);
     ptr = (RetScalar *)(p + offset);
     r[3] = *ptr;
 #else
     uint32_t m = _mm_extract_ps(mask.v, 0);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[0] = *ptr;
     }
 
     m = _mm_extract_ps(mask.v, 1);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[1] = *ptr;
     }
 
     m = _mm_extract_ps(mask.v, 2);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[2] = *ptr;
     }
 
     m = _mm_extract_ps(mask.v, 3);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
         RetScalar *ptr = (RetScalar *)(p + offset);
         r[3] = *ptr;
     }
@@ -3007,87 +3005,75 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
 }
 
 static FORCEINLINE __vec4_i8
-__gather_factored_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
-                           uint32_t scale,  __vec4_i32 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets32_i8(unsigned char *b, uint32_t scale,  __vec4_i32 offsets,
+                           __vec4_i1 mask) {
+    return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, scale, offsets, mask);
 }
 
 static FORCEINLINE __vec4_i8
-__gather_factored_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
-                           uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets64_i8(unsigned char *b, uint32_t scale, __vec4_i64 offsets,
+                           __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, scale, offsets, mask);
 }
 
 static FORCEINLINE __vec4_i16
-__gather_factored_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets32_i16(unsigned char *b, uint32_t scale, __vec4_i32 offsets,
+                            __vec4_i1 mask) {
+    return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, scale, offsets, mask);
 }
 
 static FORCEINLINE __vec4_i16
- __gather_factored_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
-                             uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets64_i16(unsigned char *b, uint32_t scale, __vec4_i64 offsets,
+                            __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, scale, offsets, mask);
 }
 
 static FORCEINLINE __vec4_i32
-__gather_factored_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
-                            __vec4_i32 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets32_i32(uint8_t *p, uint32_t scale, __vec4_i32 offsets, 
+                            __vec4_i1 mask) {
+    return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, scale, offsets, mask);
 }
 
 static FORCEINLINE __vec4_i32
-__gather_factored_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
-                            uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets64_i32(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
+                            __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, scale, offsets, mask);
 }
 
 static FORCEINLINE __vec4_f
-__gather_factored_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
-                              __vec4_i32 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets32(__vec4_f(), float(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets32_float(uint8_t *p, uint32_t scale, __vec4_i32 offsets, 
+                              __vec4_i1 mask) {
+    return lGatherBaseOffsets32(__vec4_f(), float(), p, scale, offsets, mask);
 }
 
 static FORCEINLINE __vec4_f
-__gather_factored_base_offsets64_float(unsigned char *p, __vec4_i64 offsets,
-                              uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets64(__vec4_f(), float(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets64_float(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
+                              __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_f(), float(), p, scale, offsets, mask);
 }
 
 static FORCEINLINE __vec4_i64
-__gather_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
+                            __vec4_i1 mask) {
+    return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, scale, offsets, mask);
 }
 
 static FORCEINLINE __vec4_i64
-__gather_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
-                            uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
+                            __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, scale, offsets, mask);
 }
 
 static FORCEINLINE __vec4_d
-__gather_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
-                               uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets32(__vec4_d(), double(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
+                               __vec4_i1 mask) {
+    return lGatherBaseOffsets32(__vec4_d(), double(), p, scale, offsets, mask);
 }
 
 static FORCEINLINE __vec4_d
-__gather_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
-                               uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets64(__vec4_d(), double(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
+                                __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_d(), double(), p, scale, offsets, mask);
 }
 
 template<typename RetVec, typename RetScalar>
@@ -3252,63 +3238,55 @@ static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) {
   
 #define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT)             \
 static FORCEINLINE void                                             \
-__scatter_factored_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
-                                   uint32_t scale, __vec4_i32 constOffset, \
+__scatter_base_offsets32_##SUFFIX (unsigned char *b, uint32_t scale, \
+                                   __vec4_i32 offsets, \
                                    __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
     uint32_t m = _mm_extract_ps(mask.v, 0);                             \
     if (m != 0) {                                                       \
-        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \
-                             _mm_extract_epi32(constOffset.v, 0));      \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0)); \
         *ptr = EXTRACT(val.v, 0);                                       \
     }                                                                   \
     m = _mm_extract_ps(mask.v, 1);                                      \
     if (m != 0) {                                                       \
-        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \
-                             _mm_extract_epi32(constOffset.v, 1));      \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1)); \
         *ptr = EXTRACT(val.v, 1);                                       \
     }                                                                   \
     m = _mm_extract_ps(mask.v, 2);                                      \
     if (m != 0) {                                                       \
-        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \
-                             _mm_extract_epi32(constOffset.v, 2));      \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2)); \
         *ptr = EXTRACT(val.v, 2);                                       \
     }                                                                   \
     m = _mm_extract_ps(mask.v, 3);                                      \
     if (m != 0) {                                                       \
-        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \
-                             _mm_extract_epi32(constOffset.v, 3));      \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3)); \
         *ptr = EXTRACT(val.v, 3);                                       \
     }                                                                   \
 }                                                                       \
-static FORCEINLINE void                                                \
-__scatter_factored_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
-                                  uint32_t scale, __vec4_i64 constOffset, \
+static FORCEINLINE void                                                 \
+__scatter_base_offsets64_##SUFFIX(unsigned char *p, uint32_t scale,     \
+                                  __vec4_i64 offsets,                   \
                                   __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
     uint32_t m = _mm_extract_ps(mask.v, 0);                            \
     if (m != 0) {                                                      \
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +  \
-            _mm_extract_epi64(constOffset.v[0], 0);                    \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);   \
         TYPE *ptr = (TYPE *)(p + offset);                              \
         *ptr = EXTRACT(val.v, 0);                                      \
     }                                                                  \
     m = _mm_extract_ps(mask.v, 1);                                     \
     if (m != 0) {                                                      \
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +  \
-            _mm_extract_epi64(constOffset.v[0], 1);                    \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);   \
         TYPE *ptr = (TYPE *)(p + offset);                              \
         *ptr = EXTRACT(val.v, 1);                                      \
     }                                                                  \
     m = _mm_extract_ps(mask.v, 2);                                     \
     if (m != 0) {                                                      \
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +  \
-            _mm_extract_epi64(constOffset.v[1], 0);                    \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);   \
         TYPE *ptr = (TYPE *)(p + offset);                              \
         *ptr = EXTRACT(val.v, 2);                                      \
     }                                                                  \
     m = _mm_extract_ps(mask.v, 3);                                     \
     if (m != 0) {                                                      \
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +  \
-            _mm_extract_epi64(constOffset.v[1], 1);                    \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);   \
         TYPE *ptr = (TYPE *)(p + offset);                              \
         *ptr = EXTRACT(val.v, 3);                                      \
     }                                                                  \
@@ -3322,91 +3300,79 @@ SCATTER32_64(float, f,     float,   _mm_extract_ps_as_float)
 
 
 static FORCEINLINE void
-__scatter_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, 
-                             uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val, 
-                             __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) +
-            _mm_extract_epi32(constOffset.v, 0);
-        uint64_t *ptr = (uint64_t *)(p + offset);
-        *ptr = _mm_extract_epi64(val.v[0], 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) +
-            _mm_extract_epi32(constOffset.v, 1);
-        uint64_t *ptr = (uint64_t *)(p + offset);
-        *ptr = _mm_extract_epi64(val.v[0], 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) +
-            _mm_extract_epi32(constOffset.v, 2);
-        uint64_t *ptr = (uint64_t *)(p + offset);
-        *ptr = _mm_extract_epi64(val.v[1], 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) +
-            _mm_extract_epi32(constOffset.v, 3);
-        uint64_t *ptr = (uint64_t *)(p + offset);
-        *ptr = _mm_extract_epi64(val.v[1], 1);
-    }
-}
-
-static FORCEINLINE void
-__scatter_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, 
-                             uint32_t scale, __vec4_i64 constOffset,
+__scatter_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets, 
                              __vec4_i64 val, __vec4_i1 mask) {
     uint32_t m = _mm_extract_ps(mask.v, 0);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +
-            _mm_extract_epi64(constOffset.v[0], 0);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 0);
         uint64_t *ptr = (uint64_t *)(p + offset);
         *ptr = _mm_extract_epi64(val.v[0], 0);
     }
 
     m = _mm_extract_ps(mask.v, 1);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +
-            _mm_extract_epi64(constOffset.v[0], 1);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 1);
         uint64_t *ptr = (uint64_t *)(p + offset);
         *ptr = _mm_extract_epi64(val.v[0], 1);
     }
 
     m = _mm_extract_ps(mask.v, 2);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +
-            _mm_extract_epi64(constOffset.v[1], 0);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 2);
         uint64_t *ptr = (uint64_t *)(p + offset);
         *ptr = _mm_extract_epi64(val.v[1], 0);
     }
 
     m = _mm_extract_ps(mask.v, 3);
     if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +
-            _mm_extract_epi64(constOffset.v[1], 1);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 3);
         uint64_t *ptr = (uint64_t *)(p + offset);
         *ptr = _mm_extract_epi64(val.v[1], 1);
     }
 }
 
 static FORCEINLINE void
-__scatter_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, 
-                                uint32_t scale, __vec4_i32 constOffset, __vec4_d val, 
-                                __vec4_i1 mask) {
-    __scatter_factored_base_offsets32_i64(p, offsets, scale, constOffset, val, mask);
+__scatter_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets, 
+                             __vec4_i64 val, __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[0], 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[0], 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[1], 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[1], 1);
+    }
 }
 
 static FORCEINLINE void
-__scatter_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, 
-                                uint32_t scale, __vec4_i64 constOffset, __vec4_d val, 
-                                __vec4_i1 mask) {
-    __scatter_factored_base_offsets64_i64(p, offsets, scale, constOffset, val, mask);
+__scatter_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets, 
+                                __vec4_d val, __vec4_i1 mask) {
+    __scatter_base_offsets32_i64(p, scale, offsets, val, mask);
+}
+
+static FORCEINLINE void
+__scatter_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets, 
+                                __vec4_d val, __vec4_i1 mask) {
+    __scatter_base_offsets64_i64(p, scale, offsets, val, mask);
 }
 
 
diff --git a/ispc.cpp b/ispc.cpp
index 0980c3d2..8fb8f0f5 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -254,6 +254,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->maskBitCount = 1;
         t->hasHalf = true;
         t->hasTranscendentals = true;
+        t->hasGather = t->hasScatter = true;
     }
     else if (!strcasecmp(isa, "generic-8")) {
         t->isa = Target::GENERIC;
@@ -263,6 +264,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->maskBitCount = 1;
         t->hasHalf = true;
         t->hasTranscendentals = true;
+        t->hasGather = t->hasScatter = true;
     }
     else if (!strcasecmp(isa, "generic-16")) {
         t->isa = Target::GENERIC;
@@ -272,6 +274,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->maskBitCount = 1;
         t->hasHalf = true;
         t->hasTranscendentals = true;
+        t->hasGather = t->hasScatter = true;
     }
     else if (!strcasecmp(isa, "generic-32")) {
         t->isa = Target::GENERIC;
@@ -281,6 +284,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->maskBitCount = 1;
         t->hasHalf = true;
         t->hasTranscendentals = true;
+        t->hasGather = t->hasScatter = true;
     }
     else if (!strcasecmp(isa, "generic-64")) {
         t->isa = Target::GENERIC;
@@ -290,6 +294,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->maskBitCount = 1;
         t->hasHalf = true;
         t->hasTranscendentals = true;
+        t->hasGather = t->hasScatter = true;
     }
     else if (!strcasecmp(isa, "generic-1")) {
         t->isa = Target::GENERIC;

From df18b2a150ce72225733428bad9eb02d917c46f3 Mon Sep 17 00:00:00 2001
From: Jean-Luc Duprat <jld@acm.org>
Date: Wed, 11 Jul 2012 15:43:11 -0700
Subject: [PATCH 05/15] Fixed missing tmp var needed for use with gather
 intrinsic

---
 examples/intrinsics/knc.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index a0331afb..0cfb3d31 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -1589,11 +1589,6 @@ CAST_BITS_SCALAR(double, int64_t)
 ///////////////////////////////////////////////////////////////////////////
 // various math functions
 
-/*
-static FORCEINLINE void __fastmath() {
-}
-*/
-
 static FORCEINLINE float __round_uniform_float(float v) {
     return roundf(v);
 }
@@ -1943,7 +1938,7 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
 static FORCEINLINE __vec16_i32
 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets, 
 			    __vec16_i1 mask) {
-    // Loop is generated by intrinsic
+    __vec16_i32 tmp = _mm512_undefined_epi32();
     __vec16_i32 ret = _mm512_mask_i32extgather_epi32(tmp, mask, offsets, base, 
                                                      _MM_UPCONV_EPI32_NONE, scale,
                                                      _MM_HINT_NONE);
@@ -1953,7 +1948,7 @@ __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
 static FORCEINLINE __vec16_f
 __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
                               __vec16_i1 mask) { 
-    // Loop is generated by intrinsic
+    __vec16_f tmp = _mm512_undefined_ps();
     __vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base,
                                                 _MM_UPCONV_PS_NONE, scale,
                                                 _MM_HINT_NONE);

From 2bacebb1fb792fa04fd99e65033e24453527f62f Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Wed, 11 Jul 2012 19:51:28 -0700
Subject: [PATCH 06/15] Doc fixes (Crystal Lemire).

---
 docs/ispc.rst | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/docs/ispc.rst b/docs/ispc.rst
index 7e671dbd..f1f959c9 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -2084,7 +2084,7 @@ can be declared:
 
     soa<8> Point pts[...];
 
-The in-memory layout of the ``Point``s has had the SOA transformation
+The in-memory layout of the ``Point`` instances has had the SOA transformation
 applied, such that there are 8 ``x`` values in memory followed by 8 ``y``
 values, and so forth.  Here is the effective declaration of ``soa<8>
 Point``:
@@ -2266,7 +2266,7 @@ based on C++'s ``new`` and ``delete`` operators:
    // use ptr...
    delete[] ptr;
 
-In the above code, each program instance allocates its own ``count`-sized
+In the above code, each program instance allocates its own ``count`` sized
 array of ``uniform int`` values, uses that memory, and then deallocates
 that memory.  Uses of ``new`` and ``delete`` in ``ispc`` programs are
 serviced by corresponding calls the system C library's ``malloc()`` and
@@ -2277,9 +2277,7 @@ analogous to the corresponding rules are for pointers (as described in
 `Pointer Types`_.)  Specifically, if a specific rate qualifier isn't
 provided with the ``new`` expression, then the default is that a "varying"
 ``new`` is performed, where each program instance performs a unique
-allocation.  The allocated type, in turn, is by default ``uniform`` for
-``varying`` ``new`` expressions, and ``varying`` for ``uniform`` new
-expressions.
+allocation.  The allocated type, in turn, is by default ``uniform``.
  
 After a pointer has been deleted, it is illegal to access the memory it
 points to.  However, that deletion happens on a per-program-instance basis.
@@ -3491,7 +3489,7 @@ generates the following output on a four-wide compilation target:
 ::
 
     i = 10, x = [0.000000,1.000000,2.000000,3.000000]
-    added to x = [1.000000,2.000000,((2.000000)),((3.000000)]
+    added to x = [1.000000,2.000000,((2.000000)),((3.000000))]
     last print of x = [1.000000,2.000000,2.000000,3.000000]
 
 When a varying variable is printed, the values for program instances that
@@ -4010,8 +4008,8 @@ Systems Programming Support
 Atomic Operations and Memory Fences
 -----------------------------------
 
-The standard range of atomic memory operations are provided by the standard
-library``ispc``, including variants to handle both uniform and varying
+The standard set of atomic memory operations are provided by the standard
+library, including variants to handle both uniform and varying
 types as well as "local" and "global" atomics.
 
 Local atomics provide atomic behavior across the program instances in a

From 2c640f7e522c87808d920320221ee331cf54ad71 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Thu, 12 Jul 2012 06:07:07 -0700
Subject: [PATCH 07/15] Add support for RDRAND in IvyBridge.

The standard library now provides a variety of rdrand() functions
that call out to RDRAND, when available.

Issue #263.
---
 builtins.cpp                      |   3 +
 builtins/target-avx1-x2.ll        |   2 +
 builtins/target-avx1.ll           |   2 +
 builtins/target-avx11-x2.ll       |  41 ++++++-
 builtins/target-avx11.ll          |  41 ++++++-
 builtins/target-avx2-x2.ll        |   4 +-
 builtins/target-avx2.ll           |   4 +-
 builtins/target-generic-common.ll |   1 +
 builtins/target-sse2-common.ll    |   1 +
 builtins/target-sse4-common.ll    |   1 +
 builtins/util.m4                  |  45 ++++++++
 docs/ispc.rst                     |  35 ++++++
 stdlib.ispc                       | 185 ++++++++++++++++++++++++++++++
 tests/rdrand-1.ispc               |  21 ++++
 tests/rdrand-2.ispc               |  19 +++
 tests/rdrand-3.ispc               |  25 ++++
 tests/rdrand-4.ispc               |  33 ++++++
 tests/rdrand-5.ispc               |  33 ++++++
 tests/rdrand-6.ispc               |  35 ++++++
 19 files changed, 525 insertions(+), 6 deletions(-)
 create mode 100644 tests/rdrand-1.ispc
 create mode 100644 tests/rdrand-2.ispc
 create mode 100644 tests/rdrand-3.ispc
 create mode 100644 tests/rdrand-4.ispc
 create mode 100644 tests/rdrand-5.ispc
 create mode 100644 tests/rdrand-6.ispc

diff --git a/builtins.cpp b/builtins.cpp
index 00f72fc8..64f06e1f 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -476,6 +476,9 @@ lSetInternalFunctions(llvm::Module *module) {
         "__prefetch_read_uniform_nt",
         "__rcp_uniform_float",
         "__rcp_varying_float",
+        "__rdrand_i16",
+        "__rdrand_i32",
+        "__rdrand_i64",
         "__reduce_add_double",
         "__reduce_add_float",
         "__reduce_add_int32",
diff --git a/builtins/target-avx1-x2.ll b/builtins/target-avx1-x2.ll
index efde5d10..e06134d9 100644
--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -31,6 +31,8 @@
 
 include(`target-avx-x2.ll')
 
+rdrand_decls()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
 
diff --git a/builtins/target-avx1.ll b/builtins/target-avx1.ll
index 64f8ad33..1b47955a 100644
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -31,6 +31,8 @@
 
 include(`target-avx.ll')
 
+rdrand_decls()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
 
diff --git a/builtins/target-avx11-x2.ll b/builtins/target-avx11-x2.ll
index 884255df..cdb83726 100644
--- a/builtins/target-avx11-x2.ll
+++ b/builtins/target-avx11-x2.ll
@@ -29,9 +29,46 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-define(`NO_HALF_DECLARES', `1')
+include(`target-avx-x2.ll')
 
-include(`target-avx1-x2.ll')
+rdrand_definition()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions
diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll
index 35aebe91..d3ab9f13 100644
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -29,9 +29,46 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-define(`NO_HALF_DECLARES', `1')
+include(`target-avx.ll')
 
-include(`target-avx1.ll')
+rdrand_definition()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions
diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll
index 1ca3443c..1d2a2093 100644
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,8 @@
 
 include(`target-avx-x2.ll')
 
+rdrand_definition()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
 
diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll
index 7152657e..45496779 100644
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,8 @@
 
 include(`target-avx.ll')
 
+rdrand_definition()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
 
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 77c7aabe..7b4cfd9c 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -40,6 +40,7 @@ include(`util.m4')
 stdlib_core()
 scans()
 reduce_equal(WIDTH)
+rdrand_decls()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; broadcast/rotate/shuffle
diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll
index e0b7f40c..c6a3afe2 100644
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -33,6 +33,7 @@ ctlztz()
 define_prefetches()
 define_shuffles()
 aossoa()
+rdrand_decls()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll
index 69461fcd..4b8751b5 100644
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -33,6 +33,7 @@ ctlztz()
 define_prefetches()
 define_shuffles()
 aossoa()
+rdrand_decls()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
diff --git a/builtins/util.m4 b/builtins/util.m4
index 974c799c..614ac998 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -3712,3 +3712,48 @@ define void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,
 
 '
 )
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rdrand 
+
+define(`rdrand_decls', `
+declare i1 @__rdrand_i16(i16 * nocapture)
+declare i1 @__rdrand_i32(i32 * nocapture)
+declare i1 @__rdrand_i64(i64 * nocapture)
+')
+
+define(`rdrand_definition', `
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rdrand
+
+declare {i16, i32} @llvm.x86.rdrand.16()
+declare {i32, i32} @llvm.x86.rdrand.32()
+declare {i64, i32} @llvm.x86.rdrand.64()
+
+define i1 @__rdrand_i16(i16 * %ptr) {
+  %v = call {i16, i32} @llvm.x86.rdrand.16()
+  %v0 = extractvalue {i16, i32} %v, 0
+  %v1 = extractvalue {i16, i32} %v, 1
+  store i16 %v0, i16 * %ptr
+  %good = icmp ne i32 %v1, 0
+  ret i1 %good
+}
+
+define i1 @__rdrand_i32(i32 * %ptr) {
+  %v = call {i32, i32} @llvm.x86.rdrand.32()
+  %v0 = extractvalue {i32, i32} %v, 0
+  %v1 = extractvalue {i32, i32} %v, 1
+  store i32 %v0, i32 * %ptr
+  %good = icmp ne i32 %v1, 0
+  ret i1 %good
+}
+
+define i1 @__rdrand_i64(i64 * %ptr) {
+  %v = call {i64, i32} @llvm.x86.rdrand.64()
+  %v0 = extractvalue {i64, i32} %v, 0
+  %v1 = extractvalue {i64, i32} %v, 1
+  store i64 %v0, i64 * %ptr
+  %good = icmp ne i32 %v1, 0
+  ret i1 %good
+}
+')
diff --git a/docs/ispc.rst b/docs/ispc.rst
index f1f959c9..98250e39 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -140,6 +140,7 @@ Contents:
     * `Basic Math Functions`_
     * `Transcendental Functions`_
     * `Pseudo-Random Numbers`_
+    * `Random Numbers`_
 
   + `Output Functions`_
   + `Assertions`_
@@ -3455,6 +3456,40 @@ be used to get a pseudo-random ``float`` value.
     uniform unsigned int32 random(RNGState * uniform state)
     uniform float frandom(uniform RNGState * uniform state)
 
+
+Random Numbers
+--------------
+
+Some recent CPUs (including those based on the Intel(r) Ivy Bridge
+micro-architecture), provide support for generating true random numbers.  A
+few standard library functions make this functionality available:
+
+::
+
+    bool rdrand(uniform int32 * uniform ptr)
+    bool rdrand(varying int32 * uniform ptr)
+    bool rdrand(uniform int32 * varying ptr)
+
+If the processor doesn't have sufficient entropy to generate a random
+number, then this function fails and returns ``false``.  Otherwise, if the
+processor is successful, the random value is stored in the given pointer
+and ``true`` is returned.  Therefore, this function should generally be
+used as follows, called repeatedly until it is successful:
+
+::
+
+    int r;
+    while (rdrand(&r) == false)
+        ; // empty loop body
+   
+
+In addition to the ``int32`` variants of ``rdrand()`` listed above, there
+are versions that return ``int16``, ``float``, and ``int64`` values as
+well.
+
+Note that when compiling to targets other than ``avx1.1`` and ``avx2``, the
+``rdrand()`` functions always return ``false``.
+
 Output Functions
 ----------------
 
diff --git a/stdlib.ispc b/stdlib.ispc
index a7499930..3774c4a4 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -4068,3 +4068,188 @@ static inline void seed_rng(uniform RNGState * uniform state,
 static inline void fastmath() {
     __fastmath();
 }
+
+///////////////////////////////////////////////////////////////////////////
+// rdrand
+
+static inline uniform bool rdrand(float * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        uniform int32 irand;
+        uniform bool success = __rdrand_i32(&irand);
+        if (success) {
+            irand &= (1<<23)-1;
+            *ptr = floatbits(0x3F800000 | irand)-1.0f;
+        }
+        return success;
+    }
+}
+
+static inline bool rdrand(varying float * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        bool success = false;
+        foreach_active (index) {
+            uniform int32 irand;
+            if (__rdrand_i32(&irand)) {
+                // FIXME: it probably would be preferable, here and in the
+                // following rdrand() function, to do the int->float stuff
+                // in vector form.  However, we need to be careful to not
+                // clobber any existing already-set values in *ptr with
+                // inactive lanes here...
+                irand &= (1<<23)-1;
+                *ptr = floatbits(0x3F800000 | irand)-1.0f;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
+
+static inline bool rdrand(float * ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        float * uniform ptrs[programCount];
+        ptrs[programIndex] = ptr;
+
+        bool success = false;
+        foreach_active (index) {
+            uniform int32 irand;
+            if (__rdrand_i32(&irand)) {
+                irand &= (1<<23)-1;
+                *ptrs[index] = floatbits(0x3F800000 | irand)-1.0f;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
+
+static inline uniform bool rdrand(int16 * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else
+        return __rdrand_i16(ptr);
+}
+
+static inline bool rdrand(varying int16 * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        bool success = false;
+        foreach_active (index) {
+            uniform int16 irand;
+            if (__rdrand_i16(&irand)) {
+                *ptr = irand;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
+
+static inline bool rdrand(int16 * ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        int16 * uniform ptrs[programCount];
+        ptrs[programIndex] = ptr;
+        bool success = false;
+
+        foreach_active (index) {
+            uniform int16 irand;
+            if (__rdrand_i16(&irand)) {
+                *ptrs[index] = irand;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
+
+static inline uniform bool rdrand(int32 * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else
+        return __rdrand_i32(ptr);
+}
+
+static inline bool rdrand(varying int32 * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        bool success = false;
+        foreach_active (index) {
+            uniform int32 irand;
+            if (__rdrand_i32(&irand)) {
+                *ptr = irand;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
+
+static inline bool rdrand(int32 * ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        int32 * uniform ptrs[programCount];
+        ptrs[programIndex] = ptr;
+        bool success = false;
+
+        foreach_active (index) {
+            uniform int32 irand;
+            if (__rdrand_i32(&irand)) {
+                *ptrs[index] = irand;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
+
+static inline uniform bool rdrand(int64 * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else
+        return __rdrand_i64(ptr);
+}
+
+static inline bool rdrand(varying int64 * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        bool success = false;
+        foreach_active (index) {
+            uniform int64 irand;
+            if (__rdrand_i64(&irand)) {
+                *ptr = irand;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
+
+static inline bool rdrand(int64 * ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        int64 * uniform ptrs[programCount];
+        ptrs[programIndex] = ptr;
+        bool success = false;
+
+        foreach_active (index) {
+            uniform int64 irand;
+            if (__rdrand_i64(&irand)) {
+                *ptrs[index] = irand;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
diff --git a/tests/rdrand-1.ispc b/tests/rdrand-1.ispc
new file mode 100644
index 00000000..53ca6121
--- /dev/null
+++ b/tests/rdrand-1.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
+    RET[programIndex] = 1;
+#else
+
+    uniform float r = -1;
+    uniform int count = 0;
+    while (!rdrand(&r)) {
+        ++count;
+    }
+    RET[programIndex] = (r >= 0 && r < 1);
+
+#endif
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/rdrand-2.ispc b/tests/rdrand-2.ispc
new file mode 100644
index 00000000..7021a271
--- /dev/null
+++ b/tests/rdrand-2.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
+    RET[programIndex] = 1;
+#else
+
+    float r = -1;
+    while (!rdrand(&r))
+        ;
+    RET[programIndex] = (r >= 0 && r < 1);
+
+#endif
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/rdrand-3.ispc b/tests/rdrand-3.ispc
new file mode 100644
index 00000000..a9fc93a3
--- /dev/null
+++ b/tests/rdrand-3.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
+    RET[programIndex] = 1;
+#else
+
+    int lessHalf = 0, moreHalf = 0;
+    for (uniform int i = 0; i < 1024*1024; ++i) {
+        float r = -1;
+        while (!rdrand(&r))
+            ;
+        if (r < 0.5) ++lessHalf;
+        else ++moreHalf;
+    }
+
+    float r = (double)lessHalf / (double)(lessHalf + moreHalf);
+    RET[programIndex] = (r >= .49 && r < .51);
+#endif
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/rdrand-4.ispc b/tests/rdrand-4.ispc
new file mode 100644
index 00000000..3b38b7b1
--- /dev/null
+++ b/tests/rdrand-4.ispc
@@ -0,0 +1,33 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
+    RET[programIndex] = 0;
+#else
+
+    uniform int set[64] = { 0 };
+    uniform int count = 1024*1024;
+    for (uniform int i = 0; i < count; ++i) {
+        uniform int64 r;
+        while (!rdrand(&r))
+            ;
+        for (uniform int b = 0; b < 64; ++b) 
+            if (((unsigned int64)r >> b) & 1)
+                ++set[b];
+    }
+
+    RET[programIndex] = 0;
+    for (uniform int b = 0; b < 64; ++b)  {
+        float r = (double)set[b] / (double)(count);
+        if (!(r >= .49 && r < .51)) {
+            print("% % - %\n", b, r, set[b]);
+            ++RET[programIndex];
+        }
+    }
+#endif
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/rdrand-5.ispc b/tests/rdrand-5.ispc
new file mode 100644
index 00000000..cbf59a97
--- /dev/null
+++ b/tests/rdrand-5.ispc
@@ -0,0 +1,33 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
+    RET[programIndex] = 0;
+#else
+
+    int set[32] = { 0 };
+    uniform int count = 1024*1024;
+    for (uniform int i = 0; i < count; ++i) {
+        int32 r;
+        while (!rdrand(&r))
+            ;
+        for (uniform int b = 0; b < 32; ++b) 
+            if (((unsigned int32)r >> b) & 1)
+                ++set[b];
+    }
+
+    RET[programIndex] = 0;
+    for (uniform int b = 0; b < 32; ++b)  {
+        float r = (double)set[b] / (double)(count);
+        if (!(r >= .49 && r < .51)) {
+            print("% % - %\n", b, r, set[b]);
+            ++RET[programIndex];
+        }
+    }
+#endif
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/rdrand-6.ispc b/tests/rdrand-6.ispc
new file mode 100644
index 00000000..93137625
--- /dev/null
+++ b/tests/rdrand-6.ispc
@@ -0,0 +1,35 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
+    RET[programIndex] = 0;
+#else
+
+    int set[32] = { 0 };
+    uniform int count = 1024*1024;
+    for (uniform int i = 0; i < count; ++i) {
+        uniform int32 rr[programCount];
+        int * ptr = rr + programIndex;
+        while (!rdrand(ptr))
+            ;
+        int32 r = rr[programIndex];
+        for (uniform int b = 0; b < 32; ++b) 
+            if (((unsigned int32)r >> b) & 1)
+                ++set[b];
+    }
+
+    RET[programIndex] = 0;
+    for (uniform int b = 0; b < 32; ++b)  {
+        float r = (double)set[b] / (double)(count);
+        if (!(r >= .49 && r < .51)) {
+            print("% % - %\n", b, r, set[b]);
+            ++RET[programIndex];
+        }
+    }
+#endif
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}

From e09e953bbbc55809fc4358f7194b3f85d52192fc Mon Sep 17 00:00:00 2001
From: Jean-Luc Duprat <jld@acm.org>
Date: Thu, 12 Jul 2012 10:32:38 -0700
Subject: [PATCH 08/15] Added a few functions: __setzero_i64()
 __cast_sext(__vec16_i64, __vec16_i32), __cast_zext(__vec16_i32)      
 __min_varying_in32(), __min_varying_uint32(), __max_varying_int32(),
 __max_varying_uint32() Fixed the signature of __smear_i64() to match current
 codegen

---
 examples/intrinsics/knc.h | 59 +++++++++++++++++++++++++++++++--------
 1 file changed, 48 insertions(+), 11 deletions(-)

diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index 0cfb3d31..404cd24f 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -803,6 +803,13 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
 // int64
 
 
+static FORCEINLINE __vec16_i64 __setzero_i64() {
+    __vec16_i64 ret;
+    ret.v_lo = _mm512_setzero_epi32();
+    ret.v_hi = _mm512_setzero_epi32();
+    return ret;
+}
+
 static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b)
 {
     __mmask16 carry = 0;
@@ -878,7 +885,7 @@ static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index)
     return src[index+16] | (int64_t(src[index]) << 32);
 }
 
-static FORCEINLINE  __vec16_i64 __smear_i64(__vec16_i64, const int64_t &l) {
+static FORCEINLINE  __vec16_i64 __smear_i64(const int64_t &l) {
     const int *i = (const int*)&l;
     return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1]));
 }
@@ -1373,6 +1380,11 @@ CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
 CAST(__vec16_i32, int32_t, __vec16_i8,  int8_t,  __cast_sext)
 CAST(__vec16_i16, int16_t, __vec16_i8,  int8_t,  __cast_sext)
 
+static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
+{
+    return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
+}
+
 #define CAST_SEXT_I1(TYPE)
 /*
 static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) {  \
@@ -1389,11 +1401,6 @@ CAST_SEXT_I1(__vec16_i8)
 CAST_SEXT_I1(__vec16_i16)
 CAST_SEXT_I1(__vec16_i32)
 
-static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
-{
-    return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
-}
-
 // zero extension
 CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
 CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext)
@@ -1421,6 +1428,14 @@ CAST_ZEXT_I1(__vec16_i16)
 CAST_ZEXT_I1(__vec16_i32)
 CAST_ZEXT_I1(__vec16_i64)
 
+static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
+{
+    __vec16_i32 ret = _mm512_setzero_epi32();
+    __vec16_i32 one = _mm512_set1_epi32(1);
+    return _mm512_mask_mov_epi32(ret, val.m, one);
+}
+
+
 // truncations
 CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc)
 CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc)
@@ -1654,14 +1669,25 @@ static FORCEINLINE __vec16_f __min_varying_float(__vec16_f v1, __vec16_f v2) {
   return _mm512_gmin_ps(v1, v2);
 }
 
+static FORCEINLINE __vec16_i32 __max_varying_int32(__vec16_i32 v1, __vec16_i32 v2) {
+  return _mm512_max_epi32(v1, v2);
+}
+
+static FORCEINLINE __vec16_i32 __min_varying_int32(__vec16_i32 v1, __vec16_i32 v2) {
+  return _mm512_min_epi32(v1, v2);
+}
+
+static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) {
+  return _mm512_max_epu32(v1, v2);
+}
+
+static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) {
+  return _mm512_min_epu32(v1, v2);
+}
+
 BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double)
 BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double)
 
-BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32)
-BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32)
-BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32)
-BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32)
-
 BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64)
 BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64)
 BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64)
@@ -2033,6 +2059,17 @@ __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
                                  _MM_HINT_NONE);
 }
 
+/*
+static FORCEINLINE void
+__scatter_base_offsets64_float(void *base, const __vec16_i64 &varyingOffset,
+                               uint32_t scale, const __vec16_i64 &constOffset,
+                               const __vec16_f &val, const __vec16_i1 mask)
+{
+    __vec16_i64 offsets = __add(__mul(varyingOffset,__vec16_i64(scale)), constOffset);
+    _mm512_mask_i64extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
+}
+*/
+
 #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
 /*
 static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \

From d180031ef0014611f17692935da36bdd75ad4315 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Thu, 12 Jul 2012 13:56:58 -0700
Subject: [PATCH 09/15] Add more tests of basic gather functionality.

---
 tests/gather-double-1.ispc | 17 +++++++++++++++++
 tests/gather-double-2.ispc | 17 +++++++++++++++++
 tests/gather-double-3.ispc | 21 +++++++++++++++++++++
 tests/gather-double-4.ispc | 21 +++++++++++++++++++++
 tests/gather-double-5.ispc | 19 +++++++++++++++++++
 tests/gather-double-6.ispc | 19 +++++++++++++++++++
 tests/gather-double-7.ispc | 23 +++++++++++++++++++++++
 tests/gather-double-8.ispc | 23 +++++++++++++++++++++++
 tests/gather-float-1.ispc  | 17 +++++++++++++++++
 tests/gather-float-2.ispc  | 17 +++++++++++++++++
 tests/gather-float-3.ispc  | 21 +++++++++++++++++++++
 tests/gather-float-4.ispc  | 21 +++++++++++++++++++++
 tests/gather-float-5.ispc  | 19 +++++++++++++++++++
 tests/gather-float-6.ispc  | 19 +++++++++++++++++++
 tests/gather-float-7.ispc  | 23 +++++++++++++++++++++++
 tests/gather-float-8.ispc  | 23 +++++++++++++++++++++++
 tests/gather-int16-1.ispc  | 24 +++++++++++-------------
 tests/gather-int16-2.ispc  | 17 +++++++++++++++++
 tests/gather-int16-3.ispc  | 21 +++++++++++++++++++++
 tests/gather-int16-4.ispc  | 21 +++++++++++++++++++++
 tests/gather-int16-5.ispc  | 19 +++++++++++++++++++
 tests/gather-int16-6.ispc  | 19 +++++++++++++++++++
 tests/gather-int16-7.ispc  | 23 +++++++++++++++++++++++
 tests/gather-int16-8.ispc  | 23 +++++++++++++++++++++++
 tests/gather-int32-1.ispc  | 17 +++++++++++++++++
 tests/gather-int32-2.ispc  | 17 +++++++++++++++++
 tests/gather-int32-3.ispc  | 21 +++++++++++++++++++++
 tests/gather-int32-4.ispc  | 21 +++++++++++++++++++++
 tests/gather-int32-5.ispc  | 19 +++++++++++++++++++
 tests/gather-int32-6.ispc  | 19 +++++++++++++++++++
 tests/gather-int32-7.ispc  | 23 +++++++++++++++++++++++
 tests/gather-int32-8.ispc  | 23 +++++++++++++++++++++++
 tests/gather-int64-1.ispc  | 17 +++++++++++++++++
 tests/gather-int64-2.ispc  | 17 +++++++++++++++++
 tests/gather-int64-3.ispc  | 21 +++++++++++++++++++++
 tests/gather-int64-4.ispc  | 21 +++++++++++++++++++++
 tests/gather-int64-5.ispc  | 19 +++++++++++++++++++
 tests/gather-int64-6.ispc  | 19 +++++++++++++++++++
 tests/gather-int64-7.ispc  | 23 +++++++++++++++++++++++
 tests/gather-int64-8.ispc  | 23 +++++++++++++++++++++++
 tests/gather-int8-1.ispc   | 24 +++++++++++-------------
 tests/gather-int8-2.ispc   | 17 +++++++++++++++++
 tests/gather-int8-3.ispc   | 21 +++++++++++++++++++++
 tests/gather-int8-4.ispc   | 21 +++++++++++++++++++++
 tests/gather-int8-5.ispc   | 19 +++++++++++++++++++
 tests/gather-int8-6.ispc   | 19 +++++++++++++++++++
 tests/gather-int8-7.ispc   | 23 +++++++++++++++++++++++
 tests/gather-int8-8.ispc   | 23 +++++++++++++++++++++++
 48 files changed, 948 insertions(+), 26 deletions(-)
 create mode 100644 tests/gather-double-1.ispc
 create mode 100644 tests/gather-double-2.ispc
 create mode 100644 tests/gather-double-3.ispc
 create mode 100644 tests/gather-double-4.ispc
 create mode 100644 tests/gather-double-5.ispc
 create mode 100644 tests/gather-double-6.ispc
 create mode 100644 tests/gather-double-7.ispc
 create mode 100644 tests/gather-double-8.ispc
 create mode 100644 tests/gather-float-1.ispc
 create mode 100644 tests/gather-float-2.ispc
 create mode 100644 tests/gather-float-3.ispc
 create mode 100644 tests/gather-float-4.ispc
 create mode 100644 tests/gather-float-5.ispc
 create mode 100644 tests/gather-float-6.ispc
 create mode 100644 tests/gather-float-7.ispc
 create mode 100644 tests/gather-float-8.ispc
 create mode 100644 tests/gather-int16-2.ispc
 create mode 100644 tests/gather-int16-3.ispc
 create mode 100644 tests/gather-int16-4.ispc
 create mode 100644 tests/gather-int16-5.ispc
 create mode 100644 tests/gather-int16-6.ispc
 create mode 100644 tests/gather-int16-7.ispc
 create mode 100644 tests/gather-int16-8.ispc
 create mode 100644 tests/gather-int32-1.ispc
 create mode 100644 tests/gather-int32-2.ispc
 create mode 100644 tests/gather-int32-3.ispc
 create mode 100644 tests/gather-int32-4.ispc
 create mode 100644 tests/gather-int32-5.ispc
 create mode 100644 tests/gather-int32-6.ispc
 create mode 100644 tests/gather-int32-7.ispc
 create mode 100644 tests/gather-int32-8.ispc
 create mode 100644 tests/gather-int64-1.ispc
 create mode 100644 tests/gather-int64-2.ispc
 create mode 100644 tests/gather-int64-3.ispc
 create mode 100644 tests/gather-int64-4.ispc
 create mode 100644 tests/gather-int64-5.ispc
 create mode 100644 tests/gather-int64-6.ispc
 create mode 100644 tests/gather-int64-7.ispc
 create mode 100644 tests/gather-int64-8.ispc
 create mode 100644 tests/gather-int8-2.ispc
 create mode 100644 tests/gather-int8-3.ispc
 create mode 100644 tests/gather-int8-4.ispc
 create mode 100644 tests/gather-int8-5.ispc
 create mode 100644 tests/gather-int8-6.ispc
 create mode 100644 tests/gather-int8-7.ispc
 create mode 100644 tests/gather-int8-8.ispc

diff --git a/tests/gather-double-1.ispc b/tests/gather-double-1.ispc
new file mode 100644
index 00000000..64575545
--- /dev/null
+++ b/tests/gather-double-1.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-double-2.ispc b/tests/gather-double-2.ispc
new file mode 100644
index 00000000..78b9423a
--- /dev/null
+++ b/tests/gather-double-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-double-3.ispc b/tests/gather-double-3.ispc
new file mode 100644
index 00000000..cfa32f21
--- /dev/null
+++ b/tests/gather-double-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-double-4.ispc b/tests/gather-double-4.ispc
new file mode 100644
index 00000000..d7ad2f5e
--- /dev/null
+++ b/tests/gather-double-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-double-5.ispc b/tests/gather-double-5.ispc
new file mode 100644
index 00000000..3b97816a
--- /dev/null
+++ b/tests/gather-double-5.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+
+    double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
+    int g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-double-6.ispc b/tests/gather-double-6.ispc
new file mode 100644
index 00000000..1c464bd5
--- /dev/null
+++ b/tests/gather-double-6.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
+    int g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-double-7.ispc b/tests/gather-double-7.ispc
new file mode 100644
index 00000000..c73f3b4e
--- /dev/null
+++ b/tests/gather-double-7.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-double-8.ispc b/tests/gather-double-8.ispc
new file mode 100644
index 00000000..52da874d
--- /dev/null
+++ b/tests/gather-double-8.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-float-1.ispc b/tests/gather-float-1.ispc
new file mode 100644
index 00000000..18b3fd98
--- /dev/null
+++ b/tests/gather-float-1.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-float-2.ispc b/tests/gather-float-2.ispc
new file mode 100644
index 00000000..4f680814
--- /dev/null
+++ b/tests/gather-float-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-float-3.ispc b/tests/gather-float-3.ispc
new file mode 100644
index 00000000..9e81cd06
--- /dev/null
+++ b/tests/gather-float-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-float-4.ispc b/tests/gather-float-4.ispc
new file mode 100644
index 00000000..4f114fee
--- /dev/null
+++ b/tests/gather-float-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-float-5.ispc b/tests/gather-float-5.ispc
new file mode 100644
index 00000000..16f0e81e
--- /dev/null
+++ b/tests/gather-float-5.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+
+    float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
+    int g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-float-6.ispc b/tests/gather-float-6.ispc
new file mode 100644
index 00000000..d1136f9a
--- /dev/null
+++ b/tests/gather-float-6.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
+    int g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-float-7.ispc b/tests/gather-float-7.ispc
new file mode 100644
index 00000000..f5b09dc4
--- /dev/null
+++ b/tests/gather-float-7.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-float-8.ispc b/tests/gather-float-8.ispc
new file mode 100644
index 00000000..3708f063
--- /dev/null
+++ b/tests/gather-float-8.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int16-1.ispc b/tests/gather-int16-1.ispc
index e6bedd7f..89675185 100644
--- a/tests/gather-int16-1.ispc
+++ b/tests/gather-int16-1.ispc
@@ -1,19 +1,17 @@
+
 export uniform int width() { return programCount; }
 
-export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int16 x[programCount];
-    x[programIndex] = programIndex;
-    int a = aFOO[programIndex]-1;
-    unsigned int16 v;
-    if (programIndex < 2)
-        v = x[a];
-    else
-        v = 2;
-    RET[programIndex] = v;
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 2;
-    RET[0] = 0;
-    RET[1] = 1;
+    RET[programIndex] = 1 + programIndex;
 }
diff --git a/tests/gather-int16-2.ispc b/tests/gather-int16-2.ispc
new file mode 100644
index 00000000..74fdab8c
--- /dev/null
+++ b/tests/gather-int16-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-int16-3.ispc b/tests/gather-int16-3.ispc
new file mode 100644
index 00000000..a197f754
--- /dev/null
+++ b/tests/gather-int16-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int16-4.ispc b/tests/gather-int16-4.ispc
new file mode 100644
index 00000000..db9a7217
--- /dev/null
+++ b/tests/gather-int16-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int16-5.ispc b/tests/gather-int16-5.ispc
new file mode 100644
index 00000000..8d6ced77
--- /dev/null
+++ b/tests/gather-int16-5.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+
+    int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
+    int g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-int16-6.ispc b/tests/gather-int16-6.ispc
new file mode 100644
index 00000000..8d740856
--- /dev/null
+++ b/tests/gather-int16-6.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
+    int g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-int16-7.ispc b/tests/gather-int16-7.ispc
new file mode 100644
index 00000000..a6236af5
--- /dev/null
+++ b/tests/gather-int16-7.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int16-8.ispc b/tests/gather-int16-8.ispc
new file mode 100644
index 00000000..66bc8e89
--- /dev/null
+++ b/tests/gather-int16-8.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int32-1.ispc b/tests/gather-int32-1.ispc
new file mode 100644
index 00000000..2df1dd7e
--- /dev/null
+++ b/tests/gather-int32-1.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-int32-2.ispc b/tests/gather-int32-2.ispc
new file mode 100644
index 00000000..61f5a024
--- /dev/null
+++ b/tests/gather-int32-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-int32-3.ispc b/tests/gather-int32-3.ispc
new file mode 100644
index 00000000..e87eab33
--- /dev/null
+++ b/tests/gather-int32-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int32-4.ispc b/tests/gather-int32-4.ispc
new file mode 100644
index 00000000..8a6d7bb6
--- /dev/null
+++ b/tests/gather-int32-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int32-5.ispc b/tests/gather-int32-5.ispc
new file mode 100644
index 00000000..573666c7
--- /dev/null
+++ b/tests/gather-int32-5.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+
+    int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
+    int g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-int32-6.ispc b/tests/gather-int32-6.ispc
new file mode 100644
index 00000000..0d59a8fc
--- /dev/null
+++ b/tests/gather-int32-6.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
+    int g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-int32-7.ispc b/tests/gather-int32-7.ispc
new file mode 100644
index 00000000..ebc724e5
--- /dev/null
+++ b/tests/gather-int32-7.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int32-8.ispc b/tests/gather-int32-8.ispc
new file mode 100644
index 00000000..03cd7c8b
--- /dev/null
+++ b/tests/gather-int32-8.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int64-1.ispc b/tests/gather-int64-1.ispc
new file mode 100644
index 00000000..fe3d171b
--- /dev/null
+++ b/tests/gather-int64-1.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-int64-2.ispc b/tests/gather-int64-2.ispc
new file mode 100644
index 00000000..7a00439b
--- /dev/null
+++ b/tests/gather-int64-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-int64-3.ispc b/tests/gather-int64-3.ispc
new file mode 100644
index 00000000..7ddd559c
--- /dev/null
+++ b/tests/gather-int64-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int64-4.ispc b/tests/gather-int64-4.ispc
new file mode 100644
index 00000000..92e004e3
--- /dev/null
+++ b/tests/gather-int64-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int64-5.ispc b/tests/gather-int64-5.ispc
new file mode 100644
index 00000000..76d95f2d
--- /dev/null
+++ b/tests/gather-int64-5.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+
+    int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
+    int g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-int64-6.ispc b/tests/gather-int64-6.ispc
new file mode 100644
index 00000000..9deaaa80
--- /dev/null
+++ b/tests/gather-int64-6.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
+    int g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-int64-7.ispc b/tests/gather-int64-7.ispc
new file mode 100644
index 00000000..52df9d19
--- /dev/null
+++ b/tests/gather-int64-7.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int64-8.ispc b/tests/gather-int64-8.ispc
new file mode 100644
index 00000000..5cfa621b
--- /dev/null
+++ b/tests/gather-int64-8.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int8-1.ispc b/tests/gather-int8-1.ispc
index 305b12ca..43961ff2 100644
--- a/tests/gather-int8-1.ispc
+++ b/tests/gather-int8-1.ispc
@@ -1,19 +1,17 @@
+
 export uniform int width() { return programCount; }
 
-export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int8 x[programCount];
-    x[programIndex] = programIndex;
-    int a = aFOO[programIndex]-1;
-    unsigned int8 v;
-    if (programIndex < 2)
-        v = x[a];
-    else
-        v = 2;
-    RET[programIndex] = v;
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 2;
-    RET[0] = 0;
-    RET[1] = 1;
+    RET[programIndex] = 1 + programIndex;
 }
diff --git a/tests/gather-int8-2.ispc b/tests/gather-int8-2.ispc
new file mode 100644
index 00000000..8e853d0e
--- /dev/null
+++ b/tests/gather-int8-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-int8-3.ispc b/tests/gather-int8-3.ispc
new file mode 100644
index 00000000..5650ab7f
--- /dev/null
+++ b/tests/gather-int8-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int8-4.ispc b/tests/gather-int8-4.ispc
new file mode 100644
index 00000000..92386d5a
--- /dev/null
+++ b/tests/gather-int8-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int8-5.ispc b/tests/gather-int8-5.ispc
new file mode 100644
index 00000000..d0440d77
--- /dev/null
+++ b/tests/gather-int8-5.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+
+    int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
+    int g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-int8-6.ispc b/tests/gather-int8-6.ispc
new file mode 100644
index 00000000..840b309c
--- /dev/null
+++ b/tests/gather-int8-6.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
+    int g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/gather-int8-7.ispc b/tests/gather-int8-7.ispc
new file mode 100644
index 00000000..c0190db0
--- /dev/null
+++ b/tests/gather-int8-7.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/gather-int8-8.ispc b/tests/gather-int8-8.ispc
new file mode 100644
index 00000000..3c5cd41e
--- /dev/null
+++ b/tests/gather-int8-8.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}

From 371d4be8efb1b7801e7081ad1a2a995a6930ad6d Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Thu, 12 Jul 2012 14:10:59 -0700
Subject: [PATCH 10/15] Fix bugs in detection of Ivy Bridge systems.

We were incorrectly characterizing them as basic AVX1 without further
extensions, due to a bug in the logic to check CPU features.
---
 builtins/dispatch.ll | 70 +++++++++++++++++++++-----------------------
 ispc.cpp             | 26 ++++++++--------
 2 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/builtins/dispatch.ll b/builtins/dispatch.ll
index b9db3543..f1d5a969 100644
--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -76,18 +76,19 @@ declare void @abort() noreturn
 ;;     /* NOTE: the values returned below must be the same as the
 ;;        corresponding enumerant values in Target::ISA. */
 ;;     if ((info[2] & (1 << 28)) != 0) {
-;;         // AVX1 for sure. Do we have AVX2?
-;;         // Call cpuid with eax=7, ecx=0
-;;         __cpuid_count(info, 7, 0);
-;;         if ((info[1] & (1 << 5)) != 0)
-;;             return 4; // AVX2
-;;         else {
-;;             if ((info[2] & (1 << 29)) != 0 &&  // F16C
-;;                 (info[2] & (1 << 30)) != 0)    // RDRAND
-;;                 return 3; // AVX1 on IVB
-;;             else
-;;                 return 2; // AVX1
-;;         }
+;;        if ((info[2] & (1 << 29)) != 0 &&  // F16C
+;;            (info[2] & (1 << 30)) != 0) {  // RDRAND
+;;            // So far, so good.  AVX2?
+;;            // Call cpuid with eax=7, ecx=0
+;;            int info2[4];
+;;            __cpuid_count(info2, 7, 0);
+;;            if ((info2[1] & (1 << 5)) != 0)
+;;                return 4;
+;;            else
+;;                return 3;
+;;        }
+;;        // Regular AVX
+;;        return 2;
 ;;     }
 ;;     else if ((info[2] & (1 << 19)) != 0)
 ;;         return 1; // SSE4
@@ -104,40 +105,37 @@ entry:
   %asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
   %and = and i32 %asmresult5.i, 268435456
   %cmp = icmp eq i32 %and, 0
-  br i1 %cmp, label %if.else14, label %if.then
+  br i1 %cmp, label %if.else13, label %if.then
 
 if.then:                                          ; preds = %entry
-  %1 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
-  %asmresult4.i29 = extractvalue { i32, i32, i32, i32 } %1, 1
-  %and3 = and i32 %asmresult4.i29, 32
-  %cmp4 = icmp eq i32 %and3, 0
-  br i1 %cmp4, label %if.else, label %return
+  %1 = and i32 %asmresult5.i, 1610612736
+  %2 = icmp eq i32 %1, 1610612736
+  br i1 %2, label %if.then7, label %return
 
-if.else:                                          ; preds = %if.then
-  %asmresult5.i30 = extractvalue { i32, i32, i32, i32 } %1, 2
-  %2 = and i32 %asmresult5.i30, 1610612736
-  %3 = icmp eq i32 %2, 1610612736
-  br i1 %3, label %return, label %if.else13
-
-if.else13:                                        ; preds = %if.else
+if.then7:                                         ; preds = %if.then
+  %3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
+  %asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1
+  %and10 = lshr i32 %asmresult4.i28, 5
+  %4 = and i32 %and10, 1
+  %5 = add i32 %4, 3
   br label %return
 
-if.else14:                                        ; preds = %entry
-  %and16 = and i32 %asmresult5.i, 524288
-  %cmp17 = icmp eq i32 %and16, 0
-  br i1 %cmp17, label %if.else19, label %return
+if.else13:                                        ; preds = %entry
+  %and15 = and i32 %asmresult5.i, 524288
+  %cmp16 = icmp eq i32 %and15, 0
+  br i1 %cmp16, label %if.else18, label %return
 
-if.else19:                                        ; preds = %if.else14
-  %and21 = and i32 %asmresult6.i, 67108864
-  %cmp22 = icmp eq i32 %and21, 0
-  br i1 %cmp22, label %if.else24, label %return
+if.else18:                                        ; preds = %if.else13
+  %and20 = and i32 %asmresult6.i, 67108864
+  %cmp21 = icmp eq i32 %and20, 0
+  br i1 %cmp21, label %if.else23, label %return
 
-if.else24:                                        ; preds = %if.else19
+if.else23:                                        ; preds = %if.else18
   tail call void @abort() noreturn nounwind
   unreachable
 
-return:                                           ; preds = %if.else19, %if.else14, %if.else13, %if.else, %if.then
-  %retval.0 = phi i32 [ 2, %if.else13 ], [ 4, %if.then ], [ 3, %if.else ], [ 1, %if.else14 ], [ 0, %if.else19 ]
+return:                                           ; preds = %if.else18, %if.else13, %if.then7, %if.then
+  %retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ]
   ret i32 %retval.0
 }
 
diff --git a/ispc.cpp b/ispc.cpp
index 8fb8f0f5..15c8d4ae 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -94,20 +94,22 @@ lGetSystemISA() {
     int info[4];
     __cpuid(info, 1);
 
-    if ((info[2] & (1 << 28)) != 0) {
-        // AVX1 for sure. Do we have AVX2?
-        // Call cpuid with eax=7, ecx=0
-        __cpuidex(info, 7, 0);
-        if ((info[1] & (1 << 5)) != 0)
-            return "avx2";
-        else {
-            // ivybridge?
-            if ((info[2] & (1 << 29)) != 0 &&  // F16C
-                (info[2] & (1 << 30)) != 0)    // RDRAND
-                return "avx1.1";
+    if ((info[2] & (1 << 28)) != 0) {  // AVX
+        // AVX1 for sure....
+        // Ivy Bridge?
+        if ((info[2] & (1 << 29)) != 0 &&  // F16C
+            (info[2] & (1 << 30)) != 0) {  // RDRAND
+            // So far, so good.  AVX2?
+            // Call cpuid with eax=7, ecx=0
+            int info2[4];
+            __cpuidex(info2, 7, 0);
+            if ((info2[1] & (1 << 5)) != 0)
+                return "avx2";
             else
-                return "avx";
+                return "avx1.1";
         }
+        // Regular AVX
+        return "avx";
     }
     else if ((info[2] & (1 << 19)) != 0)
         return "sse4";

From 9a1932eaf765d5f279d605a14138ce923f730171 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 13 Jul 2012 12:01:04 -0700
Subject: [PATCH 11/15] Only set gcc's "-msse4.2", etc, option when compiling
 for generic targets.

We don't need it when ispc is just generating an object file directly, and gcc
on OS X doesn't recognize -mavx.
---
 run_tests.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/run_tests.py b/run_tests.py
index 0cb78ed6..ea53b432 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -265,11 +265,9 @@ def run_test(filename):
                     gcc_arch = '-m64'
 
                 gcc_isa=""
-                if options.target == 'sse2' or options.target == 'sse2-x2':
-                    gcc_isa = '-msse3'
-                if options.target == 'sse4' or options.target == 'sse4-x2' or options.target == 'generic-4':
+                if options.target == 'generic-4':
                     gcc_isa = '-msse4.2'
-                if options.target == 'avx' or options.target == 'avx-x2' or options.target == 'generic-8':
+                if options.target == 'generic-8':
                     gcc_isa = '-mavx'
                 if (options.target == 'generic-16' or options.target == 'generic-32' or options.target == 'generic-64') \
                         and (options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1):

From 98b2e0e426ac008151dd59cf4bf1650683015073 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 13 Jul 2012 12:14:10 -0700
Subject: [PATCH 12/15] Fixes for intrinsics unsupported in earlier LLVM
 versions.

Specifically, don't use the half/float conversion routines with
LLVM 3.0, and don't try to use RDRAND with anything before LLVM 3.2.
---
 builtins/target-avx11-x2.ll | 10 ++++++++--
 builtins/target-avx11.ll    |  8 +++++++-
 builtins/target-avx2-x2.ll  |  9 +++++++--
 builtins/target-avx2.ll     |  8 +++++++-
 ispc.cpp                    | 18 ++++++++++++++++++
 5 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/builtins/target-avx11-x2.ll b/builtins/target-avx11-x2.ll
index cdb83726..1aa6345c 100644
--- a/builtins/target-avx11-x2.ll
+++ b/builtins/target-avx11-x2.ll
@@ -31,7 +31,9 @@
 
 include(`target-avx-x2.ll')
 
-rdrand_definition()
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -73,6 +75,9 @@ gen_gather(double)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions
 
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -123,4 +128,5 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
   %r = extractelement <8 x i16> %rv, i32 0
   ret i16 %r
 }
-
+'
+)
diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll
index d3ab9f13..fea0a7c2 100644
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -31,7 +31,9 @@
 
 include(`target-avx.ll')
 
-rdrand_definition()
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -73,6 +75,9 @@ gen_gather(double)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions
 
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -107,3 +112,4 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
   %r = extractelement <8 x i16> %rv, i32 0
   ret i16 %r
 }
+')
diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll
index 1d2a2093..6572783f 100644
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -31,7 +31,9 @@
 
 include(`target-avx-x2.ll')
 
-rdrand_definition()
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -68,6 +70,9 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions
 
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -118,7 +123,7 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
   %r = extractelement <8 x i16> %rv, i32 0
   ret i16 %r
 }
-
+')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll
index 45496779..e36a74de 100644
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -31,7 +31,9 @@
 
 include(`target-avx.ll')
 
-rdrand_definition()
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -68,6 +70,9 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions
 
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -102,6 +107,7 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
   %r = extractelement <8 x i16> %rv, i32 0
   ret i16 %r
 }
+')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
diff --git a/ispc.cpp b/ispc.cpp
index 15c8d4ae..636dfdd4 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -328,8 +328,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
         t->maskingIsFree = false;
         t->maskBitCount = 32;
+#if !defined(LLVM_3_0)
+        // LLVM 3.1+ only
         t->hasHalf = true;
+  #if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
         t->hasRand = true;
+  #endif
+#endif
     }
     else if (!strcasecmp(isa, "avx1.1-x2")) {
         t->isa = Target::AVX11;
@@ -338,8 +344,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
         t->maskingIsFree = false;
         t->maskBitCount = 32;
+#if !defined(LLVM_3_0)
+        // LLVM 3.1+ only
         t->hasHalf = true;
+  #if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
         t->hasRand = true;
+  #endif
+#endif
     }
 #ifndef LLVM_3_0
     else if (!strcasecmp(isa, "avx2")) {
@@ -350,7 +362,10 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->maskingIsFree = false;
         t->maskBitCount = 32;
         t->hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
         t->hasRand = true;
+#endif
     }
     else if (!strcasecmp(isa, "avx2-x2")) {
         t->isa = Target::AVX2;
@@ -360,7 +375,10 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->maskingIsFree = false;
         t->maskBitCount = 32;
         t->hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
         t->hasRand = true;
+#endif
     }
 #endif // !LLVM_3_0
     else {

From daf5aa8e8b0a0dfcaa75753ad9be4a5823d0200a Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 13 Jul 2012 12:14:53 -0700
Subject: [PATCH 13/15] Run inst combine before memory optimizations.

We were previously emitting 64-bit indexing for some gathers where
32-bit was actually fine, due to some adds of constant vectors
that hadn't been simplified to the result.
---
 opt.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/opt.cpp b/opt.cpp
index 824c5bc7..a623466b 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -447,6 +447,7 @@ Optimize(llvm::Module *module, int optLevel) {
 
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target.vectorWidth > 1) {
+            optPM.add(llvm::createInstructionCombiningPass());
             optPM.add(CreateImproveMemoryOpsPass());
         }
         if (!g->opt.disableMaskAllOnOptimizations) {
@@ -489,6 +490,7 @@ Optimize(llvm::Module *module, int optLevel) {
 
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target.vectorWidth > 1) {
+            optPM.add(llvm::createInstructionCombiningPass());
             optPM.add(CreateImproveMemoryOpsPass());
         
             if (g->opt.disableCoalescing == false &&
@@ -507,6 +509,7 @@ Optimize(llvm::Module *module, int optLevel) {
 
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target.vectorWidth > 1) {
+            optPM.add(llvm::createInstructionCombiningPass());
             optPM.add(CreateImproveMemoryOpsPass());
         }
 

From 984a68c3a9657737846e8fe86c117a3294961824 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 13 Jul 2012 12:20:42 -0700
Subject: [PATCH 14/15] Rename gen_gather() macro to gen_gather_factored()

---
 builtins/target-avx1-x2.ll   | 12 ++++++------
 builtins/target-avx1.ll      | 12 ++++++------
 builtins/target-avx2-x2.ll   | 12 ++++++------
 builtins/target-avx2.ll      | 12 ++++++------
 builtins/target-generic-1.ll | 12 ++++++------
 builtins/target-sse2-x2.ll   | 12 ++++++------
 builtins/target-sse2.ll      | 12 ++++++------
 builtins/target-sse4-x2.ll   | 12 ++++++------
 builtins/target-sse4.ll      | 12 ++++++------
 builtins/util.m4             |  2 +-
 10 files changed, 55 insertions(+), 55 deletions(-)
 mode change 100755 => 100644 builtins/target-generic-1.ll

diff --git a/builtins/target-avx1-x2.ll b/builtins/target-avx1-x2.ll
index e06134d9..562d7ff0 100644
--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -73,9 +73,9 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 
-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
diff --git a/builtins/target-avx1.ll b/builtins/target-avx1.ll
index 1b47955a..9c86cab8 100644
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -73,9 +73,9 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 
-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll
index 6572783f..a2a4fd34 100644
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -128,9 +128,9 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 
-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll
index e36a74de..4b4b38c5 100644
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -112,9 +112,9 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 
-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
old mode 100755
new mode 100644
index 5e82b4f1..c5937c8e
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -34,12 +34,12 @@ masked_load(double, 8)
 
 ; define these with the macros from stdlib.m4
 
-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
 
 gen_scatter(i8)
 gen_scatter(i16)
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 0260971a..ad19f899 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -444,12 +444,12 @@ masked_load(double, 8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
 
-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
 
 gen_scatter(i8)
 gen_scatter(i16)
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index 5f40d1eb..6558adc8 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -575,12 +575,12 @@ masked_load(double, 8)
 
 ; define these with the macros from stdlib.m4
 
-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
 
 gen_scatter(i8)
 gen_scatter(i16)
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index ef3a7746..0f7cb355 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -371,12 +371,12 @@ masked_load(double, 8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
 
-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
 
 gen_scatter(i8)
 gen_scatter(i16)
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index ee57f6bd..b00bcfd6 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -474,12 +474,12 @@ masked_load(double, 8)
 
 ; define these with the macros from stdlib.m4
 
-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
 
 gen_scatter(i8)
 gen_scatter(i16)
diff --git a/builtins/util.m4 b/builtins/util.m4
index 614ac998..d29bcbca 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -3472,7 +3472,7 @@ pl_done:
 ;; $1: scalar type for which to generate functions to do gathers
 
 ; vec width, type
-define(`gen_gather', `
+define(`gen_gather_factored', `
 ;; Define the utility function to do the gather operation for a single element
 ;; of the type
 define <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,

From 6a410fc30e418d2f166558d95af5981c6a64abdc Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 13 Jul 2012 12:29:05 -0700
Subject: [PATCH 15/15] Emit gather instructions for the AVX2 targets.

Issue #308.
---
 builtins/target-avx2-x2.ll | 427 ++++++++++++++++++++++++++++++++++++-
 builtins/target-avx2.ll    | 315 ++++++++++++++++++++++++++-
 builtins/util.m4           |  93 +++++---
 ispc.cpp                   |   2 +
 4 files changed, 808 insertions(+), 29 deletions(-)

diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll
index a2a4fd34..053fd078 100644
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -29,6 +29,10 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
+ifelse(LLVM_VERSION, `LLVM_3_0', `',
+       LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
+
 include(`target-avx-x2.ll')
 
 ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
@@ -128,9 +132,430 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 
+declare void @llvm.trap() noreturn nounwind
+
+; $1: type
+; $2: var base name
+define(`extract_4s', `
+  %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %$2_3 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %$2_4 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: type
+; $2: var base name
+define(`extract_8s', `
+  %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef,
+                    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: element type
+; $2: ret name
+; $3: v1
+; $4: v2
+define(`assemble_8s', `
+  %$2 = shufflevector <8 x $1> %$3, <8 x $1> %$4,
+                      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: element type
+; $2: ret name
+; $3: v1
+; $4: v2
+; $5: v3
+; $6: v4
+define(`assemble_4s', `
+  %$2_1 = shufflevector <4 x $1> %$3, <4 x $1> %$4,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$2_2 = shufflevector <4 x $1> %$5, <4 x $1> %$6,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  assemble_8s($1, $2, $2_1, $2_2)
+')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
 gen_gather_factored(i8)
 gen_gather_factored(i16)
 gen_gather_factored(i32)
 gen_gather_factored(float)
 gen_gather_factored(i64)
-gen_gather_factored(double)
+gen_gather_factored(double)',
+LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
+gen_gather(i8)
+gen_gather(i16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  extract_8s(i32, offsets)
+  extract_8s(i32, vecmask)
+
+  %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets_1, <8 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets_2, <8 x i32> %vecmask_2, i8 %scale8)
+
+  assemble_8s(i32, v, v1, v2)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  extract_4s(i32, vecmask)
+  extract_4s(i64, offsets)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x i32> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x i32> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i32, v, v1, v2, v3, v4)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather32_i32(<16 x i32> %ptrs, 
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_8s(i32, ptrs)
+  extract_8s(i32, vecmask)
+
+  %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                       <8 x i32> %ptrs_1, <8 x i32> %vecmask_1, i8 1)
+  %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                       <8 x i32> %ptrs_2, <8 x i32> %vecmask_2, i8 1)
+
+  assemble_8s(i32, v, v1, v2)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather64_i32(<16 x i64> %ptrs, 
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_4s(i64, ptrs)
+  extract_4s(i32, vecmask)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
+  %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x i32> %vecmask_3, i8 1)
+  %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x i32> %vecmask_4, i8 1)
+
+  assemble_4s(i32, v, v1, v2, v3, v4)
+
+  ret <16 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <16 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <16 x i32> %offsets,
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_8s(i32, offsets)
+  extract_8s(float, mask)
+
+  %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets_1, <8 x float> %mask_1, i8 %scale8)
+  %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets_2, <8 x float> %mask_2, i8 %scale8)
+
+  assemble_8s(float, v, v1, v2)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <16 x i64> %offsets,
+                                   <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_4s(i64, offsets)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
+  %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_3, <4 x float> %mask_3, i8 %scale8)
+  %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_4, <4 x float> %mask_4, i8 %scale8)
+
+  assemble_4s(float, v, v1, v2, v3, v4)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather32_float(<16 x i32> %ptrs, 
+                                      <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_8s(float, mask)
+  extract_8s(i32, ptrs)
+
+  %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs_1, <8 x float> %mask_1, i8 1)
+  %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs_2, <8 x float> %mask_2, i8 1)
+
+  assemble_8s(float, v, v1, v2)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather64_float(<16 x i64> %ptrs, 
+                                      <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_4s(i64, ptrs)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
+  %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x float> %mask_3, i8 1)
+  %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x float> %mask_4, i8 1)
+
+  assemble_4s(float, v, v1, v2, v3, v4)
+
+  ret <16 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <16 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i32, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+
+define <16 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i64, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+
+define <16 x i64> @__gather32_i64(<16 x i32> %ptrs, 
+                                  <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i32, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @__gather64_i64(<16 x i64> %ptrs, 
+                                  <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i64, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <16 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i32, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i64, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather32_double(<16 x i32> %ptrs, 
+                                        <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i32, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_3, <4 x double> %vecmask_3, i8 1)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_4, <4 x double> %vecmask_4, i8 1)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather64_double(<16 x i64> %ptrs, 
+                                        <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i64, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x double> %vecmask_3, i8 1)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x double> %vecmask_4, i8 1)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+')
diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll
index 4b4b38c5..f4a0ee07 100644
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -29,6 +29,10 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
+ifelse(LLVM_VERSION, `LLVM_3_0', `',
+       LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
+
 include(`target-avx.ll')
 
 ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
@@ -112,9 +116,318 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 
+declare void @llvm.trap() noreturn nounwind
+
+define(`extract_4s', `
+  %$2_1 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
 gen_gather_factored(i8)
 gen_gather_factored(i16)
 gen_gather_factored(i32)
 gen_gather_factored(float)
 gen_gather_factored(i64)
-gen_gather_factored(double)
+gen_gather_factored(double)',
+LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
+gen_gather(i8)
+gen_gather(i16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <8 x i32> @__gather_base_offsets32_i32(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets, <8 x i32> %vecmask, i8 %scale8)
+
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  extract_4s(i32, vecmask)
+  extract_4s(i64, offsets)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather32_i32(<8 x i32> %ptrs, 
+                                 <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                      <8 x i32> %ptrs, <8 x i32> %vecmask, i8 1)
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather64_i32(<8 x i64> %ptrs, 
+                                 <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_4s(i64, ptrs)
+  extract_4s(i32, vecmask)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <8 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <8 x i32> %offsets,
+                                  <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+
+  %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets, <8 x float> %mask, i8 %scale8)
+
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <8 x i64> %offsets,
+                                   <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+  extract_4s(i64, offsets)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
+
+  %v = shufflevector <4 x float> %v1, <4 x float> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather32_float(<8 x i32> %ptrs, 
+                                     <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+
+  %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs, <8 x float> %mask, i8 1)
+
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather64_float(<8 x i64> %ptrs, 
+                                     <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+  extract_4s(i64, ptrs)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
+
+  %v = shufflevector <4 x float> %v1, <4 x float> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <8 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i32, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i64, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather32_i64(<8 x i32> %ptrs, 
+                                 <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+
+  extract_4s(i32, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather64_i64(<8 x i64> %ptrs, 
+                                 <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i64, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <8 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i32, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i64, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather32_double(<8 x i32> %ptrs, 
+                                       <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i32, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather64_double(<8 x i64> %ptrs, 
+                                       <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i64, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  ret <8 x double> %v
+}
+
+')
diff --git a/builtins/util.m4 b/builtins/util.m4
index d29bcbca..a97336a7 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -3471,6 +3471,40 @@ pl_done:
 ;;
 ;; $1: scalar type for which to generate functions to do gathers
 
+define(`gen_gather_general', `
+; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
+define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs, 
+                                   <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+  %ret_ptr = alloca <WIDTH x $1>
+  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
+  %val_LANE_ID = load $1 * %ptr_LANE_ID
+  %store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
+  store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
+ ')
+
+  %ret = load <WIDTH x $1> * %ret_ptr
+  ret <WIDTH x $1> %ret
+}
+
+; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
+define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs, 
+                                   <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+  %ret_ptr = alloca <WIDTH x $1>
+  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
+  %val_LANE_ID = load $1 * %ptr_LANE_ID
+  %store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
+  store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
+ ')
+
+  %ret = load <WIDTH x $1> * %ret_ptr
+  ret <WIDTH x $1> %ret
+}
+')
+
 ; vec width, type
 define(`gen_gather_factored', `
 ;; Define the utility function to do the gather operation for a single element
@@ -3582,37 +3616,42 @@ define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64
   ret <WIDTH x $1> %ret`'eval(WIDTH-1)
 }
 
-; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
-define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs, 
-                                <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
-  %ret_ptr = alloca <WIDTH x $1>
-  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
-  %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
-  %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
-  %val_LANE_ID = load $1 * %ptr_LANE_ID
-  %store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
-  store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
- ')
+gen_gather_general($1)
+'
+)
 
-  %ret = load <WIDTH x $1> * %ret_ptr
-  ret <WIDTH x $1> %ret
+; vec width, type
+define(`gen_gather', `
+
+gen_gather_factored($1)
+
+define <WIDTH x $1>
+@__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale,
+                           <WIDTH x i32> %offsets,
+                           <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale_vec = bitcast i32 %offset_scale to <1 x i32>
+  %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef,
+     <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
+  %scaled_offsets = mul <WIDTH x i32> %smear_scale, %offsets
+  %v = call <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %scaled_offsets, i32 1, 
+                                                     <WIDTH x i32> zeroinitializer, <WIDTH x i32> %vecmask)
+  ret <WIDTH x $1> %v
 }
 
-; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
-define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs, 
-                                <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
-  %ret_ptr = alloca <WIDTH x $1>
-  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
-  %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
-  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
-  %val_LANE_ID = load $1 * %ptr_LANE_ID
-  %store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
-  store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
- ')
-
-  %ret = load <WIDTH x $1> * %ret_ptr
-  ret <WIDTH x $1> %ret
+define <WIDTH x $1>
+@__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale,
+                            <WIDTH x i64> %offsets,
+                            <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale64 = zext i32 %offset_scale to i64
+  %scale_vec = bitcast i64 %scale64 to <1 x i64>
+  %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef,
+     <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
+  %scaled_offsets = mul <WIDTH x i64> %smear_scale, %offsets
+  %v = call <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %scaled_offsets,
+                                                     i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x i32> %vecmask)
+  ret <WIDTH x $1> %v
 }
+
 '
 )
 
diff --git a/ispc.cpp b/ispc.cpp
index 636dfdd4..fac83bbe 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -365,6 +365,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
 #if !defined(LLVM_3_1)
         // LLVM 3.2+ only
         t->hasRand = true;
+        t->hasGather = true;
 #endif
     }
     else if (!strcasecmp(isa, "avx2-x2")) {
@@ -378,6 +379,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
 #if !defined(LLVM_3_1)
         // LLVM 3.2+ only
         t->hasRand = true;
+        t->hasGather = true;
 #endif
     }
 #endif // !LLVM_3_0