Merge branch 'master' of https://github.com/ispc/ispc

2012-07-17 11:46:30 -07:00
parent 898cded646 6a410fc30e
commit 1334a84861
82 changed files with 3708 additions and 885 deletions
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -476,6 +476,9 @@ lSetInternalFunctions(llvm::Module *module) {
        "__prefetch_read_uniform_nt",
        "__rcp_uniform_float",
        "__rcp_varying_float",
+        "__rdrand_i16",
+        "__rdrand_i32",
+        "__rdrand_i64",
        "__reduce_add_double",
        "__reduce_add_float",
        "__reduce_add_int32",
--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -76,18 +76,19 @@ declare void @abort() noreturn
 ;;     /* NOTE: the values returned below must be the same as the
 ;;        corresponding enumerant values in Target::ISA. */
 ;;     if ((info[2] & (1 << 28)) != 0) {
-;;         // AVX1 for sure. Do we have AVX2?
-;;         // Call cpuid with eax=7, ecx=0
-;;         __cpuid_count(info, 7, 0);
-;;         if ((info[1] & (1 << 5)) != 0)
-;;             return 4; // AVX2
-;;         else {
-;;             if ((info[2] & (1 << 29)) != 0 &&  // F16C
-;;                 (info[2] & (1 << 30)) != 0)    // RDRAND
-;;                 return 3; // AVX1 on IVB
-;;             else
-;;                 return 2; // AVX1
-;;         }
+;;        if ((info[2] & (1 << 29)) != 0 &&  // F16C
+;;            (info[2] & (1 << 30)) != 0) {  // RDRAND
+;;            // So far, so good.  AVX2?
+;;            // Call cpuid with eax=7, ecx=0
+;;            int info2[4];
+;;            __cpuid_count(info2, 7, 0);
+;;            if ((info2[1] & (1 << 5)) != 0)
+;;                return 4;
+;;            else
+;;                return 3;
+;;        }
+;;        // Regular AVX
+;;        return 2;
 ;;     }
 ;;     else if ((info[2] & (1 << 19)) != 0)
 ;;         return 1; // SSE4
@@ -104,40 +105,37 @@ entry:
  %asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
  %and = and i32 %asmresult5.i, 268435456
  %cmp = icmp eq i32 %and, 0
-  br i1 %cmp, label %if.else14, label %if.then
+  br i1 %cmp, label %if.else13, label %if.then

 if.then:                                          ; preds = %entry
-  %1 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
-  %asmresult4.i29 = extractvalue { i32, i32, i32, i32 } %1, 1
-  %and3 = and i32 %asmresult4.i29, 32
-  %cmp4 = icmp eq i32 %and3, 0
-  br i1 %cmp4, label %if.else, label %return
+  %1 = and i32 %asmresult5.i, 1610612736
+  %2 = icmp eq i32 %1, 1610612736
+  br i1 %2, label %if.then7, label %return

-if.else:                                          ; preds = %if.then
-  %asmresult5.i30 = extractvalue { i32, i32, i32, i32 } %1, 2
-  %2 = and i32 %asmresult5.i30, 1610612736
-  %3 = icmp eq i32 %2, 1610612736
-  br i1 %3, label %return, label %if.else13
-
-if.else13:                                        ; preds = %if.else
+if.then7:                                         ; preds = %if.then
+  %3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
+  %asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1
+  %and10 = lshr i32 %asmresult4.i28, 5
+  %4 = and i32 %and10, 1
+  %5 = add i32 %4, 3
  br label %return

-if.else14:                                        ; preds = %entry
-  %and16 = and i32 %asmresult5.i, 524288
-  %cmp17 = icmp eq i32 %and16, 0
-  br i1 %cmp17, label %if.else19, label %return
+if.else13:                                        ; preds = %entry
+  %and15 = and i32 %asmresult5.i, 524288
+  %cmp16 = icmp eq i32 %and15, 0
+  br i1 %cmp16, label %if.else18, label %return

-if.else19:                                        ; preds = %if.else14
-  %and21 = and i32 %asmresult6.i, 67108864
-  %cmp22 = icmp eq i32 %and21, 0
-  br i1 %cmp22, label %if.else24, label %return
+if.else18:                                        ; preds = %if.else13
+  %and20 = and i32 %asmresult6.i, 67108864
+  %cmp21 = icmp eq i32 %and20, 0
+  br i1 %cmp21, label %if.else23, label %return

-if.else24:                                        ; preds = %if.else19
+if.else23:                                        ; preds = %if.else18
  tail call void @abort() noreturn nounwind
  unreachable

-return:                                           ; preds = %if.else19, %if.else14, %if.else13, %if.else, %if.then
-  %retval.0 = phi i32 [ 2, %if.else13 ], [ 4, %if.then ], [ 3, %if.else ], [ 1, %if.else14 ], [ 0, %if.else19 ]
+return:                                           ; preds = %if.else18, %if.else13, %if.then7, %if.then
+  %retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ]
  ret i32 %retval.0
 }

--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -31,6 +31,8 @@

 include(`target-avx-x2.ll')

+rdrand_decls()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max

@@ -71,9 +73,9 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather

-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -31,6 +31,8 @@

 include(`target-avx.ll')

+rdrand_decls()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max

@@ -71,9 +73,9 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather

-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
--- a/builtins/target-avx11-x2.ll
+++ b/builtins/target-avx11-x2.ll
@@ -29,13 +29,55 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

-define(`NO_HALF_DECLARES', `1')
+include(`target-avx-x2.ll')

-include(`target-avx1-x2.ll')
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions

+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -86,4 +128,5 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
-
+'
+)
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -29,13 +29,55 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

-define(`NO_HALF_DECLARES', `1')
+include(`target-avx.ll')

-include(`target-avx1.ll')
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions

+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -70,3 +112,4 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
+')
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,16 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

+ifelse(LLVM_VERSION, `LLVM_3_0', `',
+       LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
+
 include(`target-avx-x2.ll')

+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max

@@ -66,6 +74,9 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions

+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -116,14 +127,435 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
-
+')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather

+declare void @llvm.trap() noreturn nounwind
+
+; $1: type
+; $2: var base name
+define(`extract_4s', `
+  %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %$2_3 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %$2_4 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: type
+; $2: var base name
+define(`extract_8s', `
+  %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef,
+                    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: element type
+; $2: ret name
+; $3: v1
+; $4: v2
+define(`assemble_8s', `
+  %$2 = shufflevector <8 x $1> %$3, <8 x $1> %$4,
+                      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: element type
+; $2: ret name
+; $3: v1
+; $4: v2
+; $5: v3
+; $6: v4
+define(`assemble_4s', `
+  %$2_1 = shufflevector <4 x $1> %$3, <4 x $1> %$4,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$2_2 = shufflevector <4 x $1> %$5, <4 x $1> %$6,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  assemble_8s($1, $2, $2_1, $2_2)
+')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)',
+LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
 gen_gather(i8)
 gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  extract_8s(i32, offsets)
+  extract_8s(i32, vecmask)
+
+  %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets_1, <8 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets_2, <8 x i32> %vecmask_2, i8 %scale8)
+
+  assemble_8s(i32, v, v1, v2)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  extract_4s(i32, vecmask)
+  extract_4s(i64, offsets)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x i32> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x i32> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i32, v, v1, v2, v3, v4)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather32_i32(<16 x i32> %ptrs, 
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_8s(i32, ptrs)
+  extract_8s(i32, vecmask)
+
+  %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                       <8 x i32> %ptrs_1, <8 x i32> %vecmask_1, i8 1)
+  %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                       <8 x i32> %ptrs_2, <8 x i32> %vecmask_2, i8 1)
+
+  assemble_8s(i32, v, v1, v2)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather64_i32(<16 x i64> %ptrs, 
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_4s(i64, ptrs)
+  extract_4s(i32, vecmask)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
+  %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x i32> %vecmask_3, i8 1)
+  %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x i32> %vecmask_4, i8 1)
+
+  assemble_4s(i32, v, v1, v2, v3, v4)
+
+  ret <16 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <16 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <16 x i32> %offsets,
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_8s(i32, offsets)
+  extract_8s(float, mask)
+
+  %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets_1, <8 x float> %mask_1, i8 %scale8)
+  %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets_2, <8 x float> %mask_2, i8 %scale8)
+
+  assemble_8s(float, v, v1, v2)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <16 x i64> %offsets,
+                                   <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_4s(i64, offsets)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
+  %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_3, <4 x float> %mask_3, i8 %scale8)
+  %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_4, <4 x float> %mask_4, i8 %scale8)
+
+  assemble_4s(float, v, v1, v2, v3, v4)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather32_float(<16 x i32> %ptrs, 
+                                      <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_8s(float, mask)
+  extract_8s(i32, ptrs)
+
+  %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs_1, <8 x float> %mask_1, i8 1)
+  %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs_2, <8 x float> %mask_2, i8 1)
+
+  assemble_8s(float, v, v1, v2)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather64_float(<16 x i64> %ptrs, 
+                                      <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_4s(i64, ptrs)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
+  %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x float> %mask_3, i8 1)
+  %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x float> %mask_4, i8 1)
+
+  assemble_4s(float, v, v1, v2, v3, v4)
+
+  ret <16 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <16 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i32, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+
+define <16 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i64, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+
+define <16 x i64> @__gather32_i64(<16 x i32> %ptrs, 
+                                  <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i32, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @__gather64_i64(<16 x i64> %ptrs, 
+                                  <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i64, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <16 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i32, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i64, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather32_double(<16 x i32> %ptrs, 
+                                        <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i32, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_3, <4 x double> %vecmask_3, i8 1)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_4, <4 x double> %vecmask_4, i8 1)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather64_double(<16 x i64> %ptrs, 
+                                        <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i64, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x double> %vecmask_3, i8 1)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x double> %vecmask_4, i8 1)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+')
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,16 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

+ifelse(LLVM_VERSION, `LLVM_3_0', `',
+       LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
+
 include(`target-avx.ll')

+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max

@@ -66,6 +74,9 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions

+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -100,13 +111,323 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
+')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather

+declare void @llvm.trap() noreturn nounwind
+
+define(`extract_4s', `
+  %$2_1 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)',
+LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
 gen_gather(i8)
 gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <8 x i32> @__gather_base_offsets32_i32(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets, <8 x i32> %vecmask, i8 %scale8)
+
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  extract_4s(i32, vecmask)
+  extract_4s(i64, offsets)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather32_i32(<8 x i32> %ptrs, 
+                                 <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                      <8 x i32> %ptrs, <8 x i32> %vecmask, i8 1)
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather64_i32(<8 x i64> %ptrs, 
+                                 <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_4s(i64, ptrs)
+  extract_4s(i32, vecmask)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <8 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <8 x i32> %offsets,
+                                  <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+
+  %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets, <8 x float> %mask, i8 %scale8)
+
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <8 x i64> %offsets,
+                                   <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+  extract_4s(i64, offsets)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
+
+  %v = shufflevector <4 x float> %v1, <4 x float> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather32_float(<8 x i32> %ptrs, 
+                                     <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+
+  %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs, <8 x float> %mask, i8 1)
+
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather64_float(<8 x i64> %ptrs, 
+                                     <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+  extract_4s(i64, ptrs)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
+
+  %v = shufflevector <4 x float> %v1, <4 x float> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <8 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i32, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i64, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather32_i64(<8 x i32> %ptrs, 
+                                 <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+
+  extract_4s(i32, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather64_i64(<8 x i64> %ptrs, 
+                                 <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i64, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <8 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i32, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i64, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather32_double(<8 x i32> %ptrs, 
+                                       <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i32, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather64_double(<8 x i64> %ptrs, 
+                                       <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i64, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  ret <8 x double> %v
+}
+
+')
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -34,12 +34,12 @@ masked_load(double, 8)

 ; define these with the macros from stdlib.m4

-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)

 gen_scatter(i8)
 gen_scatter(i16)
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -32,11 +32,15 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32";

 define(`MASK',`i1')
+define(`HAVE_GATHER',`1')
+define(`HAVE_SCATTER',`1')
+
 include(`util.m4')

 stdlib_core()
 scans()
 reduce_equal(WIDTH)
+rdrand_decls()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; broadcast/rotate/shuffle
@@ -334,19 +338,19 @@ define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
 ;; gather/scatter

 define(`gather_scatter', `
-declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
-                        i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly 
-declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
-                        i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, i32, <WIDTH x i32>,
+                                                 <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, i32, <WIDTH x i64>,
+                                                  <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>, 
                                    <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>, 
                                    <WIDTH x i1>) nounwind readonly 

-declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
-                  i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
-declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
-                  i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, <WIDTH x i32>,
+                                          <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, <WIDTH x i64>,
+                                          <WIDTH x $1>, <WIDTH x i1>) nounwind 
 declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
                             <WIDTH x i1>) nounwind 
 declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -33,6 +33,7 @@ ctlztz()
 define_prefetches()
 define_shuffles()
 aossoa()
+rdrand_decls()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -444,12 +444,12 @@ masked_load(double, 8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)

 gen_scatter(i8)
 gen_scatter(i16)
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -575,12 +575,12 @@ masked_load(double, 8)

 ; define these with the macros from stdlib.m4

-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)

 gen_scatter(i8)
 gen_scatter(i16)
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -33,6 +33,7 @@ ctlztz()
 define_prefetches()
 define_shuffles()
 aossoa()
+rdrand_decls()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -371,12 +371,12 @@ masked_load(double, 8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)

 gen_scatter(i8)
 gen_scatter(i16)
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -474,12 +474,12 @@ masked_load(double, 8)

 ; define these with the macros from stdlib.m4

-gen_gather(i8)
-gen_gather(i16)
-gen_gather(i32)
-gen_gather(float)
-gen_gather(i64)
-gen_gather(double)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)

 gen_scatter(i8)
 gen_scatter(i16)
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1579,7 +1579,7 @@ declare void @__pseudo_masked_store_double(<WIDTH x double> * nocapture, <WIDTH

 ; Declare the pseudo-gather functions.  When the ispc front-end needs
 ; to perform a gather, it generates a call to one of these functions,
-; which have signatures:
+; which ideally have these signatures:
 ;    
 ; varying int8  __pseudo_gather_i8(varying int8 *, mask)
 ; varying int16 __pseudo_gather_i16(varying int16 *, mask)
@@ -1588,24 +1588,9 @@ declare void @__pseudo_masked_store_double(<WIDTH x double> * nocapture, <WIDTH
 ; varying int64 __pseudo_gather_i64(varying int64 *, mask)
 ; varying double __pseudo_gather_double(varying double *, mask)
 ;
-; The GatherScatterFlattenOpt optimization pass finds these calls and then 
-; converts them to make calls to the following functions (when appropriate); 
-; these represent gathers from a common base pointer with offsets.  The
-; offset_scale factor scales the offsets before they are added to the base
-; pointer--it should have the value 1, 2, 4, or 8.  (It can always just be 1.)
-; Then, the offset delta_value (guaranteed to be a compile-time constant value),
-; is added to the final address. The 2, 4, 8 scales are used to match LLVM patterns
-; that use the free 2/4/8 scaling available in x86 addressing calculations, and
-; offset_delta feeds into the free offset calculation. 
-;
-; varying int{8,16,32,float,64,double}
-; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
-;                                    int{32,64} offsets, uniform int32 offset_scale, 
-;                                    int{32,64} offset_delta, mask)
-;
-; Then, the GSImprovementsPass optimizations finds these and either
-; converts them to native gather functions or converts them to vector
-; loads, if equivalent.
+; However, vectors of pointers weren not legal in LLVM until recently, so
+; instead, it emits calls to functions that either take vectors of int32s
+; or int64s, depending on the compilation target.

 declare <WIDTH x i8>  @__pseudo_gather32_i8(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i16> @__pseudo_gather32_i16(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
@@ -1621,31 +1606,106 @@ declare <WIDTH x float> @__pseudo_gather64_float(<WIDTH x i64>, <WIDTH x MASK>)
 declare <WIDTH x i64> @__pseudo_gather64_i64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x double> @__pseudo_gather64_double(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly

-declare <WIDTH x i8>  @__pseudo_gather_base_offsets32_i8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                        <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i16> @__pseudo_gather_base_offsets32_i16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                         <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i32> @__pseudo_gather_base_offsets32_i32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x float> @__pseudo_gather_base_offsets32_float(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i64> @__pseudo_gather_base_offsets32_i64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x double> @__pseudo_gather_base_offsets32_double(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                          <WIDTH x MASK>) nounwind readonly
+; The ImproveMemoryOps optimization pass finds these calls and then 
+; tries to convert them to be calls to gather functions that take a uniform
+; base pointer and then a varying integer offset, when possible.
+;
+; For targets without a native gather instruction, it is best to factor the
+; integer offsets like "{1/2/4/8} * varying_offset + constant_offset",
+; where varying_offset includes non-compile time constant values, and
+; constant_offset includes compile-time constant values.  (The scalar loads
+; generated in turn can then take advantage of the free offsetting and scale by
+; 1/2/4/8 that is offered by the x86 addresisng modes.)
+;
+; varying int{8,16,32,float,64,double}
+; __pseudo_gather_factored_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
+;                                    int{32,64} offsets, uniform int32 offset_scale, 
+;                                    int{32,64} offset_delta, mask)
+;
+; For targets with a gather instruction, it is better to just factor them into
+; a gather from a uniform base pointer and then "{1/2/4/8} * offsets", where the
+; offsets are int32/64 vectors.
+;
+; varying int{8,16,32,float,64,double}
+; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
+;                                    uniform int32 offset_scale, int{32,64} offsets, mask)

-declare <WIDTH x i8>  @__pseudo_gather_base_offsets64_i8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                         <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i16> @__pseudo_gather_base_offsets64_i16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i32> @__pseudo_gather_base_offsets64_i32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x float> @__pseudo_gather_base_offsets64_float(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i64> @__pseudo_gather_base_offsets64_i64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                          <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x double> @__pseudo_gather_base_offsets64_double(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                                <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>
+@__pseudo_gather_factored_base_offsets32_i8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                            <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_factored_base_offsets32_i16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_factored_base_offsets32_i32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_factored_base_offsets32_float(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                               <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_factored_base_offsets32_i64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_factored_base_offsets32_double(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>
+@__pseudo_gather_factored_base_offsets64_i8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                            <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_factored_base_offsets64_i16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_factored_base_offsets64_i32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_factored_base_offsets64_float(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                               <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_factored_base_offsets64_i64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_factored_base_offsets64_double(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>
+@__pseudo_gather_base_offsets32_i8(i8 *, i32, <WIDTH x i32>,
+                                   <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_base_offsets32_i16(i8 *, i32, <WIDTH x i32>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_base_offsets32_i32(i8 *, i32, <WIDTH x i32>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_base_offsets32_float(i8 *, i32, <WIDTH x i32>,
+                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_base_offsets32_i64(i8 *, i32, <WIDTH x i32>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_base_offsets32_double(i8 *, i32, <WIDTH x i32>,
+                                       <WIDTH x MASK>) nounwind readonly
+
+declare <WIDTH x i8>
+@__pseudo_gather_base_offsets64_i8(i8 *, i32, <WIDTH x i64>,
+                                   <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16>
+@__pseudo_gather_base_offsets64_i16(i8 *, i32, <WIDTH x i64>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32>
+@__pseudo_gather_base_offsets64_i32(i8 *, i32, <WIDTH x i64>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x float>
+@__pseudo_gather_base_offsets64_float(i8 *, i32, <WIDTH x i64>,
+                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64>
+@__pseudo_gather_base_offsets64_i64(i8 *, i32, <WIDTH x i64>,
+                                    <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x double>
+@__pseudo_gather_base_offsets64_double(i8 *, i32, <WIDTH x i64>,
+                                       <WIDTH x MASK>) nounwind readonly

 ; Similarly to the pseudo-gathers defined above, we also declare undefined
 ; pseudo-scatter instructions with signatures:
@@ -1657,16 +1717,6 @@ declare <WIDTH x double> @__pseudo_gather_base_offsets64_double(i8 *, <WIDTH x i
 ; void __pseudo_scatter_i64(varying int64 *, varying int64 values, mask)
 ; void __pseudo_scatter_double(varying double *, varying double values, mask)
 ;
-; The GatherScatterFlattenOpt optimization pass also finds these and
-; transforms them to scatters like:
-;
-; void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base, 
-;             varying int32 offsets, uniform int32 offset_scale, 
-;             varying int{32,64} offset_delta, varying int8 values, mask)
-; (and similarly for 16/32/64 bit values)
-;
-; And the GSImprovementsPass in turn converts these to actual native
-; scatters or masked stores.  

 declare void @__pseudo_scatter32_i8(<WIDTH x i32>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter32_i16(<WIDTH x i32>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
@@ -1682,31 +1732,96 @@ declare void @__pseudo_scatter64_float(<WIDTH x i64>, <WIDTH x float>, <WIDTH x
 declare void @__pseudo_scatter64_i64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter64_double(<WIDTH x i64>, <WIDTH x double>, <WIDTH x MASK>) nounwind

-declare void @__pseudo_scatter_base_offsets32_i8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                 <WIDTH x i8>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_i16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                  <WIDTH x i16>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_i32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                  <WIDTH x i32>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_float(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                  <WIDTH x float>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_i64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                  <WIDTH x i64>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_double(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
-                                                   <WIDTH x double>, <WIDTH x MASK>) nounwind
+; And the ImproveMemoryOps optimization pass also finds these and
+; either transforms them to scatters like:
+;
+; void __pseudo_scatter_factored_base_offsets{32,64}_i8(uniform int8 *base, 
+;             varying int32 offsets, uniform int32 offset_scale, 
+;             varying int{32,64} offset_delta, varying int8 values, mask)
+; (and similarly for 16/32/64 bit values)
+;
+; Or, if the target has a native scatter instruction:
+;
+; void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base, 
+;             uniform int32 offset_scale, varying int{32,64} offsets,
+;             varying int8 values, mask)
+; (and similarly for 16/32/64 bit values)

-declare void @__pseudo_scatter_base_offsets64_i8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                 <WIDTH x i8>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_i16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                  <WIDTH x i16>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_i32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                  <WIDTH x i32>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_float(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                  <WIDTH x float>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_i64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                  <WIDTH x i64>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_double(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
-                                                     <WIDTH x double>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                             <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                              <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                              <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                              <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                 <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                             <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                              <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                              <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                              <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
+                                                 <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_scatter_base_offsets32_i8(i8 * nocapture, i32, <WIDTH x i32>,
+                                    <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_i16(i8 * nocapture, i32, <WIDTH x i32>,
+                                     <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_i32(i8 * nocapture, i32, <WIDTH x i32>,
+                                     <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_float(i8 * nocapture, i32, <WIDTH x i32>,
+                                       <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_i64(i8 * nocapture, i32, <WIDTH x i32>,
+                                     <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets32_double(i8 * nocapture, i32, <WIDTH x i32>,
+                                        <WIDTH x double>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_scatter_base_offsets64_i8(i8 * nocapture, i32, <WIDTH x i64>,
+                                    <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_i16(i8 * nocapture, i32, <WIDTH x i64>,
+                                     <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_i32(i8 * nocapture, i32, <WIDTH x i64>,
+                                     <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_float(i8 * nocapture, i32, <WIDTH x i64>,
+                                       <WIDTH x float>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_i64(i8 * nocapture, i32, <WIDTH x i64>,
+                                     <WIDTH x i64>, <WIDTH x MASK>) nounwind
+declare void
+@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, <WIDTH x i64>,
+                                        <WIDTH x double>, <WIDTH x MASK>) nounwind

 declare float @__log_uniform_float(float) nounwind readnone
 declare <WIDTH x float> @__log_varying_float(<WIDTH x float>) nounwind readnone
@@ -1834,143 +1949,246 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
  call void @__usedouble(<WIDTH x double> %pg64_d)

  %g32_8 = call <WIDTH x i8>  @__gather32_i8(<WIDTH x i32> %v32,
-                                             <WIDTH x MASK> %mask)
+                                                     <WIDTH x MASK> %mask)
  call void @__use8(<WIDTH x i8> %g32_8)
  %g32_16 = call <WIDTH x i16>  @__gather32_i16(<WIDTH x i32> %v32,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
  call void @__use16(<WIDTH x i16> %g32_16)
  %g32_32 = call <WIDTH x i32>  @__gather32_i32(<WIDTH x i32> %v32,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %g32_32)
  %g32_f = call <WIDTH x float>  @__gather32_float(<WIDTH x i32> %v32,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
  call void @__usefloat(<WIDTH x float> %g32_f)
  %g32_64 = call <WIDTH x i64>  @__gather32_i64(<WIDTH x i32> %v32,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %g32_64)
  %g32_d = call <WIDTH x double>  @__gather32_double(<WIDTH x i32> %v32,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
  call void @__usedouble(<WIDTH x double> %g32_d)

  %g64_8 = call <WIDTH x i8>  @__gather64_i8(<WIDTH x i64> %v64,
-                                             <WIDTH x MASK> %mask)
+                                                     <WIDTH x MASK> %mask)
  call void @__use8(<WIDTH x i8> %g64_8)
  %g64_16 = call <WIDTH x i16>  @__gather64_i16(<WIDTH x i64> %v64,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
  call void @__use16(<WIDTH x i16> %g64_16)
  %g64_32 = call <WIDTH x i32>  @__gather64_i32(<WIDTH x i64> %v64,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %g64_32)
  %g64_f = call <WIDTH x float>  @__gather64_float(<WIDTH x i64> %v64,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
  call void @__usefloat(<WIDTH x float> %g64_f)
  %g64_64 = call <WIDTH x i64>  @__gather64_i64(<WIDTH x i64> %v64,
-                                                <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %g64_64)
  %g64_d = call <WIDTH x double>  @__gather64_double(<WIDTH x i64> %v64,
-                                                    <WIDTH x MASK> %mask)
+                                                        <WIDTH x MASK> %mask)
  call void @__usedouble(<WIDTH x double> %g64_d)

+ifelse(HAVE_GATHER, `1', 
+`
+  %nfpgbo32_8 = call <WIDTH x i8>
+       @__pseudo_gather_base_offsets32_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfpgbo32_8)
+  %nfpgbo32_16 = call <WIDTH x i16>
+       @__pseudo_gather_base_offsets32_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfpgbo32_16)
+  %nfpgbo32_32 = call <WIDTH x i32>
+       @__pseudo_gather_base_offsets32_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfpgbo32_32)
+  %nfpgbo32_f = call <WIDTH x float>
+       @__pseudo_gather_base_offsets32_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfpgbo32_f)
+  %nfpgbo32_64 = call <WIDTH x i64>
+       @__pseudo_gather_base_offsets32_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfpgbo32_64)
+  %nfpgbo32_d = call <WIDTH x double>
+       @__pseudo_gather_base_offsets32_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfpgbo32_d)
+
+  %nfpgbo64_8 = call <WIDTH x i8>
+       @__pseudo_gather_base_offsets64_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfpgbo64_8)
+  %nfpgbo64_16 = call <WIDTH x i16>
+       @__pseudo_gather_base_offsets64_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfpgbo64_16)
+  %nfpgbo64_32 = call <WIDTH x i32>
+       @__pseudo_gather_base_offsets64_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfpgbo64_32)
+  %nfpgbo64_f = call <WIDTH x float>
+       @__pseudo_gather_base_offsets64_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfpgbo64_f)
+  %nfpgbo64_64 = call <WIDTH x i64>
+       @__pseudo_gather_base_offsets64_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfpgbo64_64)
+  %nfpgbo64_d = call <WIDTH x double>
+       @__pseudo_gather_base_offsets64_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfpgbo64_d)
+
+  %nfgbo32_8 = call <WIDTH x i8>
+       @__gather_base_offsets32_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfgbo32_8)
+  %nfgbo32_16 = call <WIDTH x i16>
+       @__gather_base_offsets32_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfgbo32_16)
+  %nfgbo32_32 = call <WIDTH x i32>
+       @__gather_base_offsets32_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfgbo32_32)
+  %nfgbo32_f = call <WIDTH x float>
+       @__gather_base_offsets32_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfgbo32_f)
+  %nfgbo32_64 = call <WIDTH x i64>
+       @__gather_base_offsets32_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfgbo32_64)
+  %nfgbo32_d = call <WIDTH x double>
+       @__gather_base_offsets32_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfgbo32_d)
+
+  %nfgbo64_8 = call <WIDTH x i8>
+       @__gather_base_offsets64_i8(i8 * %ptr, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %nfgbo64_8)
+  %nfgbo64_16 = call <WIDTH x i16>
+       @__gather_base_offsets64_i16(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %nfgbo64_16)
+  %nfgbo64_32 = call <WIDTH x i32>
+       @__gather_base_offsets64_i32(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %nfgbo64_32)
+  %nfgbo64_f = call <WIDTH x float>
+       @__gather_base_offsets64_float(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %nfgbo64_f)
+  %nfgbo64_64 = call <WIDTH x i64>
+       @__gather_base_offsets64_i64(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %nfgbo64_64)
+  %nfgbo64_d = call <WIDTH x double>
+       @__gather_base_offsets64_double(i8 * %ptr, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %nfgbo64_d)
+',
+`
  %pgbo32_8 = call <WIDTH x i8>
-       @__pseudo_gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__pseudo_gather_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
  call void @__use8(<WIDTH x i8> %pgbo32_8)
  %pgbo32_16 = call <WIDTH x i16>
-       @__pseudo_gather_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__pseudo_gather_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
  call void @__use16(<WIDTH x i16> %pgbo32_16)
  %pgbo32_32 = call <WIDTH x i32>
-       @__pseudo_gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__pseudo_gather_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %pgbo32_32)
  %pgbo32_f = call <WIDTH x float>
-       @__pseudo_gather_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__pseudo_gather_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
  call void @__usefloat(<WIDTH x float> %pgbo32_f)
  %pgbo32_64 = call <WIDTH x i64>
-       @__pseudo_gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__pseudo_gather_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %pgbo32_64)
  %pgbo32_d = call <WIDTH x double>
-       @__pseudo_gather_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+       @__pseudo_gather_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
  call void @__usedouble(<WIDTH x double> %pgbo32_d)

-  %gbo32_8 = call <WIDTH x i8>
-       @__gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
-                                  <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__use8(<WIDTH x i8> %gbo32_8)
-  %gbo32_16 = call <WIDTH x i16>
-       @__gather_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
-                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__use16(<WIDTH x i16> %gbo32_16)
-  %gbo32_32 = call <WIDTH x i32>
-       @__gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
-                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__use32(<WIDTH x i32> %gbo32_32)
-  %gbo32_f = call <WIDTH x float>
-       @__gather_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
-                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__usefloat(<WIDTH x float> %gbo32_f)
-  %gbo32_64 = call <WIDTH x i64>
-       @__gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
-                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__use64(<WIDTH x i64> %gbo32_64)
-  %gbo32_d = call <WIDTH x double>
-       @__gather_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
-                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__usedouble(<WIDTH x double> %gbo32_d)
-
-
  %pgbo64_8 = call <WIDTH x i8>
-       @__pseudo_gather_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__pseudo_gather_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__use8(<WIDTH x i8> %pgbo64_8)
  %pgbo64_16 = call <WIDTH x i16>
-       @__pseudo_gather_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__pseudo_gather_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__use16(<WIDTH x i16> %pgbo64_16)
  %pgbo64_32 = call <WIDTH x i32>
-       @__pseudo_gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__pseudo_gather_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %pgbo64_32)
  %pgbo64_f = call <WIDTH x float>
-       @__pseudo_gather_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__pseudo_gather_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__usefloat(<WIDTH x float> %pgbo64_f)
  %pgbo64_64 = call <WIDTH x i64>
-       @__pseudo_gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__pseudo_gather_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %pgbo64_64)
  %pgbo64_d = call <WIDTH x double>
-       @__pseudo_gather_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+       @__pseudo_gather_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__usedouble(<WIDTH x double> %pgbo64_d)

+  %gbo32_8 = call <WIDTH x i8>
+       @__gather_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %gbo32_8)
+  %gbo32_16 = call <WIDTH x i16>
+       @__gather_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %gbo32_16)
+  %gbo32_32 = call <WIDTH x i32>
+       @__gather_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %gbo32_32)
+  %gbo32_f = call <WIDTH x float>
+       @__gather_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usefloat(<WIDTH x float> %gbo32_f)
+  %gbo32_64 = call <WIDTH x i64>
+       @__gather_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %gbo32_64)
+  %gbo32_d = call <WIDTH x double>
+       @__gather_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                           <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %gbo32_d)
+
  %gbo64_8 = call <WIDTH x i8>
-       @__gather_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
-                                  <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+       @__gather_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__use8(<WIDTH x i8> %gbo64_8)
  %gbo64_16 = call <WIDTH x i16>
-       @__gather_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
-                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+       @__gather_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__use16(<WIDTH x i16> %gbo64_16)
  %gbo64_32 = call <WIDTH x i32>
-       @__gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
-                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+       @__gather_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__use32(<WIDTH x i32> %gbo64_32)
  %gbo64_f = call <WIDTH x float>
-       @__gather_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
-                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+       @__gather_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__usefloat(<WIDTH x float> %gbo64_f)
  %gbo64_64 = call <WIDTH x i64>
-       @__gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
-                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+       @__gather_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__use64(<WIDTH x i64> %gbo64_64)
  %gbo64_d = call <WIDTH x double>
-       @__gather_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
-                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
-  call void @__usedouble(<WIDTH x double> %gbo64_d)
+       @__gather_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                           <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__usedouble(<WIDTH x double> %pgbo64_d)
+')

  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;; scatters
@@ -2003,61 +2221,118 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
  call void @__scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
  call void @__scatter64_double(<WIDTH x i64> %v64, <WIDTH x double> %vd, <WIDTH x MASK> %mask)

-  call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+ifelse(HAVE_SCATTER, `1',
+`
+  call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+  call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)

-  call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
-  call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+  call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)

-  call void @__scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
-                                         <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
-                                          <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
-                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
-                                            <WIDTH x float> %vf, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
-                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
-                                             <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i8(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i16(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i32(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_float(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i64(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_double(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)

-  call void @__scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
-                                         <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
-                                          <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
-                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
-                                            <WIDTH x float> %vf, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
-                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
-  call void @__scatter_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
-                                             <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i8(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i16(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i32(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_float(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i64(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_double(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+',
+`
+  call void @__pseudo_scatter_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_scatter_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+
+  call void @__scatter_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                   <WIDTH x float> %vf, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__scatter_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
+')

  ret void
 }

+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops

@@ -3196,8 +3471,42 @@ pl_done:
 ;;
 ;; $1: scalar type for which to generate functions to do gathers

+define(`gen_gather_general', `
+; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
+define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs, 
+                                   <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+  %ret_ptr = alloca <WIDTH x $1>
+  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
+  %val_LANE_ID = load $1 * %ptr_LANE_ID
+  %store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
+  store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
+ ')
+
+  %ret = load <WIDTH x $1> * %ret_ptr
+  ret <WIDTH x $1> %ret
+}
+
+; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
+define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs, 
+                                   <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+  %ret_ptr = alloca <WIDTH x $1>
+  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
+  %val_LANE_ID = load $1 * %ptr_LANE_ID
+  %store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
+  store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
+ ')
+
+  %ret = load <WIDTH x $1> * %ret_ptr
+  ret <WIDTH x $1> %ret
+}
+')
+
 ; vec width, type
-define(`gen_gather', `
+define(`gen_gather_factored', `
 ;; Define the utility function to do the gather operation for a single element
 ;; of the type
 define <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
@@ -3245,7 +3554,7 @@ define <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %o
 }


-define <WIDTH x $1> @__gather_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
+define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
                                             <WIDTH x i32> %offset_delta,
                                             <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
@@ -3276,7 +3585,7 @@ define <WIDTH x $1> @__gather_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offset
  ret <WIDTH x $1> %ret`'eval(WIDTH-1)
 }

-define <WIDTH x $1> @__gather_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
+define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
                                             <WIDTH x i64> %offset_delta,
                                             <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
@@ -3307,37 +3616,42 @@ define <WIDTH x $1> @__gather_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offset
  ret <WIDTH x $1> %ret`'eval(WIDTH-1)
 }

-; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
-define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs, 
-                                <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
-  %ret_ptr = alloca <WIDTH x $1>
-  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
-  %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
-  %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
-  %val_LANE_ID = load $1 * %ptr_LANE_ID
-  %store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
-  store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
- ')
+gen_gather_general($1)
+'
+)

-  %ret = load <WIDTH x $1> * %ret_ptr
-  ret <WIDTH x $1> %ret
+; vec width, type
+define(`gen_gather', `
+
+gen_gather_factored($1)
+
+define <WIDTH x $1>
+@__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale,
+                           <WIDTH x i32> %offsets,
+                           <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale_vec = bitcast i32 %offset_scale to <1 x i32>
+  %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef,
+     <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
+  %scaled_offsets = mul <WIDTH x i32> %smear_scale, %offsets
+  %v = call <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %scaled_offsets, i32 1, 
+                                                     <WIDTH x i32> zeroinitializer, <WIDTH x i32> %vecmask)
+  ret <WIDTH x $1> %v
 }

-; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
-define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs, 
-                                <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
-  %ret_ptr = alloca <WIDTH x $1>
-  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
-  %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
-  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
-  %val_LANE_ID = load $1 * %ptr_LANE_ID
-  %store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
-  store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
- ')
-
-  %ret = load <WIDTH x $1> * %ret_ptr
-  ret <WIDTH x $1> %ret
+define <WIDTH x $1>
+@__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale,
+                            <WIDTH x i64> %offsets,
+                            <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale64 = zext i32 %offset_scale to i64
+  %scale_vec = bitcast i64 %scale64 to <1 x i64>
+  %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef,
+     <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
+  %scaled_offsets = mul <WIDTH x i64> %smear_scale, %offsets
+  %v = call <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %scaled_offsets,
+                                                     i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x i32> %vecmask)
+  ret <WIDTH x $1> %v
 }
+
 '
 )

@@ -3391,7 +3705,7 @@ define void @__scatter_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_s
  ret void
 }

-define void @__scatter_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
+define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
                                         <WIDTH x i32> %offset_delta, <WIDTH x $1> %values,
                                         <WIDTH x i32> %mask) nounwind alwaysinline {
  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
@@ -3401,7 +3715,7 @@ define void @__scatter_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32
  ret void
 }

-define void @__scatter_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
+define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
                                         <WIDTH x i64> %offset_delta, <WIDTH x $1> %values,
                                         <WIDTH x i32> %mask) nounwind alwaysinline {
  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
@@ -3437,3 +3751,48 @@ define void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,

 '
 )
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rdrand 
+
+define(`rdrand_decls', `
+declare i1 @__rdrand_i16(i16 * nocapture)
+declare i1 @__rdrand_i32(i32 * nocapture)
+declare i1 @__rdrand_i64(i64 * nocapture)
+')
+
+define(`rdrand_definition', `
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rdrand
+
+declare {i16, i32} @llvm.x86.rdrand.16()
+declare {i32, i32} @llvm.x86.rdrand.32()
+declare {i64, i32} @llvm.x86.rdrand.64()
+
+define i1 @__rdrand_i16(i16 * %ptr) {
+  %v = call {i16, i32} @llvm.x86.rdrand.16()
+  %v0 = extractvalue {i16, i32} %v, 0
+  %v1 = extractvalue {i16, i32} %v, 1
+  store i16 %v0, i16 * %ptr
+  %good = icmp ne i32 %v1, 0
+  ret i1 %good
+}
+
+define i1 @__rdrand_i32(i32 * %ptr) {
+  %v = call {i32, i32} @llvm.x86.rdrand.32()
+  %v0 = extractvalue {i32, i32} %v, 0
+  %v1 = extractvalue {i32, i32} %v, 1
+  store i32 %v0, i32 * %ptr
+  %good = icmp ne i32 %v1, 0
+  ret i1 %good
+}
+
+define i1 @__rdrand_i64(i64 * %ptr) {
+  %v = call {i64, i32} @llvm.x86.rdrand.64()
+  %v0 = extractvalue {i64, i32} %v, 0
+  %v1 = extractvalue {i64, i32} %v, 1
+  store i64 %v0, i64 * %ptr
+  %good = icmp ne i32 %v1, 0
+  ret i1 %good
+}
+')
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -140,6 +140,7 @@ Contents:
    * `Basic Math Functions`_
    * `Transcendental Functions`_
    * `Pseudo-Random Numbers`_
+    * `Random Numbers`_

  + `Output Functions`_
  + `Assertions`_
@@ -2084,7 +2085,7 @@ can be declared:

    soa<8> Point pts[...];

-The in-memory layout of the ``Point``s has had the SOA transformation
+The in-memory layout of the ``Point`` instances has had the SOA transformation
 applied, such that there are 8 ``x`` values in memory followed by 8 ``y``
 values, and so forth.  Here is the effective declaration of ``soa<8>
 Point``:
@@ -2266,7 +2267,7 @@ based on C++'s ``new`` and ``delete`` operators:
   // use ptr...
   delete[] ptr;

-In the above code, each program instance allocates its own ``count`-sized
+In the above code, each program instance allocates its own ``count`` sized
 array of ``uniform int`` values, uses that memory, and then deallocates
 that memory.  Uses of ``new`` and ``delete`` in ``ispc`` programs are
 serviced by corresponding calls the system C library's ``malloc()`` and
@@ -2277,9 +2278,7 @@ analogous to the corresponding rules are for pointers (as described in
 `Pointer Types`_.)  Specifically, if a specific rate qualifier isn't
 provided with the ``new`` expression, then the default is that a "varying"
 ``new`` is performed, where each program instance performs a unique
-allocation.  The allocated type, in turn, is by default ``uniform`` for
-``varying`` ``new`` expressions, and ``varying`` for ``uniform`` new
-expressions.
+allocation.  The allocated type, in turn, is by default ``uniform``.
 
 After a pointer has been deleted, it is illegal to access the memory it
 points to.  However, that deletion happens on a per-program-instance basis.
@@ -3457,6 +3456,40 @@ be used to get a pseudo-random ``float`` value.
    uniform unsigned int32 random(RNGState * uniform state)
    uniform float frandom(uniform RNGState * uniform state)

+
+Random Numbers
+--------------
+
+Some recent CPUs (including those based on the Intel(r) Ivy Bridge
+micro-architecture), provide support for generating true random numbers.  A
+few standard library functions make this functionality available:
+
+::
+
+    bool rdrand(uniform int32 * uniform ptr)
+    bool rdrand(varying int32 * uniform ptr)
+    bool rdrand(uniform int32 * varying ptr)
+
+If the processor doesn't have sufficient entropy to generate a random
+number, then this function fails and returns ``false``.  Otherwise, if the
+processor is successful, the random value is stored in the given pointer
+and ``true`` is returned.  Therefore, this function should generally be
+used as follows, called repeatedly until it is successful:
+
+::
+
+    int r;
+    while (rdrand(&r) == false)
+        ; // empty loop body
+   
+
+In addition to the ``int32`` variants of ``rdrand()`` listed above, there
+are versions that return ``int16``, ``float``, and ``int64`` values as
+well.
+
+Note that when compiling to targets other than ``avx1.1`` and ``avx2``, the
+``rdrand()`` functions always return ``false``.
+
 Output Functions
 ----------------

@@ -3491,7 +3524,7 @@ generates the following output on a four-wide compilation target:
 ::

    i = 10, x = [0.000000,1.000000,2.000000,3.000000]
-    added to x = [1.000000,2.000000,((2.000000)),((3.000000)]
+    added to x = [1.000000,2.000000,((2.000000)),((3.000000))]
    last print of x = [1.000000,2.000000,2.000000,3.000000]

 When a varying variable is printed, the values for program instances that
@@ -4010,8 +4043,8 @@ Systems Programming Support
 Atomic Operations and Memory Fences
 -----------------------------------

-The standard range of atomic memory operations are provided by the standard
-library``ispc``, including variants to handle both uniform and varying
+The standard set of atomic memory operations are provided by the standard
+library, including variants to handle both uniform and varying
 types as well as "local" and "global" atomics.

 Local atomics provide atomic behavior across the program instances in a
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1307,15 +1307,13 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
 // offsets * offsetScale is in bytes (for all of these)

 #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
-static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
-                              uint32_t scale, OTYPE constOffset, \
-                              __vec16_i1 mask) {                        \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec16_i1 mask) {          \
    VTYPE ret;                                                          \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 16; ++i)                                        \
        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
            ret.v[i] = *ptr;                                            \
        }                                                               \
    return ret;                                                         \
@@ -1362,14 +1360,13 @@ GATHER_GENERAL(__vec16_d,   double,  __vec16_i64, __gather64_double)
 // scatter

 #define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
-static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
-                             uint32_t scale, OTYPE constOffset,         \
-                             VTYPE val, __vec16_i1 mask) {              \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val,                   \
+                             __vec16_i1 mask) {                         \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 16; ++i)                                        \
        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
            *ptr = val.v[i];                                            \
        }                                                               \
 }
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -1375,15 +1375,13 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec32_d val,
 // offsets * offsetScale is in bytes (for all of these)

 #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
-static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
-                              uint32_t scale, OTYPE constOffset, \
-                              __vec32_i1 mask) {                        \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec32_i1 mask) {          \
    VTYPE ret;                                                          \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 32; ++i)                                        \
        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
            ret.v[i] = *ptr;                                            \
        }                                                               \
    return ret;                                                         \
@@ -1430,14 +1428,12 @@ GATHER_GENERAL(__vec32_d, double, __vec32_i64, __gather64_double)
 // scatter

 #define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
-static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
-                             uint32_t scale, OTYPE constOffset,         \
-                             VTYPE val, __vec32_i1 mask) {              \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val, __vec32_i1 mask) { \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 32; ++i)                                        \
        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
            *ptr = val.v[i];                                            \
        }                                                               \
 }
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -1508,15 +1508,13 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec64_d val,
 // offsets * offsetScale is in bytes (for all of these)

 #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
-static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
-                              uint32_t scale, OTYPE constOffset, \
-                              __vec64_i1 mask) {                        \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec64_i1 mask) {          \
    VTYPE ret;                                                          \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 64; ++i)                                        \
-        if ((mask.v & (1ull << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
+        if ((mask.v & (1ull << i)) != 0) {                              \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
            ret.v[i] = *ptr;                                            \
        }                                                               \
    return ret;                                                         \
@@ -1540,7 +1538,7 @@ GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_base_offsets64_doub
 static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) {   \
    VTYPE ret;                                              \
    for (int i = 0; i < 64; ++i)                            \
-        if ((mask.v & (1ull << i)) != 0) {                     \
+        if ((mask.v & (1ull << i)) != 0) {                  \
            STYPE *ptr = (STYPE *)ptrs.v[i];                \
            ret.v[i] = *ptr;                                \
        }                                                   \
@@ -1563,14 +1561,12 @@ GATHER_GENERAL(__vec64_d, double, __vec64_i64, __gather64_double)
 // scatter

 #define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
-static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
-                             uint32_t scale, OTYPE constOffset,         \
-                             VTYPE val, __vec64_i1 mask) {              \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val, __vec64_i1 mask) { \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 64; ++i)                                        \
-        if ((mask.v & (1ull << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
+        if ((mask.v & (1ull << i)) != 0) {                              \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
            *ptr = val.v[i];                                            \
        }                                                               \
 }
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -803,6 +803,13 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
 // int64


+static FORCEINLINE __vec16_i64 __setzero_i64() {
+    __vec16_i64 ret;
+    ret.v_lo = _mm512_setzero_epi32();
+    ret.v_hi = _mm512_setzero_epi32();
+    return ret;
+}
+
 static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b)
 {
    __mmask16 carry = 0;
@@ -878,7 +885,7 @@ static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index)
    return src[index+16] | (int64_t(src[index]) << 32);
 }

-static FORCEINLINE  __vec16_i64 __smear_i64(__vec16_i64, const int64_t &l) {
+static FORCEINLINE  __vec16_i64 __smear_i64(const int64_t &l) {
    const int *i = (const int*)&l;
    return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1]));
 }
@@ -1373,6 +1380,11 @@ CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
 CAST(__vec16_i32, int32_t, __vec16_i8,  int8_t,  __cast_sext)
 CAST(__vec16_i16, int16_t, __vec16_i8,  int8_t,  __cast_sext)

+static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
+{
+    return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
+}
+
 #define CAST_SEXT_I1(TYPE)
 /*
 static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) {  \
@@ -1389,11 +1401,6 @@ CAST_SEXT_I1(__vec16_i8)
 CAST_SEXT_I1(__vec16_i16)
 CAST_SEXT_I1(__vec16_i32)

-static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
-{
-    return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
-}
-
 // zero extension
 CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
 CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext)
@@ -1421,6 +1428,14 @@ CAST_ZEXT_I1(__vec16_i16)
 CAST_ZEXT_I1(__vec16_i32)
 CAST_ZEXT_I1(__vec16_i64)

+static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
+{
+    __vec16_i32 ret = _mm512_setzero_epi32();
+    __vec16_i32 one = _mm512_set1_epi32(1);
+    return _mm512_mask_mov_epi32(ret, val.m, one);
+}
+
+
 // truncations
 CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc)
 CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc)
@@ -1589,11 +1604,6 @@ CAST_BITS_SCALAR(double, int64_t)
 ///////////////////////////////////////////////////////////////////////////
 // various math functions

-/*
-static FORCEINLINE void __fastmath() {
-}
-*/
-
 static FORCEINLINE float __round_uniform_float(float v) {
    return roundf(v);
 }
@@ -1659,14 +1669,25 @@ static FORCEINLINE __vec16_f __min_varying_float(__vec16_f v1, __vec16_f v2) {
  return _mm512_gmin_ps(v1, v2);
 }

+static FORCEINLINE __vec16_i32 __max_varying_int32(__vec16_i32 v1, __vec16_i32 v2) {
+  return _mm512_max_epi32(v1, v2);
+}
+
+static FORCEINLINE __vec16_i32 __min_varying_int32(__vec16_i32 v1, __vec16_i32 v2) {
+  return _mm512_min_epi32(v1, v2);
+}
+
+static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) {
+  return _mm512_max_epu32(v1, v2);
+}
+
+static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) {
+  return _mm512_min_epu32(v1, v2);
+}
+
 BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double)
 BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double)

-BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32)
-BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32)
-BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32)
-BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32)
-
 BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64)
 BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64)
 BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64)
@@ -1940,60 +1961,33 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,

 // offsets * offsetScale is in bytes (for all of these)

-#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
-/*
-static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
-                              uint32_t scale, OTYPE constOffset, \
-                              __vec16_i1 mask) {                        \
-    VTYPE ret;                                                          \
-    int8_t *base = (int8_t *)b;                                         \
-    for (int i = 0; i < 16; ++i)                                        \
-        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
-            ret.v[i] = *ptr;                                            \
-        }                                                               \
-    return ret;                                                         \
-}
-*/
-
 static FORCEINLINE __vec16_i32
-__gather_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset, 
-			    uint32_t scale, __vec16_i32 constOffset, 
-			    __vec16_i1 mask) { 
-    __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
-    __vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
-    __vec16_i32 tmp;
-
-    // Loop is generated by intrinsic
+__gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets, 
+			    __vec16_i1 mask) {
+    __vec16_i32 tmp = _mm512_undefined_epi32();
    __vec16_i32 ret = _mm512_mask_i32extgather_epi32(tmp, mask, offsets, base, 
-                                                     _MM_UPCONV_EPI32_NONE, 1,
+                                                     _MM_UPCONV_EPI32_NONE, scale,
                                                     _MM_HINT_NONE);
    return ret;
 }

 static FORCEINLINE __vec16_f
-__gather_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset, 
-                              uint32_t scale, __vec16_i32 constOffset, 
+__gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
                              __vec16_i1 mask) { 
-    __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
-    __vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
-    __vec16_f tmp;
-
-    // Loop is generated by intrinsic
-    __vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base, 
-                                                _MM_UPCONV_PS_NONE, 1,
+    __vec16_f tmp = _mm512_undefined_ps();
+    __vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base,
+                                                _MM_UPCONV_PS_NONE, scale,
                                                _MM_HINT_NONE);
    return ret;
 }

-GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
-GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
-GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
-GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
-GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
-GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
-GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
+//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
+//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
+//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
+//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
+//GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
+//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
+//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)

 #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
 /*
@@ -2039,47 +2033,43 @@ static FORCEINLINE __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask)
 */
 // scatter

-#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
-/*
-static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
-                             uint32_t scale, OTYPE constOffset,         \
-                             VTYPE val, __vec16_i1 mask) {              \
-    int8_t *base = (int8_t *)b;                                         \
-    for (int i = 0; i < 16; ++i)                                        \
-        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
-            *ptr = val.v[i];                                            \
-        }                                                               \
-}
-*/
-
-SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
-SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
-SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
-SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
-SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
-SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
-SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
+//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
+//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
+//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
+//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
+//SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
+//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
+//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)

 static FORCEINLINE void
-__scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
-                             uint32_t scale, __vec16_i32 constOffset,
+__scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
                             __vec16_i32 val, __vec16_i1 mask)
 {
-    __vec16_i32 offsets = __add(__mul(__vec16_i32(scale), varyingOffset), constOffset);
-    _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE);
+    _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, 
+                                    _MM_DOWNCONV_EPI32_NONE, scale, 
+                                    _MM_HINT_NONE);
 }

 static FORCEINLINE void 
-__scatter_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset, 
-                               uint32_t scale, const __vec16_i32 &constOffset, 
-                               const __vec16_f &val, const __vec16_i1 mask) 
+__scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
+                               __vec16_f val, __vec16_i1 mask) 
 { 
-    __vec16_i32 offsets = __add(__mul(varyingOffset,__vec16_i32(scale)), constOffset);
-    _mm512_mask_i32extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
+    _mm512_mask_i32extscatter_ps(base, mask, offsets, val, 
+                                 _MM_DOWNCONV_PS_NONE, scale,
+                                 _MM_HINT_NONE);
 }

+/*
+static FORCEINLINE void
+__scatter_base_offsets64_float(void *base, const __vec16_i64 &varyingOffset,
+                               uint32_t scale, const __vec16_i64 &constOffset,
+                               const __vec16_f &val, const __vec16_i1 mask)
+{
+    __vec16_i64 offsets = __add(__mul(varyingOffset,__vec16_i64(scale)), constOffset);
+    _mm512_mask_i64extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
+}
+*/
+
 #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
 /*
 static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -2892,54 +2892,53 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec4_d val,

 template<typename RetVec, typename RetScalar>
 static FORCEINLINE RetVec
-lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets, 
-                     uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
+lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, uint32_t scale, 
+                     __vec4_i32 offsets, __vec4_i1 mask) {
    RetScalar r[4];
 #if 1
    // "Fast gather" trick...
    offsets = __select(mask, offsets, __setzero_i32());
-    constOffset = __select(mask, constOffset, __setzero_i32());

-    int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
+    int offset = scale * _mm_extract_epi32(offsets.v, 0);
    RetScalar *ptr = (RetScalar *)(p + offset);
    r[0] = *ptr;

-    offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
+    offset = scale * _mm_extract_epi32(offsets.v, 1);
    ptr = (RetScalar *)(p + offset);
    r[1] = *ptr;

-    offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
+    offset = scale * _mm_extract_epi32(offsets.v, 2);
    ptr = (RetScalar *)(p + offset);
    r[2] = *ptr;

-    offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
+    offset = scale * _mm_extract_epi32(offsets.v, 3);
    ptr = (RetScalar *)(p + offset);
    r[3] = *ptr;
 #else
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
+        int offset = scale * _mm_extract_epi32(offsets.v, 0);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[0] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
+        int offset = scale * _mm_extract_epi32(offsets.v, 1);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[1] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
+        int offset = scale * _mm_extract_epi32(offsets.v, 2);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[2] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
+        int offset = scale * _mm_extract_epi32(offsets.v, 3);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[3] = *ptr;
    }
@@ -2950,54 +2949,53 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,

 template<typename RetVec, typename RetScalar>
 static FORCEINLINE RetVec
-lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
-                     uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
+lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, uint32_t scale, 
+                     __vec4_i64 offsets, __vec4_i1 mask) {
    RetScalar r[4];
 #if 1
    // "Fast gather" trick...
    offsets = __select(mask, offsets, __setzero_i64());
-    constOffset = __select(mask, constOffset, __setzero_i64());

-    int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
+    int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
    RetScalar *ptr = (RetScalar *)(p + offset);
    r[0] = *ptr;

-    offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
+    offset = scale * _mm_extract_epi64(offsets.v[0], 1);
    ptr = (RetScalar *)(p + offset);
    r[1] = *ptr;

-    offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
+    offset = scale * _mm_extract_epi64(offsets.v[1], 0);
    ptr = (RetScalar *)(p + offset);
    r[2] = *ptr;

-    offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
+    offset = scale * _mm_extract_epi64(offsets.v[1], 1);
    ptr = (RetScalar *)(p + offset);
    r[3] = *ptr;
 #else
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[0] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[1] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[2] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[3] = *ptr;
    }
@@ -3007,87 +3005,75 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
 }

 static FORCEINLINE __vec4_i8
-__gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
-                           uint32_t scale,  __vec4_i32 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets32_i8(unsigned char *b, uint32_t scale,  __vec4_i32 offsets,
+                           __vec4_i1 mask) {
+    return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, scale, offsets, mask);
 }

 static FORCEINLINE __vec4_i8
-__gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
-                           uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets64_i8(unsigned char *b, uint32_t scale, __vec4_i64 offsets,
+                           __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, scale, offsets, mask);
 }

 static FORCEINLINE __vec4_i16
-__gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets32_i16(unsigned char *b, uint32_t scale, __vec4_i32 offsets,
+                            __vec4_i1 mask) {
+    return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, scale, offsets, mask);
 }

 static FORCEINLINE __vec4_i16
- __gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
-                             uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets64_i16(unsigned char *b, uint32_t scale, __vec4_i64 offsets,
+                            __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, scale, offsets, mask);
 }

 static FORCEINLINE __vec4_i32
-__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
-                            __vec4_i32 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets32_i32(uint8_t *p, uint32_t scale, __vec4_i32 offsets, 
+                            __vec4_i1 mask) {
+    return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, scale, offsets, mask);
 }

 static FORCEINLINE __vec4_i32
-__gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
-                            uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets64_i32(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
+                            __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, scale, offsets, mask);
 }

 static FORCEINLINE __vec4_f
-__gather_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
-                              __vec4_i32 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets32(__vec4_f(), float(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets32_float(uint8_t *p, uint32_t scale, __vec4_i32 offsets, 
+                              __vec4_i1 mask) {
+    return lGatherBaseOffsets32(__vec4_f(), float(), p, scale, offsets, mask);
 }

 static FORCEINLINE __vec4_f
-__gather_base_offsets64_float(unsigned char *p, __vec4_i64 offsets,
-                              uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets64(__vec4_f(), float(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets64_float(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
+                              __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_f(), float(), p, scale, offsets, mask);
 }

 static FORCEINLINE __vec4_i64
-__gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
+                            __vec4_i1 mask) {
+    return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, scale, offsets, mask);
 }

 static FORCEINLINE __vec4_i64
-__gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
-                            uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
+                            __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, scale, offsets, mask);
 }

 static FORCEINLINE __vec4_d
-__gather_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
-                               uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets32(__vec4_d(), double(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
+                               __vec4_i1 mask) {
+    return lGatherBaseOffsets32(__vec4_d(), double(), p, scale, offsets, mask);
 }

 static FORCEINLINE __vec4_d
-__gather_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
-                               uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
-    return lGatherBaseOffsets64(__vec4_d(), double(), p, offsets, scale, 
-                                constOffset, mask);
+__gather_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
+                                __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_d(), double(), p, scale, offsets, mask);
 }

 template<typename RetVec, typename RetScalar>
@@ -3252,63 +3238,55 @@ static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) {
  
 #define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT)             \
 static FORCEINLINE void                                             \
-__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
-                                   uint32_t scale, __vec4_i32 constOffset, \
+__scatter_base_offsets32_##SUFFIX (unsigned char *b, uint32_t scale, \
+                                   __vec4_i32 offsets, \
                                   __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
    uint32_t m = _mm_extract_ps(mask.v, 0);                             \
    if (m != 0) {                                                       \
-        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \
-                             _mm_extract_epi32(constOffset.v, 0));      \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0)); \
        *ptr = EXTRACT(val.v, 0);                                       \
    }                                                                   \
    m = _mm_extract_ps(mask.v, 1);                                      \
    if (m != 0) {                                                       \
-        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \
-                             _mm_extract_epi32(constOffset.v, 1));      \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1)); \
        *ptr = EXTRACT(val.v, 1);                                       \
    }                                                                   \
    m = _mm_extract_ps(mask.v, 2);                                      \
    if (m != 0) {                                                       \
-        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \
-                             _mm_extract_epi32(constOffset.v, 2));      \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2)); \
        *ptr = EXTRACT(val.v, 2);                                       \
    }                                                                   \
    m = _mm_extract_ps(mask.v, 3);                                      \
    if (m != 0) {                                                       \
-        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \
-                             _mm_extract_epi32(constOffset.v, 3));      \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3)); \
        *ptr = EXTRACT(val.v, 3);                                       \
    }                                                                   \
 }                                                                       \
-static FORCEINLINE void                                                \
-__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
-                                  uint32_t scale, __vec4_i64 constOffset, \
+static FORCEINLINE void                                                 \
+__scatter_base_offsets64_##SUFFIX(unsigned char *p, uint32_t scale,     \
+                                  __vec4_i64 offsets,                   \
                                  __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
    uint32_t m = _mm_extract_ps(mask.v, 0);                            \
    if (m != 0) {                                                      \
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +  \
-            _mm_extract_epi64(constOffset.v[0], 0);                    \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);   \
        TYPE *ptr = (TYPE *)(p + offset);                              \
        *ptr = EXTRACT(val.v, 0);                                      \
    }                                                                  \
    m = _mm_extract_ps(mask.v, 1);                                     \
    if (m != 0) {                                                      \
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +  \
-            _mm_extract_epi64(constOffset.v[0], 1);                    \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);   \
        TYPE *ptr = (TYPE *)(p + offset);                              \
        *ptr = EXTRACT(val.v, 1);                                      \
    }                                                                  \
    m = _mm_extract_ps(mask.v, 2);                                     \
    if (m != 0) {                                                      \
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +  \
-            _mm_extract_epi64(constOffset.v[1], 0);                    \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);   \
        TYPE *ptr = (TYPE *)(p + offset);                              \
        *ptr = EXTRACT(val.v, 2);                                      \
    }                                                                  \
    m = _mm_extract_ps(mask.v, 3);                                     \
    if (m != 0) {                                                      \
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +  \
-            _mm_extract_epi64(constOffset.v[1], 1);                    \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);   \
        TYPE *ptr = (TYPE *)(p + offset);                              \
        *ptr = EXTRACT(val.v, 3);                                      \
    }                                                                  \
@@ -3322,91 +3300,79 @@ SCATTER32_64(float, f,     float,   _mm_extract_ps_as_float)


 static FORCEINLINE void
-__scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, 
-                             uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val, 
-                             __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) +
-            _mm_extract_epi32(constOffset.v, 0);
-        uint64_t *ptr = (uint64_t *)(p + offset);
-        *ptr = _mm_extract_epi64(val.v[0], 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) +
-            _mm_extract_epi32(constOffset.v, 1);
-        uint64_t *ptr = (uint64_t *)(p + offset);
-        *ptr = _mm_extract_epi64(val.v[0], 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) +
-            _mm_extract_epi32(constOffset.v, 2);
-        uint64_t *ptr = (uint64_t *)(p + offset);
-        *ptr = _mm_extract_epi64(val.v[1], 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) +
-            _mm_extract_epi32(constOffset.v, 3);
-        uint64_t *ptr = (uint64_t *)(p + offset);
-        *ptr = _mm_extract_epi64(val.v[1], 1);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, 
-                             uint32_t scale, __vec4_i64 constOffset,
+__scatter_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets, 
                             __vec4_i64 val, __vec4_i1 mask) {
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +
-            _mm_extract_epi64(constOffset.v[0], 0);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 0);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[0], 0);
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +
-            _mm_extract_epi64(constOffset.v[0], 1);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 1);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[0], 1);
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +
-            _mm_extract_epi64(constOffset.v[1], 0);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 2);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[1], 0);
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +
-            _mm_extract_epi64(constOffset.v[1], 1);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 3);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[1], 1);
    }
 }

 static FORCEINLINE void
-__scatter_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, 
-                                uint32_t scale, __vec4_i32 constOffset, __vec4_d val, 
-                                __vec4_i1 mask) {
-    __scatter_base_offsets32_i64(p, offsets, scale, constOffset, val, mask);
+__scatter_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets, 
+                             __vec4_i64 val, __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[0], 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[0], 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[1], 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[1], 1);
+    }
 }

 static FORCEINLINE void
-__scatter_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, 
-                                uint32_t scale, __vec4_i64 constOffset, __vec4_d val, 
-                                __vec4_i1 mask) {
-    __scatter_base_offsets64_i64(p, offsets, scale, constOffset, val, mask);
+__scatter_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets, 
+                                __vec4_d val, __vec4_i1 mask) {
+    __scatter_base_offsets32_i64(p, scale, offsets, val, mask);
+}
+
+static FORCEINLINE void
+__scatter_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets, 
+                                __vec4_d val, __vec4_i1 mask) {
+    __scatter_base_offsets64_i64(p, scale, offsets, val, mask);
 }


--- a/ispc.cpp
+++ b/ispc.cpp
@@ -94,20 +94,22 @@ lGetSystemISA() {
    int info[4];
    __cpuid(info, 1);

-    if ((info[2] & (1 << 28)) != 0) {
-        // AVX1 for sure. Do we have AVX2?
-        // Call cpuid with eax=7, ecx=0
-        __cpuidex(info, 7, 0);
-        if ((info[1] & (1 << 5)) != 0)
-            return "avx2";
-        else {
-            // ivybridge?
-            if ((info[2] & (1 << 29)) != 0 &&  // F16C
-                (info[2] & (1 << 30)) != 0)    // RDRAND
-                return "avx1.1";
+    if ((info[2] & (1 << 28)) != 0) {  // AVX
+        // AVX1 for sure....
+        // Ivy Bridge?
+        if ((info[2] & (1 << 29)) != 0 &&  // F16C
+            (info[2] & (1 << 30)) != 0) {  // RDRAND
+            // So far, so good.  AVX2?
+            // Call cpuid with eax=7, ecx=0
+            int info2[4];
+            __cpuidex(info2, 7, 0);
+            if ((info2[1] & (1 << 5)) != 0)
+                return "avx2";
            else
-                return "avx";
+                return "avx1.1";
        }
+        // Regular AVX
+        return "avx";
    }
    else if ((info[2] & (1 << 19)) != 0)
        return "sse4";
@@ -212,6 +214,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,

    // This is the case for most of them
    t->hasHalf = t->hasRand = t->hasTranscendentals = false;
+    t->hasGather = t->hasScatter = false;

    if (!strcasecmp(isa, "sse2")) {
        t->isa = Target::SSE2;
@@ -253,6 +256,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->maskBitCount = 1;
        t->hasHalf = true;
        t->hasTranscendentals = true;
+        t->hasGather = t->hasScatter = true;
    }
    else if (!strcasecmp(isa, "generic-8")) {
        t->isa = Target::GENERIC;
@@ -262,6 +266,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->maskBitCount = 1;
        t->hasHalf = true;
        t->hasTranscendentals = true;
+        t->hasGather = t->hasScatter = true;
    }
    else if (!strcasecmp(isa, "generic-16")) {
        t->isa = Target::GENERIC;
@@ -271,6 +276,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->maskBitCount = 1;
        t->hasHalf = true;
        t->hasTranscendentals = true;
+        t->hasGather = t->hasScatter = true;
    }
    else if (!strcasecmp(isa, "generic-32")) {
        t->isa = Target::GENERIC;
@@ -280,6 +286,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->maskBitCount = 1;
        t->hasHalf = true;
        t->hasTranscendentals = true;
+        t->hasGather = t->hasScatter = true;
    }
    else if (!strcasecmp(isa, "generic-64")) {
        t->isa = Target::GENERIC;
@@ -289,6 +296,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->maskBitCount = 1;
        t->hasHalf = true;
        t->hasTranscendentals = true;
+        t->hasGather = t->hasScatter = true;
    }
    else if (!strcasecmp(isa, "generic-1")) {
        t->isa = Target::GENERIC;
@@ -320,8 +328,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
        t->maskingIsFree = false;
        t->maskBitCount = 32;
+#if !defined(LLVM_3_0)
+        // LLVM 3.1+ only
        t->hasHalf = true;
+  #if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
        t->hasRand = true;
+  #endif
+#endif
    }
    else if (!strcasecmp(isa, "avx1.1-x2")) {
        t->isa = Target::AVX11;
@@ -330,8 +344,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
        t->maskingIsFree = false;
        t->maskBitCount = 32;
+#if !defined(LLVM_3_0)
+        // LLVM 3.1+ only
        t->hasHalf = true;
+  #if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
        t->hasRand = true;
+  #endif
+#endif
    }
 #ifndef LLVM_3_0
    else if (!strcasecmp(isa, "avx2")) {
@@ -342,7 +362,11 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->maskingIsFree = false;
        t->maskBitCount = 32;
        t->hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
        t->hasRand = true;
+        t->hasGather = true;
+#endif
    }
    else if (!strcasecmp(isa, "avx2-x2")) {
        t->isa = Target::AVX2;
@@ -352,7 +376,11 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->maskingIsFree = false;
        t->maskBitCount = 32;
        t->hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
        t->hasRand = true;
+        t->hasGather = true;
+#endif
    }
 #endif // !LLVM_3_0
    else {
--- a/ispc.h
+++ b/ispc.h
@@ -252,9 +252,15 @@ struct Target {
        conversions. */
    bool hasHalf;

-    /** Indicates whether there is an ISA random number instruciton. */
+    /** Indicates whether there is an ISA random number instruction. */
    bool hasRand;

+    /** Indicates whether the target has a native gather instruction */
+    bool hasGather;
+
+    /** Indicates whether the target has a native scatter instruction */
+    bool hasScatter;
+
    /** Indicates whether the target has support for transcendentals (beyond
        sqrt, which we assume that all of them handle). */
    bool hasTranscendentals;
--- a/opt.cpp
+++ b/opt.cpp
--- a/run_tests.py
+++ b/run_tests.py
@@ -265,11 +265,9 @@ def run_test(filename):
                    gcc_arch = '-m64'

                gcc_isa=""
-                if options.target == 'sse2' or options.target == 'sse2-x2':
-                    gcc_isa = '-msse3'
-                if options.target == 'sse4' or options.target == 'sse4-x2' or options.target == 'generic-4':
+                if options.target == 'generic-4':
                    gcc_isa = '-msse4.2'
-                if options.target == 'avx' or options.target == 'avx-x2' or options.target == 'generic-8':
+                if options.target == 'generic-8':
                    gcc_isa = '-mavx'
                if (options.target == 'generic-16' or options.target == 'generic-32' or options.target == 'generic-64') \
                        and (options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1):
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -4068,3 +4068,188 @@ static inline void seed_rng(uniform RNGState * uniform state,
 static inline void fastmath() {
    __fastmath();
 }
+
+///////////////////////////////////////////////////////////////////////////
+// rdrand
+
+static inline uniform bool rdrand(float * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        uniform int32 irand;
+        uniform bool success = __rdrand_i32(&irand);
+        if (success) {
+            irand &= (1<<23)-1;
+            *ptr = floatbits(0x3F800000 | irand)-1.0f;
+        }
+        return success;
+    }
+}
+
+static inline bool rdrand(varying float * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        bool success = false;
+        foreach_active (index) {
+            uniform int32 irand;
+            if (__rdrand_i32(&irand)) {
+                // FIXME: it probably would be preferable, here and in the
+                // following rdrand() function, to do the int->float stuff
+                // in vector form.  However, we need to be careful to not
+                // clobber any existing already-set values in *ptr with
+                // inactive lanes here...
+                irand &= (1<<23)-1;
+                *ptr = floatbits(0x3F800000 | irand)-1.0f;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
+
+static inline bool rdrand(float * ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        float * uniform ptrs[programCount];
+        ptrs[programIndex] = ptr;
+
+        bool success = false;
+        foreach_active (index) {
+            uniform int32 irand;
+            if (__rdrand_i32(&irand)) {
+                irand &= (1<<23)-1;
+                *ptrs[index] = floatbits(0x3F800000 | irand)-1.0f;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
+
+static inline uniform bool rdrand(int16 * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else
+        return __rdrand_i16(ptr);
+}
+
+static inline bool rdrand(varying int16 * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        bool success = false;
+        foreach_active (index) {
+            uniform int16 irand;
+            if (__rdrand_i16(&irand)) {
+                *ptr = irand;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
+
+static inline bool rdrand(int16 * ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        int16 * uniform ptrs[programCount];
+        ptrs[programIndex] = ptr;
+        bool success = false;
+
+        foreach_active (index) {
+            uniform int16 irand;
+            if (__rdrand_i16(&irand)) {
+                *ptrs[index] = irand;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
+
+static inline uniform bool rdrand(int32 * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else
+        return __rdrand_i32(ptr);
+}
+
+static inline bool rdrand(varying int32 * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        bool success = false;
+        foreach_active (index) {
+            uniform int32 irand;
+            if (__rdrand_i32(&irand)) {
+                *ptr = irand;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
+
+static inline bool rdrand(int32 * ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        int32 * uniform ptrs[programCount];
+        ptrs[programIndex] = ptr;
+        bool success = false;
+
+        foreach_active (index) {
+            uniform int32 irand;
+            if (__rdrand_i32(&irand)) {
+                *ptrs[index] = irand;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
+
+static inline uniform bool rdrand(int64 * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else
+        return __rdrand_i64(ptr);
+}
+
+static inline bool rdrand(varying int64 * uniform ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        bool success = false;
+        foreach_active (index) {
+            uniform int64 irand;
+            if (__rdrand_i64(&irand)) {
+                *ptr = irand;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
+
+static inline bool rdrand(int64 * ptr) {
+    if (__have_native_rand == false)
+        return false;
+    else {
+        int64 * uniform ptrs[programCount];
+        ptrs[programIndex] = ptr;
+        bool success = false;
+
+        foreach_active (index) {
+            uniform int64 irand;
+            if (__rdrand_i64(&irand)) {
+                *ptrs[index] = irand;
+                success = true;
+            }
+        }
+        return success;
+    }
+}
--- a/tests/gather-double-1.ispc
+++ b/tests/gather-double-1.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-double-2.ispc
+++ b/tests/gather-double-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-double-3.ispc
+++ b/tests/gather-double-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-double-4.ispc
+++ b/tests/gather-double-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-double-5.ispc
+++ b/tests/gather-double-5.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+
+    double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
+    int g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-double-6.ispc
+++ b/tests/gather-double-6.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
+    int g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-double-7.ispc
+++ b/tests/gather-double-7.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-double-8.ispc
+++ b/tests/gather-double-8.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform double a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-float-1.ispc
+++ b/tests/gather-float-1.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-float-2.ispc
+++ b/tests/gather-float-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-float-3.ispc
+++ b/tests/gather-float-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-float-4.ispc
+++ b/tests/gather-float-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-float-5.ispc
+++ b/tests/gather-float-5.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+
+    float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
+    int g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-float-6.ispc
+++ b/tests/gather-float-6.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
+    int g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-float-7.ispc
+++ b/tests/gather-float-7.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-float-8.ispc
+++ b/tests/gather-float-8.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int16-1.ispc
+++ b/tests/gather-int16-1.ispc
@@ -1,19 +1,17 @@
+
 export uniform int width() { return programCount; }

-export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int16 x[programCount];
-    x[programIndex] = programIndex;
-    int a = aFOO[programIndex]-1;
-    unsigned int16 v;
-    if (programIndex < 2)
-        v = x[a];
-    else
-        v = 2;
-    RET[programIndex] = v;
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = 2;
-    RET[0] = 0;
-    RET[1] = 1;
+    RET[programIndex] = 1 + programIndex;
 }
--- a/tests/gather-int16-2.ispc
+++ b/tests/gather-int16-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-int16-3.ispc
+++ b/tests/gather-int16-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int16-4.ispc
+++ b/tests/gather-int16-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int16-5.ispc
+++ b/tests/gather-int16-5.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+
+    int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
+    int g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-int16-6.ispc
+++ b/tests/gather-int16-6.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
+    int g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-int16-7.ispc
+++ b/tests/gather-int16-7.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int16-8.ispc
+++ b/tests/gather-int16-8.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int32-1.ispc
+++ b/tests/gather-int32-1.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-int32-2.ispc
+++ b/tests/gather-int32-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-int32-3.ispc
+++ b/tests/gather-int32-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int32-4.ispc
+++ b/tests/gather-int32-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int32-5.ispc
+++ b/tests/gather-int32-5.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+
+    int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
+    int g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-int32-6.ispc
+++ b/tests/gather-int32-6.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
+    int g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-int32-7.ispc
+++ b/tests/gather-int32-7.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int32-8.ispc
+++ b/tests/gather-int32-8.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int64-1.ispc
+++ b/tests/gather-int64-1.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-int64-2.ispc
+++ b/tests/gather-int64-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-int64-3.ispc
+++ b/tests/gather-int64-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int64-4.ispc
+++ b/tests/gather-int64-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int64-5.ispc
+++ b/tests/gather-int64-5.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+
+    int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
+    int g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-int64-6.ispc
+++ b/tests/gather-int64-6.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
+    int g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-int64-7.ispc
+++ b/tests/gather-int64-7.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int64-8.ispc
+++ b/tests/gather-int64-8.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int64 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int8-1.ispc
+++ b/tests/gather-int8-1.ispc
@@ -1,19 +1,17 @@
+
 export uniform int width() { return programCount; }

-export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int8 x[programCount];
-    x[programIndex] = programIndex;
-    int a = aFOO[programIndex]-1;
-    unsigned int8 v;
-    if (programIndex < 2)
-        v = x[a];
-    else
-        v = 2;
-    RET[programIndex] = v;
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = 2;
-    RET[0] = 0;
-    RET[1] = 1;
+    RET[programIndex] = 1 + programIndex;
 }
--- a/tests/gather-int8-2.ispc
+++ b/tests/gather-int8-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-int8-3.ispc
+++ b/tests/gather-int8-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int8-4.ispc
+++ b/tests/gather-int8-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    if (programIndex < 2)
+        g = a[programIndex+zero]; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int8-5.ispc
+++ b/tests/gather-int8-5.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+
+    int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
+    int g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-int8-6.ispc
+++ b/tests/gather-int8-6.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
+    int g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/gather-int8-7.ispc
+++ b/tests/gather-int8-7.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr; 
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/gather-int8-8.ispc
+++ b/tests/gather-int8-8.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+void *gptr;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = aFOO[i];
+    
+    int g = 0;
+    int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
+    if (programIndex < 2)
+        g = *ptr;
+    RET[programIndex] = g; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/rdrand-1.ispc
+++ b/tests/rdrand-1.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
+    RET[programIndex] = 1;
+#else
+
+    uniform float r = -1;
+    uniform int count = 0;
+    while (!rdrand(&r)) {
+        ++count;
+    }
+    RET[programIndex] = (r >= 0 && r < 1);
+
+#endif
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
--- a/tests/rdrand-2.ispc
+++ b/tests/rdrand-2.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
+    RET[programIndex] = 1;
+#else
+
+    float r = -1;
+    while (!rdrand(&r))
+        ;
+    RET[programIndex] = (r >= 0 && r < 1);
+
+#endif
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
--- a/tests/rdrand-3.ispc
+++ b/tests/rdrand-3.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
+    RET[programIndex] = 1;
+#else
+
+    int lessHalf = 0, moreHalf = 0;
+    for (uniform int i = 0; i < 1024*1024; ++i) {
+        float r = -1;
+        while (!rdrand(&r))
+            ;
+        if (r < 0.5) ++lessHalf;
+        else ++moreHalf;
+    }
+
+    float r = (double)lessHalf / (double)(lessHalf + moreHalf);
+    RET[programIndex] = (r >= .49 && r < .51);
+#endif
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
--- a/tests/rdrand-4.ispc
+++ b/tests/rdrand-4.ispc
@@ -0,0 +1,33 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
+    RET[programIndex] = 0;
+#else
+
+    uniform int set[64] = { 0 };
+    uniform int count = 1024*1024;
+    for (uniform int i = 0; i < count; ++i) {
+        uniform int64 r;
+        while (!rdrand(&r))
+            ;
+        for (uniform int b = 0; b < 64; ++b) 
+            if (((unsigned int64)r >> b) & 1)
+                ++set[b];
+    }
+
+    RET[programIndex] = 0;
+    for (uniform int b = 0; b < 64; ++b)  {
+        float r = (double)set[b] / (double)(count);
+        if (!(r >= .49 && r < .51)) {
+            print("% % - %\n", b, r, set[b]);
+            ++RET[programIndex];
+        }
+    }
+#endif
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/rdrand-5.ispc
+++ b/tests/rdrand-5.ispc
@@ -0,0 +1,33 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
+    RET[programIndex] = 0;
+#else
+
+    int set[32] = { 0 };
+    uniform int count = 1024*1024;
+    for (uniform int i = 0; i < count; ++i) {
+        int32 r;
+        while (!rdrand(&r))
+            ;
+        for (uniform int b = 0; b < 32; ++b) 
+            if (((unsigned int32)r >> b) & 1)
+                ++set[b];
+    }
+
+    RET[programIndex] = 0;
+    for (uniform int b = 0; b < 32; ++b)  {
+        float r = (double)set[b] / (double)(count);
+        if (!(r >= .49 && r < .51)) {
+            print("% % - %\n", b, r, set[b]);
+            ++RET[programIndex];
+        }
+    }
+#endif
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/rdrand-6.ispc
+++ b/tests/rdrand-6.ispc
@@ -0,0 +1,35 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
+    RET[programIndex] = 0;
+#else
+
+    int set[32] = { 0 };
+    uniform int count = 1024*1024;
+    for (uniform int i = 0; i < count; ++i) {
+        uniform int32 rr[programCount];
+        int * ptr = rr + programIndex;
+        while (!rdrand(ptr))
+            ;
+        int32 r = rr[programIndex];
+        for (uniform int b = 0; b < 32; ++b) 
+            if (((unsigned int32)r >> b) & 1)
+                ++set[b];
+    }
+
+    RET[programIndex] = 0;
+    for (uniform int b = 0; b < 32; ++b)  {
+        float r = (double)set[b] / (double)(count);
+        if (!(r >= .49 && r < .51)) {
+            print("% % - %\n", b, r, set[b]);
+            ++RET[programIndex];
+        }
+    }
+#endif
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}