Merge branch 'master' into nvptx_clean_master

2014-10-14 14:27:00 +02:00
parent 83a863ea83 29c0f75306
commit 9238c72e08
25 changed files with 1947 additions and 1176 deletions
--- a/alloy.py
+++ b/alloy.py
@@ -110,8 +110,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
    if  version_LLVM == "trunk":
        SVN_PATH="trunk"
    if  version_LLVM == "3.5":
-        # SVN_PATH=tags/RELEASE_35/rc1
-        SVN_PATH="branches/release_35"
+        SVN_PATH="tags/RELEASE_350/final"
        version_LLVM = "3_5"
    if  version_LLVM == "3.4":
        SVN_PATH="tags/RELEASE_34/dot2-final"
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -555,6 +555,10 @@ lSetInternalFunctions(llvm::Module *module) {
        "__prefetch_read_uniform_2",
        "__prefetch_read_uniform_3",
        "__prefetch_read_uniform_nt",
+        "__pseudo_prefetch_read_varying_1",
+        "__pseudo_prefetch_read_varying_2",
+        "__pseudo_prefetch_read_varying_3",
+        "__pseudo_prefetch_read_varying_nt",
        "__psubs_vi8",
        "__psubs_vi16",
        "__psubus_vi8",
@@ -780,7 +784,11 @@ void
 AddBitcodeToModule(const unsigned char *bitcode, int length,
                   llvm::Module *module, SymbolTable *symbolTable) {
    llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
+#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
    llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
+#else // LLVM 3.6+
+    llvm::MemoryBufferRef bcBuf = llvm::MemoryBuffer::getMemBuffer(sb)->getMemBufferRef();
+#endif

 #if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
    llvm::ErrorOr<llvm::Module *> ModuleOrErr = llvm::parseBitcodeFile(bcBuf, *g->ctx);
@@ -910,12 +918,23 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
        // have the DW_AT_artifical attribute.  It's not clear if this
        // matters for anything though.
        llvm::DIGlobalVariable var =
+#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
+            m->diBuilder->createGlobalVariable(file,
+                                               name,
+                                               name,
+                                               file,
+                                               0 /* line */,
+                                               diType,
+                                               true /* static */,
+                                               sym->storagePtr);
+#else
            m->diBuilder->createGlobalVariable(name,
                                               file,
                                               0 /* line */,
                                               diType,
                                               true /* static */,
                                               sym->storagePtr);
+#endif
        Assert(var.Verify());
    }
 }
@@ -970,12 +989,23 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
        llvm::DIType diType = sym->type->GetDIType(file);
        Assert(diType.Verify());
        llvm::DIGlobalVariable var =
+#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
+            m->diBuilder->createGlobalVariable(file,
+                                               sym->name.c_str(),
+                                               sym->name.c_str(),
+                                               file,
+                                               0 /* line */,
+                                               diType,
+                                               false /* static */,
+                                               sym->storagePtr);
+#else
            m->diBuilder->createGlobalVariable(sym->name.c_str(),
                                               file,
                                               0 /* line */,
                                               diType,
                                               false /* static */,
                                               sym->storagePtr);
+#endif    
        Assert(var.Verify());
    }
 }
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -370,6 +370,14 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
 declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind 
 declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind 

+declare void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int8/int16 builtins

--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1584,6 +1584,50 @@ define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
  call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1)
  ret void
 }
+
+define void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
+  call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 3, i32 1)
+  ')
+  ret void
+}
+
+declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+
+define void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
+  call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 2, i32 1)
+  ')
+  ret void
+}
+
+declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+
+define void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
+  call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 1, i32 1)
+  ')
+  ret void
+}
+
+declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+
+define void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
+  call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 0, i32 1)
+  ')
+  ret void
+}
+
+declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
 ')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2535,6 +2579,31 @@ declare void
@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, <WIDTH x i64>,
                                        <WIDTH x double>, <WIDTH x MASK>) nounwind

+
+declare void @__pseudo_prefetch_read_varying_1(<WIDTH x i64>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_prefetch_read_varying_1_native(i8 *, i32, <WIDTH x i32>,
+                                         <WIDTH x MASK>) nounwind
+
+declare void @__pseudo_prefetch_read_varying_2(<WIDTH x i64>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_prefetch_read_varying_2_native(i8 *, i32, <WIDTH x i32>,
+                                         <WIDTH x MASK>) nounwind
+
+declare void @__pseudo_prefetch_read_varying_3(<WIDTH x i64>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_prefetch_read_varying_3_native(i8 *, i32, <WIDTH x i32>,
+                                         <WIDTH x MASK>) nounwind
+
+declare void @__pseudo_prefetch_read_varying_nt(<WIDTH x i64>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_prefetch_read_varying_nt_native(i8 *, i32, <WIDTH x i32>,
+                                         <WIDTH x MASK>) nounwind
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 declare void @__use8(<WIDTH x i8>)
@@ -3034,6 +3103,41 @@ ifelse(HAVE_SCATTER, `1',
                                                    <WIDTH x double> %vd, <WIDTH x MASK> %mask)
 ')

+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; prefetchs
+
+  call void @__pseudo_prefetch_read_varying_1(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_prefetch_read_varying_1_native(i8 * %ptr, i32 0,
+                                                     <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_1_native(i8 * %ptr, i32 0,
+                                              <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_1(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_prefetch_read_varying_2(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_prefetch_read_varying_2_native(i8 * %ptr, i32 0,
+                                                     <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_2_native(i8 * %ptr, i32 0,
+                                              <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_2(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_prefetch_read_varying_3(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_prefetch_read_varying_3_native(i8 * %ptr, i32 0,
+                                                     <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_3_native(i8 * %ptr, i32 0,
+                                              <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_3(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_prefetch_read_varying_nt(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_prefetch_read_varying_nt_native(i8 * %ptr, i32 0,
+                                                     <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_nt_native(i8 * %ptr, i32 0,
+                                              <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_nt(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
  ret void
 }

--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -4945,9 +4945,19 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
    llvm::sys::fs::OpenFlags flags = llvm::sys::fs::F_None;
 #endif

+#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
    std::string error;
+#else // LLVM 3.6+
+    std::error_code error;
+#endif
+
    llvm::tool_output_file *of = new llvm::tool_output_file(fn, error, flags);
+
+#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
    if (error.size()) {
+#else // LLVM 3.6+
+    if (error) {
+#endif
        fprintf(stderr, "Error opening output file \"%s\".\n", fn);
        return false;
    }
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -745,6 +745,7 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
        // that have executed a 'break' statement:
        // breakLanes = breakLanes | mask
        AssertPos(currentPos, breakLanesPtr != NULL);
+
        llvm::Value *mask = GetInternalMask();
        llvm::Value *breakMask = LoadInst(breakLanesPtr,
                                          "break_mask");
@@ -927,6 +928,16 @@ FunctionEmitContext::RestoreContinuedLanes() {
 }


+void
+FunctionEmitContext::ClearBreakLanes() {
+  if (breakLanesPtr == NULL)
+    return;
+
+  // breakLanes = 0
+  StoreInst(LLVMMaskAllOff, breakLanesPtr);
+}
+
+
 void
 FunctionEmitContext::StartSwitch(bool cfIsUniform, llvm::BasicBlock *bbBreak) {
    llvm::Value *oldMask = GetInternalMask();
@@ -1636,14 +1647,16 @@ FunctionEmitContext::StartScope() {
        llvm::DILexicalBlock lexicalBlock =
            m->diBuilder->createLexicalBlock(parentScope, diFile,
                                             currentPos.first_line,
-#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
+#if defined(LLVM_3_5)
        // Revision 202736 in LLVM adds support of DWARF discriminator
        // to the last argument and revision 202737 in clang adds 0
        // for the last argument by default.
                                             currentPos.first_column, 0);
 #else
+        // Revision 216239 in LLVM removes support of DWARF discriminator
+        // as the last argument
                                             currentPos.first_column);
-#endif
+#endif // LLVM 3.2, 3.3, 3.4 and 3.6+
        AssertPos(currentPos, lexicalBlock.Verify());
        debugScopes.push_back(lexicalBlock);
    }
@@ -1683,8 +1696,14 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
                                          diType,
                                          true /* preserve through opts */);
    AssertPos(currentPos, var.Verify());
+#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
+    llvm::DIExpression E = m->diBuilder->createExpression();
+    llvm::Instruction *declareInst =
+        m->diBuilder->insertDeclare(sym->storagePtr, var, E, bblock);
+#else
    llvm::Instruction *declareInst =
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
+#endif
    AddDebugPos(declareInst, &sym->pos, &scope);
 }

@@ -1710,8 +1729,14 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym, int argNum) {
                                          flags,
                                          argNum+1);
    AssertPos(currentPos, var.Verify());
+#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
+    llvm::DIExpression E =  m->diBuilder->createExpression();
+    llvm::Instruction *declareInst =
+        m->diBuilder->insertDeclare(sym->storagePtr, var, E, bblock);
+#else
    llvm::Instruction *declareInst =
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
+#endif
    AddDebugPos(declareInst, &sym->pos, &scope);
 }

--- a/ctx.h
+++ b/ctx.h
@@ -196,6 +196,13 @@ public:
        previous iteration. */
    void RestoreContinuedLanes();
    
+    /** This method is called by code emitting IR for a loop.  It clears 
+        any lanes that contained a break since the mask has been updated to take
+        them into account.  This is necessary as all the bail out checks for 
+        breaks are meant to only deal with lanes breaking on the current iteration.
+     */
+    void ClearBreakLanes();
+
    /** Indicates that code generation for a "switch" statement is about to
        start.  isUniform indicates whether the "switch" value is uniform,
        and bbAfterSwitch gives the basic block immediately following the
--- a/examples/common.props
+++ b/examples/common.props
@@ -160,8 +160,8 @@
  <ItemGroup>
    <CustomBuild Include='$(ISPC_file).ispc'>
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) $(flags)</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) $(flags)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O0 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) -g $(flags)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O0 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) -g $(flags)</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Target_out)</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Target_out)</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) $(flags)</Command>
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1540,6 +1540,15 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
 static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
 }

+#define PREFETCH_READ_VARYING(CACHE_NUM)                                                                    \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale,         \
+                                                                   __vec16_i32 offsets, __vec16_i1 mask) {} \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {}           \
+
+PREFETCH_READ_VARYING(1)
+PREFETCH_READ_VARYING(2)
+PREFETCH_READ_VARYING(3)
+PREFETCH_READ_VARYING(nt)
 ///////////////////////////////////////////////////////////////////////////
 // atomics

--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -1624,6 +1624,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
 static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
 }

+#define PREFETCH_READ_VARYING(CACHE_NUM)                                                                    \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale,         \
+                                                                   __vec32_i32 offsets, __vec32_i1 mask) {} \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec32_i64 addr, __vec32_i1 mask) {}           \
+
+PREFETCH_READ_VARYING(1)
+PREFETCH_READ_VARYING(2)
+PREFETCH_READ_VARYING(3)
+PREFETCH_READ_VARYING(nt)
+
 ///////////////////////////////////////////////////////////////////////////
 // atomics

--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -1757,6 +1757,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
 static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
 }

+#define PREFETCH_READ_VARYING(CACHE_NUM)                                                                    \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale,         \
+                                                                   __vec64_i32 offsets, __vec64_i1 mask) {} \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec64_i64 addr, __vec64_i1 mask) {}           \
+
+PREFETCH_READ_VARYING(1)
+PREFETCH_READ_VARYING(2)
+PREFETCH_READ_VARYING(3)
+PREFETCH_READ_VARYING(nt)
+
 ///////////////////////////////////////////////////////////////////////////
 // atomics

--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -1,5 +1,5 @@
 /**
-  Copyright (c) 2010-2013, Intel Corporation
+  Copyright (c) 2010-2014, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,7 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

+#include <limits.h> // INT_MIN
 #include <stdint.h> 
 #include <math.h>
 #include <assert.h>
@@ -525,11 +526,11 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
  *p = v;
 }

-template <class RetVecType> RetVecType __smear_i1(int i);
-template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
+template <class RetVecType> static RetVecType __smear_i1(int i);
+template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }

-template <class RetVecType> RetVecType __setzero_i1();
-template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
+template <class RetVecType> static RetVecType __setzero_i1();
+template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }

 template <class RetVecType> __vec16_i1 __undef_i1();
 template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); }
@@ -677,8 +678,8 @@ static FORCEINLINE __vec16_i32 __select(      bool cond, __vec16_i32 a, __vec16_
 static FORCEINLINE int32_t __extract_element(__vec16_i32  v,  int32_t index)              { return v[index];    }
 static FORCEINLINE void    __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val;  }

-template <class RetVecType> RetVecType __smear_i32(int32_t i);
-template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
+template <class RetVecType> RetVecType static __smear_i32(int32_t i);
+template <> FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }

 static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
 static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0);
@@ -686,11 +687,11 @@ static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
 static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
 static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);

-template <class RetVecType> RetVecType __setzero_i32();
-template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
+template <class RetVecType> static RetVecType __setzero_i32();
+template <> FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }

-template <class RetVecType> RetVecType __undef_i32();
-template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
+template <class RetVecType> static RetVecType __undef_i32();
+template <> FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }

 static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); }

@@ -742,11 +743,11 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
 }

 #if 0 /* knc::fails  ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
-template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) 
+template <> FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) 
 {
  return _mm512_load_epi32(p);
 }
-template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) 
+template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) 
 {
  _mm512_store_epi32(p, v);
 }
@@ -1017,21 +1018,21 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
 }

 #if 0 /* knc::fails  as with _i32 this may generate fails ... so commetining it out */
-template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
+template <> FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
 {
  __m512i v2 = _mm512_load_epi32(p);
  __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
  return __vec16_i64(v2,v1);
 }
-template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {    return __load<64>(p); }
-template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) 
+template <> FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {    return __load<64>(p); }
+template <> FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) 
 {
  __m512i v1 = v.v2;
  __m512i v2 = v.v1;
  _mm512_store_epi64(p, v2);
  _mm512_store_epi64(((uint8_t*)p)+64, v1);
 }
-template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
+template <> FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
 #endif


@@ -1067,14 +1068,14 @@ static FORCEINLINE __vec16_f __select(      bool cond, __vec16_f a, __vec16_f b)
 static FORCEINLINE float __extract_element(__vec16_f  v, uint32_t index)            { return v[index];   }
 static FORCEINLINE void   __insert_element(__vec16_f *v, uint32_t index, float val) { (*v)[index] = val; }

-template <class RetVecType> RetVecType __smear_float(float f);
-template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
+template <class RetVecType> static RetVecType __smear_float(float f);
+template <> FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }

-template <class RetVecType> RetVecType __setzero_float();
-template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
+template <class RetVecType> static RetVecType __setzero_float();
+template <> FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }

-template <class RetVecType> RetVecType __undef_float();
-template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
+template <class RetVecType> static RetVecType __undef_float();
+template <> FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }

 static FORCEINLINE __vec16_f __broadcast_float(__vec16_f _v, int index) 
 {
@@ -1131,12 +1132,12 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v)
 }

 #if 0 /* knc::fails  ./tests/gs-improve-progindex.ispc with segfault */
-template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) 
+template <> FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) 
 {
    return _mm512_load_ps(p);
 }
 /* this one doesn't fail but it is  commented out for completeness, no aligned load/stores */
-template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) 
+template <> FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) 
 {
  _mm512_store_ps(p, v);
 }
@@ -1309,14 +1310,14 @@ static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b)
 static FORCEINLINE double __extract_element(__vec16_d  v, uint32_t index)             { return v[index];   }
 static FORCEINLINE void    __insert_element(__vec16_d *v, uint32_t index, double val) { (*v)[index] = val; }

-template <class RetVecType> RetVecType __smear_double(double d);
-template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
+template <class RetVecType> static RetVecType __smear_double(double d);
+template <> FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }

-template <class RetVecType> RetVecType __setzero_double();
-template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
+template <class RetVecType> static RetVecType __setzero_double();
+template <> FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }

-template <class RetVecType> RetVecType __undef_double();
-template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
+template <class RetVecType> static RetVecType __undef_double();
+template <> FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }

 #define CASTD2F(_v_, _v_hi_, _v_lo_) \
  __vec16_f _v_hi_, _v_lo_;  \
@@ -1390,17 +1391,17 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v)


 #if 0 /* knc::fails  as with _f this may generate fails ... so commetining it out */
-template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) 
+template <> FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) 
 {
  return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
 }
-template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) 
+template <> FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) 
 {
  _mm512_store_pd(p, v.v1);
  _mm512_store_pd(((uint8_t*)p)+64, v.v2);
 }
-template <> static FORCEINLINE __vec16_d __load <128>(const __vec16_d *p)        { return __load<64>(p); }
-template <> static FORCEINLINE      void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v);    }
+template <> FORCEINLINE __vec16_d __load <128>(const __vec16_d *p)        { return __load<64>(p); }
+template <> FORCEINLINE      void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v);    }
 #endif

 ///////////////////////////////////////////////////////////////////////////
@@ -2162,6 +2163,7 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t
 static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) 
 { 
  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
  __vec16_i1 still_to_do = mask;
  __vec16_i32 tmp;
  while (still_to_do) {
@@ -2172,8 +2174,8 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_
        _MM_CMPINT_EQ);

    void * base = (void*)((unsigned long)_base  +
-        ((scale*(unsigned long)hi32) << 32));    
-    tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base,
+        ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
+    tmp = _mm512_mask_i32extgather_epi32(tmp, match, signed_offsets, base,
        _MM_UPCONV_EPI32_SINT8, scale,
        _MM_HINT_NONE);
    still_to_do = _mm512_kxor(match,still_to_do);
@@ -2197,6 +2199,7 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32
 static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
 {
  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
  // There is no gather instruction with 64-bit offsets in KNC.
  // We have to manually iterate over the upper 32 bits ;-)
  __vec16_i1  still_to_do = mask;
@@ -2209,8 +2212,8 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint3
        _MM_CMPINT_EQ);
         
    void * base = (void*)((unsigned long)_base  +
-        ((scale*(unsigned long)hi32) << 32));
-    ret = _mm512_mask_i32extgather_epi32(ret, match, offsets.v_lo, base,
+        ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
+    ret = _mm512_mask_i32extgather_epi32(ret, match, signed_offsets, base,
        _MM_UPCONV_EPI32_NONE, scale,
        _MM_HINT_NONE);
    still_to_do = _mm512_kxor(match, still_to_do);
@@ -2230,6 +2233,7 @@ static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32
 static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
 {
  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
  // There is no gather instruction with 64-bit offsets in KNC.
  // We have to manually iterate over the upper 32 bits ;-)
  __vec16_i1 still_to_do = mask;
@@ -2242,8 +2246,8 @@ static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint3
        _MM_CMPINT_EQ);

    void * base = (void*)((unsigned long)_base  +
-        ((scale*(unsigned long)hi32) << 32));
-    ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base,
+        ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
+    ret = _mm512_mask_i32extgather_ps(ret, match, signed_offsets, base,
        _MM_UPCONV_PS_NONE, scale,
        _MM_HINT_NONE);
    still_to_do = _mm512_kxor(match, still_to_do);
@@ -2339,6 +2343,7 @@ static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale,
 static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i32 value, __vec16_i1 mask) 
 {
  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
  
  __vec16_i1 still_to_do = mask;
  while (still_to_do) {
@@ -2349,8 +2354,8 @@ static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t sc
        _MM_CMPINT_EQ);

    void * base = (void*)((unsigned long)_base  +
-        ((scale*(unsigned long)hi32) << 32));    
-    _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, 
+        ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));  
+    _mm512_mask_i32extscatter_epi32(base, match, signed_offsets, 
        value,
        _MM_DOWNCONV_EPI32_NONE, scale,
        _MM_HINT_NONE);
@@ -2370,6 +2375,7 @@ static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scal
 static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask) 
 { 
  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
  
  __vec16_i1 still_to_do = mask;
  while (still_to_do) {
@@ -2380,8 +2386,9 @@ static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t
        _MM_CMPINT_EQ);

    void * base = (void*)((unsigned long)_base  +
-        ((scale*(unsigned long)hi32) << 32));    
-    _mm512_mask_i32extscatter_ps(base, match, offsets.v_lo, 
+        ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));   
+
+    _mm512_mask_i32extscatter_ps(base, match, signed_offsets, 
        value,
        _MM_DOWNCONV_PS_NONE, scale,
        _MM_HINT_NONE);
@@ -2543,6 +2550,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
    // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
 }

+#define PREFETCH_READ_VARYING(CACHE_NUM, HINT)                                                              \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale,         \
+                                                                   __vec16_i32 offsets, __vec16_i1 mask) {  \
+    _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT);                                   \
+    offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
+    __vec16_i1 copy_mask = _mm512_kmov(mask);                                                               \
+    _mm512_kswapb(mask, copy_mask);                                                                         \
+    _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0);                            \
+}                                                                                                           \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {}           \
+
+PREFETCH_READ_VARYING(1, _MM_HINT_T0)
+PREFETCH_READ_VARYING(2, _MM_HINT_T1)
+PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
+
+static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
+                                                         __vec16_i32 offsets, __vec16_i1 mask) {}
+
+static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
+
 ///////////////////////////////////////////////////////////////////////////
 // atomics
 ///////////////////////////////////////////////////////////////////////////
--- a/examples/intrinsics/knc-i1x8.h
+++ b/examples/intrinsics/knc-i1x8.h
@@ -2606,6 +2606,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
    // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
 }

+#define PREFETCH_READ_VARYING(CACHE_NUM, HINT)                                                              \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale,         \
+                                                                   __vec16_i32 offsets, __vec16_i1 mask) {  \
+    _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT);                                   \
+    offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
+    __vec16_i1 copy_mask = _mm512_kmov(mask);                                                               \
+    _mm512_kswapb(mask, copy_mask);                                                                         \
+    _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0);                            \
+}                                                                                                           \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {}           \
+
+PREFETCH_READ_VARYING(1, _MM_HINT_T0)
+PREFETCH_READ_VARYING(2, _MM_HINT_T1)
+PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
+
+static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
+                                                         __vec16_i32 offsets, __vec16_i1 mask) {}
+
+static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
+
 ///////////////////////////////////////////////////////////////////////////
 // atomics

--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -3898,6 +3898,15 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *ptr) {
    _mm_prefetch((char *)ptr, _MM_HINT_NTA);
 }

+#define PREFETCH_READ_VARYING(CACHE_NUM)                                                                    \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale,         \
+                                                                   __vec4_i32 offsets, __vec4_i1 mask) {}   \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec4_i64 addr, __vec4_i1 mask) {}             \
+
+PREFETCH_READ_VARYING(1)
+PREFETCH_READ_VARYING(2)
+PREFETCH_READ_VARYING(3)
+PREFETCH_READ_VARYING(nt)
 ///////////////////////////////////////////////////////////////////////////
 // atomics

--- a/fail_db.txt
+++ b/fail_db.txt
@@ -257,6 +257,32 @@
 ./tests/reduce-equal-5.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O0 *
 ./tests/reduce-equal-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O0 *
 ./tests/reduce-equal-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O0 *
+./tests/atomics-6.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/atomics-uniform-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/atomics-uniform-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/paddus_vi16.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/paddus_vi8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmuls_i64.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i16.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i32.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i64.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/atomics-6.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O0 *
+./tests/atomics-uniform-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O0 *
+./tests/atomics-uniform-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O0 *
+./tests/atomics-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/atomics-uniform-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/atomics-uniform-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/paddus_vi16.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/paddus_vi8.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmuls_i64.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i16.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i32.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i64.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i8.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/atomics-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O0 *
+./tests/atomics-uniform-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O0 *
+./tests/atomics-uniform-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O0 *
 .\tests\foreach-double-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.5         cl -O2 *
 .\tests\foreach-double-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.5         cl -O2 *
 .\tests\foreach-double-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.5         cl -O2 *
@@ -267,7 +293,6 @@
 ./tests/ptr-22.ispc runfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O0 *
 ./tests/ptr-22.ispc runfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O0 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc13.1 -O2 *
-./tests/ptr-22.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc13.1 -O0 *
 ./tests/atomics-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc13.1 -O0 *
 ./tests/atomics-10.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc13.1 -O0 *
 ./tests/atomics-11.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc13.1 -O0 *
@@ -454,3 +479,35 @@
 ./tests/reduce-equal-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc13.1 -O0 *
 ./tests/reduce-equal-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc13.1 -O0 *
 ./tests/reduce-equal-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc13.1 -O0 *
+./tests/atomics-6.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/atomics-uniform-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/atomics-uniform-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/paddus_vi16.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/paddus_vi8.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmuls_i64.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i16.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i32.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i64.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i8.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/atomics-6.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O0 *
+./tests/atomics-uniform-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O0 *
+./tests/atomics-uniform-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O0 *
+./tests/atomics-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/atomics-uniform-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/atomics-uniform-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/paddus_vi16.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/paddus_vi8.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmuls_i64.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i16.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i32.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i64.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i8.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/atomics-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O0 *
+./tests/atomics-uniform-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O0 *
+./tests/atomics-uniform-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O0 *
+./tests/psubus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc13.1 -O2 *
+./tests/psubus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc13.1 -O2 *
+./tests/psubus_vi16.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/psubus_vi8.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/psubus_vi16.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/psubus_vi8.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -199,7 +199,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
    m_hasTranscendentals(false),
    m_hasTrigonometry(false),
    m_hasRsqrtd(false),
-    m_hasRcpd(false)
+    m_hasRcpd(false),
+    m_hasVecPrefetch(false)
 {
    if (isa == NULL) {
        if (cpu != NULL) {
@@ -386,6 +387,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        this->m_hasTrigonometry = false;
        this->m_hasGather = this->m_hasScatter = true;
        this->m_hasRsqrtd = this->m_hasRcpd = true;
+        // It's set to true, because MIC has hardware vector prefetch instruction
+        this->m_hasVecPrefetch = true;
    }
    else if (!strcasecmp(isa, "generic-32") ||
             !strcasecmp(isa, "generic-x32")) {
--- a/ispc.h
+++ b/ispc.h
@@ -283,6 +283,8 @@ public:
    
    bool hasRcpd() const {return m_hasRcpd;}

+    bool hasVecPrefetch() const {return m_hasVecPrefetch;}
+
 private:

    /** llvm Target object representing this target. */
@@ -385,6 +387,9 @@ private:
    
    /** Indicates whether there is an ISA double precision rcp. */
    bool m_hasRcpd;
+
+    /** Indicates whether the target has hardware instruction for vector prefetch. */
+    bool m_hasVecPrefetch;
 };


--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -403,6 +403,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'AND'$(LLVM_VERSION)'!='LLVM_3_4'">LLVMMCDisassembler.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'">LLVMOption.lib;LLVMSupport.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
@@ -424,6 +425,8 @@
      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'AND'$(LLVM_VERSION)'!='LLVM_3_4'AND'$(LLVM_VERSION)'!='LLVM_3_5'">LLVMProfileData.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'AND'$(LLVM_VERSION)'!='LLVM_3_4'">LLVMMCDisassembler.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'">LLVMOption.lib;LLVMSupport.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
--- a/module.cpp
+++ b/module.cpp
@@ -604,12 +604,23 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
    if (diBuilder) {
        llvm::DIFile file = pos.GetDIFile();
        llvm::DIGlobalVariable var =
+#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
+            diBuilder->createGlobalVariable(file,
+                                            name,
+                                            name,
+                                            file,
+                                            pos.first_line,
+                                            sym->type->GetDIType(file),
+                                            (sym->storageClass == SC_STATIC),
+                                            sym->storagePtr);
+#else
            diBuilder->createGlobalVariable(name,
                                            file,
                                            pos.first_line,
                                            sym->type->GetDIType(file),
                                            (sym->storageClass == SC_STATIC),
                                            sym->storagePtr);
+#endif
        Assert(var.Verify());
    }
 }
@@ -1304,18 +1315,33 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,

 #endif

+#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
    std::string error;
+#else // LLVM 3.6+
+    std::error_code error;
+#endif
+
    llvm::tool_output_file *of = new llvm::tool_output_file(outFileName, error, flags);
+
+#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
    if (error.size()) {
+#else // LLVM 3.6+
+    if (error) {
+#endif
+
        fprintf(stderr, "Error opening output file \"%s\".\n", outFileName);
        return false;
    }

    llvm::PassManager pm;
-#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
-    pm.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
-#else
+#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4)
    pm.add(new llvm::DataLayout(*g->target->getDataLayout()));
+#elif defined(LLVM_3_5)
+    pm.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
+#else // LLVM 3.6+
+    llvm::DataLayoutPass *dlp= new llvm::DataLayoutPass();
+    dlp->doInitialization(*module);
+    pm.add(dlp);
 #endif

    llvm::formatted_raw_ostream fos(of->os());
--- a/opt.cpp
+++ b/opt.cpp
@@ -479,10 +479,14 @@ Optimize(llvm::Module *module, int optLevel) {
        new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
    optPM.add(targetLibraryInfo);

-#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
-    optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
-#else
+#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4)
    optPM.add(new llvm::DataLayout(*g->target->getDataLayout()));
+#elif defined(LLVM_3_5)
+    optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
+#else // LLVM 3.6+
+    llvm::DataLayoutPass *dlp= new llvm::DataLayoutPass();
+    dlp->doInitialization(*module);
+    optPM.add(dlp);
 #endif

    llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
@@ -2117,8 +2121,8 @@ static bool
 lGSToGSBaseOffsets(llvm::CallInst *callInst) {
    struct GSInfo {
        GSInfo(const char *pgFuncName, const char *pgboFuncName,
-               const char *pgbo32FuncName, bool ig)
-            : isGather(ig) {
+               const char *pgbo32FuncName, bool ig, bool ip)
+            : isGather(ig), isPrefetch(ip) {
            func = m->module->getFunction(pgFuncName);
            baseOffsetsFunc = m->module->getFunction(pgboFuncName);
            baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
@@ -2126,6 +2130,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
        llvm::Function *func;
        llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
        const bool isGather;
+        const bool isPrefetch;
    };

    GSInfo gsFuncs[] = {
@@ -2134,148 +2139,176 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
               "__pseudo_gather_factored_base_offsets32_i8",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
               "__pseudo_gather_factored_base_offsets32_i8",
-               true),
+               true, false),
        GSInfo("__pseudo_gather32_i16",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
               "__pseudo_gather_factored_base_offsets32_i16",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
               "__pseudo_gather_factored_base_offsets32_i16",
-               true),
+               true, false),
        GSInfo("__pseudo_gather32_i32",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
               "__pseudo_gather_factored_base_offsets32_i32",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
               "__pseudo_gather_factored_base_offsets32_i32",
-               true),
+               true, false),
        GSInfo("__pseudo_gather32_float",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
               "__pseudo_gather_factored_base_offsets32_float",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
               "__pseudo_gather_factored_base_offsets32_float",
-               true),
+               true, false),
        GSInfo("__pseudo_gather32_i64",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
               "__pseudo_gather_factored_base_offsets32_i64",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
               "__pseudo_gather_factored_base_offsets32_i64",
-               true),
+               true, false),
        GSInfo("__pseudo_gather32_double",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
               "__pseudo_gather_factored_base_offsets32_double",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
               "__pseudo_gather_factored_base_offsets32_double",
-               true),
+               true, false),

        GSInfo("__pseudo_scatter32_i8",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
               "__pseudo_scatter_factored_base_offsets32_i8",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
               "__pseudo_scatter_factored_base_offsets32_i8",
-               false),
+               false, false),
        GSInfo("__pseudo_scatter32_i16",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
               "__pseudo_scatter_factored_base_offsets32_i16",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
               "__pseudo_scatter_factored_base_offsets32_i16",
-               false),
+               false, false),
        GSInfo("__pseudo_scatter32_i32",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
               "__pseudo_scatter_factored_base_offsets32_i32",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
               "__pseudo_scatter_factored_base_offsets32_i32",
-               false),
+               false, false),
        GSInfo("__pseudo_scatter32_float",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
               "__pseudo_scatter_factored_base_offsets32_float",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
               "__pseudo_scatter_factored_base_offsets32_float",
-               false),
+               false, false),
        GSInfo("__pseudo_scatter32_i64",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
               "__pseudo_scatter_factored_base_offsets32_i64",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
               "__pseudo_scatter_factored_base_offsets32_i64",
-               false),
+               false, false),
        GSInfo("__pseudo_scatter32_double",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
               "__pseudo_scatter_factored_base_offsets32_double",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
               "__pseudo_scatter_factored_base_offsets32_double",
-               false),
+               false, false),

        GSInfo("__pseudo_gather64_i8",
               g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" :
               "__pseudo_gather_factored_base_offsets64_i8",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
               "__pseudo_gather_factored_base_offsets32_i8",
-               true),
+               true, false),
        GSInfo("__pseudo_gather64_i16",
               g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16" :
               "__pseudo_gather_factored_base_offsets64_i16",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
               "__pseudo_gather_factored_base_offsets32_i16",
-               true),
+               true, false),
        GSInfo("__pseudo_gather64_i32",
               g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32" :
               "__pseudo_gather_factored_base_offsets64_i32",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
               "__pseudo_gather_factored_base_offsets32_i32",
-               true),
+               true, false),
        GSInfo("__pseudo_gather64_float",
               g->target->hasGather() ? "__pseudo_gather_base_offsets64_float" :
               "__pseudo_gather_factored_base_offsets64_float",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
               "__pseudo_gather_factored_base_offsets32_float",
-               true),
+               true, false),
        GSInfo("__pseudo_gather64_i64",
               g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64" :
               "__pseudo_gather_factored_base_offsets64_i64",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
               "__pseudo_gather_factored_base_offsets32_i64",
-               true),
+               true, false),
        GSInfo("__pseudo_gather64_double",
               g->target->hasGather() ? "__pseudo_gather_base_offsets64_double" :
               "__pseudo_gather_factored_base_offsets64_double",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
               "__pseudo_gather_factored_base_offsets32_double",
-               true),
+               true, false),

        GSInfo("__pseudo_scatter64_i8",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8" :
               "__pseudo_scatter_factored_base_offsets64_i8",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
               "__pseudo_scatter_factored_base_offsets32_i8",
-               false),
+               false, false),
        GSInfo("__pseudo_scatter64_i16",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16" :
               "__pseudo_scatter_factored_base_offsets64_i16",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
               "__pseudo_scatter_factored_base_offsets32_i16",
-               false),
+               false, false),
        GSInfo("__pseudo_scatter64_i32",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32" :
               "__pseudo_scatter_factored_base_offsets64_i32",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
               "__pseudo_scatter_factored_base_offsets32_i32",
-               false),
+               false, false),
        GSInfo("__pseudo_scatter64_float",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float" :
               "__pseudo_scatter_factored_base_offsets64_float",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
               "__pseudo_scatter_factored_base_offsets32_float",
-               false),
+               false, false),
        GSInfo("__pseudo_scatter64_i64",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64" :
               "__pseudo_scatter_factored_base_offsets64_i64",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
               "__pseudo_scatter_factored_base_offsets32_i64",
-               false),
+               false, false),
        GSInfo("__pseudo_scatter64_double",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double" :
               "__pseudo_scatter_factored_base_offsets64_double",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
               "__pseudo_scatter_factored_base_offsets32_double",
-               false),
+               false, false),
+        
+        GSInfo("__pseudo_prefetch_read_varying_1",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : 
+               "__prefetch_read_varying_1",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : 
+               "__prefetch_read_varying_1",
+               false, true),
+
+        GSInfo("__pseudo_prefetch_read_varying_2",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
+               "__prefetch_read_varying_2",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
+               "__prefetch_read_varying_2",
+               false, true),
+
+        GSInfo("__pseudo_prefetch_read_varying_3",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
+               "__prefetch_read_varying_3",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
+               "__prefetch_read_varying_3",
+               false, true),
+
+        GSInfo("__pseudo_prefetch_read_varying_nt",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
+               "__prefetch_read_varying_nt",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
+               "__prefetch_read_varying_nt",
+               false, true),
    };

    int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
@@ -2301,7 +2334,8 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
    llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector,
                                                 callInst);

-    if (basePtr == NULL || offsetVector == NULL)
+    if (basePtr == NULL || offsetVector == NULL || 
+        (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch() == false))
        // It's actually a fully general gather/scatter with a varying
        // set of base pointers, so leave it as is and continune onward
        // to the next instruction...
@@ -2316,7 +2350,9 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
    llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;

    if ((info->isGather == true && g->target->hasGather()) ||
-        (info->isGather == false && g->target->hasScatter())) {
+        (info->isGather == false && info->isPrefetch == false && g->target->hasScatter()) ||
+        (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch())) {
+
        // See if the offsets are scaled by 2, 4, or 8.  If so,
        // extract that scale factor and rewrite the offsets to remove
        // it.
@@ -2330,7 +2366,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
            gatherScatterFunc = info->baseOffsets32Func;
        }

-        if (info->isGather) {
+        if (info->isGather || info->isPrefetch) {
            llvm::Value *mask = callInst->getArgOperand(1);

            // Generate a new function call to the next pseudo gather
@@ -2387,7 +2423,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
            gatherScatterFunc = info->baseOffsets32Func;
        }

-        if (info->isGather) {
+        if (info->isGather || info->isPrefetch) {
            llvm::Value *mask = callInst->getArgOperand(1);

            // Generate a new function call to the next pseudo gather
@@ -2429,13 +2465,14 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
 static bool
 lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
    struct GSBOInfo {
-        GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig)
-            : isGather(ig) {
+        GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
+            : isGather(ig), isPrefetch(ip) {
            baseOffsetsFunc = m->module->getFunction(pgboFuncName);
            baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
        }
        llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
        const bool isGather;
+        const bool isPrefetch;
    };

    GSBOInfo gsFuncs[] = {
@@ -2443,63 +2480,87 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
                                       "__pseudo_gather_factored_base_offsets32_i8",
                 g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
                                       "__pseudo_gather_factored_base_offsets32_i8",
-                 true),
+                 true, false),
        GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
                                       "__pseudo_gather_factored_base_offsets32_i16",
                 g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
                                       "__pseudo_gather_factored_base_offsets32_i16",
-                 true),
+                 true, false),
        GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
                                       "__pseudo_gather_factored_base_offsets32_i32",
                 g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
                                       "__pseudo_gather_factored_base_offsets32_i32",
-                 true),
+                 true, false),
        GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
                                       "__pseudo_gather_factored_base_offsets32_float",
                 g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
                                       "__pseudo_gather_factored_base_offsets32_float",
-                 true),
+                 true, false),
        GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
                                       "__pseudo_gather_factored_base_offsets32_i64",
                 g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
                                       "__pseudo_gather_factored_base_offsets32_i64",
-                 true),
+                 true, false),
        GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
                                       "__pseudo_gather_factored_base_offsets32_double",
                 g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
                                       "__pseudo_gather_factored_base_offsets32_double",
-                 true),
+                 true, false),

        GSBOInfo( g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
                                         "__pseudo_scatter_factored_base_offsets32_i8",
                  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
                                         "__pseudo_scatter_factored_base_offsets32_i8",
-                  false),
+                  false, false),
        GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
                                        "__pseudo_scatter_factored_base_offsets32_i16",
                 g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
                                        "__pseudo_scatter_factored_base_offsets32_i16",
-                 false),
+                 false, false),
        GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
                                        "__pseudo_scatter_factored_base_offsets32_i32",
                 g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
                                        "__pseudo_scatter_factored_base_offsets32_i32",
-                 false),
+                 false, false),
        GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
                                        "__pseudo_scatter_factored_base_offsets32_float",
                 g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
                                        "__pseudo_scatter_factored_base_offsets32_float",
-                 false),
+                 false, false),
        GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
                                        "__pseudo_scatter_factored_base_offsets32_i64",
                 g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
                                        "__pseudo_scatter_factored_base_offsets32_i64",
-                 false),
+                 false, false),
        GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
                                        "__pseudo_scatter_factored_base_offsets32_double",
                 g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
                                        "__pseudo_scatter_factored_base_offsets32_double",
-                 false),
+                 false, false),
+
+        GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
+                                            "__prefetch_read_varying_1",
+                 g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
+                                            "__prefetch_read_varying_1",
+                 false, true),
+
+        GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
+                                            "__prefetch_read_varying_2",
+                 g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
+                                            "__prefetch_read_varying_2",
+                 false, true),
+
+        GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
+                                            "__prefetch_read_varying_3",
+                 g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
+                                            "__prefetch_read_varying_3",
+                 false, true),
+
+        GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
+                                            "__prefetch_read_varying_nt",
+                 g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
+                                            "__prefetch_read_varying_nt",
+                 false, true),
    };

    int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
@@ -4290,149 +4351,170 @@ lReplacePseudoMaskedStore(llvm::CallInst *callInst) {
 static bool
 lReplacePseudoGS(llvm::CallInst *callInst) {
    struct LowerGSInfo {
-        LowerGSInfo(const char *pName, const char *aName, bool ig)
-            : isGather(ig) {
+        LowerGSInfo(const char *pName, const char *aName, bool ig, bool ip)
+            : isGather(ig), isPrefetch(ip) {
            pseudoFunc = m->module->getFunction(pName);
            actualFunc = m->module->getFunction(aName);
        }
        llvm::Function *pseudoFunc;
        llvm::Function *actualFunc;
        const bool isGather;
+        const bool isPrefetch;
    };

    LowerGSInfo lgsInfo[] = {
-        LowerGSInfo("__pseudo_gather32_i8",  "__gather32_i8",  true),
-        LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true),
-        LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true),
-        LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true),
-        LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true),
-        LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true),
+        LowerGSInfo("__pseudo_gather32_i8",  "__gather32_i8",  true, false),
+        LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true, false),
+        LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true, false),
+        LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true, false),
+        LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true, false),
+        LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true, false),

-        LowerGSInfo("__pseudo_gather64_i8",  "__gather64_i8",  true),
-        LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true),
-        LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true),
-        LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true),
-        LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true),
-        LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true),
+        LowerGSInfo("__pseudo_gather64_i8",  "__gather64_i8",  true, false),
+        LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true, false),
+        LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true, false),
+        LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true, false),
+        LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true, false),
+        LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true, false),

        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8",
-                    "__gather_factored_base_offsets32_i8",  true),
+                    "__gather_factored_base_offsets32_i8",  true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16",
-                    "__gather_factored_base_offsets32_i16", true),
+                    "__gather_factored_base_offsets32_i16", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32",
-                    "__gather_factored_base_offsets32_i32", true),
+                    "__gather_factored_base_offsets32_i32", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets32_float",
-                    "__gather_factored_base_offsets32_float", true),
+                    "__gather_factored_base_offsets32_float", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64",
-                    "__gather_factored_base_offsets32_i64", true),
+                    "__gather_factored_base_offsets32_i64", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets32_double",
-                    "__gather_factored_base_offsets32_double", true),
+                    "__gather_factored_base_offsets32_double", true, false),

        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8",
-                    "__gather_factored_base_offsets64_i8",  true),
+                    "__gather_factored_base_offsets64_i8",  true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16",
-                    "__gather_factored_base_offsets64_i16", true),
+                    "__gather_factored_base_offsets64_i16", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32",
-                    "__gather_factored_base_offsets64_i32", true),
+                    "__gather_factored_base_offsets64_i32", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets64_float",
-                    "__gather_factored_base_offsets64_float", true),
+                    "__gather_factored_base_offsets64_float", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64",
-                    "__gather_factored_base_offsets64_i64", true),
+                    "__gather_factored_base_offsets64_i64", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets64_double",
-                    "__gather_factored_base_offsets64_double", true),
+                    "__gather_factored_base_offsets64_double", true, false),

        LowerGSInfo("__pseudo_gather_base_offsets32_i8",
-                    "__gather_base_offsets32_i8",  true),
+                    "__gather_base_offsets32_i8",  true, false),
        LowerGSInfo("__pseudo_gather_base_offsets32_i16",
-                    "__gather_base_offsets32_i16", true),
+                    "__gather_base_offsets32_i16", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets32_i32",
-                    "__gather_base_offsets32_i32", true),
+                    "__gather_base_offsets32_i32", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets32_float",
-                    "__gather_base_offsets32_float", true),
+                    "__gather_base_offsets32_float", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets32_i64",
-                    "__gather_base_offsets32_i64", true),
+                    "__gather_base_offsets32_i64", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets32_double",
-                    "__gather_base_offsets32_double", true),
+                    "__gather_base_offsets32_double", true, false),

        LowerGSInfo("__pseudo_gather_base_offsets64_i8",
-                    "__gather_base_offsets64_i8",  true),
+                    "__gather_base_offsets64_i8",  true, false),
        LowerGSInfo("__pseudo_gather_base_offsets64_i16",
-                    "__gather_base_offsets64_i16", true),
+                    "__gather_base_offsets64_i16", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets64_i32",
-                    "__gather_base_offsets64_i32", true),
+                    "__gather_base_offsets64_i32", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets64_float",
-                    "__gather_base_offsets64_float", true),
+                    "__gather_base_offsets64_float", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets64_i64",
-                    "__gather_base_offsets64_i64", true),
+                    "__gather_base_offsets64_i64", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets64_double",
-                    "__gather_base_offsets64_double", true),
+                    "__gather_base_offsets64_double", true, false),

-        LowerGSInfo("__pseudo_scatter32_i8",  "__scatter32_i8",  false),
-        LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false),
-        LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false),
-        LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false),
-        LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false),
-        LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false),
+        LowerGSInfo("__pseudo_scatter32_i8",  "__scatter32_i8",  false, false),
+        LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false, false),
+        LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false, false),
+        LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false, false),
+        LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false, false),
+        LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false, false),

-        LowerGSInfo("__pseudo_scatter64_i8",  "__scatter64_i8",  false),
-        LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false),
-        LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false),
-        LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false),
-        LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false),
-        LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false),
+        LowerGSInfo("__pseudo_scatter64_i8",  "__scatter64_i8",  false, false),
+        LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false, false),
+        LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false, false),
+        LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false, false),
+        LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false, false),
+        LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false, false),

        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8",
-                    "__scatter_factored_base_offsets32_i8",  false),
+                    "__scatter_factored_base_offsets32_i8",  false, false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16",
-                    "__scatter_factored_base_offsets32_i16", false),
+                    "__scatter_factored_base_offsets32_i16", false, false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32",
-                    "__scatter_factored_base_offsets32_i32", false),
+                    "__scatter_factored_base_offsets32_i32", false, false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float",
-                    "__scatter_factored_base_offsets32_float", false),
+                    "__scatter_factored_base_offsets32_float", false, false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64",
-                    "__scatter_factored_base_offsets32_i64", false),
+                    "__scatter_factored_base_offsets32_i64", false, false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double",
-                    "__scatter_factored_base_offsets32_double", false),
+                    "__scatter_factored_base_offsets32_double", false, false),

        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8",
-                    "__scatter_factored_base_offsets64_i8",  false),
+                    "__scatter_factored_base_offsets64_i8",  false, false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16",
-                    "__scatter_factored_base_offsets64_i16", false),
+                    "__scatter_factored_base_offsets64_i16", false, false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32",
-                    "__scatter_factored_base_offsets64_i32", false),
+                    "__scatter_factored_base_offsets64_i32", false, false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float",
-                    "__scatter_factored_base_offsets64_float", false),
+                    "__scatter_factored_base_offsets64_float", false, false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64",
-                    "__scatter_factored_base_offsets64_i64", false),
+                    "__scatter_factored_base_offsets64_i64", false, false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double",
-                    "__scatter_factored_base_offsets64_double", false),
+                    "__scatter_factored_base_offsets64_double", false, false),


        LowerGSInfo("__pseudo_scatter_base_offsets32_i8",
-                    "__scatter_base_offsets32_i8",  false),
+                    "__scatter_base_offsets32_i8",  false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets32_i16",
-                    "__scatter_base_offsets32_i16", false),
+                    "__scatter_base_offsets32_i16", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets32_i32",
-                    "__scatter_base_offsets32_i32", false),
+                    "__scatter_base_offsets32_i32", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets32_float",
-                    "__scatter_base_offsets32_float", false),
+                    "__scatter_base_offsets32_float", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets32_i64",
-                    "__scatter_base_offsets32_i64", false),
+                    "__scatter_base_offsets32_i64", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets32_double",
-                    "__scatter_base_offsets32_double", false),
+                    "__scatter_base_offsets32_double", false, false),

        LowerGSInfo("__pseudo_scatter_base_offsets64_i8",
-                    "__scatter_base_offsets64_i8",  false),
+                    "__scatter_base_offsets64_i8",  false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets64_i16",
-                    "__scatter_base_offsets64_i16", false),
+                    "__scatter_base_offsets64_i16", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets64_i32",
-                    "__scatter_base_offsets64_i32", false),
+                    "__scatter_base_offsets64_i32", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets64_float",
-                    "__scatter_base_offsets64_float", false),
+                    "__scatter_base_offsets64_float", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets64_i64",
-                    "__scatter_base_offsets64_i64", false),
+                    "__scatter_base_offsets64_i64", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets64_double",
-                    "__scatter_base_offsets64_double", false),
+                    "__scatter_base_offsets64_double", false, false),
+
+        LowerGSInfo("__pseudo_prefetch_read_varying_1",
+                    "__prefetch_read_varying_1", false, true),
+        LowerGSInfo("__pseudo_prefetch_read_varying_1_native",
+                    "__prefetch_read_varying_1_native", false, true),
+
+        LowerGSInfo("__pseudo_prefetch_read_varying_2",
+                    "__prefetch_read_varying_2", false, true),
+        LowerGSInfo("__pseudo_prefetch_read_varying_2_native",
+                    "__prefetch_read_varying_2_native", false, true),
+
+        LowerGSInfo("__pseudo_prefetch_read_varying_3",
+                    "__prefetch_read_varying_3", false, true),
+        LowerGSInfo("__pseudo_prefetch_read_varying_3_native",
+                    "__prefetch_read_varying_3_native", false, true),
+
+        LowerGSInfo("__pseudo_prefetch_read_varying_nt",
+                    "__prefetch_read_varying_nt", false, true),
+        LowerGSInfo("__pseudo_prefetch_read_varying_nt_native",
+                    "__prefetch_read_varying_nt_native", false, true),
    };

    llvm::Function *calledFunc = callInst->getCalledFunction();
@@ -4459,7 +4541,7 @@ lReplacePseudoGS(llvm::CallInst *callInst) {
    if (gotPosition && g->target->getVectorWidth() > 1) {
        if (info->isGather)
            PerformanceWarning(pos, "Gather required to load value.");
-        else
+        else if (!info->isPrefetch)
            PerformanceWarning(pos, "Scatter required to store value.");
    }
    return true;
@@ -4740,6 +4822,8 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
        "__scatter64_i8", "__scatter64_i16",
        "__scatter64_i32", "__scatter64_i64",
        "__scatter64_float", "__scatter64_double",
+        "__prefetch_read_varying_1", "__prefetch_read_varying_2",
+        "__prefetch_read_varying_3", "__prefetch_read_varying_nt",
        "__keep_funcs_live",
    };

--- a/run_tests.py
+++ b/run_tests.py
@@ -647,8 +647,8 @@ def run_tests(options1, args, print_version):
            options.include_file = "examples/intrinsics/generic-64.h"
            options.target = "generic-64"
        elif options.target == "knc":
-            error("No knc #include specified; using examples/intrinsics/knc-i1x16.h\n", 2)
-            options.include_file = "examples/intrinsics/knc-i1x16.h"
+            error("No knc #include specified; using examples/intrinsics/knc.h\n", 2)
+            options.include_file = "examples/intrinsics/knc.h"
 
    if options.compiler_exe == None:
        if (options.target == "knc"): 
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -847,43 +847,19 @@ static inline void prefetch_nt(const void * uniform ptr) {
 }

 static inline void prefetch_l1(const void * varying ptr) {
-    const void * uniform ptrArray[programCount];
-    ptrArray[programIndex] = ptr;
-
-    foreach_active (i) {
-        const void * uniform p = ptrArray[i];
-        prefetch_l1(p);
-    }
+    __pseudo_prefetch_read_varying_1((int64)ptr, (IntMaskType)__mask);
 }

 static inline void prefetch_l2(const void * varying ptr) {
-    const void * uniform ptrArray[programCount];
-    ptrArray[programIndex] = ptr;
-
-    foreach_active (i) {
-        const void * uniform p = ptrArray[i];
-        prefetch_l2(p);
-    }
+     __pseudo_prefetch_read_varying_2((int64)ptr, (IntMaskType)__mask);
 }

 static inline void prefetch_l3(const void * varying ptr) {
-    const void * uniform ptrArray[programCount];
-    ptrArray[programIndex] = ptr;
-
-    foreach_active (i) {
-        const void * uniform p = ptrArray[i];
-        prefetch_l3(p);
-    }
+     __pseudo_prefetch_read_varying_3((int64)ptr, (IntMaskType)__mask);
 }

 static inline void prefetch_nt(const void * varying ptr) {
-    const void * uniform ptrArray[programCount];
-    ptrArray[programIndex] = ptr;
-
-    foreach_active (i) {
-        const void * uniform p = ptrArray[i];
-        prefetch_nt(p);
-    }
+    __pseudo_prefetch_read_varying_nt((int64)ptr, (IntMaskType)__mask);
 }

 ///////////////////////////////////////////////////////////////////////////
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -712,7 +712,6 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
    }
 }

-
 /** Emit code for an if test that checks the mask and the test values and
    tries to be smart about jumping over code that doesn't need to be run.
 */
@@ -1101,8 +1100,10 @@ void DoStmt::EmitCode(FunctionEmitContext *ctx) const {
    // the code for the test.  This is only necessary for varying loops;
    // 'uniform' loops just jump when they hit a continue statement and
    // don't mess with the mask.
-    if (!uniformTest)
+    if (!uniformTest) {
        ctx->RestoreContinuedLanes();
+        ctx->ClearBreakLanes();
+    }
    llvm::Value *testValue = testExpr->GetValue(ctx);
    if (!testValue)
        return;
@@ -1310,6 +1311,8 @@ ForStmt::EmitCode(FunctionEmitContext *ctx) const {
    // test code.
    ctx->SetCurrentBasicBlock(bstep);
    ctx->RestoreContinuedLanes();
+    ctx->ClearBreakLanes();
+
    if (step)
        step->EmitCode(ctx);
    ctx->BranchInst(btest);
--- a/tests/prefetch-varying.ispc
+++ b/tests/prefetch-varying.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+ uniform int64 a[programCount];
+ for (uniform int i = 0; i < programCount; ++i)
+ a[i] = aFOO[i];
+
+ int64 *ptr = &(a[programIndex+zero]);
+ prefetch_l1(ptr);
+ prefetch_l2(ptr);
+ prefetch_l3(ptr);
+ prefetch_nt(ptr);
+ int g = *ptr;
+ RET[programIndex] = g;
+}
+
+export void result(uniform float RET[]) {
+ RET[programIndex] = 1 + programIndex;
+}