diff --git a/alloy.py b/alloy.py
index 080033c3..f5c8a7ca 100755
--- a/alloy.py
+++ b/alloy.py
@@ -110,8 +110,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
     if  version_LLVM == "trunk":
         SVN_PATH="trunk"
     if  version_LLVM == "3.5":
-        # SVN_PATH=tags/RELEASE_35/rc1
-        SVN_PATH="branches/release_35"
+        SVN_PATH="tags/RELEASE_350/final"
         version_LLVM = "3_5"
     if  version_LLVM == "3.4":
         SVN_PATH="tags/RELEASE_34/dot2-final"
diff --git a/builtins.cpp b/builtins.cpp
index 35cf115c..2a1df0eb 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -555,6 +555,10 @@ lSetInternalFunctions(llvm::Module *module) {
         "__prefetch_read_uniform_2",
         "__prefetch_read_uniform_3",
         "__prefetch_read_uniform_nt",
+        "__pseudo_prefetch_read_varying_1",
+        "__pseudo_prefetch_read_varying_2",
+        "__pseudo_prefetch_read_varying_3",
+        "__pseudo_prefetch_read_varying_nt",
         "__psubs_vi8",
         "__psubs_vi16",
         "__psubus_vi8",
@@ -780,7 +784,11 @@ void
 AddBitcodeToModule(const unsigned char *bitcode, int length,
                    llvm::Module *module, SymbolTable *symbolTable) {
     llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
+#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
     llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
+#else // LLVM 3.6+
+    llvm::MemoryBufferRef bcBuf = llvm::MemoryBuffer::getMemBuffer(sb)->getMemBufferRef();
+#endif
 
 #if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
     llvm::ErrorOr<llvm::Module *> ModuleOrErr = llvm::parseBitcodeFile(bcBuf, *g->ctx);
@@ -910,12 +918,23 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
         // have the DW_AT_artifical attribute.  It's not clear if this
         // matters for anything though.
         llvm::DIGlobalVariable var =
+#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
+            m->diBuilder->createGlobalVariable(file,
+                                               name,
+                                               name,
+                                               file,
+                                               0 /* line */,
+                                               diType,
+                                               true /* static */,
+                                               sym->storagePtr);
+#else
             m->diBuilder->createGlobalVariable(name,
                                                file,
                                                0 /* line */,
                                                diType,
                                                true /* static */,
                                                sym->storagePtr);
+#endif
         Assert(var.Verify());
     }
 }
@@ -970,12 +989,23 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
         llvm::DIType diType = sym->type->GetDIType(file);
         Assert(diType.Verify());
         llvm::DIGlobalVariable var =
+#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
+            m->diBuilder->createGlobalVariable(file,
+                                               sym->name.c_str(),
+                                               sym->name.c_str(),
+                                               file,
+                                               0 /* line */,
+                                               diType,
+                                               false /* static */,
+                                               sym->storagePtr);
+#else
             m->diBuilder->createGlobalVariable(sym->name.c_str(),
                                                file,
                                                0 /* line */,
                                                diType,
                                                false /* static */,
                                                sym->storagePtr);
+#endif    
         Assert(var.Verify());
     }
 }
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index d50f4947..dd9bd428 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -370,6 +370,14 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
 declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind 
 declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind 
 
+declare void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int8/int16 builtins
 
diff --git a/builtins/util.m4 b/builtins/util.m4
index 58f5a5cd..b265add8 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1584,6 +1584,50 @@ define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
   call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1)
   ret void
 }
+
+define void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
+  call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 3, i32 1)
+  ')
+  ret void
+}
+
+declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+
+define void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
+  call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 2, i32 1)
+  ')
+  ret void
+}
+
+declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+
+define void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
+  call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 1, i32 1)
+  ')
+  ret void
+}
+
+declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+
+define void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+  %iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
+  call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 0, i32 1)
+  ')
+  ret void
+}
+
+declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
 ')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2535,6 +2579,31 @@ declare void
 @__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, <WIDTH x i64>,
                                         <WIDTH x double>, <WIDTH x MASK>) nounwind
 
+
+declare void @__pseudo_prefetch_read_varying_1(<WIDTH x i64>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_prefetch_read_varying_1_native(i8 *, i32, <WIDTH x i32>,
+                                         <WIDTH x MASK>) nounwind
+
+declare void @__pseudo_prefetch_read_varying_2(<WIDTH x i64>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_prefetch_read_varying_2_native(i8 *, i32, <WIDTH x i32>,
+                                         <WIDTH x MASK>) nounwind
+
+declare void @__pseudo_prefetch_read_varying_3(<WIDTH x i64>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_prefetch_read_varying_3_native(i8 *, i32, <WIDTH x i32>,
+                                         <WIDTH x MASK>) nounwind
+
+declare void @__pseudo_prefetch_read_varying_nt(<WIDTH x i64>, <WIDTH x MASK>) nounwind
+
+declare void
+@__pseudo_prefetch_read_varying_nt_native(i8 *, i32, <WIDTH x i32>,
+                                         <WIDTH x MASK>) nounwind
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 declare void @__use8(<WIDTH x i8>)
@@ -3034,6 +3103,41 @@ ifelse(HAVE_SCATTER, `1',
                                                     <WIDTH x double> %vd, <WIDTH x MASK> %mask)
 ')
 
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; prefetchs
+
+  call void @__pseudo_prefetch_read_varying_1(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_prefetch_read_varying_1_native(i8 * %ptr, i32 0,
+                                                     <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_1_native(i8 * %ptr, i32 0,
+                                              <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_1(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_prefetch_read_varying_2(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_prefetch_read_varying_2_native(i8 * %ptr, i32 0,
+                                                     <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_2_native(i8 * %ptr, i32 0,
+                                              <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_2(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_prefetch_read_varying_3(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_prefetch_read_varying_3_native(i8 * %ptr, i32 0,
+                                                     <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_3_native(i8 * %ptr, i32 0,
+                                              <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_3(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_prefetch_read_varying_nt(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_prefetch_read_varying_nt_native(i8 * %ptr, i32 0,
+                                                     <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_nt_native(i8 * %ptr, i32 0,
+                                              <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__prefetch_read_varying_nt(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
   ret void
 }
 
diff --git a/cbackend.cpp b/cbackend.cpp
index 867385a4..5de99366 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -4945,9 +4945,19 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
     llvm::sys::fs::OpenFlags flags = llvm::sys::fs::F_None;
 #endif
 
+#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
     std::string error;
+#else // LLVM 3.6+
+    std::error_code error;
+#endif
+
     llvm::tool_output_file *of = new llvm::tool_output_file(fn, error, flags);
+
+#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
     if (error.size()) {
+#else // LLVM 3.6+
+    if (error) {
+#endif
         fprintf(stderr, "Error opening output file \"%s\".\n", fn);
         return false;
     }
diff --git a/ctx.cpp b/ctx.cpp
index 04ec22f6..f09002c1 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -745,6 +745,7 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
         // that have executed a 'break' statement:
         // breakLanes = breakLanes | mask
         AssertPos(currentPos, breakLanesPtr != NULL);
+
         llvm::Value *mask = GetInternalMask();
         llvm::Value *breakMask = LoadInst(breakLanesPtr,
                                           "break_mask");
@@ -883,7 +884,7 @@ FunctionEmitContext::jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target) {
             finishedLanes = BinaryOperator(llvm::Instruction::Or, finishedLanes,
                                            continued, "returned|breaked|continued");
         }
-        
+          
         finishedLanes = BinaryOperator(llvm::Instruction::And,
                                        finishedLanes, GetFunctionMask(),
                                        "finished&func");
@@ -927,6 +928,16 @@ FunctionEmitContext::RestoreContinuedLanes() {
 }
 
 
+void
+FunctionEmitContext::ClearBreakLanes() {
+  if (breakLanesPtr == NULL)
+    return;
+
+  // breakLanes = 0
+  StoreInst(LLVMMaskAllOff, breakLanesPtr);
+}
+
+
 void
 FunctionEmitContext::StartSwitch(bool cfIsUniform, llvm::BasicBlock *bbBreak) {
     llvm::Value *oldMask = GetInternalMask();
@@ -1636,14 +1647,16 @@ FunctionEmitContext::StartScope() {
         llvm::DILexicalBlock lexicalBlock =
             m->diBuilder->createLexicalBlock(parentScope, diFile,
                                              currentPos.first_line,
-#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
+#if defined(LLVM_3_5)
         // Revision 202736 in LLVM adds support of DWARF discriminator
         // to the last argument and revision 202737 in clang adds 0
         // for the last argument by default.
                                              currentPos.first_column, 0);
 #else
+        // Revision 216239 in LLVM removes support of DWARF discriminator
+        // as the last argument
                                              currentPos.first_column);
-#endif
+#endif // LLVM 3.2, 3.3, 3.4 and 3.6+
         AssertPos(currentPos, lexicalBlock.Verify());
         debugScopes.push_back(lexicalBlock);
     }
@@ -1683,8 +1696,14 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
                                           diType,
                                           true /* preserve through opts */);
     AssertPos(currentPos, var.Verify());
+#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
+    llvm::DIExpression E = m->diBuilder->createExpression();
+    llvm::Instruction *declareInst =
+        m->diBuilder->insertDeclare(sym->storagePtr, var, E, bblock);
+#else
     llvm::Instruction *declareInst =
         m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
+#endif
     AddDebugPos(declareInst, &sym->pos, &scope);
 }
 
@@ -1710,8 +1729,14 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym, int argNum) {
                                           flags,
                                           argNum+1);
     AssertPos(currentPos, var.Verify());
+#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
+    llvm::DIExpression E =  m->diBuilder->createExpression();
+    llvm::Instruction *declareInst =
+        m->diBuilder->insertDeclare(sym->storagePtr, var, E, bblock);
+#else
     llvm::Instruction *declareInst =
         m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
+#endif
     AddDebugPos(declareInst, &sym->pos, &scope);
 }
 
diff --git a/ctx.h b/ctx.h
index c4f9a6aa..cd4db7e8 100644
--- a/ctx.h
+++ b/ctx.h
@@ -195,6 +195,13 @@ public:
         'continue' statement when going through the loop body in the
         previous iteration. */
     void RestoreContinuedLanes();
+    
+    /** This method is called by code emitting IR for a loop.  It clears 
+        any lanes that contained a break since the mask has been updated to take
+        them into account.  This is necessary as all the bail out checks for 
+        breaks are meant to only deal with lanes breaking on the current iteration.
+     */
+    void ClearBreakLanes();
 
     /** Indicates that code generation for a "switch" statement is about to
         start.  isUniform indicates whether the "switch" value is uniform,
diff --git a/examples/common.props b/examples/common.props
index 5cfad4fc..08f40f65 100644
--- a/examples/common.props
+++ b/examples/common.props
@@ -160,8 +160,8 @@
   <ItemGroup>
     <CustomBuild Include='$(ISPC_file).ispc'>
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) $(flags)</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) $(flags)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O0 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) -g $(flags)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O0 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) -g $(flags)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Target_out)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Target_out)</Outputs>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) $(flags)</Command>
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index c8f2cf08..f44c581e 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1540,6 +1540,15 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
 static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
 }
 
+#define PREFETCH_READ_VARYING(CACHE_NUM)                                                                    \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale,         \
+                                                                   __vec16_i32 offsets, __vec16_i1 mask) {} \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {}           \
+
+PREFETCH_READ_VARYING(1)
+PREFETCH_READ_VARYING(2)
+PREFETCH_READ_VARYING(3)
+PREFETCH_READ_VARYING(nt)
 ///////////////////////////////////////////////////////////////////////////
 // atomics
 
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index f5bb233c..26505615 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -1624,6 +1624,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
 static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
 }
 
+#define PREFETCH_READ_VARYING(CACHE_NUM)                                                                    \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale,         \
+                                                                   __vec32_i32 offsets, __vec32_i1 mask) {} \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec32_i64 addr, __vec32_i1 mask) {}           \
+
+PREFETCH_READ_VARYING(1)
+PREFETCH_READ_VARYING(2)
+PREFETCH_READ_VARYING(3)
+PREFETCH_READ_VARYING(nt)
+
 ///////////////////////////////////////////////////////////////////////////
 // atomics
 
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index a7148c8b..b5caa008 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -1757,6 +1757,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
 static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
 }
 
+#define PREFETCH_READ_VARYING(CACHE_NUM)                                                                    \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale,         \
+                                                                   __vec64_i32 offsets, __vec64_i1 mask) {} \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec64_i64 addr, __vec64_i1 mask) {}           \
+
+PREFETCH_READ_VARYING(1)
+PREFETCH_READ_VARYING(2)
+PREFETCH_READ_VARYING(3)
+PREFETCH_READ_VARYING(nt)
+
 ///////////////////////////////////////////////////////////////////////////
 // atomics
 
diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 2e6afed5..b09958fa 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -1,5 +1,5 @@
 /**
-  Copyright (c) 2010-2013, Intel Corporation
+  Copyright (c) 2010-2014, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -31,7 +31,8 @@
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 
-#include <stdint.h>
+#include <limits.h> // INT_MIN
+#include <stdint.h> 
 #include <math.h>
 #include <assert.h>
 #include <algorithm>
@@ -525,11 +526,11 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
   *p = v;
 }
 
-template <class RetVecType> RetVecType __smear_i1(int i);
-template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
+template <class RetVecType> static RetVecType __smear_i1(int i);
+template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
 
-template <class RetVecType> RetVecType __setzero_i1();
-template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
+template <class RetVecType> static RetVecType __setzero_i1();
+template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
 
 template <class RetVecType> __vec16_i1 __undef_i1();
 template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); }
@@ -677,8 +678,8 @@ static FORCEINLINE __vec16_i32 __select(      bool cond, __vec16_i32 a, __vec16_
 static FORCEINLINE int32_t __extract_element(__vec16_i32  v,  int32_t index)              { return v[index];    }
 static FORCEINLINE void    __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val;  }
 
-template <class RetVecType> RetVecType __smear_i32(int32_t i);
-template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
+template <class RetVecType> RetVecType static __smear_i32(int32_t i);
+template <> FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
 
 static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
 static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0);
@@ -686,11 +687,11 @@ static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
 static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
 static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 
-template <class RetVecType> RetVecType __setzero_i32();
-template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
+template <class RetVecType> static RetVecType __setzero_i32();
+template <> FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
 
-template <class RetVecType> RetVecType __undef_i32();
-template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
+template <class RetVecType> static RetVecType __undef_i32();
+template <> FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
 
 static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); }
 
@@ -742,11 +743,11 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
 }
 
 #if 0 /* knc::fails  ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
-template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) 
+template <> FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) 
 {
   return _mm512_load_epi32(p);
 }
-template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) 
+template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) 
 {
   _mm512_store_epi32(p, v);
 }
@@ -1017,21 +1018,21 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
 }
 
 #if 0 /* knc::fails  as with _i32 this may generate fails ... so commetining it out */
-template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
+template <> FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
 {
   __m512i v2 = _mm512_load_epi32(p);
   __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
   return __vec16_i64(v2,v1);
 }
-template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {    return __load<64>(p); }
-template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) 
+template <> FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {    return __load<64>(p); }
+template <> FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) 
 {
   __m512i v1 = v.v2;
   __m512i v2 = v.v1;
   _mm512_store_epi64(p, v2);
   _mm512_store_epi64(((uint8_t*)p)+64, v1);
 }
-template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
+template <> FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
 #endif
 
 
@@ -1067,14 +1068,14 @@ static FORCEINLINE __vec16_f __select(      bool cond, __vec16_f a, __vec16_f b)
 static FORCEINLINE float __extract_element(__vec16_f  v, uint32_t index)            { return v[index];   }
 static FORCEINLINE void   __insert_element(__vec16_f *v, uint32_t index, float val) { (*v)[index] = val; }
 
-template <class RetVecType> RetVecType __smear_float(float f);
-template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
+template <class RetVecType> static RetVecType __smear_float(float f);
+template <> FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
 
-template <class RetVecType> RetVecType __setzero_float();
-template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
+template <class RetVecType> static RetVecType __setzero_float();
+template <> FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
 
-template <class RetVecType> RetVecType __undef_float();
-template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
+template <class RetVecType> static RetVecType __undef_float();
+template <> FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
 
 static FORCEINLINE __vec16_f __broadcast_float(__vec16_f _v, int index) 
 {
@@ -1131,12 +1132,12 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v)
 }
 
 #if 0 /* knc::fails  ./tests/gs-improve-progindex.ispc with segfault */
-template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) 
+template <> FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) 
 {
     return _mm512_load_ps(p);
 }
 /* this one doesn't fail but it is  commented out for completeness, no aligned load/stores */
-template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) 
+template <> FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) 
 {
   _mm512_store_ps(p, v);
 }
@@ -1309,14 +1310,14 @@ static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b)
 static FORCEINLINE double __extract_element(__vec16_d  v, uint32_t index)             { return v[index];   }
 static FORCEINLINE void    __insert_element(__vec16_d *v, uint32_t index, double val) { (*v)[index] = val; }
 
-template <class RetVecType> RetVecType __smear_double(double d);
-template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
+template <class RetVecType> static RetVecType __smear_double(double d);
+template <> FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
 
-template <class RetVecType> RetVecType __setzero_double();
-template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
+template <class RetVecType> static RetVecType __setzero_double();
+template <> FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
 
-template <class RetVecType> RetVecType __undef_double();
-template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
+template <class RetVecType> static RetVecType __undef_double();
+template <> FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
 
 #define CASTD2F(_v_, _v_hi_, _v_lo_) \
   __vec16_f _v_hi_, _v_lo_;  \
@@ -1390,17 +1391,17 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v)
 
 
 #if 0 /* knc::fails  as with _f this may generate fails ... so commetining it out */
-template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) 
+template <> FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) 
 {
   return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
 }
-template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) 
+template <> FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) 
 {
   _mm512_store_pd(p, v.v1);
   _mm512_store_pd(((uint8_t*)p)+64, v.v2);
 }
-template <> static FORCEINLINE __vec16_d __load <128>(const __vec16_d *p)        { return __load<64>(p); }
-template <> static FORCEINLINE      void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v);    }
+template <> FORCEINLINE __vec16_d __load <128>(const __vec16_d *p)        { return __load<64>(p); }
+template <> FORCEINLINE      void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v);    }
 #endif
 
 ///////////////////////////////////////////////////////////////////////////
@@ -2162,6 +2163,7 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t
 static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) 
 { 
   const __vec16_i64 offsets = _offsets.cvt2hilo();
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
   __vec16_i1 still_to_do = mask;
   __vec16_i32 tmp;
   while (still_to_do) {
@@ -2172,8 +2174,8 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_
         _MM_CMPINT_EQ);
 
     void * base = (void*)((unsigned long)_base  +
-        ((scale*(unsigned long)hi32) << 32));    
-    tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base,
+        ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
+    tmp = _mm512_mask_i32extgather_epi32(tmp, match, signed_offsets, base,
         _MM_UPCONV_EPI32_SINT8, scale,
         _MM_HINT_NONE);
     still_to_do = _mm512_kxor(match,still_to_do);
@@ -2197,6 +2199,7 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32
 static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
 {
   const __vec16_i64 offsets = _offsets.cvt2hilo();
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
   // There is no gather instruction with 64-bit offsets in KNC.
   // We have to manually iterate over the upper 32 bits ;-)
   __vec16_i1  still_to_do = mask;
@@ -2207,10 +2210,10 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint3
     __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
         __smear_i32<__vec16_i32>((int32_t)hi32),
         _MM_CMPINT_EQ);
-
+         
     void * base = (void*)((unsigned long)_base  +
-        ((scale*(unsigned long)hi32) << 32));
-    ret = _mm512_mask_i32extgather_epi32(ret, match, offsets.v_lo, base,
+        ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
+    ret = _mm512_mask_i32extgather_epi32(ret, match, signed_offsets, base,
         _MM_UPCONV_EPI32_NONE, scale,
         _MM_HINT_NONE);
     still_to_do = _mm512_kxor(match, still_to_do);
@@ -2230,6 +2233,7 @@ static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32
 static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
 {
   const __vec16_i64 offsets = _offsets.cvt2hilo();
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
   // There is no gather instruction with 64-bit offsets in KNC.
   // We have to manually iterate over the upper 32 bits ;-)
   __vec16_i1 still_to_do = mask;
@@ -2242,8 +2246,8 @@ static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint3
         _MM_CMPINT_EQ);
 
     void * base = (void*)((unsigned long)_base  +
-        ((scale*(unsigned long)hi32) << 32));
-    ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base,
+        ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
+    ret = _mm512_mask_i32extgather_ps(ret, match, signed_offsets, base,
         _MM_UPCONV_PS_NONE, scale,
         _MM_HINT_NONE);
     still_to_do = _mm512_kxor(match, still_to_do);
@@ -2339,7 +2343,8 @@ static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale,
 static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i32 value, __vec16_i1 mask) 
 {
   const __vec16_i64 offsets = _offsets.cvt2hilo();
-
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
+  
   __vec16_i1 still_to_do = mask;
   while (still_to_do) {
     int first_active_lane = _mm_tzcnt_32((int)still_to_do);
@@ -2349,8 +2354,8 @@ static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t sc
         _MM_CMPINT_EQ);
 
     void * base = (void*)((unsigned long)_base  +
-        ((scale*(unsigned long)hi32) << 32));    
-    _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, 
+        ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));  
+    _mm512_mask_i32extscatter_epi32(base, match, signed_offsets, 
         value,
         _MM_DOWNCONV_EPI32_NONE, scale,
         _MM_HINT_NONE);
@@ -2370,7 +2375,8 @@ static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scal
 static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask) 
 { 
   const __vec16_i64 offsets = _offsets.cvt2hilo();
-
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
+  
   __vec16_i1 still_to_do = mask;
   while (still_to_do) {
     int first_active_lane = _mm_tzcnt_32((int)still_to_do);
@@ -2380,8 +2386,9 @@ static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t
         _MM_CMPINT_EQ);
 
     void * base = (void*)((unsigned long)_base  +
-        ((scale*(unsigned long)hi32) << 32));    
-    _mm512_mask_i32extscatter_ps(base, match, offsets.v_lo, 
+        ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));   
+
+    _mm512_mask_i32extscatter_ps(base, match, signed_offsets, 
         value,
         _MM_DOWNCONV_PS_NONE, scale,
         _MM_HINT_NONE);
@@ -2543,6 +2550,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
     // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
 }
 
+#define PREFETCH_READ_VARYING(CACHE_NUM, HINT)                                                              \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale,         \
+                                                                   __vec16_i32 offsets, __vec16_i1 mask) {  \
+    _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT);                                   \
+    offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
+    __vec16_i1 copy_mask = _mm512_kmov(mask);                                                               \
+    _mm512_kswapb(mask, copy_mask);                                                                         \
+    _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0);                            \
+}                                                                                                           \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {}           \
+
+PREFETCH_READ_VARYING(1, _MM_HINT_T0)
+PREFETCH_READ_VARYING(2, _MM_HINT_T1)
+PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
+
+static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
+                                                         __vec16_i32 offsets, __vec16_i1 mask) {}
+
+static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
+
 ///////////////////////////////////////////////////////////////////////////
 // atomics
 ///////////////////////////////////////////////////////////////////////////
diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h
index 478ad75a..5eb8ea05 100644
--- a/examples/intrinsics/knc-i1x8.h
+++ b/examples/intrinsics/knc-i1x8.h
@@ -2606,6 +2606,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
     // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
 }
 
+#define PREFETCH_READ_VARYING(CACHE_NUM, HINT)                                                              \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale,         \
+                                                                   __vec16_i32 offsets, __vec16_i1 mask) {  \
+    _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT);                                   \
+    offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
+    __vec16_i1 copy_mask = _mm512_kmov(mask);                                                               \
+    _mm512_kswapb(mask, copy_mask);                                                                         \
+    _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0);                            \
+}                                                                                                           \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {}           \
+
+PREFETCH_READ_VARYING(1, _MM_HINT_T0)
+PREFETCH_READ_VARYING(2, _MM_HINT_T1)
+PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
+
+static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
+                                                         __vec16_i32 offsets, __vec16_i1 mask) {}
+
+static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
+
 ///////////////////////////////////////////////////////////////////////////
 // atomics
 
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index e674f409..732330b9 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -1,5 +1,5 @@
-/*
-  Copyright (c) 2012, Intel Corporation
+/**
+  Copyright (c) 2010-2014, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,7 @@
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 
+#include <limits.h> // INT_MIN
 #include <stdint.h>
 #include <math.h>
 #include <assert.h>
@@ -43,6 +44,15 @@
 #include <iostream> // for operator<<(m512[i])
 #include <iomanip>  // for operator<<(m512[i])
 
+#if 0
+  #define STRING(x) #x
+  #define TOSTRING(x) STRING(x)
+  #define PING std::cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << std::endl
+  #define PRINT(x) std::cout << STRING(x) << " = " << (x) << std::endl
+  #define PRINT2(x,y) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << std::endl
+  #define PRINT3(x,y,z) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << std::endl
+  #define PRINT4(x,y,z,w) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << std::endl
+#endif
 
 #define FORCEINLINE __forceinline
 #ifdef _MSC_VER
@@ -57,13 +67,13 @@
 
 #define KNC 1
 extern "C" {
-    int printf(const unsigned char *, ...);
-    int puts(unsigned char *);
-    unsigned int putchar(unsigned int);
-    int fflush(void *);
-    uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t);
-    uint8_t *memset(uint8_t *, uint8_t, uint64_t);
-    void memset_pattern16(void *, const void *, uint64_t);
+  int printf(const unsigned char *, ...);
+  int puts(unsigned char *);
+  unsigned int putchar(unsigned int);
+  int fflush(void *);
+  uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t);
+  uint8_t *memset(uint8_t *, uint8_t, uint64_t);
+  void memset_pattern16(void *, const void *, uint64_t);
 }
 
 typedef float   __vec1_f;
@@ -75,118 +85,155 @@ typedef int64_t __vec1_i64;
 
 struct __vec16_i32;
 
+#if 0
+/* (iw) actually, this *SHOULD* be the right implementation for a
+vec16_i1: this one is a class that can have a constructor (which
+ISPC sometimes emits for these vectors...) This version might 
+not be working with embree's ISPC bindings, probably because 
+embree still uses the 'wrong' implementation */
+typedef struct PRE_ALIGN(2) __vec16_i1
+{
+  FORCEINLINE operator __mmask16() const { return v; }
+  FORCEINLINE __vec16_i1() { }
+  FORCEINLINE __vec16_i1(const __mmask16 &vv) : v(vv) { }
+  FORCEINLINE __vec16_i1(bool v0, bool v1, bool v2, bool v3,
+                         bool v4, bool v5, bool v6, bool v7,
+                         bool v8, bool v9, bool v10, bool v11,
+                         bool v12, bool v13, bool v14, bool v15) {
+    v = ((v0 & 1) |
+        ((v1 & 1) << 1) |
+        ((v2 & 1) << 2) |
+        ((v3 & 1) << 3) |
+        ((v4 & 1) << 4) |
+        ((v5 & 1) << 5) |
+        ((v6 & 1) << 6) |
+        ((v7 & 1) << 7) |
+        ((v8 & 1) << 8) |
+        ((v9 & 1) << 9) |
+        ((v10 & 1) << 10) |
+        ((v11 & 1) << 11) |
+        ((v12 & 1) << 12) |
+        ((v13 & 1) << 13) |
+        ((v14 & 1) << 14) |
+        ((v15 & 1) << 15));
+    }
+    __mmask16 v;
+} POST_ALIGN(2) __vec16_i1;
+
+#else
 typedef __mmask16 POST_ALIGN(2) __vec16_i1;
+#endif
 
 typedef struct PRE_ALIGN(64) __vec16_f {
-    FORCEINLINE operator __m512() const { return v; }
-    FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { }
-    FORCEINLINE __vec16_f(const __m512 &in) : v(in) {}
-    FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {}
-    FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; }
-    FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, 
-                          float v04, float v05, float v06, float v07,
-                          float v08, float v09, float v10, float v11,
-                          float v12, float v13, float v14, float v15) {
-        v = _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00);
-    }
-    __m512 v;
+  FORCEINLINE operator __m512() const { return v; }
+  FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { }
+  FORCEINLINE __vec16_f(const __m512 &in) : v(in) {}
+  FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {}
+  FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; }
+  FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, 
+                        float v04, float v05, float v06, float v07,
+                        float v08, float v09, float v10, float v11,
+                        float v12, float v13, float v14, float v15) {
+      v = _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00);
+  }
+  __m512 v;
 } POST_ALIGN(64) __vec16_f;
 
 typedef struct PRE_ALIGN(64) __vec16_d {
-    FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {}
-    FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {}
-    FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {}
-    FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; }
-    FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, 
-                          double v04, double v05, double v06, double v07,
-                          double v08, double v09, double v10, double v11,
-                          double v12, double v13, double v14, double v15) {
-        v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08);
-        v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00);
-    }
-    __m512d v1;
-    __m512d v2;
+  FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {}
+  FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {}
+  FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {}
+  FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; }
+  FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, 
+                        double v04, double v05, double v06, double v07,
+                        double v08, double v09, double v10, double v11,
+                        double v12, double v13, double v14, double v15) {
+      v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08);
+      v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00);
+  }
+  __m512d v1;
+  __m512d v2;
 } POST_ALIGN(64) __vec16_d;
 
 typedef struct PRE_ALIGN(64) __vec16_i32 {
-    FORCEINLINE operator __m512i() const { return v; }
-    FORCEINLINE __vec16_i32() : v(_mm512_undefined_epi32()) {}
-    FORCEINLINE __vec16_i32(const int32_t &in) : v(_mm512_set1_epi32(in)) {}
-    FORCEINLINE __vec16_i32(const __m512i &in) : v(in) {}
-    FORCEINLINE __vec16_i32(const __vec16_i32 &o) : v(o.v) {}
-    FORCEINLINE __vec16_i32& operator =(const __vec16_i32 &o) { v=o.v; return *this; }
-    FORCEINLINE __vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, 
-                            int32_t v04, int32_t v05, int32_t v06, int32_t v07,
-                            int32_t v08, int32_t v09, int32_t v10, int32_t v11,
-                            int32_t v12, int32_t v13, int32_t v14, int32_t v15) {
-        v = _mm512_set_16to16_pi(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00);
-    }
-    __m512i v;
+  FORCEINLINE operator __m512i() const { return v; }
+  FORCEINLINE __vec16_i32() : v(_mm512_undefined_epi32()) {}
+  FORCEINLINE __vec16_i32(const int32_t &in) : v(_mm512_set1_epi32(in)) {}
+  FORCEINLINE __vec16_i32(const __m512i &in) : v(in) {}
+  FORCEINLINE __vec16_i32(const __vec16_i32 &o) : v(o.v) {}
+  FORCEINLINE __vec16_i32& operator =(const __vec16_i32 &o) { v=o.v; return *this; }
+  FORCEINLINE __vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, 
+                          int32_t v04, int32_t v05, int32_t v06, int32_t v07,
+                          int32_t v08, int32_t v09, int32_t v10, int32_t v11,
+                          int32_t v12, int32_t v13, int32_t v14, int32_t v15) {
+      v = _mm512_set_16to16_pi(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00);
+  }
+  __m512i v;
 } POST_ALIGN(64) __vec16_i32;
 
 typedef struct PRE_ALIGN(64) __vec16_i64 {
-    FORCEINLINE __vec16_i64() : v_lo(_mm512_undefined_epi32()),  v_hi(_mm512_undefined_epi32()) {}
-    FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {}
-    FORCEINLINE __vec16_i64(__m512i l, __m512i h) : v_lo(l), v_hi(h) {}
-    FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v_lo=o.v_lo; v_hi=o.v_hi; return *this; }
-    FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, 
-                            int64_t v04, int64_t v05, int64_t v06, int64_t v07,
-                            int64_t v08, int64_t v09, int64_t v10, int64_t v11,
-                            int64_t v12, int64_t v13, int64_t v14, int64_t v15) {
-        __m512i v1 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08);
-        __m512i v2 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00);
-        v_hi = _mm512_mask_permutevar_epi32(v_hi, 0xFF00, 
-                      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-                      v1);
-        v_hi = _mm512_mask_permutevar_epi32(v_hi, 0x00FF, 
-                      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-                      v2);
-        v_lo = _mm512_mask_permutevar_epi32(v_lo, 0xFF00,
-                      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-                      v1);
-        v_lo = _mm512_mask_permutevar_epi32(v_lo, 0x00FF,
-                      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-                      v2);
-    }
-    __m512i v_hi;
-    __m512i v_lo;
+  FORCEINLINE __vec16_i64() : v_lo(_mm512_undefined_epi32()),  v_hi(_mm512_undefined_epi32()) {}
+  FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {}
+  FORCEINLINE __vec16_i64(__m512i l, __m512i h) : v_lo(l), v_hi(h) {}
+  FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v_lo=o.v_lo; v_hi=o.v_hi; return *this; }
+  FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, 
+                          int64_t v04, int64_t v05, int64_t v06, int64_t v07,
+                          int64_t v08, int64_t v09, int64_t v10, int64_t v11,
+                          int64_t v12, int64_t v13, int64_t v14, int64_t v15) {
+      __m512i v1 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08);
+      __m512i v2 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00);
+      v_hi = _mm512_mask_permutevar_epi32(v_hi, 0xFF00, 
+                    _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+                    v1);
+      v_hi = _mm512_mask_permutevar_epi32(v_hi, 0x00FF, 
+                    _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+                    v2);
+      v_lo = _mm512_mask_permutevar_epi32(v_lo, 0xFF00,
+          _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+          v1);
+      v_lo = _mm512_mask_permutevar_epi32(v_lo, 0x00FF,
+          _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+          v2);
+  }
+  __m512i v_hi;
+  __m512i v_lo;
 } POST_ALIGN(64) __vec16_i64;
 
 template <typename T>
 struct vec16 {
-    FORCEINLINE vec16() { }
-    FORCEINLINE vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
-                      T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) {
-        v[0] = v0;        v[1] = v1;        v[2] = v2;        v[3] = v3;
-        v[4] = v4;        v[5] = v5;        v[6] = v6;        v[7] = v7;
-        v[8] = v8;        v[9] = v9;        v[10] = v10;      v[11] = v11;
-        v[12] = v12;      v[13] = v13;      v[14] = v14;      v[15] = v15;
-    }
-    T v[16]; 
+  FORCEINLINE vec16() { }
+  FORCEINLINE vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+      T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) {
+    v[0] = v0;        v[1] = v1;        v[2] = v2;        v[3] = v3;
+    v[4] = v4;        v[5] = v5;        v[6] = v6;        v[7] = v7;
+    v[8] = v8;        v[9] = v9;        v[10] = v10;      v[11] = v11;
+    v[12] = v12;      v[13] = v13;      v[14] = v14;      v[15] = v15;
+  }
+  T v[16]; 
 };
 
 PRE_ALIGN(16) struct __vec16_i8   : public vec16<int8_t> { 
-    FORCEINLINE __vec16_i8() { }
-    FORCEINLINE __vec16_i8(const __vec16_i8 &o);
-    FORCEINLINE __vec16_i8& operator =(const __vec16_i8 &o);
-    FORCEINLINE __vec16_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, 
-                           int8_t v4, int8_t v5, int8_t v6, int8_t v7,
-                           int8_t v8, int8_t v9, int8_t v10, int8_t v11, 
-                           int8_t v12, int8_t v13, int8_t v14, int8_t v15)
-        : vec16<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7,
-                        v8, v9, v10, v11, v12, v13, v14, v15) { }
+  FORCEINLINE __vec16_i8() { }
+  FORCEINLINE __vec16_i8(const int8_t  v0,  const int8_t  v1,  const int8_t  v2,  const int8_t  v3, 
+                         const int8_t  v4,  const int8_t  v5,  const int8_t  v6,  const int8_t  v7,
+                         const int8_t  v8,  const int8_t  v9,  const int8_t  v10, const int8_t  v11, 
+                         const int8_t  v12, const int8_t  v13, const int8_t  v14, const int8_t  v15)
+    : vec16<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                    v8, v9, v10, v11, v12, v13, v14, v15) { }
+  FORCEINLINE __vec16_i8(const __vec16_i8 &o);
+  FORCEINLINE __vec16_i8& operator =(const __vec16_i8 &o);
 } POST_ALIGN(16);
 
 PRE_ALIGN(32) struct __vec16_i16  : public vec16<int16_t> { 
-    FORCEINLINE __vec16_i16() { }
-    FORCEINLINE __vec16_i16(const __vec16_i16 &o);
-    FORCEINLINE __vec16_i16& operator =(const __vec16_i16 &o);
-    FORCEINLINE __vec16_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, 
-                            int16_t v4, int16_t v5, int16_t v6, int16_t v7,
-                            int16_t v8, int16_t v9, int16_t v10, int16_t v11, 
-                            int16_t v12, int16_t v13, int16_t v14, int16_t v15) 
-        : vec16<int16_t>(v0, v1, v2, v3, v4, v5, v6, v7,
-                         v8, v9, v10, v11, v12, v13, v14, v15) { }
+  FORCEINLINE __vec16_i16() { }
+  FORCEINLINE __vec16_i16(const __vec16_i16 &o);
+  FORCEINLINE __vec16_i16& operator =(const __vec16_i16 &o);
+  FORCEINLINE __vec16_i16(int16_t v0,  int16_t v1,  int16_t v2,  int16_t v3, 
+                          int16_t v4,  int16_t v5,  int16_t v6,  int16_t v7,
+                          int16_t v8,  int16_t v9,  int16_t v10, int16_t v11, 
+                          int16_t v12, int16_t v13, int16_t v14, int16_t v15) 
+    : vec16<int16_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                     v8, v9, v10, v11, v12, v13, v14, v15) { }
 } POST_ALIGN(32);
 
 
@@ -199,8 +246,8 @@ inline std::ostream &operator<<(std::ostream &out, const __m512i &v)
   out << "[";
   for (int i=0;i<16;i++)  
     out << (i?",":"") << std::dec << std::setw(8) << ((int*)&v)[i] << std::dec;
-    // out << (i?",":"") << std::hex << std::setw(8) << ((int*)&v)[i] << std::dec;
-  
+  // out << (i?",":"") << std::hex << std::setw(8) << ((int*)&v)[i] << std::dec;
+
   out << "]" << std::flush;
   return out;
 }
@@ -210,11 +257,33 @@ inline std::ostream &operator<<(std::ostream &out, const __m512 &v)
   out << "[";
   for (int i=0;i<16;i++)  
     out << (i?",":"") << ((float*)&v)[i];
-  
+
   out << "]" << std::flush;
   return out;
 }
 
+inline std::ostream &operator<<(std::ostream &out, const __vec16_i8 &v)
+{
+  out << "[";
+  for (int i=0;i<16;i++)  
+    out << (i?",":"") << std::dec << std::setw(8) << (int)((unsigned char*)&v)[i] << std::dec;
+  // out << (i?",":"") << std::hex << std::setw(8) << ((int*)&v)[i] << std::dec;
+
+  out << "]" << std::flush;
+  return out;
+}
+
+inline std::ostream &operator<<(std::ostream &out, const __vec16_i64 &v)
+{
+  out << "[";
+  uint32_t *ptr = (uint32_t*)&v;
+  for (int i=0;i<16;i++) {
+    uint64_t val = (uint64_t(ptr[i])<<32)+ptr[i+16];
+    out << (i?",":"") << ((int*)val);
+  }  
+  out << "]" << std::flush;
+  return out;
+}
 
 ///////////////////////////////////////////////////////////////////////////
 // macros...
@@ -237,103 +306,117 @@ FORCEINLINE __vec16_i8& __vec16_i8::operator=(const __vec16_i8 &o)
 ///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) {
-    return _mm512_kmov(mask);
+  return _mm512_kmov(mask);
 }
 
 static FORCEINLINE bool __any(__vec16_i1 mask) {
-    return !_mm512_kortestz(mask, mask);
+  return !_mm512_kortestz(mask, mask);
 }
 
 static FORCEINLINE bool __all(__vec16_i1 mask) {
-    return _mm512_kortestc(mask, mask);
+  return _mm512_kortestc(mask, mask);
 }
 
 static FORCEINLINE bool __none(__vec16_i1 mask) {
-    return _mm512_kortestz(mask, mask);
+  return _mm512_kortestz(mask, mask);
 }
 
 static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_knot( _mm512_kandn(a, b));
+  return _mm512_knot( _mm512_kandn(a, b));
 }
 
 static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kand(a, b);
+  return _mm512_kand(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __not(__vec16_i1 a) {
-    return _mm512_knot(a);
+  return _mm512_knot(a);
 }
 
 static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kandn(a, b);
+  return _mm512_kandn(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kandnr(a, b);
+  return _mm512_kandnr(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __xor(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kxor(a, b);
+  return _mm512_kxor(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __xnor(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kxnor(a, b);
+  return _mm512_kxnor(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kor(a, b);
+  return _mm512_kor(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, 
-                                       __vec16_i1 b) {
-    return ((a & mask) | (b & ~mask));
-    //return __or(__and(a, mask), __andnr(b, mask));
+    __vec16_i1 b) {
+  return ((a & mask) | (b & ~mask));
+  //return __or(__and(a, mask), __andnr(b, mask));
 }
 
 static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) {
-    return cond ? a : b;
+  return cond ? a : b;
 }
 
 
 static FORCEINLINE bool __extract_element(__vec16_i1 mask, uint32_t index) {
-    return (mask & (1 << index)) ? true : false;
+  return (mask & (1 << index)) ? true : false;
 }
 
+
+static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, uint32_t index)
+{
+  //uint *src = (uint *)&v;
+  const uint *src = (const uint *)&v;
+  return src[index+16] | (uint64_t(src[index]) << 32);
+}
+
+
+
+
+
+
+
 /*
-static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, 
-                                         bool val) {
-    if (val == false)
-        vec->v &= ~(1 << index);
-    else
-        vec->v |= (1 << index);
-}
-*/
+   static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, 
+   bool val) {
+   if (val == false)
+   vec->v &= ~(1 << index);
+   else
+   vec->v |= (1 << index);
+   }
+   */
 
 template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
-    const uint16_t *ptr = (const uint16_t *)p;
-    __vec16_i1 r;
-    r = *ptr;
-    return r;
+  const uint16_t *ptr = (const uint16_t *)p;
+  __vec16_i1 r;
+  r = *ptr;
+  return r;
 }
 
 template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) {
-    uint16_t *ptr = (uint16_t *)p;
-    *ptr = v;
+  uint16_t *ptr = (uint16_t *)p;
+  *ptr = v;
 }
 
-template <class RetVecType> RetVecType __smear_i1(int i);
-template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) {
-    return i?0xFFFF:0x0;
+template <class RetVecType> static RetVecType __smear_i1(int i);
+template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) {
+  return i?0xFFFF:0x0;
 }
 
-template <class RetVecType> RetVecType __setzero_i1();
-template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
-    return 0;
+template <class RetVecType> static RetVecType __setzero_i1();
+template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
+  return 0;
 }
 
-template <class RetVecType> RetVecType __undef_i1();
-template <> static FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
-    return __vec16_i1();
+template <class RetVecType> static RetVecType __undef_i1();
+template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
+  return __vec16_i1();
 }
 
 ///////////////////////////////////////////////////////////////////////////
@@ -342,7 +425,7 @@ template <> static FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
 
 /*
 
-TODO
+   TODO
 
 */
 
@@ -353,7 +436,7 @@ TODO
 
 /*
 
-TODO
+   TODO
 
 */
 
@@ -362,179 +445,179 @@ TODO
 ///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE __vec16_i32 __add(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_add_epi32(a, b);
+  return _mm512_add_epi32(a, b);
 }
 
 static FORCEINLINE __vec16_i32 __sub(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_sub_epi32(a, b);
+  return _mm512_sub_epi32(a, b);
 }
 
 static FORCEINLINE __vec16_i32 __mul(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_mullo_epi32(a, b);
+  return _mm512_mullo_epi32(a, b);
 }
 
 static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_div_epu32(a, b);
+  return _mm512_div_epu32(a, b);
 }
 
 static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_div_epi32(a, b);
+  return _mm512_div_epi32(a, b);
 }
 
 static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_rem_epu32(a, b);
+  return _mm512_rem_epu32(a, b);
 }
 
 static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_rem_epi32(a, b);
+  return _mm512_rem_epi32(a, b);
 }
 
 static FORCEINLINE __vec16_i32 __or(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_or_epi32(a, b);
+  return _mm512_or_epi32(a, b);
 }
 
 static FORCEINLINE __vec16_i32 __and(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_and_epi32(a, b);
+  return _mm512_and_epi32(a, b);
 }
 
 static FORCEINLINE __vec16_i32 __xor(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_xor_epi32(a, b);
+  return _mm512_xor_epi32(a, b);
 }
 
 static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_sllv_epi32(a, b);
+  return _mm512_sllv_epi32(a, b);
 }
 
 static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_srlv_epi32(a, b);
+  return _mm512_srlv_epi32(a, b);
 }
 
 static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_srav_epi32(a, b); 
+  return _mm512_srav_epi32(a, b); 
 }
 
 static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, int32_t n) {
-    return _mm512_slli_epi32(a, n);
+  return _mm512_slli_epi32(a, n);
 }
 
 static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, int32_t n) {
-    return _mm512_srli_epi32(a, n); 
+  return _mm512_srli_epi32(a, n); 
 }
 
 static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) {
-    return _mm512_srai_epi32(a, n); 
+  return _mm512_srai_epi32(a, n); 
 }
 
 static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i32 &b) {
-    return _mm512_cmpeq_epi32_mask(a, b);
+  return _mm512_cmpeq_epi32_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __equal_i32_and_mask(const __vec16_i32 &a, const __vec16_i32 &b,
-                                                   __vec16_i1 m) {
-    return _mm512_mask_cmpeq_epi32_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmpeq_epi32_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpneq_epi32_mask(a, b);
+  return _mm512_cmpneq_epi32_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                       __vec16_i1 m) {
-    return _mm512_mask_cmpneq_epi32_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmpneq_epi32_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmple_epu32_mask(a, b);
+  return _mm512_cmple_epu32_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                 __vec16_i1 m) {
-    return _mm512_mask_cmple_epu32_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmple_epu32_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmple_epi32_mask(a, b);
+  return _mm512_cmple_epi32_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                               __vec16_i1 m) {
-    return _mm512_mask_cmple_epi32_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmple_epi32_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpge_epu32_mask(a, b);
+  return _mm512_cmpge_epu32_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                    __vec16_i1 m) {
-    return _mm512_mask_cmpge_epu32_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmpge_epu32_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpge_epi32_mask(a, b);
+  return _mm512_cmpge_epi32_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                  __vec16_i1 m) {
-    return _mm512_mask_cmpge_epi32_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmpge_epi32_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmplt_epu32_mask(a, b);
+  return _mm512_cmplt_epu32_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                __vec16_i1 m) {
-    return _mm512_mask_cmplt_epu32_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmplt_epu32_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmplt_epi32_mask(a, b);
+  return _mm512_cmplt_epi32_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                              __vec16_i1 m) {
-    return _mm512_mask_cmplt_epi32_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmplt_epi32_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpgt_epu32_mask(a, b);
+  return _mm512_cmpgt_epu32_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                   __vec16_i1 m) {
-    return _mm512_mask_cmpgt_epu32_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmpgt_epu32_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpgt_epi32_mask(a, b);
+  return _mm512_cmpgt_epi32_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                 __vec16_i1 m) {
-    return _mm512_mask_cmpgt_epi32_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmpgt_epi32_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask,
-                                        __vec16_i32 a, __vec16_i32 b) {
-    return _mm512_mask_mov_epi32(b.v, mask, a.v);
+    __vec16_i32 a, __vec16_i32 b) {
+  return _mm512_mask_mov_epi32(b.v, mask, a.v);
 } 
 
 static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b) {
-    return cond ? a : b;
+  return cond ? a : b;
 }
 
 static FORCEINLINE int32_t __extract_element(__vec16_i32 v, uint32_t index) {
-    return ((int32_t *)&v)[index];
+  return ((int32_t *)&v)[index];
 }
 
 static FORCEINLINE void __insert_element(__vec16_i32 *v, uint32_t index, int32_t val) {
-    ((int32_t *)v)[index] = val;
+  ((int32_t *)v)[index] = val;
 }
 
-template <class RetVecType> RetVecType __smear_i32(int32_t i);
-template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) {
-    return _mm512_set1_epi32(i);
+template <class RetVecType> static RetVecType __smear_i32(int32_t i);
+template <> FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) {
+  return _mm512_set1_epi32(i);
 }
 
 static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
@@ -542,462 +625,560 @@ static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
 static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
 static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 
-template <class RetVecType> RetVecType __setzero_i32();
-template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() {
-    return _mm512_setzero_epi32();
+template <class RetVecType> static RetVecType __setzero_i32();
+template <> FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() {
+  return _mm512_setzero_epi32();
 }
 
-template <class RetVecType> RetVecType __undef_i32();
-template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() {
-    return __vec16_i32();
+template <class RetVecType> static RetVecType __undef_i32();
+template <> FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() {
+  return __vec16_i32();
 }
 
 static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) {
-    int32_t val = __extract_element(v, index & 0xf);
-    return _mm512_set1_epi32(val);
+  int32_t val = __extract_element(v, index & 0xf);
+  return _mm512_set1_epi32(val);
+}
+
+static FORCEINLINE __vec16_i32 __cast_trunc(__vec16_i32, const __vec16_i64 i64) {
+  return __vec16_i32(i64.v_lo);
 }
 
 static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) {
-    __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
-    __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0x7));
-    return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
+  __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
+  __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0xf));
+  return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
 }
 
 static FORCEINLINE __vec16_i32 __shuffle_i32(__vec16_i32 v, __vec16_i32 index) {
-    return _mm512_mask_permutevar_epi32(v, 0xffff, index, v);
+  return _mm512_mask_permutevar_epi32(v, 0xffff, index, v);
 }
 
 template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    return _mm512_load_epi32(p);
+  return _mm512_load_epi32(p);
 #else
-    __vec16_i32 v;
-    v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    return v;
+  __vec16_i32 v;
+  v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  return v;
 #endif
 }
 
-template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) {
-    return _mm512_load_epi32(p);
+template <> FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) {
+  return _mm512_load_epi32(p);
 }
 
 template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    _mm512_store_epi32(p, v);
+  _mm512_store_epi32(p, v);
 #else
-    _mm512_extpackstorelo_epi32(p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_epi32(p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
 #endif
 }
 
-template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
-    _mm512_store_epi32(p, v);
+template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
+  _mm512_store_epi32(p, v);
 }
 
 ///////////////////////////////////////////////////////////////////////////
 // int64
 ///////////////////////////////////////////////////////////////////////////
-
-static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, uint32_t index)
+  static FORCEINLINE 
+void __masked_store_i64(void *p, const __vec16_i64 &v, __vec16_i1 mask) 
 {
-    uint *src = (uint *)&v;
-    return src[index+16] | (int64_t(src[index]) << 32);
+  __m512i v1;
+  __m512i v2;
+  v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+      v.v_hi);
+  v1 = _mm512_mask_permutevar_epi32(v1, 0x5555,
+      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+      v.v_lo);
+  v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+      v.v_hi);
+  v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
+      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+      v.v_lo);
+  _mm512_mask_store_epi64(p, mask, v2);
+  _mm512_mask_store_epi64(((uint8_t*)p)+64, mask>>8, v1);
 }
 
 static FORCEINLINE void __insert_element(__vec16_i64 *v, uint32_t index, int64_t val) {
-    ((int32_t *)&v->v_hi)[index] = val>>32;
-    ((int32_t *)&v->v_lo)[index] = val;
+  ((int32_t *)&v->v_hi)[index] = val>>32;
+  ((int32_t *)&v->v_lo)[index] = val;
 }
 
 
-template <class RetVecType> RetVecType __setzero_i64();
-template <> static FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() {
-    __vec16_i64 ret;
-    ret.v_lo = _mm512_setzero_epi32();
-    ret.v_hi = _mm512_setzero_epi32();
-    return ret;
+template <class RetVecType> static RetVecType __setzero_i64();
+template <> FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() {
+  __vec16_i64 ret;
+  ret.v_lo = _mm512_setzero_epi32();
+  ret.v_hi = _mm512_setzero_epi32();
+  return ret;
 }
 
-template <class RetVecType> RetVecType __undef_i64();
-template <> static FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() {
-    return __vec16_i64();
+template <class RetVecType> static RetVecType __undef_i64();
+template <> FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() {
+  return __vec16_i64();
 }
 
 static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b)
 {    
-    __mmask16 carry = 0;
-    __m512i lo = _mm512_addsetc_epi32(a.v_lo, b.v_lo, &carry);
-    __m512i hi = _mm512_adc_epi32(a.v_hi, carry, b.v_hi, &carry);
-    return __vec16_i64(lo, hi);
+  __mmask16 carry = 0;
+  __m512i lo = _mm512_addsetc_epi32(a.v_lo, b.v_lo, &carry);
+  __m512i hi = _mm512_adc_epi32(a.v_hi, carry, b.v_hi, &carry);
+  return __vec16_i64(lo, hi);
 }
 
 static FORCEINLINE __vec16_i64 __sub(const __vec16_i64 &a, const __vec16_i64 &b)
 {    
-    __mmask16 borrow = 0;
-    __m512i lo = _mm512_subsetb_epi32(a.v_lo, b.v_lo, &borrow);
-    __m512i hi = _mm512_sbb_epi32(a.v_hi, borrow, b.v_hi, &borrow);
-    return __vec16_i64(lo, hi);
+  __mmask16 borrow = 0;
+  __m512i lo = _mm512_subsetb_epi32(a.v_lo, b.v_lo, &borrow);
+  __m512i hi = _mm512_sbb_epi32(a.v_hi, borrow, b.v_hi, &borrow);
+  return __vec16_i64(lo, hi);
 }
 
 /*! 64x32 bit mul -- address computations often use a scale that we
-    know is 32 bits; and 32x64 is faster than 64x64 */
+  know is 32 bits; and 32x64 is faster than 64x64 */
 static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &b)
 {
-    return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo),
-                       _mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi),
-                                        _mm512_mulhi_epi32(a.v, b.v_lo)));
+  return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo),
+      _mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi),
+        _mm512_mulhi_epi32(a.v, b.v_lo)));
 }
 
 static FORCEINLINE __vec16_i64 __mul(const __vec16_i64 &a, const __vec16_i64 &b)
 {
-    __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo);
-    __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
-    __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
-    __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
-    __mmask16 carry = 0;
-    __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry);
-    __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry);
-    return __vec16_i64(lo, hi);
+  __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo);
+  __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
+  __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
+  __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
+  __mmask16 carry = 0;
+  __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry);
+  __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry);
+  return __vec16_i64(lo, hi);
 }
 
 static FORCEINLINE __vec16_i64 __sdiv(const __vec16_i64 &a, const __vec16_i64 &b)
 {
-    __vec16_i64 ret;
-    for(int i=0; i<16; i++) {
-        int64_t dividend = __extract_element(a, i);
-        int64_t divisor = __extract_element(b, i);
-        int64_t quotient = dividend / divisor; // SVML
-        __insert_element(&ret, i, quotient);
-    }
-    return ret;
+  __vec16_i64 ret;
+  for(int i=0; i<16; i++) {
+    int64_t dividend = __extract_element(a, i);
+    int64_t divisor = __extract_element(b, i);
+    int64_t quotient = dividend / divisor; // SVML
+    __insert_element(&ret, i, quotient);
+  }
+  return ret;
 }
 
 static FORCEINLINE __vec16_i64 __udiv(const __vec16_i64 &a, const __vec16_i64 &b)
 {
-    __vec16_i64 ret;
-    for(int i=0; i<16; i++) {
-        uint64_t dividend = __extract_element(a, i);
-        uint64_t divisor = __extract_element(b, i);
-        uint64_t quotient = dividend / divisor; // SVML
-        __insert_element(&ret, i, quotient);
-    }
-    return ret;
+  __vec16_i64 ret;
+  for(int i=0; i<16; i++) {
+    uint64_t dividend = __extract_element(a, i);
+    uint64_t divisor = __extract_element(b, i);
+    uint64_t quotient = dividend / divisor; // SVML
+    __insert_element(&ret, i, quotient);
+  }
+  return ret;
 }
 
 static FORCEINLINE __vec16_i64 __or(__vec16_i64 a, __vec16_i64 b) {
-    return __vec16_i64(_mm512_or_epi32(a.v_lo, b.v_lo), _mm512_or_epi32(a.v_hi, b.v_hi));
+  return __vec16_i64(_mm512_or_epi32(a.v_lo, b.v_lo), _mm512_or_epi32(a.v_hi, b.v_hi));
 }
 
 static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) {
-    return __vec16_i64(_mm512_and_epi32(a.v_lo, b.v_lo), _mm512_and_epi32(a.v_hi, b.v_hi));
+  return __vec16_i64(_mm512_and_epi32(a.v_lo, b.v_lo), _mm512_and_epi32(a.v_hi, b.v_hi));
 }
 
 static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) {
-    return __vec16_i64(_mm512_xor_epi32(a.v_lo, b.v_lo), _mm512_xor_epi32(a.v_hi, b.v_hi));
+  return __vec16_i64(_mm512_xor_epi32(a.v_lo, b.v_lo), _mm512_xor_epi32(a.v_hi, b.v_hi));
 }
 
 static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, __vec16_i64 b) {
-    __vec16_i32 xfer = _mm512_srlv_epi32(a.v_lo, _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
-    __vec16_i32 hi = _mm512_or_epi32(_mm512_sllv_epi32(a.v_hi, b.v_lo), xfer);
-    __vec16_i32 lo = _mm512_sllv_epi32(a.v_lo, b.v_lo);
-    return __vec16_i64(lo, hi);
+  __vec16_i32 xfer = _mm512_srlv_epi32(a.v_lo, _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
+  __vec16_i32 hi = _mm512_or_epi32(_mm512_sllv_epi32(a.v_hi, b.v_lo), xfer);
+  __vec16_i32 lo = _mm512_sllv_epi32(a.v_lo, b.v_lo);
+  return __vec16_i64(lo, hi);
+}
+
+static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, unsigned long long b) {
+  __vec16_i32 hi = _mm512_or_epi32(_mm512_slli_epi32(a.v_hi, b), 
+      _mm512_srli_epi32(a.v_lo, 32-b));
+  __vec16_i32 lo = _mm512_slli_epi32(a.v_lo, b);
+  return __vec16_i64(lo, hi);
 }
 
 static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, __vec16_i64 b) {
-    __vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo);
-    __vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift));
-    //__vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, 
-    //                                                      _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), 
-    //                                     _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
-    __vec16_i32 hi = _mm512_srlv_epi32(a.v_hi, b.v_lo);
-    __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
-    return __vec16_i64(lo, hi);
+  __vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo);
+  __vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift));
+  //__vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, 
+  //                                                      _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), 
+  //                                     _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
+  __vec16_i32 hi = _mm512_srlv_epi32(a.v_hi, b.v_lo);
+  __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
+  return __vec16_i64(lo, hi);
 }
 
 static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, __vec16_i64 b) {
-    __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, 
-                                                          _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), 
-                                         _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
-    __vec16_i32 hi = _mm512_srav_epi32(a.v_hi, b.v_lo);
-    __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
-    return __vec16_i64(lo, hi);
+  __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, 
+        _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), 
+      _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
+  __vec16_i32 hi = _mm512_srav_epi32(a.v_hi, b.v_lo);
+  __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
+  return __vec16_i64(lo, hi);
+}
+
+static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, unsigned long long b) {
+  __vec16_i32 xfer
+    = _mm512_slli_epi32(_mm512_and_epi32(a.v_hi, 
+          _mm512_set1_epi32((1<<b)-1)),
+        32-b);
+  __vec16_i32 hi = _mm512_srai_epi32(a.v_hi, b);
+  __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srli_epi32(a.v_lo, b));
+  return __vec16_i64(lo, hi);
 }
 
 static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
-    const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
-    return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
+  const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
+  return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
 }
 
 static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b,
-                                                   __vec16_i1 mask) {
-    __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
-    __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
-    return _mm512_kand(full_match, (__mmask16)mask);
+    __vec16_i1 mask) {
+  __mmask16 lo_match = _mm512_mask_cmpeq_epi32_mask((__mmask16)mask, a.v_lo,b.v_lo);
+  __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
+  return full_match;
 }
 
 static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
-    return __not(__equal_i64(a,b));
+  return __not(__equal_i64(a,b));
 }
 
 static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b,
-                                                       __vec16_i1 mask) {
-    return __and(__not(__equal_i64(a,b)), mask);
+    __vec16_i1 mask) {
+  return __and(__not(__equal_i64(a,b)), mask);
 }
 
 static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask,
-                                        __vec16_i64 a, __vec16_i64 b) {
-    __vec16_i64 ret;
-    ret.v_hi = _mm512_mask_mov_epi32(b.v_hi, mask, a.v_hi);
-    ret.v_lo = _mm512_mask_mov_epi32(b.v_lo, mask, a.v_lo);
-    return ret;
+    __vec16_i64 a, __vec16_i64 b) {
+  __vec16_i64 ret;
+  ret.v_hi = _mm512_mask_mov_epi32(b.v_hi, mask, a.v_hi);
+  ret.v_lo = _mm512_mask_mov_epi32(b.v_lo, mask, a.v_lo);
+  return ret;
 }
 
-template <class RetVecType> RetVecType __smear_i64(const int64_t &l);
+template <class RetVecType> static RetVecType __smear_i64(const int64_t &l);
 template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) {
-    const int *i = (const int*)&l;
-    return __vec16_i64(_mm512_set1_epi32(i[0]), _mm512_set1_epi32(i[1]));
+  const int *i = (const int*)&l;
+  return __vec16_i64(_mm512_set1_epi32(i[0]), _mm512_set1_epi32(i[1]));
 }
 
 template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) {
-    __vec16_i32 v1;
-    __vec16_i32 v2;
-    v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v2 = _mm512_extloadunpackhi_epi32(v2, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v1 = _mm512_extloadunpacklo_epi32(v1, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v1 = _mm512_extloadunpackhi_epi32(v1, (uint8_t*)p+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  __vec16_i32 v1;
+  __vec16_i32 v2;
+  const uint8_t*ptr = (const uint8_t*)p;
+  v2 = _mm512_extloadunpacklo_epi32(v2, ptr, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v2 = _mm512_extloadunpackhi_epi32(v2, ptr+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v1 = _mm512_extloadunpacklo_epi32(v1, ptr+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v1 = _mm512_extloadunpackhi_epi32(v1, ptr+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
 
-    __vec16_i64 ret;
-    ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
-                                            _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-                                            v1);
-    ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF,
-                                            _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-                                            v2);
-    ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0xFF00,
-                                            _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-                                            v1);
-    ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF,
-                                            _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-                                            v2);
-    return ret;    
+  __vec16_i64 ret;
+  ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
+      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+      v1);
+  ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF,
+      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+      v2);
+  ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0xFF00,
+      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+      v1);
+  ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF,
+      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+      v2);
+  return ret;    
 }
 
-template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) {
-    __m512i v2 = _mm512_load_epi32(p);
-    __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
-    __vec16_i64 ret;
-    ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
-                                            _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-                                            v1);
-    ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF,
-                                            _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-                                            v2);
-    ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0xFF00,
-                                            _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-                                            v1);
-    ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF,
-                                            _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-                                            v2);
-    return ret;    
+template <> FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) {
+  __m512i v2 = _mm512_load_epi32(p);
+  __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
+  __vec16_i64 ret;
+  ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
+      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+      v1);
+  ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF,
+      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+      v2);
+  ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0xFF00,
+      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+      v1);
+  ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF,
+      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+      v2);
+  return ret;    
 }
 
-template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {
-    return __load<64>(p);
+template <> FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {
+  return __load<64>(p);
 }
 
 template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) {
-    __m512i v1;
-    __m512i v2;
-    v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
-                                      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
-                                      v.v_hi);
-    v1 = _mm512_mask_permutevar_epi32(v1, 0x5555,
-                                      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
-                                      v.v_lo);
-    v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
-                                      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
-                                      v.v_hi);
-    v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
-                                      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
-                                      v.v_lo);
-    _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_epi32((uint8_t*)p+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorelo_epi32((uint8_t*)p+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_epi32((uint8_t*)p+128, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  __m512i v1;
+  __m512i v2;
+  v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+      v.v_hi);
+  v1 = _mm512_mask_permutevar_epi32(v1, 0x5555,
+      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+      v.v_lo);
+  v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+      v.v_hi);
+  v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
+      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+      v.v_lo);
+  _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_epi32((uint8_t*)p+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+128, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
 }
 
-template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
-    __m512i v1;
-    __m512i v2;
-    v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
-                                      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
-                                      v.v_hi);
-    v1 = _mm512_mask_permutevar_epi32(v1, 0x5555,
-                                      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
-                                      v.v_lo);
-    v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
-                                      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
-                                      v.v_hi);
-    v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
-                                      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
-                                      v.v_lo);
-    _mm512_store_epi64(p, v2);
-    _mm512_store_epi64(((uint8_t*)p)+64, v1);
+template <> FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
+  __m512i v1;
+  __m512i v2;
+  v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+      v.v_hi);
+  v1 = _mm512_mask_permutevar_epi32(v1, 0x5555,
+      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+      v.v_lo);
+  v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+      v.v_hi);
+  v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
+      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+      v.v_lo);    
+  _mm512_store_epi64(p, v2);
+  _mm512_store_epi64(((uint8_t*)p)+64, v1);
 }
 
-template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {
-    __store<64>(p, v);
+template <> FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {
+  __store<64>(p, v);
 }
 
+
+/*! gather vector of 64-bit ints from addresses pointing to uniform ints 
+
+  (iw) WARNING: THIS CODE ONLY WORKS FOR GATHERS FROM ARRAYS OF
+ ***UNIFORM*** INT64's/POINTERS.  (problem is that ispc doesn't
+ expose whether it's from array of uniform or array of varying
+ poitners, so in here there's no way to tell - only thing we can do
+ is pick one...
+ */
+static FORCEINLINE __vec16_i64
+__gather_base_offsets32_i64(uint8_t *base, uint32_t scale, __vec16_i32 offsets, 
+    __vec16_i1 mask) {
+  __vec16_i64 ret;
+  ret.v_lo = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, 
+      base, _MM_UPCONV_EPI32_NONE, scale,
+      _MM_HINT_NONE);
+  ret.v_hi = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, 
+      base+4, _MM_UPCONV_EPI32_NONE, scale,
+      _MM_HINT_NONE);
+  return ret;
+}
+
+/*! gather vector of 64-bit ints from addresses pointing to uniform ints 
+
+  (iw) WARNING: THIS CODE ONLY WORKS FOR GATHERS FROM ARRAYS OF
+ ***UNIFORM*** INT64's/POINTERS.  (problem is that ispc doesn't
+ expose whether it's from array of uniform or array of varying
+ poitners, so in here there's no way to tell - only thing we can do
+ is pick one...
+ */
+  static FORCEINLINE __vec16_i64
+__gather64_i64(__vec16_i64 addr, __vec16_i1 mask) 
+{
+  __vec16_i64 ret;
+
+  // There is no gather instruction with 64-bit offsets in KNC.
+  // We have to manually iterate over the upper 32 bits ;-)
+  __vec16_i1 still_to_do = mask;
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(addr.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint32_t &hi32 = ((uint*)&addr.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN));
+    ret.v_lo = _mm512_mask_i32extgather_epi32(ret.v_lo, match, signed_offsets, 
+        base, _MM_UPCONV_EPI32_NONE, 1,
+        _MM_HINT_NONE);
+    ret.v_hi = _mm512_mask_i32extgather_epi32(ret.v_hi, match, signed_offsets, 
+        base+4, _MM_UPCONV_EPI32_NONE, 1,
+        _MM_HINT_NONE);
+
+    still_to_do = _mm512_kxor(match, still_to_do);
+  }
+
+  return ret;
+}
+
+
+
 ///////////////////////////////////////////////////////////////////////////
 // float
 ///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE __vec16_f __add(__vec16_f a, __vec16_f b) { 
-    return _mm512_add_ps(a, b);
+  return _mm512_add_ps(a, b);
 }
 
 static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) {
-    return _mm512_sub_ps(a, b);
+  return _mm512_sub_ps(a, b);
 }
 
 static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) {
-    return _mm512_mul_ps(a, b);
+  return _mm512_mul_ps(a, b);
 }
 
 static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) {
-    return _mm512_div_ps(a, b);
+  return _mm512_div_ps(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmpeq_ps_mask(a, b);
+  return _mm512_cmpeq_ps_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __equal_float_and_mask(__vec16_f a, __vec16_f b,
-                                                     __vec16_i1 m) {
-    return _mm512_mask_cmpeq_ps_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmpeq_ps_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i1 __not_equal_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmpneq_ps_mask(a, b);
+  return _mm512_cmpneq_ps_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __not_equal_float_and_mask(__vec16_f a, __vec16_f b,
-                                                         __vec16_i1 m) {
-    return _mm512_mask_cmpneq_ps_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmpneq_ps_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i1 __less_than_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmplt_ps_mask(a, b);
+  return _mm512_cmplt_ps_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __less_than_float_and_mask(__vec16_f a, __vec16_f b,
-                                                         __vec16_i1 m) {
-    return _mm512_mask_cmplt_ps_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmplt_ps_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i1 __less_equal_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmple_ps_mask(a, b);
+  return _mm512_cmple_ps_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __less_equal_float_and_mask(__vec16_f a, __vec16_f b,
-                                                          __vec16_i1 m) {
-    return _mm512_mask_cmple_ps_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmple_ps_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i1 __greater_than_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmpnle_ps_mask(a, b);
+  return _mm512_cmpnle_ps_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __greater_than_float_and_mask(__vec16_f a, __vec16_f b,
-                                                            __vec16_i1 m) {
-    return _mm512_mask_cmpnle_ps_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmpnle_ps_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmpnlt_ps_mask(a, b);
+  return _mm512_cmpnlt_ps_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b,
-                                                             __vec16_i1 m) {
-    return _mm512_mask_cmpnlt_ps_mask(m, a, b);
+    __vec16_i1 m) {
+  return _mm512_mask_cmpnlt_ps_mask(m, a, b);
 }
 
 static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmpord_ps_mask(a, b);
+  return _mm512_cmpord_ps_mask(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmpunord_ps_mask(a, b);
+  return _mm512_cmpunord_ps_mask(a, b);
 }
 
 static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) {
-    return _mm512_mask_mov_ps(b, mask, a);
+  return _mm512_mask_mov_ps(b, mask, a);
 }
 
 static FORCEINLINE __vec16_f __select(bool cond, __vec16_f a, __vec16_f b) {
-    return cond ? a : b;
+  return cond ? a : b;
 }
 
 static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) {
-    return ((float *)&v)[index];
+  return ((float *)&v)[index];
 }
 
 static FORCEINLINE void  __insert_element(__vec16_f *v, uint32_t index, float val) {
-    ((float *)v)[index] = val;
+  ((float *)v)[index] = val;
 }
 
-template <class RetVecType> RetVecType __smear_float(float f);
-template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) {
-    return _mm512_set_1to16_ps(f);
+template <class RetVecType> static RetVecType __smear_float(float f);
+template <> FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) {
+  return _mm512_set_1to16_ps(f);
 }
 
-template <class RetVecType> RetVecType __setzero_float();
-template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() {
-    return _mm512_setzero_ps();
+template <class RetVecType> static RetVecType __setzero_float();
+template <> FORCEINLINE __vec16_f __setzero_float<__vec16_f>() {
+  return _mm512_setzero_ps();
 }
 
-template <class RetVecType> RetVecType __undef_float();
-template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() {
-    return __vec16_f();
+template <class RetVecType> static RetVecType __undef_float();
+template <> FORCEINLINE __vec16_f __undef_float<__vec16_f>() {
+  return __vec16_f();
 }
 
 static FORCEINLINE __vec16_f __broadcast_float(__vec16_f v, int index) {
-    int32_t val = __extract_element(v, index & 0xf);
-    return _mm512_set1_ps(val);
+  int32_t val = __extract_element(v, index & 0xf);
+  return _mm512_set1_ps(val);
 }
 
 static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) {
-    return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
+  return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
 }
 
 template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    return _mm512_load_ps(p);
+  return _mm512_load_ps(p);
 #else
-    __vec16_f v;
-    v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    return v;
+  __vec16_f v;
+  v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  return v;
 #endif
 }
 
-template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) {
-    return _mm512_load_ps(p);
+template <> FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) {
+  return _mm512_load_ps(p);
 }
 
 template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    _mm512_store_ps(p, v);
+  _mm512_store_ps(p, v);
 #else
-    _mm512_extpackstorelo_ps(p, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_ps(p, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
 #endif
 }
 
-template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) {
-    _mm512_store_ps(p, v);
+template <> FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) {
+  _mm512_store_ps(p, v);
 }
 
 
@@ -1006,241 +1187,241 @@ template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) {
 ///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE __vec16_d __add(__vec16_d a, __vec16_d b) { 
-    __vec16_d ret;
-    ret.v1 = _mm512_add_pd(a.v1, b.v1);
-    ret.v2 = _mm512_add_pd(a.v2, b.v2);
-    return ret;
+  __vec16_d ret;
+  ret.v1 = _mm512_add_pd(a.v1, b.v1);
+  ret.v2 = _mm512_add_pd(a.v2, b.v2);
+  return ret;
 }
 
 static FORCEINLINE __vec16_d __sub(__vec16_d a, __vec16_d b) {
-    __vec16_d ret;
-    ret.v1 = _mm512_sub_pd(a.v1, b.v1);
-    ret.v2 = _mm512_sub_pd(a.v2, b.v2);
-    return ret;
+  __vec16_d ret;
+  ret.v1 = _mm512_sub_pd(a.v1, b.v1);
+  ret.v2 = _mm512_sub_pd(a.v2, b.v2);
+  return ret;
 }
 
 static FORCEINLINE __vec16_d __mul(__vec16_d a, __vec16_d b) {
-    __vec16_d ret;
-    ret.v1 = _mm512_mul_pd(a.v1, b.v1);
-    ret.v2 = _mm512_mul_pd(a.v2, b.v2);
-    return ret;
+  __vec16_d ret;
+  ret.v1 = _mm512_mul_pd(a.v1, b.v1);
+  ret.v2 = _mm512_mul_pd(a.v2, b.v2);
+  return ret;
 }
 
 static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) {
-    __vec16_d ret;
-    ret.v1 = _mm512_div_pd(a.v1, b.v1);
-    ret.v2 = _mm512_div_pd(a.v2, b.v2);
-    return ret;
+  __vec16_d ret;
+  ret.v1 = _mm512_div_pd(a.v1, b.v1);
+  ret.v2 = _mm512_div_pd(a.v2, b.v2);
+  return ret;
 }
 
 static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpeq_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpeq_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
+  __vec16_i1 ret1;
+  __vec16_i1 ret2;
+  ret1 = _mm512_cmpeq_pd_mask(a.v1, b.v1);
+  ret2 = _mm512_cmpeq_pd_mask(a.v2, b.v2);
+  return _mm512_kmovlhb(ret1, ret2);
 }
 
 static FORCEINLINE __vec16_i1 __equal_double_and_mask(__vec16_d a, __vec16_d b,
-                                                      __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_mask_cmpeq_pd_mask(m, a.v1, b.v1);
-    __vec16_i1 tmp_m = m;
-    ret2 = _mm512_mask_cmpeq_pd_mask(_mm512_kswapb(tmp_m,tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
+    __vec16_i1 m) {
+  __vec16_i1 ret1;
+  __vec16_i1 ret2;
+  ret1 = _mm512_mask_cmpeq_pd_mask(m, a.v1, b.v1);
+  __vec16_i1 tmp_m = m;
+  ret2 = _mm512_mask_cmpeq_pd_mask(_mm512_kswapb(tmp_m,tmp_m), a.v2, b.v2);
+  return _mm512_kmovlhb(ret1, ret2);
 }
 
 static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpneq_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpneq_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
+  __vec16_i1 ret1;
+  __vec16_i1 ret2;
+  ret1 = _mm512_cmpneq_pd_mask(a.v1, b.v1);
+  ret2 = _mm512_cmpneq_pd_mask(a.v2, b.v2);
+  return _mm512_kmovlhb(ret1, ret2);
 }
 
 static FORCEINLINE __vec16_i1 __not_equal_double_and_mask(__vec16_d a, __vec16_d b,
-                                                          __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    __vec16_i1 tmp_m = m;
-    ret1 = _mm512_mask_cmpneq_pd_mask(m, a.v1, b.v1);
-    ret2 = _mm512_mask_cmpneq_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
+    __vec16_i1 m) {
+  __vec16_i1 ret1;
+  __vec16_i1 ret2;
+  __vec16_i1 tmp_m = m;
+  ret1 = _mm512_mask_cmpneq_pd_mask(m, a.v1, b.v1);
+  ret2 = _mm512_mask_cmpneq_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
+  return _mm512_kmovlhb(ret1, ret2);
 }
 
 static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmplt_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmplt_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
+  __vec16_i1 ret1;
+  __vec16_i1 ret2;
+  ret1 = _mm512_cmplt_pd_mask(a.v1, b.v1);
+  ret2 = _mm512_cmplt_pd_mask(a.v2, b.v2);
+  return _mm512_kmovlhb(ret1, ret2);
 }
 
 static FORCEINLINE __vec16_i1 __less_than_double_and_mask(__vec16_d a, __vec16_d b,
-                                                          __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    __vec16_i1 tmp_m = m;
-    ret1 = _mm512_mask_cmplt_pd_mask(m, a.v1, b.v1);
-    ret2 = _mm512_mask_cmplt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
+    __vec16_i1 m) {
+  __vec16_i1 ret1;
+  __vec16_i1 ret2;
+  __vec16_i1 tmp_m = m;
+  ret1 = _mm512_mask_cmplt_pd_mask(m, a.v1, b.v1);
+  ret2 = _mm512_mask_cmplt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
+  return _mm512_kmovlhb(ret1, ret2);
 }
 
 static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmple_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmple_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
+  __vec16_i1 ret1;
+  __vec16_i1 ret2;
+  ret1 = _mm512_cmple_pd_mask(a.v1, b.v1);
+  ret2 = _mm512_cmple_pd_mask(a.v2, b.v2);
+  return _mm512_kmovlhb(ret1, ret2);
 }
 
 static FORCEINLINE __vec16_i1 __less_equal_double_and_mask(__vec16_d a, __vec16_d b,
-                                                           __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    __vec16_i1 tmp_m = m;
-    ret1 = _mm512_mask_cmple_pd_mask(m, a.v1, b.v1);
-    ret2 = _mm512_mask_cmple_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
+    __vec16_i1 m) {
+  __vec16_i1 ret1;
+  __vec16_i1 ret2;
+  __vec16_i1 tmp_m = m;
+  ret1 = _mm512_mask_cmple_pd_mask(m, a.v1, b.v1);
+  ret2 = _mm512_mask_cmple_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
+  return _mm512_kmovlhb(ret1, ret2);
 }
 
 static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpnle_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpnle_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
+  __vec16_i1 ret1;
+  __vec16_i1 ret2;
+  ret1 = _mm512_cmpnle_pd_mask(a.v1, b.v1);
+  ret2 = _mm512_cmpnle_pd_mask(a.v2, b.v2);
+  return _mm512_kmovlhb(ret1, ret2);
 }
 
 static FORCEINLINE __vec16_i1 __greater_than_double_and_mask(__vec16_d a, __vec16_d b,
-                                                             __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    __vec16_i1 tmp_m = m;
-    ret1 = _mm512_mask_cmpnle_pd_mask(m, a.v1, b.v1);
-    ret2 = _mm512_mask_cmpnle_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
+    __vec16_i1 m) {
+  __vec16_i1 ret1;
+  __vec16_i1 ret2;
+  __vec16_i1 tmp_m = m;
+  ret1 = _mm512_mask_cmpnle_pd_mask(m, a.v1, b.v1);
+  ret2 = _mm512_mask_cmpnle_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
+  return _mm512_kmovlhb(ret1, ret2);
 }
 
 static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
+  __vec16_i1 ret1;
+  __vec16_i1 ret2;
+  ret1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1);
+  ret2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2);
+  return _mm512_kmovlhb(ret1, ret2);
 }
 
 static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b,
-                                                              __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    __vec16_i1 tmp_m = m;
-    ret1 = _mm512_mask_cmpnlt_pd_mask(m, a.v1, b.v1);
-    ret2 = _mm512_mask_cmpnlt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
+    __vec16_i1 m) {
+  __vec16_i1 ret1;
+  __vec16_i1 ret2;
+  __vec16_i1 tmp_m = m;
+  ret1 = _mm512_mask_cmpnlt_pd_mask(m, a.v1, b.v1);
+  ret2 = _mm512_mask_cmpnlt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
+  return _mm512_kmovlhb(ret1, ret2);
 }
 
 static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpord_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpord_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
+  __vec16_i1 ret1;
+  __vec16_i1 ret2;
+  ret1 = _mm512_cmpord_pd_mask(a.v1, b.v1);
+  ret2 = _mm512_cmpord_pd_mask(a.v2, b.v2);
+  return _mm512_kmovlhb(ret1, ret2);
 }
 
 static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpunord_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpunord_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
+  __vec16_i1 ret1;
+  __vec16_i1 ret2;
+  ret1 = _mm512_cmpunord_pd_mask(a.v1, b.v1);
+  ret2 = _mm512_cmpunord_pd_mask(a.v2, b.v2);
+  return _mm512_kmovlhb(ret1, ret2);
 }
 
 static FORCEINLINE __vec16_d __select(__vec16_i1 mask, __vec16_d a, __vec16_d b) {
-    __vec16_d ret;
-    __vec16_i1 tmp_m = mask;
-    ret.v1 = _mm512_mask_mov_pd(b.v1, mask, a.v1);
-    ret.v2 = _mm512_mask_mov_pd(b.v2, _mm512_kswapb(tmp_m, tmp_m), a.v2);
-    return ret;
+  __vec16_d ret;
+  __vec16_i1 tmp_m = mask;
+  ret.v1 = _mm512_mask_mov_pd(b.v1, mask, a.v1);
+  ret.v2 = _mm512_mask_mov_pd(b.v2, _mm512_kswapb(tmp_m, tmp_m), a.v2);
+  return ret;
 }
 
 
 static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b) {
-    return cond ? a : b;
+  return cond ? a : b;
 }
 
 static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) {
-    return ((double *)&v)[index];
+  return ((double *)&v)[index];
 }
 
 static FORCEINLINE void  __insert_element(__vec16_d *v, uint32_t index, double val) {
-    ((double *)v)[index] = val;
+  ((double *)v)[index] = val;
 }
 
-template <class RetVecType> RetVecType __smear_double(double d);
-template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) {
-    __vec16_d ret;
-    ret.v1 = _mm512_set1_pd(d);
-    ret.v2 = _mm512_set1_pd(d);
-    return ret;
+template <class RetVecType> static RetVecType __smear_double(double d);
+template <> FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) {
+  __vec16_d ret;
+  ret.v1 = _mm512_set1_pd(d);
+  ret.v2 = _mm512_set1_pd(d);
+  return ret;
 }
 
-template <class RetVecType> RetVecType __setzero_double();
-template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() {
-    __vec16_d ret;
-    ret.v1 = _mm512_setzero_pd();
-    ret.v2 = _mm512_setzero_pd();
-    return ret;
+template <class RetVecType> static RetVecType __setzero_double();
+template <> FORCEINLINE __vec16_d __setzero_double<__vec16_d>() {
+  __vec16_d ret;
+  ret.v1 = _mm512_setzero_pd();
+  ret.v2 = _mm512_setzero_pd();
+  return ret;
 }
 
-template <class RetVecType> RetVecType __undef_double();
-template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() {
-    return __vec16_d();
+template <class RetVecType> static RetVecType __undef_double();
+template <> FORCEINLINE __vec16_d __undef_double<__vec16_d>() {
+  return __vec16_d();
 }
 
 static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) {
-    __vec16_d ret;
-    double val = __extract_element(v, index & 0xf);
-    ret.v1 = _mm512_set1_pd(val);
-    ret.v2 = _mm512_set1_pd(val);
-    return ret;
+  __vec16_d ret;
+  double val = __extract_element(v, index & 0xf);
+  ret.v1 = _mm512_set1_pd(val);
+  ret.v2 = _mm512_set1_pd(val);
+  return ret;
 }
 
 template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) {
-    __vec16_d ret;
-    ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    return ret;
+  __vec16_d ret;
+  ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  return ret;
 }
 
-template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) {
-    __vec16_d ret;
-    ret.v1 = _mm512_load_pd(p);
-    ret.v2 = _mm512_load_pd(((uint8_t*)p)+64);
-    return ret;
+template <> FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) {
+  __vec16_d ret;
+  ret.v1 = _mm512_load_pd(p);
+  ret.v2 = _mm512_load_pd(((uint8_t*)p)+64);
+  return ret;
 }
 
-template <> static FORCEINLINE __vec16_d __load<128>(const __vec16_d *p) {
-    return __load<64>(p);
+template <> FORCEINLINE __vec16_d __load<128>(const __vec16_d *p) {
+  return __load<64>(p);
 }
- 
+
 template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) {
-    _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
 }
 
-template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) {
-    _mm512_store_pd(p, v.v1);
-    _mm512_store_pd(((uint8_t*)p)+64, v.v2);
+template <> FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) {
+  _mm512_store_pd(p, v.v1);
+  _mm512_store_pd(((uint8_t*)p)+64, v.v2);
 }
 
-template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) {
-    __store<64>(p, v);
+template <> FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) {
+  __store<64>(p, v);
 }
 
 ///////////////////////////////////////////////////////////////////////////
@@ -1249,12 +1430,12 @@ template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) {
 
 static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
 {
-    return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
+  return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
 }
 
 static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val)
 {
-    return __vec16_i64(val.v, _mm512_setzero_epi32());
+  return __vec16_i64(val.v, _mm512_setzero_epi32());
 }
 
 static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val)
@@ -1266,47 +1447,47 @@ static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1
 
 static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
 {
-    __vec16_i32 ret = _mm512_setzero_epi32();
-    __vec16_i32 one = _mm512_set1_epi32(1);
-    return _mm512_mask_mov_epi32(ret, val, one);
+  __vec16_i32 ret = _mm512_setzero_epi32();
+  __vec16_i32 one = _mm512_set1_epi32(1);
+  return _mm512_mask_mov_epi32(ret, val, one);
 }
 
 static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i8 val) {
-    return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
 }
 
 static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i16 val) {
-    return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
 }
 
 static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i32 val) {
-    return _mm512_cvtfxpnt_round_adjustepi32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); 
+  return _mm512_cvtfxpnt_round_adjustepi32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); 
 }
 
 static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i8 val) {
-    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtepi32lo_pd(vi);
-    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
-    ret.v2 = _mm512_cvtepi32lo_pd(other8);
-    return ret;
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepi32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepi32lo_pd(other8);
+  return ret;
 }
 
 static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i16 val) {
-    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtepi32lo_pd(vi);
-    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
-    ret.v2 = _mm512_cvtepi32lo_pd(other8);
-    return ret;
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepi32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepi32lo_pd(other8);
+  return ret;
 }
 
 static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i32 val) {
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtepi32lo_pd(val);
-    __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
-    ret.v2 = _mm512_cvtepi32lo_pd(other8);
-    return ret;
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepi32lo_pd(val);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepi32lo_pd(other8);
+  return ret;
 }
 
 static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, const __vec16_i8 &v) {
@@ -1328,35 +1509,64 @@ static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) {
 }
 
 static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) {
-    __vec16_d ret;
-    ret.v2 = _mm512_cvtpslo_pd(val.v);
-    __vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC);
-    ret.v1 = _mm512_cvtpslo_pd(other8);
-    return ret;
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtpslo_pd(val.v);
+  __vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtpslo_pd(other8);
+  return ret;
 }
 
 static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
-    __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
-    __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
-
-    return _mm512_mask_permute4f128_epi32(r1i, 0xFF00, r0i, _MM_PERM_BABA);
+  __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
+  __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
+  return _mm512_mask_permute4f128_epi32(r1i, 0xFF00, r0i, _MM_PERM_BABA);
 }
 
 static FORCEINLINE __vec16_f __cast_bits(__vec16_f, __vec16_i32 val) {
-    return _mm512_castsi512_ps(val);
+  return _mm512_castsi512_ps(val);
 }
 
 static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) {
-    return _mm512_castps_si512(val);
+  return _mm512_castps_si512(val);
 }
 
 
 static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) {
-    return *(__vec16_i64*)&val;
+  __vec16_i64 ret;
+  ret.v_hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
+      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+      _mm512_castpd_si512(val.v2));
+  ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF,
+      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+      _mm512_castpd_si512(val.v1));
+  ret.v_lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
+      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+      _mm512_castpd_si512(val.v2));
+  ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF,
+      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+      _mm512_castpd_si512(val.v1));
+  return ret;
 }
 
 static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) {
-    return *(__vec16_d*)&val;
+  __vec16_d ret;
+  ret.v2 = _mm512_castsi512_pd(
+      _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+	val.v_hi));
+  ret.v2 = _mm512_castsi512_pd(
+      _mm512_mask_permutevar_epi32(_mm512_castpd_si512(ret.v2), 0x5555,
+        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+	val.v_lo));
+  ret.v1 = _mm512_castsi512_pd(
+      _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+	val.v_hi));
+  ret.v1 = _mm512_castsi512_pd(
+      _mm512_mask_permutevar_epi32(_mm512_castpd_si512(ret.v1), 0x5555,
+        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+	val.v_lo));
+  return ret;
 }
 
 ///////////////////////////////////////////////////////////////////////////
@@ -1364,27 +1574,27 @@ static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) {
 ///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE float __round_uniform_float(float v) {
-    return roundf(v);
+  return roundf(v);
 }
 
 static FORCEINLINE float __floor_uniform_float(float v)  {
-    return floorf(v);
+  return floorf(v);
 }
 
 static FORCEINLINE float __ceil_uniform_float(float v) {
-    return ceilf(v);
+  return ceilf(v);
 }
 
 static FORCEINLINE double __round_uniform_double(double v) {
-    return round(v);
+  return round(v);
 }
 
 static FORCEINLINE double __floor_uniform_double(double v) {
-    return floor(v);
+  return floor(v);
 }
 
 static FORCEINLINE double __ceil_uniform_double(double v) {
-    return ceil(v);
+  return ceil(v);
 }
 
 static FORCEINLINE __vec16_f __round_varying_float(__vec16_f v) {
@@ -1443,30 +1653,30 @@ static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32
 // sqrt/rsqrt/rcp
 
 static FORCEINLINE float __rsqrt_uniform_float(float v) {
-    return 1.f / sqrtf(v);
+  return 1.f / sqrtf(v);
 }
 
 static FORCEINLINE float __rcp_uniform_float(float v) {
-    return 1.f / v;
+  return 1.f / v;
 }
 
 static FORCEINLINE float __sqrt_uniform_float(float v) {
-    return sqrtf(v);
+  return sqrtf(v);
 }
 
 static FORCEINLINE double __sqrt_uniform_double(double v) {
-    return sqrt(v);
+  return sqrt(v);
 }
 
 static FORCEINLINE __vec16_f __sqrt_varying_float(__vec16_f v) {
-    return _mm512_sqrt_ps(v);
+  return _mm512_sqrt_ps(v);
 }
 
 static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) {
 #ifdef ISPC_FAST_MATH
-    return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy.
+  return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy.
 #else
-    return _mm512_recip_ps(v);
+  return _mm512_recip_ps(v);
 #endif
 }
 static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) {
@@ -1483,17 +1693,19 @@ static FORCEINLINE double __rcp_uniform_double(double v)
 
 static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
 #ifdef ISPC_FAST_MATH
-    return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy
+  return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy
 #else 
-    return _mm512_invsqrt_ps(v);
+  return _mm512_invsqrt_ps(v);
 #endif
 }
+
 static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
   __vec16_d y;
   for (int i = 0; i < 16; i++)
     __insert_element(&y, i, 1.0/sqrt(__extract_element(x,i)));
   return y;
 }
+
 static FORCEINLINE double __rsqrt_uniform_double(double v) 
 {
   return 1.0/v;
@@ -1505,20 +1717,20 @@ static FORCEINLINE double __rsqrt_uniform_double(double v)
 ///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE int32_t __popcnt_int32(const __vec1_i32 mask) {
-    return _mm_countbits_32(mask);
+  return _mm_countbits_32(mask);
 }
 
 static FORCEINLINE int32_t __popcnt_int64(const __vec1_i64 mask) {
-    return _mm_countbits_64(mask);
+  return _mm_countbits_64(mask);
 }
 
 
 static FORCEINLINE int32_t __count_trailing_zeros_i32(const __vec1_i32 mask) {
-    return _mm_tzcnt_32(mask);
+  return _mm_tzcnt_32(mask);
 }
 
 static FORCEINLINE int64_t __count_trailing_zeros_i64(const __vec1_i64 mask) {
-    return _mm_tzcnt_64(mask);
+  return _mm_tzcnt_64(mask);
 }
 
 ///////////////////////////////////////////////////////////////////////////
@@ -1542,39 +1754,39 @@ static FORCEINLINE int32_t __reduce_add_i16(__vec16_i16 v) {
 }
 
 static FORCEINLINE uint32_t __reduce_add_i32(__vec16_i32 v) {
-    return _mm512_reduce_add_epi32(v);
+  return _mm512_reduce_add_epi32(v);
 }
 
 static FORCEINLINE uint32_t __reduce_min_i32(__vec16_i32 v) {
-    return _mm512_reduce_min_epi32(v);
+  return _mm512_reduce_min_epi32(v);
 }
 
 static FORCEINLINE uint32_t __reduce_max_i32(__vec16_i32 v) {
-    return _mm512_reduce_max_epi32(v);
+  return _mm512_reduce_max_epi32(v);
 }
 
 static FORCEINLINE float __reduce_add_float(__vec16_f v) {
-    return _mm512_reduce_add_ps(v);
+  return _mm512_reduce_add_ps(v);
 }
 
 static FORCEINLINE float __reduce_min_float(__vec16_f v) {
-    return _mm512_reduce_min_ps(v);
+  return _mm512_reduce_min_ps(v);
 }
 
 static FORCEINLINE float __reduce_max_float(__vec16_f v) {
-    return _mm512_reduce_max_ps(v);
+  return _mm512_reduce_max_ps(v);
 }
 
 static FORCEINLINE float __reduce_add_double(__vec16_d v) {
-    return _mm512_reduce_add_pd(v.v1) + _mm512_reduce_add_pd(v.v2);
+  return _mm512_reduce_add_pd(v.v1) + _mm512_reduce_add_pd(v.v2);
 }
 
 static FORCEINLINE float __reduce_min_double(__vec16_d v) {
-    return std::min(_mm512_reduce_min_pd(v.v1), _mm512_reduce_min_pd(v.v2));
+  return std::min(_mm512_reduce_min_pd(v.v1), _mm512_reduce_min_pd(v.v2));
 }
 
 static FORCEINLINE float __reduce_max_double(__vec16_d v) {
-    return std::max(_mm512_reduce_max_pd(v.v1), _mm512_reduce_max_pd(v.v2));
+  return std::max(_mm512_reduce_max_pd(v.v1), _mm512_reduce_max_pd(v.v2));
 }
 
 ///////////////////////////////////////////////////////////////////////////
@@ -1584,110 +1796,142 @@ static FORCEINLINE float __reduce_max_double(__vec16_d v) {
 // Currently, when a pseudo_gather is converted into a masked load, it has to be unaligned
 static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    return _mm512_mask_load_epi32(__vec16_i32(), mask, p);
+  return _mm512_mask_load_epi32(__vec16_i32(), mask, p);
 #else
-    __vec16_i32 tmp;
-    tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    __vec16_i32 ret;
-    return _mm512_mask_mov_epi32(ret.v, mask, tmp.v);
+  __vec16_i32 tmp;
+  tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  __vec16_i32 ret;
+  return _mm512_mask_mov_epi32(ret.v, mask, tmp.v);
 #endif
 }
 
 static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
+  return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
 #else
-    __vec16_f tmp;
-    tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    __vec16_f ret;
-    return _mm512_mask_mov_ps(ret.v, mask, tmp.v);
+  __vec16_f tmp;
+  tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  __vec16_f ret;
+  return _mm512_mask_mov_ps(ret.v, mask, tmp.v);
 #endif
 }
 
 static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    __vec16_d ret;
-    __vec16_i1 tmp_m = mask;
-    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
-    ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p);
-    ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64);
-    return ret;
+  __vec16_d ret;
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p);
+  ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64);
+  return ret;
 #else
-    __vec16_d tmp;
-    tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    __vec16_d ret;
-    __vec16_i1 tmp_m = mask;
-    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
-    ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1);
-    ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2);
-    return ret;
+  __vec16_d tmp;
+  tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  __vec16_d ret;
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1);
+  ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2);
+  return ret;
 #endif
 }
 
+static FORCEINLINE void __masked_store_i8(void *p, const __vec16_i8 &val, __vec16_i1 mask) { 
+  __vec16_i32 tmp = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE);
+  _mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+}
+
+static FORCEINLINE __vec16_i8 __masked_load_i8(void *p, __vec16_i1 mask) {
+  __vec16_i8 ret;
+  __vec16_i32 tmp = _mm512_mask_extload_epi32(_mm512_undefined_epi32(),mask,p,
+      _MM_UPCONV_EPI32_SINT8, 
+      _MM_BROADCAST32_NONE, _MM_HINT_NONE);
+  _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+  return ret;
+}
+template <int ALIGN> static FORCEINLINE __vec16_i8 __load(const __vec16_i8 *p) {
+  return *p;
+}
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i8 *p, __vec16_i8 v) {
+  *p = v;
+}
+
+static FORCEINLINE void
+__scatter_base_offsets32_i8(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
+    __vec16_i8 val, __vec16_i1 mask)
+{
+  __vec16_i32 tmp = _mm512_extload_epi32(&val,_MM_UPCONV_EPI32_SINT8, 
+      _MM_BROADCAST32_NONE, _MM_HINT_NONE);
+  printf("__scatter_base_offsets32_i8\n");
+  _mm512_mask_i32extscatter_epi32(b, mask, offsets, tmp, 
+      _MM_DOWNCONV_EPI32_SINT8, scale, 
+      _MM_HINT_NONE);
+}
+
 static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    _mm512_mask_store_epi32(p, mask, val.v);
+  _mm512_mask_store_epi32(p, mask, val.v);
 #else
-    __vec16_i32 tmp;
-    tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v);
-    _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  __vec16_i32 tmp;
+  tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v);
+  _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
 #endif
 }
 
 static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
-                                             __vec16_i1 mask) {
+    __vec16_i1 mask) {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    _mm512_mask_store_ps(p, mask, val.v);
+  _mm512_mask_store_ps(p, mask, val.v);
 #else
-    __vec16_f tmp;
-    tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v);
-    _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  __vec16_f tmp;
+  tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v);
+  _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
 #endif
 }
 
 static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
-                                              __vec16_i1 mask) {
+    __vec16_i1 mask) {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    __vec16_i1 tmp_m = mask;
-    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
-    _mm512_mask_store_pd(p, mask, val.v1);
-    _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2);
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  _mm512_mask_store_pd(p, mask, val.v1);
+  _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2);
 #else
-    __vec16_d tmp;
-    __vec16_i1 tmp_m = mask;
-    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
-    tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1);
-    tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2);
-    _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  __vec16_d tmp;
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1);
+  tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2);
+  _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
 #endif
 }
 
 static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val,
-                                                __vec16_i1 mask) {
-    __masked_store_i32(p, val, mask);
+    __vec16_i1 mask) {
+  __masked_store_i32(p, val, mask);
 }
 
 static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
-                                                  __vec16_i1 mask) {
-        __masked_store_float(p, val, mask);
+    __vec16_i1 mask) {
+  __masked_store_float(p, val, mask);
 }
 
 ///////////////////////////////////////////////////////////////////////////
@@ -1698,47 +1942,75 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
 
 static FORCEINLINE __vec16_i8
 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets, 
-                           __vec16_i1 mask) {
-    // (iw): need to temporarily store as int because gathers can only return ints.
-    __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, 
-                                                     _MM_UPCONV_EPI32_SINT8, scale,
-                                                     _MM_HINT_NONE);
-    // now, downconverting to chars into temporary char vector
-    __vec16_i8 ret;
-    _mm512_extstore_epi32(ret.v,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
-    return ret;
+    __vec16_i1 mask) {
+  // (iw): need to temporarily store as int because gathers can only return ints.
+  __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, 
+      _MM_UPCONV_EPI32_SINT8, scale,
+      _MM_HINT_NONE);
+  // now, downconverting to chars into temporary char vector
+  __vec16_i8 ret;
+  _mm512_extstore_epi32(ret.v,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+  return ret;
 }
 
 static FORCEINLINE __vec16_i32
 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets, 
-                            __vec16_i1 mask) {
-    return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, 
-                                          base, _MM_UPCONV_EPI32_NONE, scale,
-                                          _MM_HINT_NONE);
+    __vec16_i1 mask) {
+  return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, 
+      base, _MM_UPCONV_EPI32_NONE, scale,
+      _MM_HINT_NONE);
 }
 
 static FORCEINLINE __vec16_f
 __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
-                              __vec16_i1 mask) { 
-    return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets,
-                                       base, _MM_UPCONV_PS_NONE, scale,
-                                       _MM_HINT_NONE);
+    __vec16_i1 mask) { 
+  return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets,
+      base, _MM_UPCONV_PS_NONE, scale,
+      _MM_HINT_NONE);
 }
 
 static FORCEINLINE __vec16_d
 __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
-                              __vec16_i1 mask) { 
-    __vec16_d ret;
-    ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
-                                       base, _MM_UPCONV_PD_NONE, scale,
-                                       _MM_HINT_NONE); 
-    __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
-    ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, shuffled_offsets,
-                                       base, _MM_UPCONV_PD_NONE, scale,
-                                       _MM_HINT_NONE); 
-    return ret;
+    __vec16_i1 mask) { 
+  __vec16_d ret;
+  ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
+      base, _MM_UPCONV_PD_NONE, scale,
+      _MM_HINT_NONE); 
+  __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+  ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, shuffled_offsets,
+      base, _MM_UPCONV_PD_NONE, scale,
+      _MM_HINT_NONE); 
+  return ret;
 }
 
+static FORCEINLINE __vec16_f
+__gather64_float(__vec16_i64 addr, __vec16_i1 mask) 
+{
+  __vec16_f ret;
+
+  // There is no gather instruction with 64-bit offsets in KNC.
+  // We have to manually iterate over the upper 32 bits ;-)
+  __vec16_i1 still_to_do = mask;
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(addr.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&addr.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN));
+
+    ret.v = _mm512_mask_i32extgather_ps(ret.v, match, signed_offsets, 
+        base, _MM_UPCONV_PS_NONE, 1,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match, still_to_do);
+  }
+
+  return ret;
+}
+
+
 /*! gather with 64-bit offsets.
 
   \todo add optimization that falls back to 32-bit offset gather if
@@ -1748,148 +2020,180 @@ __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offset
 
 static FORCEINLINE __vec16_f
 __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
-                              __vec16_i1 mask) {
-    // There is no gather instruction with 64-bit offsets in KNC.
-    // We have to manually iterate over the upper 32 bits ;-)
-    __vec16_i1 still_to_do = mask;
-    __vec16_f ret;
-    while (still_to_do) {
-        int first_active_lane = _mm_tzcnt_32((int)still_to_do);
-        const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
-        __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
-                                                      __smear_i32<__vec16_i32>((int32_t)hi32),
-                                                      _MM_CMPINT_EQ);
-        
-        void * base = (void*)((unsigned long)_base  +
-                              ((scale*(unsigned long)hi32) << 32));
-        ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base,
-                                          _MM_UPCONV_PS_NONE, scale,
-                                          _MM_HINT_NONE);
-        still_to_do = _mm512_kxor(match, still_to_do);
-    }
+    __vec16_i1 mask) {
 
-    return ret;
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
+  // There is no gather instruction with 64-bit offsets in KNC.
+  // We have to manually iterate over the upper 32 bits ;-)
+  __vec16_i1 still_to_do = mask;
+  __vec16_f ret;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
+
+    ret = _mm512_mask_i32extgather_ps(ret, match, signed_offsets, base,
+        _MM_UPCONV_PS_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match, still_to_do);
+  }
+
+  return ret;
 }
 
 
 static FORCEINLINE __vec16_i8
 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
-                           __vec16_i1 mask) 
+    __vec16_i1 mask) 
 { 
-    __vec16_i1 still_to_do = mask;
-    __vec16_i32 tmp;
-    while (still_to_do) {
-        int first_active_lane = _mm_tzcnt_32((int)still_to_do);
-        const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
-        __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
-                                                      __smear_i32<__vec16_i32>((int32_t)hi32),
-                                                      _MM_CMPINT_EQ);
-    
-        void * base = (void*)((unsigned long)_base  +
-                              ((scale*(unsigned long)hi32) << 32));    
-        tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base,
-                                             _MM_UPCONV_EPI32_SINT8, scale,
-                                             _MM_HINT_NONE);
-        still_to_do = _mm512_kxor(match,still_to_do);
-    }
-    __vec16_i8 ret;
-    _mm512_extstore_epi32(ret.v,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
-    return ret;
+
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
+  __vec16_i1 still_to_do = mask;
+  __vec16_i32 tmp;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
+    tmp = _mm512_mask_i32extgather_epi32(tmp, match, signed_offsets, base,
+        _MM_UPCONV_EPI32_SINT8, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
+  __vec16_i8 ret;
+  _mm512_extstore_epi32(ret.v,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+  return ret;
 }
 
 
 static FORCEINLINE void
 __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
-                               __vec16_f value,
-                               __vec16_i1 mask) { 
-    __vec16_i1 still_to_do = mask;
-    while (still_to_do) {
-        int first_active_lane = _mm_tzcnt_32((int)still_to_do);
-        const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
-        __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
-                                                      __smear_i32<__vec16_i32>((int32_t)hi32),
-                                                      _MM_CMPINT_EQ);
+    __vec16_f value,
+    __vec16_i1 mask) { 
 
-        void * base = (void*)((unsigned long)_base  +
-                              ((scale*(unsigned long)hi32) << 32));    
-        _mm512_mask_i32extscatter_ps(base, match, offsets.v_lo, 
-                                     value,
-                                     _MM_DOWNCONV_PS_NONE, scale,
-                                     _MM_HINT_NONE);
-        still_to_do = _mm512_kxor(match,still_to_do);
-    }
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
+  __vec16_i1 still_to_do = mask;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));   
+    _mm512_mask_i32extscatter_ps(base, match, signed_offsets, 
+        value,
+        _MM_DOWNCONV_PS_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
 }
 
 static FORCEINLINE void
 __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
-                             __vec16_i32 value,
-                             __vec16_i1 mask) { 
-    __vec16_i1 still_to_do = mask;
-    while (still_to_do) {
-        int first_active_lane = _mm_tzcnt_32((int)still_to_do);
-        const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
-        __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
-                                                      __smear_i32<__vec16_i32>((int32_t)hi32),
-                                                      _MM_CMPINT_EQ);
-    
-        void * base = (void*)((unsigned long)_base  +
-                              ((scale*(unsigned long)hi32) << 32));    
-        _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, 
-                                        value,
-                                        _MM_DOWNCONV_EPI32_NONE, scale,
-                                        _MM_HINT_NONE);
-        still_to_do = _mm512_kxor(match,still_to_do);
-    }
+    __vec16_i32 value,
+    __vec16_i1 mask) { 
+
+  const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
+  __vec16_i1 still_to_do = mask;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));  
+    _mm512_mask_i32extscatter_epi32(base, match, signed_offsets, 
+        value,
+        _MM_DOWNCONV_EPI32_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
 }
 
+static FORCEINLINE void // TODO
+__scatter_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
+    __vec16_i8 value,
+    __vec16_i1 mask) { 
+  __vec16_i1 still_to_do = mask;
+
+  __vec16_i32 tmp = _mm512_extload_epi32(&value, _MM_UPCONV_EPI32_SINT8,
+      _MM_BROADCAST32_NONE, _MM_HINT_NONE);
+  // _mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));    
+    _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, 
+        tmp,
+        _MM_DOWNCONV_EPI32_SINT8, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
+}
 
 
 static FORCEINLINE __vec16_i32
 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
-                            __vec16_i1 mask) 
+    __vec16_i1 mask) 
 {
-    __vec16_f r = __gather_base_offsets64_float(_base,scale,offsets,mask);
-    return (__vec16_i32&)r;
+  __vec16_f r = __gather_base_offsets64_float(_base,scale,offsets,mask);
+  return (__vec16_i32&)r;
 }
 
 // scatter
 
 static FORCEINLINE void
 __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
-                             __vec16_i32 val, __vec16_i1 mask)
+    __vec16_i32 val, __vec16_i1 mask)
 {
-    _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, 
-                                    _MM_DOWNCONV_EPI32_NONE, scale, 
-                                    _MM_HINT_NONE);
+  _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, 
+      _MM_DOWNCONV_EPI32_NONE, scale, 
+      _MM_HINT_NONE);
 }
 
 static FORCEINLINE void 
-__scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
-                               __vec16_f val, __vec16_i1 mask) 
+__scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets, 
+  __vec16_f val, __vec16_i1 mask) 
 { 
-    _mm512_mask_i32extscatter_ps(base, mask, offsets, val, 
-                                 _MM_DOWNCONV_PS_NONE, scale,
-                                 _MM_HINT_NONE);
+  _mm512_mask_i32extscatter_ps(base, mask, offsets, val, 
+      _MM_DOWNCONV_PS_NONE, scale,
+      _MM_HINT_NONE);
 }
 
 ///////////////////////////////////////////////////////////////////////////
 // packed load/store
 ///////////////////////////////////////////////////////////////////////////
 
-static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val,
-                                                __vec16_i1 mask) {
-    __vec16_i32 v;
-    v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v = _mm512_mask_extloadunpackhi_epi32(_mm512_undefined_epi32(), mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    __store<64>(val, v);
-    return _mm_countbits_32(uint32_t(mask));
+static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val, __vec16_i1 mask) {
+  __vec16_i32 v = __load<64>(val);
+  v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  __store<64>(val, v);
+  return _mm_countbits_32(uint32_t(mask));
 }
 
-static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val,
-                                                 __vec16_i1 mask) {
-    _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    return _mm_countbits_32(uint32_t(mask));
+static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) {
+  _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  return _mm_countbits_32(uint32_t(mask));
 }
 
 static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val, __vec16_i1 mask)
@@ -1902,241 +2206,261 @@ static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val,
 ///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE void __prefetch_read_uniform_1(const char *p) {
-    _mm_prefetch(p, _MM_HINT_T0); // prefetch into L1$
+  _mm_prefetch(p, _MM_HINT_T0); // prefetch into L1$
 }
 
 static FORCEINLINE void __prefetch_read_uniform_2(const char *p) {
-    _mm_prefetch(p, _MM_HINT_T1); // prefetch into L2$
+  _mm_prefetch(p, _MM_HINT_T1); // prefetch into L2$
 }
 
 static FORCEINLINE void __prefetch_read_uniform_3(const char *p) {
-    // There is no L3$ on KNC, don't want to pollute L2$ unecessarily
+  // There is no L3$ on KNC, don't want to pollute L2$ unecessarily
 }
 
 static FORCEINLINE void __prefetch_read_uniform_nt(const char *p) {
-    _mm_prefetch(p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint
-    // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
+  _mm_prefetch(p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint
+  // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
 }
 
+#define PREFETCH_READ_VARYING(CACHE_NUM, HINT)                                                              \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale,         \
+                                                                   __vec16_i32 offsets, __vec16_i1 mask) {  \
+    _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT);                                   \
+    offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
+    __vec16_i1 copy_mask = _mm512_kmov(mask);                                                               \
+    _mm512_kswapb(mask, copy_mask);                                                                         \
+    _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0);                            \
+}                                                                                                           \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {}           \
+
+PREFETCH_READ_VARYING(1, _MM_HINT_T0)
+PREFETCH_READ_VARYING(2, _MM_HINT_T1)
+PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
+
+static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
+                                                         __vec16_i32 offsets, __vec16_i1 mask) {}
+
+static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
+
 ///////////////////////////////////////////////////////////////////////////
 // atomics
 ///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) {
 #ifdef _MSC_VER
-    return InterlockedAdd((LONG volatile *)p, v) - v;
+  return InterlockedAdd((LONG volatile *)p, v) - v;
 #else
-    return __sync_fetch_and_add(p, v);
+  return __sync_fetch_and_add(p, v);
 #endif
 }
 
 static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) {
 #ifdef _MSC_VER
-    return InterlockedAdd((LONG volatile *)p, -v) + v;
+  return InterlockedAdd((LONG volatile *)p, -v) + v;
 #else
-    return __sync_fetch_and_sub(p, v);
+  return __sync_fetch_and_sub(p, v);
 #endif
 }
 
 static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) {
 #ifdef _MSC_VER
-    return _InterlockedAnd((LONG volatile *)p, v);
+  return _InterlockedAnd((LONG volatile *)p, v);
 #else
-    return __sync_fetch_and_and(p, v);
+  return __sync_fetch_and_and(p, v);
 #endif
 }
 
 static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) {
 #ifdef _MSC_VER
-    return _InterlockedOr((LONG volatile *)p, v);
+  return _InterlockedOr((LONG volatile *)p, v);
 #else
-    return __sync_fetch_and_or(p, v);
+  return __sync_fetch_and_or(p, v);
 #endif
 }
 
 static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) {
 #ifdef _MSC_VER
-    return _InterlockedXor((LONG volatile *)p, v);
+  return _InterlockedXor((LONG volatile *)p, v);
 #else
-    return __sync_fetch_and_xor(p, v);
+  return __sync_fetch_and_xor(p, v);
 #endif
 }
 
 static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) {
-    int32_t old, min;
-    do {
-        old = *((volatile int32_t *)p);
-        min = (old < (int32_t)v) ? old : (int32_t)v;
+  int32_t old, min;
+  do {
+    old = *((volatile int32_t *)p);
+    min = (old < (int32_t)v) ? old : (int32_t)v;
 #ifdef _MSC_VER
-    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+  } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
 #else
-    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+  } while (__sync_bool_compare_and_swap(p, old, min) == false);
 #endif
-    return old;
+  return old;
 }
 
 static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) {
-    int32_t old, max;
-    do {
-        old = *((volatile int32_t *)p);
-        max = (old > (int32_t)v) ? old : (int32_t)v;
+  int32_t old, max;
+  do {
+    old = *((volatile int32_t *)p);
+    max = (old > (int32_t)v) ? old : (int32_t)v;
 #ifdef _MSC_VER
-    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+  } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
 #else
-    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+  } while (__sync_bool_compare_and_swap(p, old, max) == false);
 #endif
-    return old;
+  return old;
 }
 
 static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) {
-    uint32_t old, min;
-    do {
-        old = *((volatile uint32_t *)p);
-        min = (old < v) ? old : v;
+  uint32_t old, min;
+  do {
+    old = *((volatile uint32_t *)p);
+    min = (old < v) ? old : v;
 #ifdef _MSC_VER
-    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+  } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
 #else
-    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+  } while (__sync_bool_compare_and_swap(p, old, min) == false);
 #endif
-    return old;
+  return old;
 }
 
 static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) {
-    uint32_t old, max;
-    do {
-        old = *((volatile uint32_t *)p);
-        max = (old > v) ? old : v;
+  uint32_t old, max;
+  do {
+    old = *((volatile uint32_t *)p);
+    max = (old > v) ? old : v;
 #ifdef _MSC_VER
-    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+  } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
 #else
-    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+  } while (__sync_bool_compare_and_swap(p, old, max) == false);
 #endif
-    return old;
+  return old;
 }
 
 static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) {
 #ifdef _MSC_VER
-    return InterlockedExchange((LONG volatile *)p, v);
+  return InterlockedExchange((LONG volatile *)p, v);
 #else
-    return __sync_lock_test_and_set(p, v);
+  return __sync_lock_test_and_set(p, v);
 #endif
 }
 
 static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval,
-                                             uint32_t newval) {
+    uint32_t newval) {
 #ifdef _MSC_VER
-    return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval);
+  return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval);
 #else
-    return __sync_val_compare_and_swap(p, cmpval, newval);
+  return __sync_val_compare_and_swap(p, cmpval, newval);
 #endif
 }
 
 static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) {
 #ifdef _MSC_VER
-    return InterlockedAdd64((LONGLONG volatile *)p, v) - v;
+  return InterlockedAdd64((LONGLONG volatile *)p, v) - v;
 #else
-    return __sync_fetch_and_add(p, v);
+  return __sync_fetch_and_add(p, v);
 #endif
 }
 
 static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) {
 #ifdef _MSC_VER
-    return InterlockedAdd64((LONGLONG volatile *)p, -v) + v;
+  return InterlockedAdd64((LONGLONG volatile *)p, -v) + v;
 #else
-    return __sync_fetch_and_sub(p, v);
+  return __sync_fetch_and_sub(p, v);
 #endif
 }
 
 static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) {
 #ifdef _MSC_VER
-    return InterlockedAnd64((LONGLONG volatile *)p, v) - v;
+  return InterlockedAnd64((LONGLONG volatile *)p, v) - v;
 #else
-    return __sync_fetch_and_and(p, v);
+  return __sync_fetch_and_and(p, v);
 #endif
 }
 
 static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) {
 #ifdef _MSC_VER
-    return InterlockedOr64((LONGLONG volatile *)p, v) - v;
+  return InterlockedOr64((LONGLONG volatile *)p, v) - v;
 #else
-    return __sync_fetch_and_or(p, v);
+  return __sync_fetch_and_or(p, v);
 #endif
 }
 
 static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) {
 #ifdef _MSC_VER
-    return InterlockedXor64((LONGLONG volatile *)p, v) - v;
+  return InterlockedXor64((LONGLONG volatile *)p, v) - v;
 #else
-    return __sync_fetch_and_xor(p, v);
+  return __sync_fetch_and_xor(p, v);
 #endif
 }
 
 static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) {
-    int64_t old, min;
-    do {
-        old = *((volatile int64_t *)p);
-        min = (old < (int64_t)v) ? old : (int64_t)v;
+  int64_t old, min;
+  do {
+    old = *((volatile int64_t *)p);
+    min = (old < (int64_t)v) ? old : (int64_t)v;
 #ifdef _MSC_VER
-    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+  } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
 #else
-    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+  } while (__sync_bool_compare_and_swap(p, old, min) == false);
 #endif
-    return old;
+  return old;
 }
 
 static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) {
-    int64_t old, max;
-    do {
-        old = *((volatile int64_t *)p);
-        max = (old > (int64_t)v) ? old : (int64_t)v;
+  int64_t old, max;
+  do {
+    old = *((volatile int64_t *)p);
+    max = (old > (int64_t)v) ? old : (int64_t)v;
 #ifdef _MSC_VER
-    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+  } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
 #else
-    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+  } while (__sync_bool_compare_and_swap(p, old, max) == false);
 #endif
-    return old;
+  return old;
 }
 
 static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) {
-    uint64_t old, min;
-    do {
-        old = *((volatile uint64_t *)p);
-        min = (old < v) ? old : v;
+  uint64_t old, min;
+  do {
+    old = *((volatile uint64_t *)p);
+    min = (old < v) ? old : v;
 #ifdef _MSC_VER
-    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+  } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
 #else
-    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+  } while (__sync_bool_compare_and_swap(p, old, min) == false);
 #endif
-    return old;
+  return old;
 }
 
 static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) {
-    uint64_t old, max;
-    do {
-        old = *((volatile uint64_t *)p);
-        max = (old > v) ? old : v;
+  uint64_t old, max;
+  do {
+    old = *((volatile uint64_t *)p);
+    max = (old > v) ? old : v;
 #ifdef _MSC_VER
-    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+  } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
 #else
-    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+  } while (__sync_bool_compare_and_swap(p, old, max) == false);
 #endif
-    return old;
+  return old;
 }
 
 static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) {
 #ifdef _MSC_VER
-    return InterlockedExchange64((LONGLONG volatile *)p, v);
+  return InterlockedExchange64((LONGLONG volatile *)p, v);
 #else
-    return __sync_lock_test_and_set(p, v);
+  return __sync_lock_test_and_set(p, v);
 #endif
 }
 
 static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
-                                             uint64_t newval) {
+    uint64_t newval) {
 #ifdef _MSC_VER
-    return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval);
+  return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval);
 #else
-    return __sync_val_compare_and_swap(p, cmpval, newval);
+  return __sync_val_compare_and_swap(p, cmpval, newval);
 #endif
 }
 
@@ -2148,10 +2472,10 @@ static FORCEINLINE uint64_t __clock() {
   uint32_t low, high;
 #ifdef __x86_64
   __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
-                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+      ::: "%rax", "%rbx", "%rcx", "%rdx" );
 #else
   __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
-                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+      ::: "%eax", "%ebx", "%ecx", "%edx" );
 #endif
   __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
   return (uint64_t)high << 32 | low;
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index a25af10b..765a931f 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -3898,6 +3898,15 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *ptr) {
     _mm_prefetch((char *)ptr, _MM_HINT_NTA);
 }
 
+#define PREFETCH_READ_VARYING(CACHE_NUM)                                                                    \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale,         \
+                                                                   __vec4_i32 offsets, __vec4_i1 mask) {}   \
+static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec4_i64 addr, __vec4_i1 mask) {}             \
+
+PREFETCH_READ_VARYING(1)
+PREFETCH_READ_VARYING(2)
+PREFETCH_READ_VARYING(3)
+PREFETCH_READ_VARYING(nt)
 ///////////////////////////////////////////////////////////////////////////
 // atomics
 
diff --git a/fail_db.txt b/fail_db.txt
index 9def9b6c..b7c1ad74 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -257,6 +257,32 @@
 ./tests/reduce-equal-5.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O0 *
 ./tests/reduce-equal-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O0 *
 ./tests/reduce-equal-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O0 *
+./tests/atomics-6.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/atomics-uniform-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/atomics-uniform-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/paddus_vi16.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/paddus_vi8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmuls_i64.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i16.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i32.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i64.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/atomics-6.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O0 *
+./tests/atomics-uniform-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O0 *
+./tests/atomics-uniform-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O0 *
+./tests/atomics-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/atomics-uniform-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/atomics-uniform-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/paddus_vi16.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/paddus_vi8.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmuls_i64.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i16.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i32.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i64.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/pmulus_i8.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O2 *
+./tests/atomics-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O0 *
+./tests/atomics-uniform-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O0 *
+./tests/atomics-uniform-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O0 *
 .\tests\foreach-double-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.5         cl -O2 *
 .\tests\foreach-double-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.5         cl -O2 *
 .\tests\foreach-double-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.5         cl -O2 *
@@ -267,7 +293,6 @@
 ./tests/ptr-22.ispc runfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.4 -O0 *
 ./tests/ptr-22.ispc runfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.4 -O0 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc13.1 -O2 *
-./tests/ptr-22.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc13.1 -O0 *
 ./tests/atomics-1.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc13.1 -O0 *
 ./tests/atomics-10.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc13.1 -O0 *
 ./tests/atomics-11.ispc compfail  x86-64            knc   Linux LLVM 3.4   icpc13.1 -O0 *
@@ -454,3 +479,35 @@
 ./tests/reduce-equal-5.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc13.1 -O0 *
 ./tests/reduce-equal-6.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc13.1 -O0 *
 ./tests/reduce-equal-8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc13.1 -O0 *
+./tests/atomics-6.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/atomics-uniform-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/atomics-uniform-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/paddus_vi16.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/paddus_vi8.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmuls_i64.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i16.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i32.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i64.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i8.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/atomics-6.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O0 *
+./tests/atomics-uniform-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O0 *
+./tests/atomics-uniform-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O0 *
+./tests/atomics-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/atomics-uniform-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/atomics-uniform-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/paddus_vi16.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/paddus_vi8.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmuls_i64.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i16.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i32.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i64.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/pmulus_i8.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/atomics-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O0 *
+./tests/atomics-uniform-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O0 *
+./tests/atomics-uniform-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O0 *
+./tests/psubus_vi16.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc13.1 -O2 *
+./tests/psubus_vi8.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc13.1 -O2 *
+./tests/psubus_vi16.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/psubus_vi8.ispc compfail  x86-64      generic-4   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/psubus_vi16.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
+./tests/psubus_vi8.ispc compfail  x86-64     generic-16   Linux LLVM 3.6 clang++3.4 -O2 *
diff --git a/ispc.cpp b/ispc.cpp
index 00e0faec..6b7d3fce 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -199,7 +199,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     m_hasTranscendentals(false),
     m_hasTrigonometry(false),
     m_hasRsqrtd(false),
-    m_hasRcpd(false)
+    m_hasRcpd(false),
+    m_hasVecPrefetch(false)
 {
     if (isa == NULL) {
         if (cpu != NULL) {
@@ -386,6 +387,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTrigonometry = false;
         this->m_hasGather = this->m_hasScatter = true;
         this->m_hasRsqrtd = this->m_hasRcpd = true;
+        // It's set to true, because MIC has hardware vector prefetch instruction
+        this->m_hasVecPrefetch = true;
     }
     else if (!strcasecmp(isa, "generic-32") ||
              !strcasecmp(isa, "generic-x32")) {
diff --git a/ispc.h b/ispc.h
index 01aae562..e11840a6 100644
--- a/ispc.h
+++ b/ispc.h
@@ -283,6 +283,8 @@ public:
     
     bool hasRcpd() const {return m_hasRcpd;}
 
+    bool hasVecPrefetch() const {return m_hasVecPrefetch;}
+
 private:
 
     /** llvm Target object representing this target. */
@@ -385,6 +387,9 @@ private:
     
     /** Indicates whether there is an ISA double precision rcp. */
     bool m_hasRcpd;
+
+    /** Indicates whether the target has hardware instruction for vector prefetch. */
+    bool m_hasVecPrefetch;
 };
 
 
diff --git a/ispc.vcxproj b/ispc.vcxproj
index 4308715a..c4dc4373 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -131,7 +131,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse4.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-32bit.cpp; $(Configuration)/gen-bitcode-sse4-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
@@ -141,7 +141,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse4-8.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-8-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-8-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-8-32bit.cpp; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
@@ -151,7 +151,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse4-16.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-16-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-16-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-16-32bit.cpp; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
@@ -161,7 +161,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse4-x2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-x2-32bit.cpp; $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
@@ -171,7 +171,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-32bit.cpp; $(Configuration)/gen-bitcode-sse2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
@@ -181,7 +181,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse2-x2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-x2-32bit.cpp; $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
@@ -191,7 +191,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
@@ -201,7 +201,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1-x2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
@@ -211,7 +211,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1-i64x4.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
@@ -221,7 +221,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx11.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-32bit.cpp; $(Configuration)/gen-bitcode-avx11-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
@@ -231,7 +231,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx11-x2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
@@ -241,7 +241,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx11-i64x4.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
@@ -251,7 +251,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-32bit.cpp; $(Configuration)/gen-bitcode-avx2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
@@ -261,7 +261,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx2-x2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
@@ -271,7 +271,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx2-i64x4.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
@@ -281,7 +281,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-1.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-1-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-1-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-1-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-1-32bit.cpp; $(Configuration)/gen-bitcode-generic-1-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
@@ -291,7 +291,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-4.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-4-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-4-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-4-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-4-32bit.cpp; $(Configuration)/gen-bitcode-generic-4-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
@@ -301,7 +301,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-8.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-8-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-8-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-8-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-8-32bit.cpp; $(Configuration)/gen-bitcode-generic-8-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
@@ -311,7 +311,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-16.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-16-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-16-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-16-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-16-32bit.cpp; $(Configuration)/gen-bitcode-generic-16-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
@@ -321,7 +321,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-32.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-32-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-32-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-32-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-32-32bit.cpp; $(Configuration)/gen-bitcode-generic-32-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
@@ -331,7 +331,7 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-64.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-64-32bit.cpp;
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-64-32bit.cpp;
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-64-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-64-32bit.cpp; $(Configuration)/gen-bitcode-generic-64-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
@@ -403,6 +403,7 @@
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
       <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'AND'$(LLVM_VERSION)'!='LLVM_3_4'">LLVMMCDisassembler.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'">LLVMOption.lib;LLVMSupport.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
@@ -424,10 +425,12 @@
       <OptimizeReferences>true</OptimizeReferences>
       <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
       <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'AND'$(LLVM_VERSION)'!='LLVM_3_4'AND'$(LLVM_VERSION)'!='LLVM_3_5'">LLVMProfileData.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'AND'$(LLVM_VERSION)'!='LLVM_3_4'">LLVMMCDisassembler.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'">LLVMOption.lib;LLVMSupport.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/module.cpp b/module.cpp
index 2ef89bbd..a20d2297 100644
--- a/module.cpp
+++ b/module.cpp
@@ -604,12 +604,23 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
     if (diBuilder) {
         llvm::DIFile file = pos.GetDIFile();
         llvm::DIGlobalVariable var =
+#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
+            diBuilder->createGlobalVariable(file,
+                                            name,
+                                            name,
+                                            file,
+                                            pos.first_line,
+                                            sym->type->GetDIType(file),
+                                            (sym->storageClass == SC_STATIC),
+                                            sym->storagePtr);
+#else
             diBuilder->createGlobalVariable(name,
                                             file,
                                             pos.first_line,
                                             sym->type->GetDIType(file),
                                             (sym->storageClass == SC_STATIC),
                                             sym->storagePtr);
+#endif
         Assert(var.Verify());
     }
 }
@@ -1304,18 +1315,33 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
 
 #endif
 
+#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
     std::string error;
+#else // LLVM 3.6+
+    std::error_code error;
+#endif
+
     llvm::tool_output_file *of = new llvm::tool_output_file(outFileName, error, flags);
+
+#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
     if (error.size()) {
+#else // LLVM 3.6+
+    if (error) {
+#endif
+
         fprintf(stderr, "Error opening output file \"%s\".\n", outFileName);
         return false;
     }
 
     llvm::PassManager pm;
-#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
-    pm.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
-#else
+#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4)
     pm.add(new llvm::DataLayout(*g->target->getDataLayout()));
+#elif defined(LLVM_3_5)
+    pm.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
+#else // LLVM 3.6+
+    llvm::DataLayoutPass *dlp= new llvm::DataLayoutPass();
+    dlp->doInitialization(*module);
+    pm.add(dlp);
 #endif
 
     llvm::formatted_raw_ostream fos(of->os());
diff --git a/opt.cpp b/opt.cpp
index a28b5758..135a7c8c 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -479,10 +479,14 @@ Optimize(llvm::Module *module, int optLevel) {
         new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
     optPM.add(targetLibraryInfo);
 
-#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
-    optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
-#else
+#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4)
     optPM.add(new llvm::DataLayout(*g->target->getDataLayout()));
+#elif defined(LLVM_3_5)
+    optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
+#else // LLVM 3.6+
+    llvm::DataLayoutPass *dlp= new llvm::DataLayoutPass();
+    dlp->doInitialization(*module);
+    optPM.add(dlp);
 #endif
 
     llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
@@ -2117,8 +2121,8 @@ static bool
 lGSToGSBaseOffsets(llvm::CallInst *callInst) {
     struct GSInfo {
         GSInfo(const char *pgFuncName, const char *pgboFuncName,
-               const char *pgbo32FuncName, bool ig)
-            : isGather(ig) {
+               const char *pgbo32FuncName, bool ig, bool ip)
+            : isGather(ig), isPrefetch(ip) {
             func = m->module->getFunction(pgFuncName);
             baseOffsetsFunc = m->module->getFunction(pgboFuncName);
             baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
@@ -2126,6 +2130,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
         llvm::Function *func;
         llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
         const bool isGather;
+        const bool isPrefetch;
     };
 
     GSInfo gsFuncs[] = {
@@ -2134,148 +2139,176 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
                "__pseudo_gather_factored_base_offsets32_i8",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
                "__pseudo_gather_factored_base_offsets32_i8",
-               true),
+               true, false),
         GSInfo("__pseudo_gather32_i16",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
                "__pseudo_gather_factored_base_offsets32_i16",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
                "__pseudo_gather_factored_base_offsets32_i16",
-               true),
+               true, false),
         GSInfo("__pseudo_gather32_i32",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
                "__pseudo_gather_factored_base_offsets32_i32",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
                "__pseudo_gather_factored_base_offsets32_i32",
-               true),
+               true, false),
         GSInfo("__pseudo_gather32_float",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
                "__pseudo_gather_factored_base_offsets32_float",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
                "__pseudo_gather_factored_base_offsets32_float",
-               true),
+               true, false),
         GSInfo("__pseudo_gather32_i64",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
                "__pseudo_gather_factored_base_offsets32_i64",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
                "__pseudo_gather_factored_base_offsets32_i64",
-               true),
+               true, false),
         GSInfo("__pseudo_gather32_double",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
                "__pseudo_gather_factored_base_offsets32_double",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
                "__pseudo_gather_factored_base_offsets32_double",
-               true),
+               true, false),
 
         GSInfo("__pseudo_scatter32_i8",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
                "__pseudo_scatter_factored_base_offsets32_i8",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
                "__pseudo_scatter_factored_base_offsets32_i8",
-               false),
+               false, false),
         GSInfo("__pseudo_scatter32_i16",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
                "__pseudo_scatter_factored_base_offsets32_i16",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
                "__pseudo_scatter_factored_base_offsets32_i16",
-               false),
+               false, false),
         GSInfo("__pseudo_scatter32_i32",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
                "__pseudo_scatter_factored_base_offsets32_i32",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
                "__pseudo_scatter_factored_base_offsets32_i32",
-               false),
+               false, false),
         GSInfo("__pseudo_scatter32_float",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
                "__pseudo_scatter_factored_base_offsets32_float",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
                "__pseudo_scatter_factored_base_offsets32_float",
-               false),
+               false, false),
         GSInfo("__pseudo_scatter32_i64",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
                "__pseudo_scatter_factored_base_offsets32_i64",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
                "__pseudo_scatter_factored_base_offsets32_i64",
-               false),
+               false, false),
         GSInfo("__pseudo_scatter32_double",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
                "__pseudo_scatter_factored_base_offsets32_double",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
                "__pseudo_scatter_factored_base_offsets32_double",
-               false),
+               false, false),
 
         GSInfo("__pseudo_gather64_i8",
                g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" :
                "__pseudo_gather_factored_base_offsets64_i8",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
                "__pseudo_gather_factored_base_offsets32_i8",
-               true),
+               true, false),
         GSInfo("__pseudo_gather64_i16",
                g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16" :
                "__pseudo_gather_factored_base_offsets64_i16",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
                "__pseudo_gather_factored_base_offsets32_i16",
-               true),
+               true, false),
         GSInfo("__pseudo_gather64_i32",
                g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32" :
                "__pseudo_gather_factored_base_offsets64_i32",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
                "__pseudo_gather_factored_base_offsets32_i32",
-               true),
+               true, false),
         GSInfo("__pseudo_gather64_float",
                g->target->hasGather() ? "__pseudo_gather_base_offsets64_float" :
                "__pseudo_gather_factored_base_offsets64_float",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
                "__pseudo_gather_factored_base_offsets32_float",
-               true),
+               true, false),
         GSInfo("__pseudo_gather64_i64",
                g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64" :
                "__pseudo_gather_factored_base_offsets64_i64",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
                "__pseudo_gather_factored_base_offsets32_i64",
-               true),
+               true, false),
         GSInfo("__pseudo_gather64_double",
                g->target->hasGather() ? "__pseudo_gather_base_offsets64_double" :
                "__pseudo_gather_factored_base_offsets64_double",
                g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
                "__pseudo_gather_factored_base_offsets32_double",
-               true),
+               true, false),
 
         GSInfo("__pseudo_scatter64_i8",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8" :
                "__pseudo_scatter_factored_base_offsets64_i8",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
                "__pseudo_scatter_factored_base_offsets32_i8",
-               false),
+               false, false),
         GSInfo("__pseudo_scatter64_i16",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16" :
                "__pseudo_scatter_factored_base_offsets64_i16",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
                "__pseudo_scatter_factored_base_offsets32_i16",
-               false),
+               false, false),
         GSInfo("__pseudo_scatter64_i32",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32" :
                "__pseudo_scatter_factored_base_offsets64_i32",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
                "__pseudo_scatter_factored_base_offsets32_i32",
-               false),
+               false, false),
         GSInfo("__pseudo_scatter64_float",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float" :
                "__pseudo_scatter_factored_base_offsets64_float",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
                "__pseudo_scatter_factored_base_offsets32_float",
-               false),
+               false, false),
         GSInfo("__pseudo_scatter64_i64",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64" :
                "__pseudo_scatter_factored_base_offsets64_i64",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
                "__pseudo_scatter_factored_base_offsets32_i64",
-               false),
+               false, false),
         GSInfo("__pseudo_scatter64_double",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double" :
                "__pseudo_scatter_factored_base_offsets64_double",
                g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
                "__pseudo_scatter_factored_base_offsets32_double",
-               false),
+               false, false),
+        
+        GSInfo("__pseudo_prefetch_read_varying_1",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : 
+               "__prefetch_read_varying_1",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : 
+               "__prefetch_read_varying_1",
+               false, true),
+
+        GSInfo("__pseudo_prefetch_read_varying_2",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
+               "__prefetch_read_varying_2",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
+               "__prefetch_read_varying_2",
+               false, true),
+
+        GSInfo("__pseudo_prefetch_read_varying_3",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
+               "__prefetch_read_varying_3",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
+               "__prefetch_read_varying_3",
+               false, true),
+
+        GSInfo("__pseudo_prefetch_read_varying_nt",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
+               "__prefetch_read_varying_nt",
+               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
+               "__prefetch_read_varying_nt",
+               false, true),
     };
 
     int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
@@ -2301,7 +2334,8 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
     llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector,
                                                  callInst);
 
-    if (basePtr == NULL || offsetVector == NULL)
+    if (basePtr == NULL || offsetVector == NULL || 
+        (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch() == false))
         // It's actually a fully general gather/scatter with a varying
         // set of base pointers, so leave it as is and continune onward
         // to the next instruction...
@@ -2316,7 +2350,9 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
     llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;
 
     if ((info->isGather == true && g->target->hasGather()) ||
-        (info->isGather == false && g->target->hasScatter())) {
+        (info->isGather == false && info->isPrefetch == false && g->target->hasScatter()) ||
+        (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch())) {
+
         // See if the offsets are scaled by 2, 4, or 8.  If so,
         // extract that scale factor and rewrite the offsets to remove
         // it.
@@ -2330,7 +2366,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
             gatherScatterFunc = info->baseOffsets32Func;
         }
 
-        if (info->isGather) {
+        if (info->isGather || info->isPrefetch) {
             llvm::Value *mask = callInst->getArgOperand(1);
 
             // Generate a new function call to the next pseudo gather
@@ -2387,7 +2423,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
             gatherScatterFunc = info->baseOffsets32Func;
         }
 
-        if (info->isGather) {
+        if (info->isGather || info->isPrefetch) {
             llvm::Value *mask = callInst->getArgOperand(1);
 
             // Generate a new function call to the next pseudo gather
@@ -2429,13 +2465,14 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
 static bool
 lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
     struct GSBOInfo {
-        GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig)
-            : isGather(ig) {
+        GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
+            : isGather(ig), isPrefetch(ip) {
             baseOffsetsFunc = m->module->getFunction(pgboFuncName);
             baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
         }
         llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
         const bool isGather;
+        const bool isPrefetch;
     };
 
     GSBOInfo gsFuncs[] = {
@@ -2443,63 +2480,87 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
                                        "__pseudo_gather_factored_base_offsets32_i8",
                  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
                                        "__pseudo_gather_factored_base_offsets32_i8",
-                 true),
+                 true, false),
         GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
                                        "__pseudo_gather_factored_base_offsets32_i16",
                  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
                                        "__pseudo_gather_factored_base_offsets32_i16",
-                 true),
+                 true, false),
         GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
                                        "__pseudo_gather_factored_base_offsets32_i32",
                  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
                                        "__pseudo_gather_factored_base_offsets32_i32",
-                 true),
+                 true, false),
         GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
                                        "__pseudo_gather_factored_base_offsets32_float",
                  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
                                        "__pseudo_gather_factored_base_offsets32_float",
-                 true),
+                 true, false),
         GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
                                        "__pseudo_gather_factored_base_offsets32_i64",
                  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
                                        "__pseudo_gather_factored_base_offsets32_i64",
-                 true),
+                 true, false),
         GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
                                        "__pseudo_gather_factored_base_offsets32_double",
                  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
                                        "__pseudo_gather_factored_base_offsets32_double",
-                 true),
+                 true, false),
 
         GSBOInfo( g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
                                          "__pseudo_scatter_factored_base_offsets32_i8",
                   g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
                                          "__pseudo_scatter_factored_base_offsets32_i8",
-                  false),
+                  false, false),
         GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
                                         "__pseudo_scatter_factored_base_offsets32_i16",
                  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
                                         "__pseudo_scatter_factored_base_offsets32_i16",
-                 false),
+                 false, false),
         GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
                                         "__pseudo_scatter_factored_base_offsets32_i32",
                  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
                                         "__pseudo_scatter_factored_base_offsets32_i32",
-                 false),
+                 false, false),
         GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
                                         "__pseudo_scatter_factored_base_offsets32_float",
                  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
                                         "__pseudo_scatter_factored_base_offsets32_float",
-                 false),
+                 false, false),
         GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
                                         "__pseudo_scatter_factored_base_offsets32_i64",
                  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
                                         "__pseudo_scatter_factored_base_offsets32_i64",
-                 false),
+                 false, false),
         GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
                                         "__pseudo_scatter_factored_base_offsets32_double",
                  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
                                         "__pseudo_scatter_factored_base_offsets32_double",
-                 false),
+                 false, false),
+
+        GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
+                                            "__prefetch_read_varying_1",
+                 g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
+                                            "__prefetch_read_varying_1",
+                 false, true),
+
+        GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
+                                            "__prefetch_read_varying_2",
+                 g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
+                                            "__prefetch_read_varying_2",
+                 false, true),
+
+        GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
+                                            "__prefetch_read_varying_3",
+                 g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
+                                            "__prefetch_read_varying_3",
+                 false, true),
+
+        GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
+                                            "__prefetch_read_varying_nt",
+                 g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
+                                            "__prefetch_read_varying_nt",
+                 false, true),
     };
 
     int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
@@ -4290,149 +4351,170 @@ lReplacePseudoMaskedStore(llvm::CallInst *callInst) {
 static bool
 lReplacePseudoGS(llvm::CallInst *callInst) {
     struct LowerGSInfo {
-        LowerGSInfo(const char *pName, const char *aName, bool ig)
-            : isGather(ig) {
+        LowerGSInfo(const char *pName, const char *aName, bool ig, bool ip)
+            : isGather(ig), isPrefetch(ip) {
             pseudoFunc = m->module->getFunction(pName);
             actualFunc = m->module->getFunction(aName);
         }
         llvm::Function *pseudoFunc;
         llvm::Function *actualFunc;
         const bool isGather;
+        const bool isPrefetch;
     };
 
     LowerGSInfo lgsInfo[] = {
-        LowerGSInfo("__pseudo_gather32_i8",  "__gather32_i8",  true),
-        LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true),
-        LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true),
-        LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true),
-        LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true),
-        LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true),
+        LowerGSInfo("__pseudo_gather32_i8",  "__gather32_i8",  true, false),
+        LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true, false),
+        LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true, false),
+        LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true, false),
+        LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true, false),
+        LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true, false),
 
-        LowerGSInfo("__pseudo_gather64_i8",  "__gather64_i8",  true),
-        LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true),
-        LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true),
-        LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true),
-        LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true),
-        LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true),
+        LowerGSInfo("__pseudo_gather64_i8",  "__gather64_i8",  true, false),
+        LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true, false),
+        LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true, false),
+        LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true, false),
+        LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true, false),
+        LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true, false),
 
         LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8",
-                    "__gather_factored_base_offsets32_i8",  true),
+                    "__gather_factored_base_offsets32_i8",  true, false),
         LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16",
-                    "__gather_factored_base_offsets32_i16", true),
+                    "__gather_factored_base_offsets32_i16", true, false),
         LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32",
-                    "__gather_factored_base_offsets32_i32", true),
+                    "__gather_factored_base_offsets32_i32", true, false),
         LowerGSInfo("__pseudo_gather_factored_base_offsets32_float",
-                    "__gather_factored_base_offsets32_float", true),
+                    "__gather_factored_base_offsets32_float", true, false),
         LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64",
-                    "__gather_factored_base_offsets32_i64", true),
+                    "__gather_factored_base_offsets32_i64", true, false),
         LowerGSInfo("__pseudo_gather_factored_base_offsets32_double",
-                    "__gather_factored_base_offsets32_double", true),
+                    "__gather_factored_base_offsets32_double", true, false),
 
         LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8",
-                    "__gather_factored_base_offsets64_i8",  true),
+                    "__gather_factored_base_offsets64_i8",  true, false),
         LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16",
-                    "__gather_factored_base_offsets64_i16", true),
+                    "__gather_factored_base_offsets64_i16", true, false),
         LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32",
-                    "__gather_factored_base_offsets64_i32", true),
+                    "__gather_factored_base_offsets64_i32", true, false),
         LowerGSInfo("__pseudo_gather_factored_base_offsets64_float",
-                    "__gather_factored_base_offsets64_float", true),
+                    "__gather_factored_base_offsets64_float", true, false),
         LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64",
-                    "__gather_factored_base_offsets64_i64", true),
+                    "__gather_factored_base_offsets64_i64", true, false),
         LowerGSInfo("__pseudo_gather_factored_base_offsets64_double",
-                    "__gather_factored_base_offsets64_double", true),
+                    "__gather_factored_base_offsets64_double", true, false),
 
         LowerGSInfo("__pseudo_gather_base_offsets32_i8",
-                    "__gather_base_offsets32_i8",  true),
+                    "__gather_base_offsets32_i8",  true, false),
         LowerGSInfo("__pseudo_gather_base_offsets32_i16",
-                    "__gather_base_offsets32_i16", true),
+                    "__gather_base_offsets32_i16", true, false),
         LowerGSInfo("__pseudo_gather_base_offsets32_i32",
-                    "__gather_base_offsets32_i32", true),
+                    "__gather_base_offsets32_i32", true, false),
         LowerGSInfo("__pseudo_gather_base_offsets32_float",
-                    "__gather_base_offsets32_float", true),
+                    "__gather_base_offsets32_float", true, false),
         LowerGSInfo("__pseudo_gather_base_offsets32_i64",
-                    "__gather_base_offsets32_i64", true),
+                    "__gather_base_offsets32_i64", true, false),
         LowerGSInfo("__pseudo_gather_base_offsets32_double",
-                    "__gather_base_offsets32_double", true),
+                    "__gather_base_offsets32_double", true, false),
 
         LowerGSInfo("__pseudo_gather_base_offsets64_i8",
-                    "__gather_base_offsets64_i8",  true),
+                    "__gather_base_offsets64_i8",  true, false),
         LowerGSInfo("__pseudo_gather_base_offsets64_i16",
-                    "__gather_base_offsets64_i16", true),
+                    "__gather_base_offsets64_i16", true, false),
         LowerGSInfo("__pseudo_gather_base_offsets64_i32",
-                    "__gather_base_offsets64_i32", true),
+                    "__gather_base_offsets64_i32", true, false),
         LowerGSInfo("__pseudo_gather_base_offsets64_float",
-                    "__gather_base_offsets64_float", true),
+                    "__gather_base_offsets64_float", true, false),
         LowerGSInfo("__pseudo_gather_base_offsets64_i64",
-                    "__gather_base_offsets64_i64", true),
+                    "__gather_base_offsets64_i64", true, false),
         LowerGSInfo("__pseudo_gather_base_offsets64_double",
-                    "__gather_base_offsets64_double", true),
+                    "__gather_base_offsets64_double", true, false),
 
-        LowerGSInfo("__pseudo_scatter32_i8",  "__scatter32_i8",  false),
-        LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false),
-        LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false),
-        LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false),
-        LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false),
-        LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false),
+        LowerGSInfo("__pseudo_scatter32_i8",  "__scatter32_i8",  false, false),
+        LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false, false),
+        LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false, false),
+        LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false, false),
+        LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false, false),
+        LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false, false),
 
-        LowerGSInfo("__pseudo_scatter64_i8",  "__scatter64_i8",  false),
-        LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false),
-        LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false),
-        LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false),
-        LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false),
-        LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false),
+        LowerGSInfo("__pseudo_scatter64_i8",  "__scatter64_i8",  false, false),
+        LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false, false),
+        LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false, false),
+        LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false, false),
+        LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false, false),
+        LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false, false),
 
         LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8",
-                    "__scatter_factored_base_offsets32_i8",  false),
+                    "__scatter_factored_base_offsets32_i8",  false, false),
         LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16",
-                    "__scatter_factored_base_offsets32_i16", false),
+                    "__scatter_factored_base_offsets32_i16", false, false),
         LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32",
-                    "__scatter_factored_base_offsets32_i32", false),
+                    "__scatter_factored_base_offsets32_i32", false, false),
         LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float",
-                    "__scatter_factored_base_offsets32_float", false),
+                    "__scatter_factored_base_offsets32_float", false, false),
         LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64",
-                    "__scatter_factored_base_offsets32_i64", false),
+                    "__scatter_factored_base_offsets32_i64", false, false),
         LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double",
-                    "__scatter_factored_base_offsets32_double", false),
+                    "__scatter_factored_base_offsets32_double", false, false),
 
         LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8",
-                    "__scatter_factored_base_offsets64_i8",  false),
+                    "__scatter_factored_base_offsets64_i8",  false, false),
         LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16",
-                    "__scatter_factored_base_offsets64_i16", false),
+                    "__scatter_factored_base_offsets64_i16", false, false),
         LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32",
-                    "__scatter_factored_base_offsets64_i32", false),
+                    "__scatter_factored_base_offsets64_i32", false, false),
         LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float",
-                    "__scatter_factored_base_offsets64_float", false),
+                    "__scatter_factored_base_offsets64_float", false, false),
         LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64",
-                    "__scatter_factored_base_offsets64_i64", false),
+                    "__scatter_factored_base_offsets64_i64", false, false),
         LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double",
-                    "__scatter_factored_base_offsets64_double", false),
+                    "__scatter_factored_base_offsets64_double", false, false),
 
 
         LowerGSInfo("__pseudo_scatter_base_offsets32_i8",
-                    "__scatter_base_offsets32_i8",  false),
+                    "__scatter_base_offsets32_i8",  false, false),
         LowerGSInfo("__pseudo_scatter_base_offsets32_i16",
-                    "__scatter_base_offsets32_i16", false),
+                    "__scatter_base_offsets32_i16", false, false),
         LowerGSInfo("__pseudo_scatter_base_offsets32_i32",
-                    "__scatter_base_offsets32_i32", false),
+                    "__scatter_base_offsets32_i32", false, false),
         LowerGSInfo("__pseudo_scatter_base_offsets32_float",
-                    "__scatter_base_offsets32_float", false),
+                    "__scatter_base_offsets32_float", false, false),
         LowerGSInfo("__pseudo_scatter_base_offsets32_i64",
-                    "__scatter_base_offsets32_i64", false),
+                    "__scatter_base_offsets32_i64", false, false),
         LowerGSInfo("__pseudo_scatter_base_offsets32_double",
-                    "__scatter_base_offsets32_double", false),
+                    "__scatter_base_offsets32_double", false, false),
 
         LowerGSInfo("__pseudo_scatter_base_offsets64_i8",
-                    "__scatter_base_offsets64_i8",  false),
+                    "__scatter_base_offsets64_i8",  false, false),
         LowerGSInfo("__pseudo_scatter_base_offsets64_i16",
-                    "__scatter_base_offsets64_i16", false),
+                    "__scatter_base_offsets64_i16", false, false),
         LowerGSInfo("__pseudo_scatter_base_offsets64_i32",
-                    "__scatter_base_offsets64_i32", false),
+                    "__scatter_base_offsets64_i32", false, false),
         LowerGSInfo("__pseudo_scatter_base_offsets64_float",
-                    "__scatter_base_offsets64_float", false),
+                    "__scatter_base_offsets64_float", false, false),
         LowerGSInfo("__pseudo_scatter_base_offsets64_i64",
-                    "__scatter_base_offsets64_i64", false),
+                    "__scatter_base_offsets64_i64", false, false),
         LowerGSInfo("__pseudo_scatter_base_offsets64_double",
-                    "__scatter_base_offsets64_double", false),
+                    "__scatter_base_offsets64_double", false, false),
+
+        LowerGSInfo("__pseudo_prefetch_read_varying_1",
+                    "__prefetch_read_varying_1", false, true),
+        LowerGSInfo("__pseudo_prefetch_read_varying_1_native",
+                    "__prefetch_read_varying_1_native", false, true),
+
+        LowerGSInfo("__pseudo_prefetch_read_varying_2",
+                    "__prefetch_read_varying_2", false, true),
+        LowerGSInfo("__pseudo_prefetch_read_varying_2_native",
+                    "__prefetch_read_varying_2_native", false, true),
+
+        LowerGSInfo("__pseudo_prefetch_read_varying_3",
+                    "__prefetch_read_varying_3", false, true),
+        LowerGSInfo("__pseudo_prefetch_read_varying_3_native",
+                    "__prefetch_read_varying_3_native", false, true),
+
+        LowerGSInfo("__pseudo_prefetch_read_varying_nt",
+                    "__prefetch_read_varying_nt", false, true),
+        LowerGSInfo("__pseudo_prefetch_read_varying_nt_native",
+                    "__prefetch_read_varying_nt_native", false, true),
     };
 
     llvm::Function *calledFunc = callInst->getCalledFunction();
@@ -4459,7 +4541,7 @@ lReplacePseudoGS(llvm::CallInst *callInst) {
     if (gotPosition && g->target->getVectorWidth() > 1) {
         if (info->isGather)
             PerformanceWarning(pos, "Gather required to load value.");
-        else
+        else if (!info->isPrefetch)
             PerformanceWarning(pos, "Scatter required to store value.");
     }
     return true;
@@ -4740,6 +4822,8 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
         "__scatter64_i8", "__scatter64_i16",
         "__scatter64_i32", "__scatter64_i64",
         "__scatter64_float", "__scatter64_double",
+        "__prefetch_read_varying_1", "__prefetch_read_varying_2",
+        "__prefetch_read_varying_3", "__prefetch_read_varying_nt",
         "__keep_funcs_live",
     };
 
diff --git a/run_tests.py b/run_tests.py
index d48b1a2f..c540c020 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -647,8 +647,8 @@ def run_tests(options1, args, print_version):
             options.include_file = "examples/intrinsics/generic-64.h"
             options.target = "generic-64"
         elif options.target == "knc":
-            error("No knc #include specified; using examples/intrinsics/knc-i1x16.h\n", 2)
-            options.include_file = "examples/intrinsics/knc-i1x16.h"
+            error("No knc #include specified; using examples/intrinsics/knc.h\n", 2)
+            options.include_file = "examples/intrinsics/knc.h"
  
     if options.compiler_exe == None:
         if (options.target == "knc"): 
diff --git a/stdlib.ispc b/stdlib.ispc
index 2ec3859e..01aae815 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -847,43 +847,19 @@ static inline void prefetch_nt(const void * uniform ptr) {
 }
 
 static inline void prefetch_l1(const void * varying ptr) {
-    const void * uniform ptrArray[programCount];
-    ptrArray[programIndex] = ptr;
-
-    foreach_active (i) {
-        const void * uniform p = ptrArray[i];
-        prefetch_l1(p);
-    }
+    __pseudo_prefetch_read_varying_1((int64)ptr, (IntMaskType)__mask);
 }
 
 static inline void prefetch_l2(const void * varying ptr) {
-    const void * uniform ptrArray[programCount];
-    ptrArray[programIndex] = ptr;
-
-    foreach_active (i) {
-        const void * uniform p = ptrArray[i];
-        prefetch_l2(p);
-    }
+     __pseudo_prefetch_read_varying_2((int64)ptr, (IntMaskType)__mask);
 }
 
 static inline void prefetch_l3(const void * varying ptr) {
-    const void * uniform ptrArray[programCount];
-    ptrArray[programIndex] = ptr;
-
-    foreach_active (i) {
-        const void * uniform p = ptrArray[i];
-        prefetch_l3(p);
-    }
+     __pseudo_prefetch_read_varying_3((int64)ptr, (IntMaskType)__mask);
 }
 
 static inline void prefetch_nt(const void * varying ptr) {
-    const void * uniform ptrArray[programCount];
-    ptrArray[programIndex] = ptr;
-
-    foreach_active (i) {
-        const void * uniform p = ptrArray[i];
-        prefetch_nt(p);
-    }
+    __pseudo_prefetch_read_varying_nt((int64)ptr, (IntMaskType)__mask);
 }
 
 ///////////////////////////////////////////////////////////////////////////
diff --git a/stmt.cpp b/stmt.cpp
index a551b3f7..586cb0fe 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -712,7 +712,6 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
     }
 }
 
-
 /** Emit code for an if test that checks the mask and the test values and
     tries to be smart about jumping over code that doesn't need to be run.
  */
@@ -1101,8 +1100,10 @@ void DoStmt::EmitCode(FunctionEmitContext *ctx) const {
     // the code for the test.  This is only necessary for varying loops;
     // 'uniform' loops just jump when they hit a continue statement and
     // don't mess with the mask.
-    if (!uniformTest)
+    if (!uniformTest) {
         ctx->RestoreContinuedLanes();
+        ctx->ClearBreakLanes();
+    }
     llvm::Value *testValue = testExpr->GetValue(ctx);
     if (!testValue)
         return;
@@ -1310,6 +1311,8 @@ ForStmt::EmitCode(FunctionEmitContext *ctx) const {
     // test code.
     ctx->SetCurrentBasicBlock(bstep);
     ctx->RestoreContinuedLanes();
+    ctx->ClearBreakLanes();
+
     if (step)
         step->EmitCode(ctx);
     ctx->BranchInst(btest);
diff --git a/tests/prefetch-varying.ispc b/tests/prefetch-varying.ispc
new file mode 100644
index 00000000..02df84c9
--- /dev/null
+++ b/tests/prefetch-varying.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+int64 zero = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+ uniform int64 a[programCount];
+ for (uniform int i = 0; i < programCount; ++i)
+ a[i] = aFOO[i];
+
+ int64 *ptr = &(a[programIndex+zero]);
+ prefetch_l1(ptr);
+ prefetch_l2(ptr);
+ prefetch_l3(ptr);
+ prefetch_nt(ptr);
+ int g = *ptr;
+ RET[programIndex] = g;
+}
+
+export void result(uniform float RET[]) {
+ RET[programIndex] = 1 + programIndex;
+}