From 9c79d4d182ca14072583128e6b59d48a80b93102 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 12:58:02 +0200
Subject: [PATCH 001/159] addded avxh with vectorWidth=4 support, use
 --target=avxh to enable it

---
 Makefile                 |   2 +-
 builtins.cpp             |   8 +
 builtins/target-avx-h.ll | 554 +++++++++++++++++++++++++++++++++++++++
 builtins/target-avxh.ll  |  81 ++++++
 ispc.cpp                 |   9 +
 5 files changed, 653 insertions(+), 1 deletion(-)
 create mode 100644 builtins/target-avx-h.ll
 create mode 100644 builtins/target-avxh.ll

diff --git a/Makefile b/Makefile
index 09ec302d..b5bb3472 100644
--- a/Makefile
+++ b/Makefile
@@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=avxh avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
diff --git a/builtins.cpp b/builtins.cpp
index 886eec15..63c90337 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -920,6 +920,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     }
     case Target::AVX: {
         switch (g->target->getVectorWidth()) {
+        case 4:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avxh_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avxh_64bit);
+            }
+            break;
         case 8:
             if (runtime32) {
                 EXPORT_MODULE(builtins_bitcode_avx1_32bit);
diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-h.ll
new file mode 100644
index 00000000..d56a63b9
--- /dev/null
+++ b/builtins/target-avx-h.ll
@@ -0,0 +1,554 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 4-wide definitions
+
+define(`WIDTH',`4')
+define(`MASK',`i32')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
+  ; do one N-R iteration
+  %v_iv = fmul <4 x float> %0, %call
+  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
+  %iv_mul = fmul <4 x float> %call, %two_minus
+  ret <4 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+;; avx intrinsic
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 8)
+  ret <4 x double> %call
+}
+
+define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 9)
+  ret <4 x double> %call
+}
+
+
+define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 10)
+  ret <4 x double> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <4 x float> %v, %is
+  %v_is_is = fmul <4 x float> %v_is, %is
+  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <4 x float> %is, %three_sub
+  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <4 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+;; avx§ intrinsic
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %0)
+  ret <4 x double> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+;;declare <4 x double> @__svml_sin4(<4 x double>)
+;;declare <4 x double> @__svml_cos4(<4 x double>)
+;;declare void @__svml_sincos4(<4 x double>, <4 x double> *, <4 x double> *)
+;;declare <4 x double> @__svml_tan4(<4 x double>)
+;;declare <4 x double> @__svml_atan4(<4 x double>)
+;;declare <4 x double> @__svml_atan24(<4 x double>, <4 x double>)
+;;declare <4 x double> @__svml_exp4(<4 x double>)
+;;declare <4 x double> @__svml_log4(<4 x double>)
+;;declare <4 x double> @__svml_pow4(<4 x double>, <4 x double>)
+declare <4 x float> @__svml_sin(<4 x float>)
+declare <4 x float> @__svml_cos(<4 x float>)
+declare void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *)
+declare <4 x float> @__svml_tan(<4 x float>)
+declare <4 x float> @__svml_atan(<4 x float>)
+declare <4 x float> @__svml_atan2(<4 x float>, <4 x float>)
+declare <4 x float> @__svml_exp(<4 x float>)
+declare <4 x float> @__svml_log(<4 x float>)
+declare <4 x float> @__svml_pow(<4 x float>, <4 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+;; sse intrinsics
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+;; sse intrinsic 
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
+  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
+  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
+  %scalar = extractelement <4 x float> %v2, i32 0
+  ret float %scalar
+}
+
+define float @__reduce_min_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+reduce_equal(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline 
+{
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define <4 x i32> @__add_varying_int32(<4 x i32>,
+                                      <4 x i32>) nounwind readnone alwaysinline {
+  %s = add <4 x i32> %0, %1
+  ret <4 x i32> %s
+}
+
+define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define i32 @__reduce_add_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define double @__reduce_add_double(<4 x double>) nounwind readonly alwaysinline {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1 = shufflevector <4 x double> <double 0.,double 0.,double 0.,double 0.>, <4 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;;  %v1 = <4 x double> <double 0., double 0., double 0., double 0.>
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0,   <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
+  ret double %sum
+}
+
+define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define <4 x i64> @__add_varying_int64(<4 x i64>,
+                                      <4 x i64>) nounwind readnone alwaysinline {
+  %s = add <4 x i64> %0, %1
+  ret <4 x i64> %s
+}
+
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+; no masked load instruction for i8 and i16 types??
+masked_load(i8,  1)
+masked_load(i16, 2)
+
+;; avx intrinsics
+declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
+ 
+define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+  %floatmask = bitcast <4 x i32> %mask to <4 x float>
+  %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask)
+  %retval = bitcast <4 x float> %floatval to <4 x i32>
+  ret <4 x i32> %retval
+}
+
+
+define <4 x i64> @__masked_load_i64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+  ; double up masks, bitcast to doubles
+  %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+
+  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
+
+  %vald = shufflevector <4 x double> %val0d, <4 x double> undef,
+      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %val = bitcast <4 x double> %vald to <4 x i64>
+  ret <4 x i64> %val
+}
+
+masked_load_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+
+; note that mask is the 2nd parameter, not the 3rd one!!
+;; avx intrinsics
+declare void @llvm.x86.avx.maskstore.ps    (i8 *, <4 x float>,  <4 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
+
+define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                <4 x i32>) nounwind alwaysinline {
+  %ptr = bitcast <4 x i32> * %0 to i8 *
+  %val = bitcast <4 x i32> %1 to <4 x float>
+  %mask = bitcast <4 x i32> %2 to <4 x float>
+  call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val)
+  ret void
+}
+
+define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
+                                <4 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast <4 x i64> * %0 to i8 *
+  %val = bitcast <4 x i64> %1 to <4 x double>
+
+  %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+
+  %val0 = shufflevector <4 x double> %val, <4 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
+  ret void
+}
+
+
+masked_store_blend_8_16_by_4()
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+                                             <4 x float>) nounwind readnone
+
+
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i32> %mask) nounwind alwaysinline {
+  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
+  %oldValue = load <4 x i32>* %0, align 4
+  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
+  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
+  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
+                                                     <4 x float> %newAsFloat,
+                                                     <4 x float> %mask_as_float)
+  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
+  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
+  ret void
+}
+
+;; avx intrinsic
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
+                                                <8 x float>) nounwind readnone
+
+define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new, 
+                                      <4 x i32> %i32mask) nounwind alwaysinline {
+  %oldValue = load <4 x i64>* %ptr, align 8
+  %mask = bitcast <4 x i32> %i32mask to <4 x float>
+
+  ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
+  ; are actually bitcast <4 x i64> values
+  ;
+  ; set up the first four 64-bit values
+  %old01  = bitcast <4 x i64> %oldValue to <4 x i64>
+  %old01f = bitcast <4 x i64> %old01 to <8 x float>
+  %new01  = bitcast <4 x i64> %new  to <4 x i64>
+  %new01f = bitcast <4 x i64> %new01 to <8 x float>
+  ; compute mask--note that the indices are all doubled-up
+  %mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
+                          <8 x i32> <i32 0, i32 0, i32 1, i32 1,
+                                     i32 2, i32 2, i32 3, i32 3>
+  ; and blend them
+  %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
+                                                            <8 x float> %new01f,
+                                                            <8 x float> %mask01)
+  %result01 = bitcast <8 x float> %result01f to <4 x i64>
+
+
+  %final = bitcast <4 x i64> %result01 to <4 x i64>
+  store <4 x i64> %final, <4 x i64> * %ptr, align 8
+  ret void
+}
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; scatter
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %0, <4 x double> %1)
+  ret <4 x double> %call
+}
+
+define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %0, <4 x double> %1)
+  ret <4 x double> %call
+}
+
diff --git a/builtins/target-avxh.ll b/builtins/target-avxh.ll
new file mode 100644
index 00000000..98c9111d
--- /dev/null
+++ b/builtins/target-avxh.ll
@@ -0,0 +1,81 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-h.ll')
+
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+
+  ret <4 x i32> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+ifelse(NO_HALF_DECLARES, `1', `', `
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
diff --git a/ispc.cpp b/ispc.cpp
index 6d4b063d..02c23568 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -446,6 +446,15 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
+    else if (!strcasecmp(isa, "avxh") ) {
+        fprintf(stderr, " ISA is avxh \n");
+        this->m_isa = Target::AVX;
+        this->m_nativeVectorWidth = 4;
+        this->m_vectorWidth = 4;
+        this->m_attributes = "+avx,+popcnt,+cmov";
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 32;
+    }
     else if (!strcasecmp(isa, "avx-x2") ||
              !strcasecmp(isa, "avx1-x2") ||
              !strcasecmp(isa, "avx1-i32x16")) {

From 320c41ffcf223f4793c39c2f445ed0aed19d6270 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 15:16:50 +0200
Subject: [PATCH 002/159] added svml support. experimental. for some reason all
 sybmols are visible..

---
 .gitignore                        |   4 ++
 Makefile                          |   6 +-
 builtins.cpp                      |  13 ++++
 builtins/target-avx-h.ll          |  27 ++------
 builtins/target-avx-x2.ll         |  16 +----
 builtins/target-avx.ll            |  18 ++----
 builtins/target-generic-1.ll      |  45 +++++++++----
 builtins/target-generic-common.ll |  16 ++---
 builtins/target-neon-common.ll    |  13 ++--
 builtins/target-sse2-x2.ll        |  36 +++++------
 builtins/target-sse2.ll           |  61 ++----------------
 builtins/target-sse4-16.ll        |  13 +---
 builtins/target-sse4-8.ll         |  12 +---
 builtins/target-sse4-x2.ll        |  36 +++++------
 builtins/target-sse4.ll           |  61 ++----------------
 builtins/util.m4                  |   6 ++
 stdlib.ispc                       | 102 ++++++++++++++++++++++++------
 17 files changed, 216 insertions(+), 269 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0469cf7d..3bec2ace 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,5 +11,9 @@ tests*/*run
 examples/*/*.png
 examples/*/*.ppm
 examples/*/objs/*
+*.swp
+.*
+!.gitignore
+
 
 
diff --git a/Makefile b/Makefile
index b5bb3472..43f41e09 100644
--- a/Makefile
+++ b/Makefile
@@ -246,15 +246,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
 
-objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@
 
-objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(32 bit version\)
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@
 
-objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(64 bit version\)
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@
 
diff --git a/builtins.cpp b/builtins.cpp
index 63c90337..139b8f04 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -582,7 +582,9 @@ lSetInternalFunctions(llvm::Module *module) {
         "__stdlib_tan",
         "__stdlib_tanf",
         "__svml_sin",
+        "__svml_asin",
         "__svml_cos",
+        "__svml_acos",
         "__svml_sincos",
         "__svml_tan",
         "__svml_atan",
@@ -590,6 +592,17 @@ lSetInternalFunctions(llvm::Module *module) {
         "__svml_exp",
         "__svml_log",
         "__svml_pow",
+        "__svml_sinf",
+        "__svml_asinf",
+        "__svml_cosf",
+        "__svml_acosf",
+        "__svml_sincosf",
+        "__svml_tanf",
+        "__svml_atanf",
+        "__svml_atan2f",
+        "__svml_expf",
+        "__svml_logf",
+        "__svml_powf",
         "__undef_uniform",
         "__undef_varying",
         "__vec4_add_float",
diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-h.ll
index d56a63b9..a06e5ab3 100644
--- a/builtins/target-avx-h.ll
+++ b/builtins/target-avx-h.ll
@@ -154,28 +154,11 @@ define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
 
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
-;;declare <4 x double> @__svml_sin4(<4 x double>)
-;;declare <4 x double> @__svml_cos4(<4 x double>)
-;;declare void @__svml_sincos4(<4 x double>, <4 x double> *, <4 x double> *)
-;;declare <4 x double> @__svml_tan4(<4 x double>)
-;;declare <4 x double> @__svml_atan4(<4 x double>)
-;;declare <4 x double> @__svml_atan24(<4 x double>, <4 x double>)
-;;declare <4 x double> @__svml_exp4(<4 x double>)
-;;declare <4 x double> @__svml_log4(<4 x double>)
-;;declare <4 x double> @__svml_pow4(<4 x double>, <4 x double>)
-declare <4 x float> @__svml_sin(<4 x float>)
-declare <4 x float> @__svml_cos(<4 x float>)
-declare void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *)
-declare <4 x float> @__svml_tan(<4 x float>)
-declare <4 x float> @__svml_atan(<4 x float>)
-declare <4 x float> @__svml_atan2(<4 x float>, <4 x float>)
-declare <4 x float> @__svml_exp(<4 x float>)
-declare <4 x float> @__svml_log(<4 x float>)
-declare <4 x float> @__svml_pow(<4 x float>, <4 x float>)
+include(`svml.m4')
+svmlf_declare(4)
+svmlf_define(4)
+svmld_declare(4)
+svmld_define(4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index d9e0322b..d646720e 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -137,19 +137,9 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
 
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones 4x with our 16-wide
-; vectors...
-
-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+include(`svml.m4')
+svmlf_stubs(16)
+svmld_stubs(16)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 90e2f3ac..1d33e3f9 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -137,19 +137,11 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
 
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+include(`svml.m4')
+svmlf_declare(8)
+svmlf_define(8)
+svmld_declare(4)
+svmld_stubs(8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 31ebcdd5..910565dd 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -310,6 +310,7 @@ declare double @round (double) nounwind readnone
 ;declare float     @llvm.sqrt.f32(float %Val)
 declare double    @llvm.sqrt.f64(double %Val)
 declare float     @llvm.sin.f32(float %Val)
+declare float     @llvm.asin.f32(float %Val)
 declare float     @llvm.cos.f32(float %Val)
 declare float     @llvm.sqrt.f32(float %Val)
 declare float     @llvm.exp.f32(float %Val)
@@ -651,7 +652,18 @@ define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
+declare  <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline 
+declare  void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_expd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_logd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_powd(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
+
+define  <1 x float> @__svml_sinf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
   ;ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -662,7 +674,18 @@ define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
    
 }
 
-define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_asinf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_asinf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.asin.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float,@llvm.asin.f32)
+   
+}
+
+define  <1 x float> @__svml_cosf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
   ;ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -673,18 +696,18 @@ define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
 
 }
 
-define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
+define  void @__svml_sincosf(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
 ;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
 ;  store <1 x float> %s, <1 x float> * %1
 ;  ret void
-   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
-   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
+   %sin = call <1 x float> @__svml_sinf(<1 x float> %0)
+   %cos = call <1 x float> @__svml_cosf(<1 x float> %0)
    store <1 x float> %sin, <1 x float> * %1
    store <1 x float> %cos, <1 x float> * %2
    ret void
 }
 
-define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_tanf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
   ;ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -696,7 +719,7 @@ define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
   ret <1 x float > %0
 }
 
-define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_atanf(<1 x float>) nounwind readnone alwaysinline {
 ;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
 ;  ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -709,7 +732,7 @@ define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
 
 }
 
-define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_atan2f(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
   ;ret <1 x float> %ret
   ;%y = extractelement <1 x float> %0, i32 0
@@ -722,19 +745,19 @@ define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone al
   ret <1 x float > %0
 }
 
-define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_expf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
   ;ret <1 x float> %ret
   unary1to1(float, @llvm.exp.f32)
 }
 
-define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_logf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
   ;ret <1 x float> %ret
   unary1to1(float, @llvm.log.f32)
 }
 
-define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_powf(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
   ;ret <1 x float> %ret
   %r = extractelement <1 x float> %0, i32 0
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 2896c6b1..bc7db9ec 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -202,21 +202,15 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
 declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
 declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 
-;; svml
-
 ; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
 ; or, use the macro to call the 4-wide ones twice with our 8-wide
 ; vectors...
 
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+;; svml
+
+include(`svml.m4')
+svmlf_stubs(WIDTH)
+svmld_stubs(WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
index 696b0748..92fc5ce3 100644
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -316,15 +316,10 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
 ;; yuck.  We need declarations of these, even though we shouldnt ever
 ;; actually generate calls to them for the NEON target...
 
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+
+include(`svml.m4')
+svmlf_stubs(WIDTH)
+svmld_stubs(WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index da22a66c..5688ebba 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -105,28 +105,28 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+svmlf_declare(4)
+svmld_declare(2)
+svmld_stubs(8)
 
 
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_sinf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_asinf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_cosf4, %0)
   ret <8 x float> %ret
 }
 
-define void @__svml_sincos(<8 x float>, <8 x float> *,
+define void @__svml_sincosf(<8 x float>, <8 x float> *,
                                     <8 x float> *) nounwind readnone alwaysinline {
   ; call svml_sincosf4 two times with the two 4-wide sub-vectors
   %a = shufflevector <8 x float> %0, <8 x float> undef,
@@ -155,33 +155,33 @@ define void @__svml_sincos(<8 x float>, <8 x float> *,
   ret void
 }
 
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_tanf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_atanf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_atan2(<8 x float>,
+define <8 x float> @__svml_atan2f(<8 x float>,
                                           <8 x float>) nounwind readnone alwaysinline {
   binary4to8(ret, float, @__svml_atan2f4, %0, %1)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_expf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_logf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_pow(<8 x float>,
+define <8 x float> @__svml_powf(<8 x float>,
                                         <8 x float>) nounwind readnone alwaysinline {
   binary4to8(ret, float, @__svml_powf4, %0, %1)
   ret <8 x float> %ret
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index a6b206b6..236cda33 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -496,62 +496,11 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
+include(`svml.m4')
+svmlf_declare(4)
+svmld_declare(2)
+svmlf_define(4)
+svmld_stubs(4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index d7f3833d..3fbbe534 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -209,16 +209,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
 ;; svml
 
 ; FIXME
-
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+include(`svml.m4')
+svmlf_stubs(8)
+svmld_stubs(8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index fd4b74d7..e65077b7 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -222,15 +222,9 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
 
 ; FIXME
 
-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+include(`svml.m4')
+svmlf_stubs(16)
+svmld_stubs(16)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index a7faddb3..2a69b60a 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -105,28 +105,28 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+svmlf_declare(4)
+svmld_declare(2)
+svmld_stubs(8)
 
 
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_sinf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_asinf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_cosf4, %0)
   ret <8 x float> %ret
 }
 
-define void @__svml_sincos(<8 x float>, <8 x float> *,
+define void @__svml_sincosf(<8 x float>, <8 x float> *,
                                     <8 x float> *) nounwind readnone alwaysinline {
   ; call svml_sincosf4 two times with the two 4-wide sub-vectors
   %a = shufflevector <8 x float> %0, <8 x float> undef,
@@ -155,33 +155,33 @@ define void @__svml_sincos(<8 x float>, <8 x float> *,
   ret void
 }
 
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_tanf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_atanf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_atan2(<8 x float>,
+define <8 x float> @__svml_atan2f(<8 x float>,
                                           <8 x float>) nounwind readnone alwaysinline {
   binary4to8(ret, float, @__svml_atan2f4, %0, %1)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_expf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_logf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_pow(<8 x float>,
+define <8 x float> @__svml_powf(<8 x float>,
                                         <8 x float>) nounwind readnone alwaysinline {
   binary4to8(ret, float, @__svml_powf4, %0, %1)
   ret <8 x float> %ret
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index e05b865f..686b4f84 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -209,62 +209,11 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
+include(`svml.m4')
+svmlf_declare(4)
+svmlf_define(4)
+svmld_declare(2)
+svmld_stubs(8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/util.m4 b/builtins/util.m4
index 95e3844d..6c90c821 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -3160,6 +3160,7 @@ define float @__stdlib_powf(float, float) nounwind readnone alwaysinline {
 }
 
 declare double @sin(double) nounwind readnone
+declare double @asin(double) nounwind readnone
 declare double @cos(double) nounwind readnone
 declare void @sincos(double, double *, double *) nounwind readnone
 declare double @tan(double) nounwind readnone
@@ -3174,6 +3175,11 @@ define double @__stdlib_sin(double) nounwind readnone alwaysinline {
   ret double %r
 }
 
+define double @__stdlib_asin(double) nounwind readnone alwaysinline {
+  %r = call double @asin(double %0)
+  ret double %r
+}
+
 define double @__stdlib_cos(double) nounwind readnone alwaysinline {
   %r = call double @cos(double %0)
   ret double %r
diff --git a/stdlib.ispc b/stdlib.ispc
index e4f8844f..db9d7f36 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -2180,7 +2180,7 @@ static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
 __declspec(safe)
 static inline float sin(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_sin(x_full);
+        return __svml_sinf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2313,8 +2313,10 @@ static inline float asin(float x) {
     bool isnan = (x > 1);
 
     float v;
-    if (__math_lib == __math_lib_svml ||
-        __math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_asinf(x);
+    } 
+    else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_asinf(extract(x, i));
@@ -2417,7 +2419,7 @@ static inline uniform float asin(uniform float x) {
 __declspec(safe)
 static inline float cos(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_cos(x_full);
+        return __svml_cosf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2545,18 +2547,28 @@ static inline float acos(float v) {
     return 1.57079637050628662109375 - asin(v);
 }
 
+__declspec(safe)
+static inline double acos(const double v) {
+    return 1.57079637050628662109375 - asin(v);
+}
+
 
 __declspec(safe)
 static inline uniform float acos(uniform float v) {
     return 1.57079637050628662109375 - asin(v);
 }
 
+__declspec(safe)
+static inline uniform double acos(const uniform double v) {
+    return 1.57079637050628662109375 - asin(v);
+}
+
 
 __declspec(safe)
 static inline void sincos(float x_full, varying float * uniform sin_result, 
                           varying float * uniform cos_result) {
     if (__math_lib == __math_lib_svml) {
-        __svml_sincos(x_full, sin_result, cos_result);
+        __svml_sincosf(x_full, sin_result, cos_result);
     }
     else if (__math_lib == __math_lib_system) {
         foreach_active (i) {
@@ -2688,7 +2700,7 @@ static inline void sincos(uniform float x_full, uniform float * uniform sin_resu
 __declspec(safe)
 static inline float tan(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_tan(x_full);
+        return __svml_tanf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2839,7 +2851,7 @@ static inline uniform float tan(uniform float x_full) {
 __declspec(safe)
 static inline float atan(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_atan(x_full);
+        return __svml_atanf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2934,7 +2946,7 @@ static inline uniform float atan(uniform float x_full) {
 __declspec(safe)
 static inline float atan2(float y, float x) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_atan2(y, x);
+        return __svml_atan2f(y, x);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2997,7 +3009,7 @@ static inline float exp(float x_full) {
         return __exp_varying_float(x_full);
     }
     else if (__math_lib == __math_lib_svml) {
-        return __svml_exp(x_full);
+        return __svml_expf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -3204,7 +3216,7 @@ static inline float log(float x_full) {
         return __log_varying_float(x_full);
     }
     else if (__math_lib == __math_lib_svml) {
-        return __svml_log(x_full);
+        return __svml_logf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -3379,7 +3391,7 @@ static inline float pow(float a, float b) {
         return __pow_varying_float(a, b);
     }
     else if (__math_lib == __math_lib_svml) {
-        return __svml_pow(a, b);
+        return __svml_powf(a, b);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -3469,7 +3481,11 @@ static inline uniform double frexp(uniform double x, uniform int * uniform pw2)
 
 __declspec(safe)
 static inline double sin(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_sind(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return sin((float)x);
     else {
         double ret;
@@ -3490,8 +3506,30 @@ static inline uniform double sin(uniform double x) {
 }
 
 __declspec(safe)
-static inline double cos(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+static inline double asin(const double x) {
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_asind(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
+        return asin((float)x);
+    else {
+        double ret;
+        foreach_active (i) {
+            uniform double r = __stdlib_asin(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+__declspec(safe)
+static inline double cos(const double x) {
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_cosd(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return cos((float)x);
     else {
         double ret;
@@ -3514,7 +3552,11 @@ static inline uniform double cos(uniform double x) {
 __declspec(safe)
 static inline void sincos(double x, varying double * uniform sin_result,
                           varying double * uniform cos_result) {
-    if (__math_lib == __math_lib_ispc_fast) {
+    if (__math_lib == __math_lib_svml) 
+    {
+      __svml_sincosd(x, sin_result, cos_result);
+    }
+    else if (__math_lib == __math_lib_ispc_fast) {
         float sr, cr;
         sincos((float)x, &sr, &cr);
         *sin_result = sr;
@@ -3545,7 +3587,11 @@ static inline void sincos(uniform double x, uniform double * uniform sin_result,
 
 __declspec(safe)
 static inline double tan(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_tand(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return tan((float)x);
     else {
         double ret;
@@ -3589,7 +3635,11 @@ static inline uniform double atan(uniform double x) {
 
 __declspec(safe)
 static inline double atan2(double y, double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_atan2d(y,x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return atan2((float)y, (float)x);
     else {
         double ret;
@@ -3611,7 +3661,11 @@ static inline uniform double atan2(uniform double y, uniform double x) {
 
 __declspec(safe)
 static inline double exp(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+        return __svml_expd(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return exp((float)x);
     else {
         double ret;
@@ -3633,7 +3687,11 @@ static inline uniform double exp(uniform double x) {
 
 __declspec(safe)
 static inline double log(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+        return __svml_logd(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return log((float)x);
     else {
         double ret;
@@ -3655,7 +3713,11 @@ static inline uniform double log(uniform double x) {
 
 __declspec(safe)
 static inline double pow(double a, double b) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+        return __svml_powd(a,b);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return pow((float)a, (float)b);
     else {
         double ret;

From 7a326995735293a25fb44d5f7243521a57df719a Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 15:18:03 +0200
Subject: [PATCH 003/159] added svml.m4

---
 builtins/svml.m4 | 176 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 176 insertions(+)
 create mode 100644 builtins/svml.m4

diff --git a/builtins/svml.m4 b/builtins/svml.m4
new file mode 100644
index 00000000..cc3cd979
--- /dev/null
+++ b/builtins/svml.m4
@@ -0,0 +1,176 @@
+;; svml
+
+;; stub
+define(`svmlf_stubs',`
+  declare <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline
+  declare <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline 
+  declare void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline 
+')
+
+define(`svmld_stubs',`
+  declare <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline 
+  declare void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline 
+')
+
+;; single precision
+define(`svmlf_declare',`
+  declare <$1 x float> @__svml_sinf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_asinf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_cosf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_sincosf$1(<$1 x float> *, <$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_tanf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_atanf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_atan2f$1(<$1 x float>, <$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_expf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_logf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_powf$1(<$1 x float>, <$1 x float>) nounwind readnone
+');
+
+
+
+define(`svmlf_define',`
+  define <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_sinf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+  define <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_asinf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_cosf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline {
+    %s = call <$1 x float> @__svml_sincosf$1(<$1 x float> * %2, <$1 x float> %0)
+    store <$1 x float> %s, <$1 x float> * %1
+    ret void
+  }
+
+  define <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_tanf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_atanf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_atan2f$1(<$1 x float> %0, <$1 x float> %1)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_expf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_logf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_powf$1(<$1 x float> %0, <$1 x float> %1)
+    ret <$1 x float> %ret
+  }
+')
+
+;; double precision
+define(`svmld_declare',`
+  declare <$1 x double> @__svml_sin$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_asin$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_cos$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_sincos$1(<$1 x double> *, <$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_tan$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_atan$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_atan2$1(<$1 x double>, <$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_exp$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_log$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_pow$1(<$1 x double>, <$1 x double>) nounwind readnone
+')
+
+define(`svmld_define',`
+  define <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_sin$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+  define <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_asin$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+
+  define <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_cos$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+  define void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline {
+    %s = call <$1 x double> @__svml_sincos$1(<$1 x double> * %2, <$1 x double> %0)
+    store <$1 x double> %s, <$1 x double> * %1
+    ret void
+  }
+
+  define <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_tan$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+  define <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_atan$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+  define <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_atan2$1(<$1 x double> %0, <$1 x double> %1)
+    ret <$1 x double> %ret
+  }
+
+  define <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_exp$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+  define <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_log$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+  define <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_pow$1(<$1 x double> %0, <$1 x double> %1)
+    ret <$1 x double> %ret
+  }
+')
+
+;; need to implement smvld for 2xvectorWidth ...:w
+
+define(`svmld2_define',`
+  define <$1 x double> @__svml_sinxx(<$1 x double>) nounwind readnone alwaysinline {
+    %v0 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %v1 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+    %ret0 = call <$2 x double> @__svml_sin$2(<$2 x double> %v0)
+    %ret1 = call <$2 x double> @__svml_sin$2(<$2 x double> %v1)
+    %ret  = shufflevector <$2 x double> %ret0, <$2 x double> %ret1, <$1 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+    ret <$1 x double> %ret
+  }
+')

From 9cf8e8cbf3945df122bf0652326be1404634c0cb Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 15:23:45 +0200
Subject: [PATCH 004/159] builtins fix for double precision svml and
 __stdlib_asin

---
 builtins.cpp | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index 139b8f04..816d4d78 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -576,22 +576,23 @@ lSetInternalFunctions(llvm::Module *module) {
         "__stdlib_pow",
         "__stdlib_powf",
         "__stdlib_sin",
+        "__stdlib_asin",
         "__stdlib_sincos",
         "__stdlib_sincosf",
         "__stdlib_sinf",
         "__stdlib_tan",
         "__stdlib_tanf",
-        "__svml_sin",
-        "__svml_asin",
-        "__svml_cos",
-        "__svml_acos",
-        "__svml_sincos",
-        "__svml_tan",
-        "__svml_atan",
-        "__svml_atan2",
-        "__svml_exp",
-        "__svml_log",
-        "__svml_pow",
+        "__svml_sind",
+        "__svml_asind",
+        "__svml_cosd",
+        "__svml_acosd",
+        "__svml_sincosd",
+        "__svml_tand",
+        "__svml_atand",
+        "__svml_atan2d",
+        "__svml_expd",
+        "__svml_logd",
+        "__svml_powd",
         "__svml_sinf",
         "__svml_asinf",
         "__svml_cosf",

From 19379db3b60a60f2f1862a54709115bcf11c7545 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 16:48:56 +0200
Subject: [PATCH 005/159] svml cleanup

---
 builtins/svml.m4                  | 209 +++++++++---------------------
 builtins/target-avx-h.ll          |  11 +-
 builtins/target-avx-x2.ll         |   9 +-
 builtins/target-avx.ll            |  11 +-
 builtins/target-generic-common.ll |   4 +-
 builtins/target-sse2-x2.ll        |   8 +-
 builtins/target-sse2.ll           |  12 +-
 builtins/target-sse4-16.ll        |   4 +-
 builtins/target-sse4-8.ll         |   4 +-
 builtins/target-sse4-x2.ll        |   9 +-
 builtins/target-sse4.ll           |  11 +-
 11 files changed, 116 insertions(+), 176 deletions(-)

diff --git a/builtins/svml.m4 b/builtins/svml.m4
index cc3cd979..9608dea6 100644
--- a/builtins/svml.m4
+++ b/builtins/svml.m4
@@ -1,176 +1,93 @@
 ;; svml
 
-;; stub
-define(`svmlf_stubs',`
-  declare <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline
-  declare <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline 
-  declare void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline 
+;; stubs
+define(`svml_stubs',`
+  declare <$2 x $1> @__svml_sin$3(<$2 x $1>) nounwind readnone alwaysinline
+  declare <$2 x $1> @__svml_asin$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_cos$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare void @__svml_sincos$3(<$2 x $1>, <$2 x $1> *, <$2 x $1> *) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_tan$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_atan$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_atan2$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_exp$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_log$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_pow$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline 
 ')
 
-define(`svmld_stubs',`
-  declare <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline 
-  declare void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline 
-')
-
-;; single precision
-define(`svmlf_declare',`
-  declare <$1 x float> @__svml_sinf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_asinf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_cosf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_sincosf$1(<$1 x float> *, <$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_tanf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_atanf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_atan2f$1(<$1 x float>, <$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_expf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_logf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_powf$1(<$1 x float>, <$1 x float>) nounwind readnone
+;; decalre __svml calls
+define(`svml_declare',`
+  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_sincos$2(<$3 x $1> *, <$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone
 ');
 
-
-
-define(`svmlf_define',`
-  define <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_sinf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+;; define native __svml calls
+define(`svml_define',`
+  define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
-  define <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_asinf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_asin$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_asin$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_cosf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_cos$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_cos$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline {
-    %s = call <$1 x float> @__svml_sincosf$1(<$1 x float> * %2, <$1 x float> %0)
-    store <$1 x float> %s, <$1 x float> * %1
+  define void @__svml_sincos$4(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline {
+    %s = call <$3 x $1> @__svml_sincos$2(<$3 x $1> * %2, <$3 x $1> %0)
+    store <$3 x $1> %s, <$3 x $1> * %1
     ret void
   }
 
-  define <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_tanf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_tan$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_tan$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_atanf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_atan$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_atan$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_atan2f$1(<$1 x float> %0, <$1 x float> %1)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_atan2$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_atan2$2(<$3 x $1> %0, <$3 x $1> %1)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_expf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_exp$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_exp$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_logf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_log$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_log$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_powf$1(<$1 x float> %0, <$1 x float> %1)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_pow$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_pow$2(<$3 x $1> %0, <$3 x $1> %1)
+    ret <$3 x $1> %ret
   }
 ')
 
-;; double precision
-define(`svmld_declare',`
-  declare <$1 x double> @__svml_sin$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_asin$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_cos$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_sincos$1(<$1 x double> *, <$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_tan$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_atan$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_atan2$1(<$1 x double>, <$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_exp$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_log$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_pow$1(<$1 x double>, <$1 x double>) nounwind readnone
+
+;; define x2 __svml calls
+define(`svml_define_x2',`
+   svml_stubs($1,$3,$4)
 ')
 
-define(`svmld_define',`
-  define <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_sin$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-  define <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_asin$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-
-  define <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_cos$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-  define void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline {
-    %s = call <$1 x double> @__svml_sincos$1(<$1 x double> * %2, <$1 x double> %0)
-    store <$1 x double> %s, <$1 x double> * %1
-    ret void
-  }
-
-  define <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_tan$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-  define <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_atan$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-  define <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_atan2$1(<$1 x double> %0, <$1 x double> %1)
-    ret <$1 x double> %ret
-  }
-
-  define <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_exp$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-  define <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_log$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-  define <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_pow$1(<$1 x double> %0, <$1 x double> %1)
-    ret <$1 x double> %ret
-  }
-')
-
-;; need to implement smvld for 2xvectorWidth ...:w
-
-define(`svmld2_define',`
-  define <$1 x double> @__svml_sinxx(<$1 x double>) nounwind readnone alwaysinline {
-    %v0 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-    %v1 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-    %ret0 = call <$2 x double> @__svml_sin$2(<$2 x double> %v0)
-    %ret1 = call <$2 x double> @__svml_sin$2(<$2 x double> %v1)
-    %ret  = shufflevector <$2 x double> %ret0, <$2 x double> %ret1, <$1 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-    ret <$1 x double> %ret
-  }
+;; define x4 __svml calls
+define(`svml_define_x4',`
+   svml_stubs($1,$3,$4)
 ')
diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-h.ll
index a06e5ab3..283eaddd 100644
--- a/builtins/target-avx-h.ll
+++ b/builtins/target-avx-h.ll
@@ -155,10 +155,13 @@ define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
 ;; svml
 
 include(`svml.m4')
-svmlf_declare(4)
-svmlf_define(4)
-svmld_declare(4)
-svmld_define(4)
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define(double,4,4,d)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index d646720e..f3f1590a 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -138,8 +138,13 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
 ;; svml
 
 include(`svml.m4')
-svmlf_stubs(16)
-svmld_stubs(16)
+;; single precision
+svml_declare(float,f8,8)
+svml_define_x2(float,f8,8,f,16)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define_x2(double,4,4,d,16)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 1d33e3f9..7e7ab330 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -138,10 +138,13 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;; svml
 
 include(`svml.m4')
-svmlf_declare(8)
-svmlf_define(8)
-svmld_declare(4)
-svmld_stubs(8)
+;; single precision
+svml_declare(float,f8,8)
+svml_define(float,f8,8,f)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define_x2(double,4,4,d,8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index bc7db9ec..30a8b030 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -209,8 +209,8 @@ declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 ;; svml
 
 include(`svml.m4')
-svmlf_stubs(WIDTH)
-svmld_stubs(WIDTH)
+svml_stubs(float,  WIDTH, f)
+svml_stubs(double, WIDTH, d)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 5688ebba..9fa607a4 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -106,10 +106,12 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ; svml stuff
 
 include(`svml.m4')
-svmlf_declare(4)
-svmld_declare(2)
-svmld_stubs(8)
+;; single precision
+svml_declare(float,f4,4)
 
+;; double precision
+svml_declare(double,2,2)
+svml_define_x4(double,2,2,d,8)
 
 define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_sinf4, %0)
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index 236cda33..c858ccb6 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -497,10 +497,14 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
 ; svml stuff
 
 include(`svml.m4')
-svmlf_declare(4)
-svmld_declare(2)
-svmlf_define(4)
-svmld_stubs(4)
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
+
+;; double precision
+svml_declare(double,2,2)
+svml_define_x2(double,2,2,d,4)
+
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 3fbbe534..3f8cd339 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -210,8 +210,8 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
 
 ; FIXME
 include(`svml.m4')
-svmlf_stubs(8)
-svmld_stubs(8)
+svml_stubs(float,8,f)
+svml_stubs(double,8,d)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index e65077b7..f43cd940 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -223,8 +223,8 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
 ; FIXME
 
 include(`svml.m4')
-svmlf_stubs(16)
-svmld_stubs(16)
+svml_stubs(float,16,f)
+svml_stubs(double,16,d)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 2a69b60a..c45966e3 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -106,9 +106,12 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ; svml stuff
 
 include(`svml.m4')
-svmlf_declare(4)
-svmld_declare(2)
-svmld_stubs(8)
+;; single precision
+svml_declare(float,f4,4)
+
+;; double precision
+svml_declare(double,2,2)
+svml_define_x4(double,2,2,d,8)
 
 
 define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 686b4f84..eb82ab9a 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -210,10 +210,13 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
 ; svml stuff
 
 include(`svml.m4')
-svmlf_declare(4)
-svmlf_define(4)
-svmld_declare(2)
-svmld_stubs(8)
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
+
+;; double precision
+svml_declare(double,2,2)
+svml_define_x2(double,2,2,d,4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

From efc20c211061585150abb02b4720316f0e45dad5 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 17:07:54 +0200
Subject: [PATCH 006/159] added svml support to all sse/avx modes

---
 builtins/svml.m4           | 44 ++++++++++++++++++---
 builtins/target-avx-x2.ll  |  4 +-
 builtins/target-avx.ll     |  2 +-
 builtins/target-sse2-x2.ll | 79 +------------------------------------
 builtins/target-sse2.ll    |  2 +-
 builtins/target-sse4-x2.ll | 80 +-------------------------------------
 builtins/target-sse4.ll    |  2 +-
 7 files changed, 47 insertions(+), 166 deletions(-)

diff --git a/builtins/svml.m4 b/builtins/svml.m4
index 9608dea6..71a6a709 100644
--- a/builtins/svml.m4
+++ b/builtins/svml.m4
@@ -83,11 +83,43 @@ define(`svml_define',`
 
 
 ;; define x2 __svml calls
-define(`svml_define_x2',`
-   svml_stubs($1,$3,$4)
+define(`svml_define_x',`
+  define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_sin$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_asin$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_asin$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_cos$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_cos$2, %0)
+    ret <$5 x $1> %ret
+  }
+  declare void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline 
+  define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_tan$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_atan$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_atan$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_atan2$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
+    binary$3to$5(ret, $1, @__svml_atan2$2, %0, %1)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_exp$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_exp$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_log$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_log$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_pow$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
+    binary$3to$5(ret, $1, @__svml_pow$2, %0, %1)
+    ret <$5 x $1> %ret
+  }
 ')
 
-;; define x4 __svml calls
-define(`svml_define_x4',`
-   svml_stubs($1,$3,$4)
-')
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index f3f1590a..f8fd5cd5 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -140,11 +140,11 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
 include(`svml.m4')
 ;; single precision
 svml_declare(float,f8,8)
-svml_define_x2(float,f8,8,f,16)
+svml_define_x(float,f8,8,f,16)
 
 ;; double precision
 svml_declare(double,4,4)
-svml_define_x2(double,4,4,d,16)
+svml_define_x(double,4,4,d,16)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 7e7ab330..196e5ea4 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -144,7 +144,7 @@ svml_define(float,f8,8,f)
 
 ;; double precision
 svml_declare(double,4,4)
-svml_define_x2(double,4,4,d,8)
+svml_define_x(double,4,4,d,8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 9fa607a4..77bf1a9d 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -108,86 +108,11 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 include(`svml.m4')
 ;; single precision
 svml_declare(float,f4,4)
+svml_define_x(float,f4,4,f,8)
 
 ;; double precision
 svml_declare(double,2,2)
-svml_define_x4(double,2,2,d,8)
-
-define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_asinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincosf(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2f(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_powf(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
+svml_define_x(double,2,2,d,8)
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index c858ccb6..e42d4990 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -503,7 +503,7 @@ svml_define(float,f4,4,f)
 
 ;; double precision
 svml_declare(double,2,2)
-svml_define_x2(double,2,2,d,4)
+svml_define_x(double,2,2,d,4)
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index c45966e3..842db53f 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -108,87 +108,11 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 include(`svml.m4')
 ;; single precision
 svml_declare(float,f4,4)
+svml_define_x(float,f4,4,f,8)
 
 ;; double precision
 svml_declare(double,2,2)
-svml_define_x4(double,2,2,d,8)
-
-
-define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_asinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincosf(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2f(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_powf(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
+svml_define_x(double,2,2,d,8)
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index eb82ab9a..88be6c59 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -216,7 +216,7 @@ svml_define(float,f4,4,f)
 
 ;; double precision
 svml_declare(double,2,2)
-svml_define_x2(double,2,2,d,4)
+svml_define_x(double,2,2,d,4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

From 7364e06387e7cc02f1a144097754e03181602208 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Thu, 12 Sep 2013 12:02:42 +0200
Subject: [PATCH 007/159] added mask64

---
 Makefile                                      |  17 ++-
 builtins.cpp                                  |  10 +-
 .../{target-avxh.ll => target-avx-i64x4.ll}   |   2 +-
 ...arget-avx-h.ll => target-avx-i64x4base.ll} | 137 +++++++-----------
 builtins/util.m4                              |  76 +++++++++-
 ispc.cpp                                      |   5 +-
 llvmutil.cpp                                  |  22 ++-
 parse.yy                                      |   3 +
 stdlib.ispc                                   |   3 +
 9 files changed, 175 insertions(+), 100 deletions(-)
 rename builtins/{target-avxh.ll => target-avx-i64x4.ll} (98%)
 rename builtins/{target-avx-h.ll => target-avx-i64x4base.ll} (78%)

diff --git a/Makefile b/Makefile
index 43f41e09..92debe4f 100644
--- a/Makefile
+++ b/Makefile
@@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avxh avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=avx-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
@@ -160,7 +160,7 @@ BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
 
 OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
-       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
+       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o stdlib_mask64_ispc.o \
 	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
 
 default: ispc
@@ -268,20 +268,25 @@ objs/builtins-c-64.cpp: builtins/builtins.c
 
 objs/stdlib_mask1_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask1
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask1 > $@
 
 objs/stdlib_mask8_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask8
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask8 > $@
 
 objs/stdlib_mask16_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask16
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask16 > $@
 
 objs/stdlib_mask32_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask32
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask32 > $@
+
+objs/stdlib_mask64_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask64
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask64 > $@
diff --git a/builtins.cpp b/builtins.cpp
index 816d4d78..f8d4136e 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -302,6 +302,7 @@ lCheckModuleIntrinsics(llvm::Module *module) {
         // check the llvm.x86.* intrinsics for now...
         if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
             llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
+            if (id == 0) fprintf(stderr, "FATAL: intrinsic is not found: %s  \n", funcName.c_str());
             Assert(id != 0);
             llvm::Type *intrinsicType =
                 llvm::Intrinsic::getType(*g->ctx, id);
@@ -936,10 +937,10 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         switch (g->target->getVectorWidth()) {
         case 4:
             if (runtime32) {
-                EXPORT_MODULE(builtins_bitcode_avxh_32bit);
+                EXPORT_MODULE(builtins_bitcode_avx_i64x4_32bit);
             }
             else {
-                EXPORT_MODULE(builtins_bitcode_avxh_64bit);
+                EXPORT_MODULE(builtins_bitcode_avx_i64x4_64bit);
             }
             break;
         case 8:
@@ -1105,7 +1106,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         // serialized version of the stdlib.ispc file to get its
         // definitions added.
         extern char stdlib_mask1_code[], stdlib_mask8_code[];
-        extern char stdlib_mask16_code[], stdlib_mask32_code[];
+        extern char stdlib_mask16_code[], stdlib_mask32_code[], stdlib_mask64_code[];
         if (g->target->getISA() == Target::GENERIC &&
             g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib
             yy_scan_string(stdlib_mask32_code);
@@ -1124,6 +1125,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
             case 32:
                 yy_scan_string(stdlib_mask32_code);
                 break;
+            case 64:
+                yy_scan_string(stdlib_mask64_code);
+                break;
             default:
                 FATAL("Unhandled mask bit size for stdlib.ispc");
             }
diff --git a/builtins/target-avxh.ll b/builtins/target-avx-i64x4.ll
similarity index 98%
rename from builtins/target-avxh.ll
rename to builtins/target-avx-i64x4.ll
index 98c9111d..d7dbb6bd 100644
--- a/builtins/target-avxh.ll
+++ b/builtins/target-avx-i64x4.ll
@@ -29,7 +29,7 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-include(`target-avx-h.ll')
+include(`target-avx-i64x4base.ll')
 
 rdrand_decls()
 
diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-i64x4base.ll
similarity index 78%
rename from builtins/target-avx-h.ll
rename to builtins/target-avx-i64x4base.ll
index 283eaddd..05bf178d 100644
--- a/builtins/target-avx-h.ll
+++ b/builtins/target-avx-i64x4base.ll
@@ -33,7 +33,7 @@
 ;; Basic 4-wide definitions
 
 define(`WIDTH',`4')
-define(`MASK',`i32')
+define(`MASK',`i64')
 include(`util.m4')
 
 stdlib_core()
@@ -185,32 +185,32 @@ define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind reado
 ; horizontal ops
 
 ;; sse intrinsic 
-declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
 
-define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i64 @__movmsk(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %v64 = zext i32 %v to i64
   ret i64 %v64
 }
 
-define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i1 @__any(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %cmp = icmp ne i32 %v, 0
   ret i1 %cmp
 }
 
-define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i1 @__all(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %cmp = icmp eq i32 %v, 15
   ret i1 %cmp
 }
 
-define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i1 @__none(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %cmp = icmp eq i32 %v, 0
   ret i1 %cmp
 }
@@ -392,7 +392,8 @@ masked_load(i16, 2)
 declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
  
-define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline {
+  %mask      = trunc <4 x i64> %mask64 to <4 x i32>
   %floatmask = bitcast <4 x i32> %mask to <4 x float>
   %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask)
   %retval = bitcast <4 x float> %floatval to <4 x i32>
@@ -400,18 +401,11 @@ define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline
 }
 
 
-define <4 x i64> @__masked_load_i64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  ; double up masks, bitcast to doubles
-  %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
-     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
-
-  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
-
-  %vald = shufflevector <4 x double> %val0d, <4 x double> undef,
-      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %val = bitcast <4 x double> %vald to <4 x i64>
-  ret <4 x i64> %val
+define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline {
+  %doublemask = bitcast <4 x i64> %mask to <4 x double>
+  %doubleval  = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %doublemask)
+  %retval = bitcast <4 x double> %doubleval to <4 x i64>
+  ret <4 x i64> %retval
 }
 
 masked_load_float_double()
@@ -428,83 +422,62 @@ declare void @llvm.x86.avx.maskstore.ps    (i8 *, <4 x float>,  <4 x float>)
 declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
 
 define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, 
-                                <4 x i32>) nounwind alwaysinline {
-  %ptr = bitcast <4 x i32> * %0 to i8 *
-  %val = bitcast <4 x i32> %1 to <4 x float>
-  %mask = bitcast <4 x i32> %2 to <4 x float>
+                                <4 x i64>) nounwind alwaysinline {
+  %mask32 = trunc <4 x i64> %2 to <4 x i32>
+
+  %ptr    = bitcast <4 x i32> * %0 to i8 *
+  %val    = bitcast <4 x i32> %1 to <4 x float>
+  %mask   = bitcast <4 x i32> %mask32 to <4 x float>
   call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val)
   ret void
 }
 
 define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
-                                <4 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast <4 x i64> * %0 to i8 *
-  %val = bitcast <4 x i64> %1 to <4 x double>
-
-  %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
-     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-
-  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
-
-  %val0 = shufflevector <4 x double> %val, <4 x double> undef,
-     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-
-  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
+                                <4 x i64>) nounwind alwaysinline {
+  %ptr  = bitcast <4 x i64> * %0 to i8 *
+  %val  = bitcast <4 x i64> %1 to <4 x double>
+  %mask = bitcast <4 x i64> %2 to <4 x double>
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask, <4 x double> %val)
   ret void
 }
 
 
-masked_store_blend_8_16_by_4()
+masked_store_blend_8_16_by_4_mask64()
 
 ;; sse intrinsic
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+declare <4 x float>  @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                              <4 x float>) nounwind readnone
 
-
 define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
-                                      <4 x i32> %mask) nounwind alwaysinline {
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask          = trunc   <4 x i64> %2 to <4 x i32>
   %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
-  %oldValue = load <4 x i32>* %0, align 4
-  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
-  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
-  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
-                                                     <4 x float> %newAsFloat,
-                                                     <4 x float> %mask_as_float)
+  %oldValue      = load    <4 x i32>* %0, align 4
+  %oldAsFloat    = bitcast <4 x i32> %oldValue to <4 x float>
+  %newAsFloat    = bitcast <4 x i32> %1 to <4 x float>
+  %blend         = call    <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
+                                                             <4 x float> %newAsFloat,
+                                                             <4 x float> %mask_as_float)
   %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
   store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
   ret void
 }
 
 ;; avx intrinsic
-declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
-                                                <8 x float>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
+                                                <4 x double>) nounwind readnone
 
-define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new, 
-                                      <4 x i32> %i32mask) nounwind alwaysinline {
-  %oldValue = load <4 x i64>* %ptr, align 8
-  %mask = bitcast <4 x i32> %i32mask to <4 x float>
-
-  ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
-  ; are actually bitcast <4 x i64> values
-  ;
-  ; set up the first four 64-bit values
-  %old01  = bitcast <4 x i64> %oldValue to <4 x i64>
-  %old01f = bitcast <4 x i64> %old01 to <8 x float>
-  %new01  = bitcast <4 x i64> %new  to <4 x i64>
-  %new01f = bitcast <4 x i64> %new01 to <8 x float>
-  ; compute mask--note that the indices are all doubled-up
-  %mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
-                          <8 x i32> <i32 0, i32 0, i32 1, i32 1,
-                                     i32 2, i32 2, i32 3, i32 3>
-  ; and blend them
-  %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
-                                                            <8 x float> %new01f,
-                                                            <8 x float> %mask01)
-  %result01 = bitcast <8 x float> %result01f to <4 x i64>
-
-
-  %final = bitcast <4 x i64> %result01 to <4 x i64>
-  store <4 x i64> %final, <4 x i64> * %ptr, align 8
+define void @__masked_store_blend_i64(<4 x i64>* nocapture , <4 x i64>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask_as_double = bitcast <4 x i64>  %2 to <4 x double>
+  %oldValue       = load    <4 x i64>* %0, align 4
+  %oldAsDouble    = bitcast <4 x i64>  %oldValue to <4 x double>
+  %newAsDouble    = bitcast <4 x i64>  %1 to <4 x double>
+  %blend          = call    <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %oldAsDouble,
+                                                                        <4 x double> %newAsDouble,
+                                                                        <4 x double> %mask_as_double)
+  %blendAsInt = bitcast <4 x double> %blend to <4 x i64>
+  store <4 x i64> %blendAsInt, <4 x i64>* %0, align 4
   ret void
 }
 
diff --git a/builtins/util.m4 b/builtins/util.m4
index 6c90c821..68fa818b 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -947,6 +947,22 @@ define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
   %r = sext <$1 x i32> %0 to <$1 x i64>
   ret <$1 x i64> %r
 }
+
+define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
+  ret <$1 x i64> %0
+}
 ')
 
 mask_converts(WIDTH)
@@ -2689,9 +2705,13 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
 }
 
 define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
-  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
-  `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
-  ret <WIDTH x i32> %se')
+;;  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
+;; `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
+;; ret <WIDTH x i32> %se')
+  ifelse(MASK,i32, `%se = bitcast <WIDTH x i32> %0 to <WIDTH x i32>',
+         MASK,i64, `%se = trunc <WIDTH x MASK> %0 to <WIDTH x i32>',
+                   `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>')
+  ret <WIDTH x i32> %se
 }
 
 
@@ -3508,6 +3528,56 @@ define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
 }
 ')
 
+define(`masked_store_blend_8_16_by_4_mask64', `
+define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
+                                     <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i8> * %0, align 1
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old32 = bitcast <4 x i8> %old to i32
+    %new32 = bitcast <4 x i8> %1 to i32
+
+    %mask8 = trunc <4 x i64> %2 to <4 x i8>
+    %mask32 = bitcast <4 x i8> %mask8 to i32
+    %notmask32 = xor i32 %mask32, -1
+
+    %newmasked = and i32 %new32, %mask32
+    %oldmasked = and i32 %old32, %notmask32
+    %result = or i32 %newmasked, %oldmasked
+
+    %resultvec = bitcast i32 %result to <4 x i8>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
+  ')
+  store <4 x i8> %resultvec, <4 x i8> * %0, align 1
+  ret void
+}
+
+define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i16> * %0, align 2
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old64 = bitcast <4 x i16> %old to i64
+    %new64 = bitcast <4 x i16> %1 to i64
+
+    %mask16 = trunc <4 x i64> %2 to <4 x i16>
+    %mask64 = bitcast <4 x i16> %mask16 to i64
+    %notmask64 = xor i64 %mask64, -1
+
+    %newmasked = and i64 %new64, %mask64
+    %oldmasked = and i64 %old64, %notmask64
+    %result = or i64 %newmasked, %oldmasked
+
+    %resultvec = bitcast i64 %result to <4 x i16>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
+  ')
+  store <4 x i16> %resultvec, <4 x i16> * %0, align 2
+  ret void
+}
+')
+
 define(`masked_store_blend_8_16_by_8', `
 define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
                                      <8 x i32>) nounwind alwaysinline {
diff --git a/ispc.cpp b/ispc.cpp
index 02c23568..046c64c4 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -446,14 +446,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avxh") ) {
-        fprintf(stderr, " ISA is avxh \n");
+    else if (!strcasecmp(isa, "avx-i64x4") ) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov";
         this->m_maskingIsFree = false;
-        this->m_maskBitCount = 32;
+        this->m_maskBitCount = 64;
     }
     else if (!strcasecmp(isa, "avx-x2") ||
              !strcasecmp(isa, "avx1-x2") ||
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 180c8676..64691498 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -132,6 +132,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
             llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth());
         break;
+    case 64:
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt64Ty(*ctx), target.getVectorWidth());
+        break;
     default:
         FATAL("Unhandled mask width for initializing MaskType");
     }
@@ -183,6 +187,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
                                     true /*signed*/); // 0xffffffff
         break;
+    case 64:
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), -1,
+                                    true /*signed*/); // 0xffffffff
+        break;
     default:
         FATAL("Unhandled mask width for onMask");
     }
@@ -210,6 +218,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
                                          true /*signed*/);
         break;
+    case 64:
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), 0,
+                                         true /*signed*/);
+        break;
     default:
         FATAL("Unhandled mask width for offMask");
     }
@@ -480,7 +492,10 @@ LLVMUInt64Vector(const uint64_t *ivec) {
 llvm::Constant *
 LLVMBoolVector(bool b) {
     llvm::Constant *v;
-    if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+    if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
+        v = llvm::ConstantInt::get(LLVMTypes::Int64Type, b ? 0xffffffffffffffffull : 0,
+                                   false /*unsigned*/);
+    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
         v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0,
                                    false /*unsigned*/);
     else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
@@ -506,7 +521,10 @@ LLVMBoolVector(const bool *bvec) {
     std::vector<llvm::Constant *> vals;
     for (int i = 0; i < g->target->getVectorWidth(); ++i) {
         llvm::Constant *v;
-        if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+        if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
+            v = llvm::ConstantInt::get(LLVMTypes::Int64Type, bvec[i] ? 0xffffffffffffffffull : 0,
+                                       false /*unsigned*/);
+        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
             v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0,
                                        false /*unsigned*/);
         else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
diff --git a/parse.yy b/parse.yy
index 5fc01cb0..9a2b4fc3 100644
--- a/parse.yy
+++ b/parse.yy
@@ -2183,6 +2183,9 @@ static void lAddMaskToSymbolTable(SourcePos pos) {
     case 32:
         t = AtomicType::VaryingUInt32;
         break;
+    case 64:
+        t = AtomicType::VaryingUInt64;
+        break;
     default:
         FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable");
     }
diff --git a/stdlib.ispc b/stdlib.ispc
index db9d7f36..6d7ee051 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -50,6 +50,9 @@
 #elif (ISPC_MASK_BITS == 32)
   #define IntMaskType int32
   #define UIntMaskType unsigned int32
+#elif (ISPC_MASK_BITS == 64)
+  #define IntMaskType int64
+  #define UIntMaskType unsigned int64
 #else
   #error Unknown value of ISPC_MASK_BITS
 #endif

From 059d80cc11d0cf50d337fceb1ae04d0c3c365152 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Thu, 12 Sep 2013 17:18:12 +0200
Subject: [PATCH 008/159] included suggested changes, ./tests/launch-*.ispc
 still fails. something is mask64 related, not sure what. help...

---
 .gitignore                        |   3 -
 builtins/svml.m4                  | 124 ++++++++++++++++++++++++++----
 builtins/target-avx-i64x4.ll      |   2 +-
 builtins/target-avx-i64x4base.ll  |   2 +-
 builtins/target-generic-common.ll |   4 +-
 builtins/target-neon-common.ll    |   4 +-
 builtins/target-sse4-16.ll        |   4 +-
 builtins/target-sse4-8.ll         |   4 +-
 llvmutil.cpp                      |   2 +-
 run_tests.py                      |   2 +-
 10 files changed, 120 insertions(+), 31 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3bec2ace..88fb0197 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,8 +12,5 @@ examples/*/*.png
 examples/*/*.ppm
 examples/*/objs/*
 *.swp
-.*
-!.gitignore
-
 
 
diff --git a/builtins/svml.m4 b/builtins/svml.m4
index 71a6a709..0a587577 100644
--- a/builtins/svml.m4
+++ b/builtins/svml.m4
@@ -1,20 +1,61 @@
-;; svml
+;; copyright stub  :)
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-;; stubs
+
+;; svml macro
+
+;; svml_stubs : stubs for svml calls
+;; $1 - type ("float" or "double")
+;; $2 - svml internal function suffix ("f" for float, "d" for double)
+;; $3 - vector width
 define(`svml_stubs',`
-  declare <$2 x $1> @__svml_sin$3(<$2 x $1>) nounwind readnone alwaysinline
-  declare <$2 x $1> @__svml_asin$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_cos$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare void @__svml_sincos$3(<$2 x $1>, <$2 x $1> *, <$2 x $1> *) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_tan$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_atan$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_atan2$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_exp$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_log$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_pow$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone alwaysinline
+  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare void @__svml_sincos$2(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
 ')
 
-;; decalre __svml calls
+;; svml_declare : declaration of __svml_* intrinsics 
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
 define(`svml_declare',`
   declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone
   declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone
@@ -28,7 +69,13 @@ define(`svml_declare',`
   declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone
 ');
 
-;; define native __svml calls
+;; defintition of __svml_* internal functions
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
 define(`svml_define',`
   define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline {
     %ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0)
@@ -82,7 +129,45 @@ define(`svml_define',`
 ')
 
 
-;; define x2 __svml calls
+;; svml_define_x : defintition of __svml_* internal functions operation on extended width
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
+;; $5 - extended width, must be at least twice the native vector width
+;;      contigent on existing of unary$3to$5 and binary$3to$5 macros
+
+;; *todo*: in sincos call use __svml_sincos[f][2,4,8,16] call, e.g.
+;;define void @__svml_sincosf(<8 x float>, <8 x float> *,
+;;                                    <8 x float> *) nounwind readnone alwaysinline {
+;;  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
+;;  %a = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;;  %b = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+;;
+;;  %cospa = alloca <4 x float>
+;;  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
+;;
+;;  %cospb = alloca <4 x float>
+;;  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
+;;
+;;  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %sin, <8 x float> * %1
+;;
+;;  %cosa = load <4 x float> * %cospa
+;;  %cosb = load <4 x float> * %cospb
+;;  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %cos, <8 x float> * %2
+;;
+;;  ret void
+;;}
 define(`svml_define_x',`
   define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline {
     unary$3to$5(ret, $1, @__svml_sin$2, %0)
@@ -96,7 +181,14 @@ define(`svml_define_x',`
     unary$3to$5(ret, $1, @__svml_cos$2, %0)
     ret <$5 x $1> %ret
   }
-  declare void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline 
+  define void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline 
+  {
+    %s = call <$5 x $1> @__svml_sin$4(<$5 x $1> %0)
+    %c = call <$5 x $1> @__svml_cos$4(<$5 x $1> %0)
+    store <$5 x $1> %s, <$5 x $1> * %1
+    store <$5 x $1> %c, <$5 x $1> * %2
+    ret void
+  }
   define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline {
     unary$3to$5(ret, $1, @__svml_tan$2, %0)
     ret <$5 x $1> %ret
diff --git a/builtins/target-avx-i64x4.ll b/builtins/target-avx-i64x4.ll
index d7dbb6bd..65490ea5 100644
--- a/builtins/target-avx-i64x4.ll
+++ b/builtins/target-avx-i64x4.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
diff --git a/builtins/target-avx-i64x4base.ll b/builtins/target-avx-i64x4base.ll
index 05bf178d..e1832030 100644
--- a/builtins/target-avx-i64x4base.ll
+++ b/builtins/target-avx-i64x4base.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2012, Intel Corporation
+;;  Copyright (c) 2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 30a8b030..2a5d1b32 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -209,8 +209,8 @@ declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 ;; svml
 
 include(`svml.m4')
-svml_stubs(float,  WIDTH, f)
-svml_stubs(double, WIDTH, d)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
index 92fc5ce3..1c0b421f 100644
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -318,8 +318,8 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
 
 
 include(`svml.m4')
-svmlf_stubs(WIDTH)
-svmld_stubs(WIDTH)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 3f8cd339..72b81ff0 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -210,8 +210,8 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
 
 ; FIXME
 include(`svml.m4')
-svml_stubs(float,8,f)
-svml_stubs(double,8,d)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index f43cd940..69b355e3 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -223,8 +223,8 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
 ; FIXME
 
 include(`svml.m4')
-svml_stubs(float,16,f)
-svml_stubs(double,16,d)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 64691498..275cf794 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -189,7 +189,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         break;
     case 64:
         onMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), -1,
-                                    true /*signed*/); // 0xffffffff
+                                    true /*signed*/); // 0xffffffffffffffffull
         break;
     default:
         FATAL("Unhandled mask width for onMask");
diff --git a/run_tests.py b/run_tests.py
index 9729930f..180205a0 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -75,7 +75,7 @@ if not os.path.exists(ispc_exe):
     sys.stderr.write("Fatal error: missing ispc compiler: %s\n" % ispc_exe)
     sys.exit()
 
-ispc_exe += " " + options.ispc_flags
+ispc_exe += " -g " + options.ispc_flags
 
 if __name__ == '__main__':
     sys.stdout.write("ispc compiler: %s\n" % ispc_exe)

From 40af8d6ed564cc5970786459587ecdc487a1fc44 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Thu, 12 Sep 2013 20:25:44 +0200
Subject: [PATCH 009/159] fixed segfault in tests/launch-*.ispc.
 nativeVectoWidth in avx-i64x4 was set to 4. Fixed

---
 ispc.cpp     | 2 +-
 run_tests.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ispc.cpp b/ispc.cpp
index 046c64c4..1a99154b 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -448,7 +448,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     }
     else if (!strcasecmp(isa, "avx-i64x4") ) {
         this->m_isa = Target::AVX;
-        this->m_nativeVectorWidth = 4;
+        this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov";
         this->m_maskingIsFree = false;
diff --git a/run_tests.py b/run_tests.py
index 180205a0..9729930f 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -75,7 +75,7 @@ if not os.path.exists(ispc_exe):
     sys.stderr.write("Fatal error: missing ispc compiler: %s\n" % ispc_exe)
     sys.exit()
 
-ispc_exe += " -g " + options.ispc_flags
+ispc_exe += " " + options.ispc_flags
 
 if __name__ == '__main__':
     sys.stdout.write("ispc compiler: %s\n" % ispc_exe)

From 715b82826634644eec8f95f40e53d16b8a587ca3 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Fri, 13 Sep 2013 09:25:52 +0200
Subject: [PATCH 010/159] fixed float constants to be read as doubles

---
 lex.ll   | 4 ++--
 parse.yy | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lex.ll b/lex.ll
index 8baa627a..129f0cd5 100644
--- a/lex.ll
+++ b/lex.ll
@@ -440,13 +440,13 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
 
 {FLOAT_NUMBER} {
     RT;
-    yylval.floatVal = (float)atof(yytext);
+    yylval.floatVal = atof(yytext);
     return TOKEN_FLOAT_CONSTANT;
 }
 
 {HEX_FLOAT_NUMBER} {
     RT;
-    yylval.floatVal = (float)lParseHexFloat(yytext);
+    yylval.floatVal = lParseHexFloat(yytext);
     return TOKEN_FLOAT_CONSTANT;
 }
 
diff --git a/parse.yy b/parse.yy
index 9a2b4fc3..b55d49e0 100644
--- a/parse.yy
+++ b/parse.yy
@@ -149,7 +149,7 @@ struct ForeachDimension {
 
 %union {
     uint64_t intVal;
-    float floatVal;
+    double floatVal;
     std::string *stringVal;
     const char *constCharPtr;
 
@@ -326,8 +326,8 @@ primary_expression
                            (uint64_t)yylval.intVal, @1);
     }
     | TOKEN_FLOAT_CONSTANT {
-        $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(),
-                           (float)yylval.floatVal, @1);
+        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
+                           yylval.floatVal, @1);
     }
     | TOKEN_TRUE {
         $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1);

From a97eb7b7cb217fb8f583314612527171488b0f79 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Fri, 13 Sep 2013 09:32:59 +0200
Subject: [PATCH 011/159] added clamp in double precision

---
 stdlib.ispc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/stdlib.ispc b/stdlib.ispc
index 6d7ee051..0d5c4efd 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1559,6 +1559,18 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl
     return min(max(v, low), high);
 }
 
+// double
+
+__declspec(safe,cost2)
+static inline double clamp(double v, double low, double high) {
+    return min(max(v, low), high);
+}
+
+__declspec(safe,cost2)
+static inline uniform double clamp(uniform double v, uniform double low, uniform double high) {
+    return min(max(v, low), high);
+}
+
 // int8
 
 __declspec(safe,cost2)

From a9913c83377614dde2ac782e298f437e45dcbd84 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Fri, 13 Sep 2013 10:26:15 +0200
Subject: [PATCH 012/159] changed lexer/parser to be able to read float
 constants, if they have "f"-suffix

---
 lex.ll   | 23 ++++++++++++++++++++---
 parse.yy | 11 ++++++++---
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/lex.ll b/lex.ll
index 129f0cd5..7a3db71a 100644
--- a/lex.ll
+++ b/lex.ll
@@ -76,7 +76,7 @@ static int allTokens[] = {
   TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED,
   TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE,
   TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT,
-  TOKEN_FLOAT_CONSTANT,
+  TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT,
   TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT,
   TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT,
   TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT,
@@ -152,6 +152,7 @@ void ParserInit() {
     tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
     tokenToName[TOKEN_DOTDOTDOT] = "...";
     tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
+    tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT";
     tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT";
     tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT";
     tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT";
@@ -266,6 +267,7 @@ void ParserInit() {
     tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
     tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
     tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
+    tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant";
     tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant";
     tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant";
     tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant";
@@ -341,6 +343,8 @@ inline int ispcRand() {
 WHITESPACE [ \t\r]+
 INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
+DOUBLE_NUMBER (([0-9]+|(([0-9]+\.[0-9]*?)|(\.[0-9]+)))([eE][-+]?[0-9]+)??)
+HEX_DOUBLE_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+?)
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
 
@@ -438,15 +442,28 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
 }
 
 
+
+{DOUBLE_NUMBER} {
+    RT;
+    yylval.doubleVal = atof(yytext);
+    return TOKEN_DOUBLE_CONSTANT;
+}
+
+{HEX_DOUBLE_NUMBER} {
+    RT;
+    yylval.doubleVal = lParseHexFloat(yytext);
+    return TOKEN_DOUBLE_CONSTANT;
+}
+
 {FLOAT_NUMBER} {
     RT;
-    yylval.floatVal = atof(yytext);
+    yylval.floatVal = (float)atof(yytext);
     return TOKEN_FLOAT_CONSTANT;
 }
 
 {HEX_FLOAT_NUMBER} {
     RT;
-    yylval.floatVal = lParseHexFloat(yytext);
+    yylval.floatVal = (float)lParseHexFloat(yytext);
     return TOKEN_FLOAT_CONSTANT;
 }
 
diff --git a/parse.yy b/parse.yy
index b55d49e0..933a3455 100644
--- a/parse.yy
+++ b/parse.yy
@@ -149,7 +149,8 @@ struct ForeachDimension {
 
 %union {
     uint64_t intVal;
-    double floatVal;
+    float  floatVal;
+    double doubleVal;
     std::string *stringVal;
     const char *constCharPtr;
 
@@ -185,7 +186,7 @@ struct ForeachDimension {
 %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT
 %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT
 %token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT
-%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL
+%token TOKEN_FLOAT_CONSTANT TOKEN_DOUBLE_CONSTANT TOKEN_STRING_C_LITERAL
 %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL
 %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP
 %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP
@@ -326,9 +327,13 @@ primary_expression
                            (uint64_t)yylval.intVal, @1);
     }
     | TOKEN_FLOAT_CONSTANT {
-        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
+        $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(),
                            yylval.floatVal, @1);
     }
+    | TOKEN_DOUBLE_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
+                           yylval.doubleVal, @1);
+    }
     | TOKEN_TRUE {
         $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1);
     }

From ed825b377396b639dc0d86fe44bc0b36e29189f3 Mon Sep 17 00:00:00 2001
From: Tomasz Koziara <t.koziara@gmail.com>
Date: Fri, 13 Sep 2013 13:14:31 +0100
Subject: [PATCH 013/159] Uniform memory allocation fixed.

---
 examples/sort/sort.ispc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/sort/sort.ispc b/examples/sort/sort.ispc
index 65df4736..5fc89d91 100644
--- a/examples/sort/sort.ispc
+++ b/examples/sort/sort.ispc
@@ -172,7 +172,7 @@ task void bumpup (uniform int h[], uniform int g[])
 
 static void prefix_sum (uniform int num, uniform int h[])
 {
-  uniform int * uniform g = uniform new int [num+1];
+  uniform int * uniform g = uniform new uniform int [num+1];
   uniform int i;
 
   launch[num] addup (h, g+1);
@@ -191,9 +191,9 @@ export void sort_ispc (uniform int n, uniform unsigned int code[], uniform int o
   uniform int num = ntasks < 1 ? num_cores () : ntasks;
   uniform int span = n / num;
   uniform int hsize = 256*programCount*num;
-  uniform int * uniform hist = uniform new int [hsize];
-  uniform int64 * uniform pair = uniform new int64 [n];
-  uniform int64 * uniform temp = uniform new int64 [n];
+  uniform int * uniform hist = uniform new uniform int [hsize];
+  uniform int64 * uniform pair = uniform new uniform int64 [n];
+  uniform int64 * uniform temp = uniform new uniform int64 [n];
   uniform int pass, i;
 
 #if DEBUG

From 9861375f0c1235ea25f68211f3a82f6dcd91874c Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Fri, 13 Sep 2013 15:07:14 +0200
Subject: [PATCH 014/159] renamed avx-i64x4 -> avx1-i64x4

---
 Makefile                                                      | 2 +-
 builtins.cpp                                                  | 4 ++--
 builtins/{target-avx-i64x4.ll => target-avx1-i64x4.ll}        | 2 +-
 .../{target-avx-i64x4base.ll => target-avx1-i64x4base.ll}     | 0
 ispc.cpp                                                      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)
 rename builtins/{target-avx-i64x4.ll => target-avx1-i64x4.ll} (98%)
 rename builtins/{target-avx-i64x4base.ll => target-avx1-i64x4base.ll} (100%)

diff --git a/Makefile b/Makefile
index 92debe4f..097da238 100644
--- a/Makefile
+++ b/Makefile
@@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avx-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
diff --git a/builtins.cpp b/builtins.cpp
index f8d4136e..43f68833 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -937,10 +937,10 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         switch (g->target->getVectorWidth()) {
         case 4:
             if (runtime32) {
-                EXPORT_MODULE(builtins_bitcode_avx_i64x4_32bit);
+                EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit);
             }
             else {
-                EXPORT_MODULE(builtins_bitcode_avx_i64x4_64bit);
+                EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit);
             }
             break;
         case 8:
diff --git a/builtins/target-avx-i64x4.ll b/builtins/target-avx1-i64x4.ll
similarity index 98%
rename from builtins/target-avx-i64x4.ll
rename to builtins/target-avx1-i64x4.ll
index 65490ea5..d183f1ce 100644
--- a/builtins/target-avx-i64x4.ll
+++ b/builtins/target-avx1-i64x4.ll
@@ -29,7 +29,7 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-include(`target-avx-i64x4base.ll')
+include(`target-avx1-i64x4base.ll')
 
 rdrand_decls()
 
diff --git a/builtins/target-avx-i64x4base.ll b/builtins/target-avx1-i64x4base.ll
similarity index 100%
rename from builtins/target-avx-i64x4base.ll
rename to builtins/target-avx1-i64x4base.ll
diff --git a/ispc.cpp b/ispc.cpp
index 1a99154b..26ca0b39 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -446,7 +446,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx-i64x4") ) {
+    else if (!strcasecmp(isa, "avx1-i64x4") ) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
         this->m_vectorWidth = 4;

From 36886971e337c555b1b339b862653111f9cf9506 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Fri, 13 Sep 2013 16:02:53 +0200
Subject: [PATCH 015/159] revert lex.ll parse.yy stdlib.ispc to state when all
 constants are floats

---
 lex.ll      | 19 +------------------
 parse.yy    | 11 +++--------
 stdlib.ispc | 12 ------------
 3 files changed, 4 insertions(+), 38 deletions(-)

diff --git a/lex.ll b/lex.ll
index 7a3db71a..8baa627a 100644
--- a/lex.ll
+++ b/lex.ll
@@ -76,7 +76,7 @@ static int allTokens[] = {
   TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED,
   TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE,
   TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT,
-  TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT,
+  TOKEN_FLOAT_CONSTANT,
   TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT,
   TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT,
   TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT,
@@ -152,7 +152,6 @@ void ParserInit() {
     tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
     tokenToName[TOKEN_DOTDOTDOT] = "...";
     tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
-    tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT";
     tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT";
     tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT";
     tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT";
@@ -267,7 +266,6 @@ void ParserInit() {
     tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
     tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
     tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
-    tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant";
     tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant";
     tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant";
     tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant";
@@ -343,8 +341,6 @@ inline int ispcRand() {
 WHITESPACE [ \t\r]+
 INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
-DOUBLE_NUMBER (([0-9]+|(([0-9]+\.[0-9]*?)|(\.[0-9]+)))([eE][-+]?[0-9]+)??)
-HEX_DOUBLE_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+?)
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
 
@@ -442,19 +438,6 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
 }
 
 
-
-{DOUBLE_NUMBER} {
-    RT;
-    yylval.doubleVal = atof(yytext);
-    return TOKEN_DOUBLE_CONSTANT;
-}
-
-{HEX_DOUBLE_NUMBER} {
-    RT;
-    yylval.doubleVal = lParseHexFloat(yytext);
-    return TOKEN_DOUBLE_CONSTANT;
-}
-
 {FLOAT_NUMBER} {
     RT;
     yylval.floatVal = (float)atof(yytext);
diff --git a/parse.yy b/parse.yy
index 933a3455..9a2b4fc3 100644
--- a/parse.yy
+++ b/parse.yy
@@ -149,8 +149,7 @@ struct ForeachDimension {
 
 %union {
     uint64_t intVal;
-    float  floatVal;
-    double doubleVal;
+    float floatVal;
     std::string *stringVal;
     const char *constCharPtr;
 
@@ -186,7 +185,7 @@ struct ForeachDimension {
 %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT
 %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT
 %token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT
-%token TOKEN_FLOAT_CONSTANT TOKEN_DOUBLE_CONSTANT TOKEN_STRING_C_LITERAL
+%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL
 %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL
 %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP
 %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP
@@ -328,11 +327,7 @@ primary_expression
     }
     | TOKEN_FLOAT_CONSTANT {
         $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(),
-                           yylval.floatVal, @1);
-    }
-    | TOKEN_DOUBLE_CONSTANT {
-        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
-                           yylval.doubleVal, @1);
+                           (float)yylval.floatVal, @1);
     }
     | TOKEN_TRUE {
         $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1);
diff --git a/stdlib.ispc b/stdlib.ispc
index 0d5c4efd..6d7ee051 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1559,18 +1559,6 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl
     return min(max(v, low), high);
 }
 
-// double
-
-__declspec(safe,cost2)
-static inline double clamp(double v, double low, double high) {
-    return min(max(v, low), high);
-}
-
-__declspec(safe,cost2)
-static inline uniform double clamp(uniform double v, uniform double low, uniform double high) {
-    return min(max(v, low), high);
-}
-
 // int8
 
 __declspec(safe,cost2)

From ce99b17616be754b3e30464f36fdc48bcceb22dd Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Sat, 14 Sep 2013 02:00:23 +0400
Subject: [PATCH 016/159] Fix for Windows buils to include new target:
 avx-i64x4

---
 ispc.cpp     |   3 +-
 ispc.vcxproj | 108 +++++++++++++++++++++++++++++++--------------------
 2 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/ispc.cpp b/ispc.cpp
index 26ca0b39..82f0518b 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -446,7 +446,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx1-i64x4") ) {
+    else if (!strcasecmp(isa, "avx-i64x4") ||
+             !strcasecmp(isa, "avx1-i64x4")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
         this->m_vectorWidth = 4;
diff --git a/ispc.vcxproj b/ispc.vcxproj
index b4a8b764..58fa5b08 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -22,6 +22,8 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-avx1-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx1-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx1-x2-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx1-i64x4-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx1-i64x4-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-x2-32bit.cpp" />
@@ -61,6 +63,7 @@
     <ClCompile Include="$(Configuration)\gen-stdlib-mask8.cpp" />
     <ClCompile Include="$(Configuration)\gen-stdlib-mask16.cpp" />
     <ClCompile Include="$(Configuration)\gen-stdlib-mask32.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask64.cpp" />
     <ClCompile Include="ispc.cpp" />
     <ClCompile Include="$(Configuration)\lex.cc">
       <DisableSpecificWarnings>4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
@@ -103,13 +106,14 @@
   <ItemGroup>
     <CustomBuild Include="stdlib.ispc">
       <FileType>Document</FileType>
-      <Command>%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask1 &gt; $(Configuration)/gen-stdlib-mask1.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask8 &gt; $(Configuration)/gen-stdlib-mask8.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask16 &gt; $(Configuration)/gen-stdlib-mask16.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask32 &gt; $(Configuration)/gen-stdlib-mask32.cpp;
+      <Command>%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask1 &gt; $(Configuration)/gen-stdlib-mask1.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask8 &gt; $(Configuration)/gen-stdlib-mask8.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask16 &gt; $(Configuration)/gen-stdlib-mask16.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask32 &gt; $(Configuration)/gen-stdlib-mask32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask64 &gt; $(Configuration)/gen-stdlib-mask64.cpp;
 </Command>
-      <Outputs>$(Configuration)/gen-stdlib-mask1.cpp;$(Configuration)/gen-stdlib-mask8.cpp;$(Configuration)/gen-stdlib-mask16.cpp;$(Configuration)/gen-stdlib-mask32.cpp</Outputs>
-      <Message>Building gen-stdlib-{mask1,8,16,32}.cpp</Message>
+      <Outputs>$(Configuration)/gen-stdlib-mask1.cpp;$(Configuration)/gen-stdlib-mask8.cpp;$(Configuration)/gen-stdlib-mask16.cpp;$(Configuration)/gen-stdlib-mask32.cpp;$(Configuration)/gen-stdlib-mask64.cpp</Outputs>
+      <Message>Building gen-stdlib-{mask1,8,16,32,64}.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -117,7 +121,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins/dispatch.ll | python bitcode2cpp.py dispatch.ll &gt; $(Configuration)/gen-bitcode-dispatch.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-dispatch.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4</AdditionalInputs>
       <Message>Building gen-bitcode-dispatch.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -126,7 +130,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -135,7 +139,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -144,7 +148,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-8-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -153,7 +157,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-8-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -162,7 +166,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-16-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -171,7 +175,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-16-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -180,7 +184,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -189,7 +193,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -198,7 +202,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -207,7 +211,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -216,7 +220,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse2-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -225,7 +229,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -234,7 +238,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -243,7 +247,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -252,7 +256,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx1-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -261,16 +265,34 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx1-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx1-i64x4.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-i64x4-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx1-i64x4.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-i64x4-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx11.ll">
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx11-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -279,7 +301,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx11-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -288,7 +310,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx11-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -297,7 +319,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx11-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -306,7 +328,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -315,7 +337,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -324,7 +346,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx2-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -333,7 +355,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -342,7 +364,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-1-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-1-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-1-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -351,7 +373,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-1-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-1-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-1-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -360,7 +382,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-4-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-4-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-4-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -369,7 +391,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-4-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-4-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -378,7 +400,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-8-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-8-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-8-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -387,7 +409,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-8-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-8-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-8-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -396,7 +418,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-16-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-16-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-16-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -405,7 +427,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-16-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-16-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-16-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -414,7 +436,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-32-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-32-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-32-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -423,7 +445,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-32-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-32-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-32-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -432,7 +454,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-64-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-64-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-64-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -441,7 +463,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-64-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-64-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-64-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>

From 97068765e884599afc4cc4b7187a4de4dd509b46 Mon Sep 17 00:00:00 2001
From: Tomasz Koziara <t.koziara@gmail.com>
Date: Sat, 14 Sep 2013 18:09:04 +0100
Subject: [PATCH 017/159] Copyright reversed.

---
 examples/sort/sort.cpp        | 4 ++--
 examples/sort/sort.ispc       | 4 ++--
 examples/sort/sort_serial.cpp | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/sort/sort.cpp b/examples/sort/sort.cpp
index 1d05b247..4f402c75 100644
--- a/examples/sort/sort.cpp
+++ b/examples/sort/sort.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2013, Intel Corporation
+  Copyright (c) 2013, Durham University
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -13,7 +13,7 @@
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
 
-    * Neither the name of Intel Corporation nor the names of its
+    * Neither the name of Durham University nor the names of its
       contributors may be used to endorse or promote products derived from
       this software without specific prior written permission.
 
diff --git a/examples/sort/sort.ispc b/examples/sort/sort.ispc
index 5fc89d91..25ea90f4 100644
--- a/examples/sort/sort.ispc
+++ b/examples/sort/sort.ispc
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2013, Intel Corporation
+  Copyright (c) 2013, Durham University
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -13,7 +13,7 @@
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
 
-    * Neither the name of Intel Corporation nor the names of its
+    * Neither the name of Durham University nor the names of its
       contributors may be used to endorse or promote products derived from
       this software without specific prior written permission.
 
diff --git a/examples/sort/sort_serial.cpp b/examples/sort/sort_serial.cpp
index ba955c77..38bbdda6 100644
--- a/examples/sort/sort_serial.cpp
+++ b/examples/sort/sort_serial.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2013, Intel Corporation
+  Copyright (c) 2013, Durham University
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -13,7 +13,7 @@
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
 
-    * Neither the name of Intel Corporation nor the names of its
+    * Neither the name of Durham University nor the names of its
       contributors may be used to endorse or promote products derived from
       this software without specific prior written permission.
 

From e2a91e6de5fdcd370b903b2670e76be14c60dc09 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Mon, 16 Sep 2013 15:54:32 +0200
Subject: [PATCH 018/159] added support for "d"-suffix

---
 lex.ll      | 20 +++++++++++++++++++-
 parse.yy    | 11 ++++++++---
 stdlib.ispc | 12 ++++++++++++
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/lex.ll b/lex.ll
index 8baa627a..c2990ccc 100644
--- a/lex.ll
+++ b/lex.ll
@@ -76,7 +76,7 @@ static int allTokens[] = {
   TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED,
   TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE,
   TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT,
-  TOKEN_FLOAT_CONSTANT,
+  TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT,
   TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT,
   TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT,
   TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT,
@@ -152,6 +152,7 @@ void ParserInit() {
     tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
     tokenToName[TOKEN_DOTDOTDOT] = "...";
     tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
+    tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT";
     tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT";
     tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT";
     tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT";
@@ -266,6 +267,7 @@ void ParserInit() {
     tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
     tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
     tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
+    tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant";
     tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant";
     tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant";
     tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant";
@@ -343,6 +345,8 @@ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
+DOUBLE_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[dD]?)|(\.[0-9]+)))([dD][-+]?[0-9]+)?[dD]?)
+HEX_DOUBLE_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[dD]?)
 
 IDENT [a-zA-Z_][a-zA-Z_0-9]*
 ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
@@ -438,6 +442,7 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
 }
 
 
+
 {FLOAT_NUMBER} {
     RT;
     yylval.floatVal = (float)atof(yytext);
@@ -450,6 +455,19 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
     return TOKEN_FLOAT_CONSTANT;
 }
 
+{DOUBLE_NUMBER} {
+    RT;
+    yylval.doubleVal = atof(yytext);
+    return TOKEN_DOUBLE_CONSTANT;
+}
+
+{HEX_DOUBLE_NUMBER} {
+    RT;
+    yylval.doubleVal = lParseHexFloat(yytext);
+    return TOKEN_DOUBLE_CONSTANT;
+}
+
+
 "++" { RT; return TOKEN_INC_OP; }
 "--" { RT; return TOKEN_DEC_OP; }
 "<<" { RT; return TOKEN_LEFT_OP; }
diff --git a/parse.yy b/parse.yy
index 9a2b4fc3..933a3455 100644
--- a/parse.yy
+++ b/parse.yy
@@ -149,7 +149,8 @@ struct ForeachDimension {
 
 %union {
     uint64_t intVal;
-    float floatVal;
+    float  floatVal;
+    double doubleVal;
     std::string *stringVal;
     const char *constCharPtr;
 
@@ -185,7 +186,7 @@ struct ForeachDimension {
 %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT
 %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT
 %token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT
-%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL
+%token TOKEN_FLOAT_CONSTANT TOKEN_DOUBLE_CONSTANT TOKEN_STRING_C_LITERAL
 %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL
 %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP
 %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP
@@ -327,7 +328,11 @@ primary_expression
     }
     | TOKEN_FLOAT_CONSTANT {
         $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(),
-                           (float)yylval.floatVal, @1);
+                           yylval.floatVal, @1);
+    }
+    | TOKEN_DOUBLE_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
+                           yylval.doubleVal, @1);
     }
     | TOKEN_TRUE {
         $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1);
diff --git a/stdlib.ispc b/stdlib.ispc
index 6d7ee051..0d5c4efd 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1559,6 +1559,18 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl
     return min(max(v, low), high);
 }
 
+// double
+
+__declspec(safe,cost2)
+static inline double clamp(double v, double low, double high) {
+    return min(max(v, low), high);
+}
+
+__declspec(safe,cost2)
+static inline uniform double clamp(uniform double v, uniform double low, uniform double high) {
+    return min(max(v, low), high);
+}
+
 // int8
 
 __declspec(safe,cost2)

From 233249048127b94cdb073e694f18987b643741d2 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Mon, 16 Sep 2013 16:31:41 +0200
Subject: [PATCH 019/159] added fortran_double_constant

---
 lex.ll | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/lex.ll b/lex.ll
index c2990ccc..3d88a23a 100644
--- a/lex.ll
+++ b/lex.ll
@@ -345,8 +345,7 @@ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
-DOUBLE_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[dD]?)|(\.[0-9]+)))([dD][-+]?[0-9]+)?[dD]?)
-HEX_DOUBLE_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[dD]?)
+FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9])|([0-9]+[dD][-+]?[0-9]))
 
 IDENT [a-zA-Z_][a-zA-Z_0-9]*
 ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
@@ -455,18 +454,19 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
     return TOKEN_FLOAT_CONSTANT;
 }
 
-{DOUBLE_NUMBER} {
+{FORTRAN_DOUBLE_NUMBER} {
     RT;
+    {
+      int i = 0;
+      while (yytext[i] != 'd') i++;
+      if ((yytext[i+1] >= '0' && yytext[i+1] <= '9') 
+          || yytext[i+1] == '+' || yytext[i+1] == '-')
+        yytext[i] = 'E';
+    }
     yylval.doubleVal = atof(yytext);
     return TOKEN_DOUBLE_CONSTANT;
 }
 
-{HEX_DOUBLE_NUMBER} {
-    RT;
-    yylval.doubleVal = lParseHexFloat(yytext);
-    return TOKEN_DOUBLE_CONSTANT;
-}
-
 
 "++" { RT; return TOKEN_INC_OP; }
 "--" { RT; return TOKEN_DEC_OP; }

From 6fd21d988d999b62aa0e2832cd93ccdb4ca78f77 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Mon, 16 Sep 2013 17:15:02 +0200
Subject: [PATCH 020/159] fixed lexer to properly read fortran-notation double
 constants

---
 lex.ll      | 26 +++++++++++++-------------
 stdlib.ispc |  4 ++--
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/lex.ll b/lex.ll
index 3d88a23a..ca318dbb 100644
--- a/lex.ll
+++ b/lex.ll
@@ -345,7 +345,9 @@ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
-FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9])|([0-9]+[dD][-+]?[0-9]))
+FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9]+)|([0-9]+[dD][-+]?[0-9]+))
+
+
 
 IDENT [a-zA-Z_][a-zA-Z_0-9]*
 ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
@@ -440,6 +442,16 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
     return lParseInteger(true);
 }
 
+{FORTRAN_DOUBLE_NUMBER} {
+    RT;
+    {
+      int i = 0;
+      while (yytext[i] != 'd') i++;
+      yytext[i] = 'E';
+    }
+    yylval.doubleVal = atof(yytext);
+    return TOKEN_DOUBLE_CONSTANT;
+}
 
 
 {FLOAT_NUMBER} {
@@ -454,18 +466,6 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
     return TOKEN_FLOAT_CONSTANT;
 }
 
-{FORTRAN_DOUBLE_NUMBER} {
-    RT;
-    {
-      int i = 0;
-      while (yytext[i] != 'd') i++;
-      if ((yytext[i+1] >= '0' && yytext[i+1] <= '9') 
-          || yytext[i+1] == '+' || yytext[i+1] == '-')
-        yytext[i] = 'E';
-    }
-    yylval.doubleVal = atof(yytext);
-    return TOKEN_DOUBLE_CONSTANT;
-}
 
 
 "++" { RT; return TOKEN_INC_OP; }
diff --git a/stdlib.ispc b/stdlib.ispc
index 0d5c4efd..9b02d0ba 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -2564,7 +2564,7 @@ static inline float acos(float v) {
 
 __declspec(safe)
 static inline double acos(const double v) {
-    return 1.57079637050628662109375 - asin(v);
+    return 1.57079637050628662109375d0 - asin(v);
 }
 
 
@@ -2575,7 +2575,7 @@ static inline uniform float acos(uniform float v) {
 
 __declspec(safe)
 static inline uniform double acos(const uniform double v) {
-    return 1.57079637050628662109375 - asin(v);
+    return 1.57079637050628662109375d0 - asin(v);
 }
 
 

From eef4e11768222914ffb93ccc1ab698e1cfbd7922 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Mon, 16 Sep 2013 17:25:13 +0200
Subject: [PATCH 021/159] now it is also case nonsensitive

---
 lex.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lex.ll b/lex.ll
index ca318dbb..f1dcaa6f 100644
--- a/lex.ll
+++ b/lex.ll
@@ -446,7 +446,7 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
     RT;
     {
       int i = 0;
-      while (yytext[i] != 'd') i++;
+      while (yytext[i] != 'd' && yytext[i] != 'D') i++;
       yytext[i] = 'E';
     }
     yylval.doubleVal = atof(yytext);

From 6e0b9ddc74a4480e97d9b19c66e4ad8de5d5198a Mon Sep 17 00:00:00 2001
From: Tomasz Koziara <t.koziara@gmail.com>
Date: Mon, 16 Sep 2013 18:02:07 +0100
Subject: [PATCH 022/159] Sort description.

---
 examples/README.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/README.txt b/examples/README.txt
index 5b47df44..b67529c1 100644
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -146,6 +146,11 @@ This is a simple "hello world" type program that shows a ~10 line
 application program calling out to a ~5 line ispc program to do a simple
 computation.
 
+Sort
+====
+This is a bucket sort of 32 bit unsigned integers.
+By default 1000000 random elements get sorted.
+Call ./sort N in order to sort N elements instead.
 
 Volume
 ======

From fa78d548ccc17c4a844762bd5660e49d941f9383 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 17 Sep 2013 23:36:16 +0400
Subject: [PATCH 023/159] Test, documentation and vim support for double
 precision constants

---
 contrib/ispc.vim         |  5 +++++
 docs/ispc.rst            | 11 ++++++++++-
 tests/double-consts.ispc | 23 +++++++++++++++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 tests/double-consts.ispc

diff --git a/contrib/ispc.vim b/contrib/ispc.vim
index cc8493f0..4d870dcd 100644
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -19,6 +19,11 @@ syn keyword	ispcRepeat	cdo cfor cwhile
 syn keyword	ispcBuiltin	programCount programIndex	
 syn keyword	ispcType	export uniform varying int8 int16 int32 int64
 
+"double precision floating point number, with dot, optional exponent
+syn match	cFloat		display contained "\d\+\.\d*d[-+]\=\d*\>"
+"double precision floating point number, without dot, with exponent
+syn match	cFloat		display contained "\d\+d[-+]\=\d\+\>"
+
 " Default highlighting
 command -nargs=+ HiLink hi def link <args>
 HiLink ispcStatement	Statement
diff --git a/docs/ispc.rst b/docs/ispc.rst
index ff07f6d8..224faaa9 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -270,6 +270,14 @@ new reserved words: ``unmasked``, ``foreach_unique``, ``foreach_active``,
 and ``in``.  Any program that happens to have a variable or function with
 one of these names must be modified to rename that symbol.
 
+Updating ISPC Programs For Changes In ISPC 1.4.5
+----------------------------------------------
+
+This release adds support for double precision floating point constants.
+Double precision floating point constants are floating point number with
+``d`` suffix and optional exponent part. Here are some examples: 3.14d,
+31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is
+treated as single precision constant.
 
 Getting Started with ISPC
 =========================
@@ -1349,7 +1357,8 @@ but are likely to be supported in future releases:
 * Bitfield members of ``struct`` types
 * Variable numbers of arguments to functions
 * Literal floating-point constants (even without a ``f`` suffix) are
-  currently treated as being ``float`` type, not ``double``
+  currently treated as being ``float`` type, not ``double``. To have a double
+  precision floating point constant use ``d`` suffix.
 * The ``volatile`` qualifier
 * The ``register`` storage class for variables.  (Will be ignored).
 
diff --git a/tests/double-consts.ispc b/tests/double-consts.ispc
new file mode 100644
index 00000000..3259156a
--- /dev/null
+++ b/tests/double-consts.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    // Test parsing of double constants.
+    double d1 = 1.0d40;
+    double d2 = 1.d40;
+    double d3 = 1d40;
+    double d4 = 10000000000000000000000000000000000000000.d;
+    double d5 = 10000000000000000000000000000000000000000.0d;
+
+    // All the constants should be equal and if it's evaluated as "float",
+    // then sqrt will evaluate to +inf.
+    if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 &&
+        ((float)sqrt(d1)) < 2e20) {
+        RET[programIndex] = a;
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}

From 922edb11281ae432bc1647445dfa556de8fd663f Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Wed, 18 Sep 2013 18:14:07 +0300
Subject: [PATCH 024/159] completed knc-i1x16.h and added knc-i1x8.h with
 knc-i1x8unsafe_fast.h that doesnt pass several tests..

---
 examples/intrinsics/knc-i1x16.h           | 3092 +++++++++++++++++++++
 examples/intrinsics/knc-i1x8.h            | 2862 +++++++++++++++++++
 examples/intrinsics/knc-i1x8unsafe_fast.h |    2 +
 run_tests.py                              |    7 +-
 4 files changed, 5961 insertions(+), 2 deletions(-)
 create mode 100644 examples/intrinsics/knc-i1x16.h
 create mode 100644 examples/intrinsics/knc-i1x8.h
 create mode 100644 examples/intrinsics/knc-i1x8unsafe_fast.h

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
new file mode 100644
index 00000000..8b1a2bb9
--- /dev/null
+++ b/examples/intrinsics/knc-i1x16.h
@@ -0,0 +1,3092 @@
+/**
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdint.h>
+#include <math.h>
+#include <assert.h>
+#include <algorithm>
+#include <immintrin.h>
+#include <zmmintrin.h>
+
+#ifdef _MSC_VER
+#define FORCEINLINE __forceinline
+#define PRE_ALIGN(x)  /*__declspec(align(x))*/
+#define POST_ALIGN(x)  
+#define roundf(x) (floorf(x + .5f))
+#define round(x) (floor(x + .5))
+#else
+#define FORCEINLINE __attribute__((always_inline))
+#define PRE_ALIGN(x)
+#define POST_ALIGN(x)  __attribute__ ((aligned(x)))
+#endif
+
+#if 0
+#define KNC 1
+extern "C" 
+{
+  int printf(const unsigned char *, ...);
+  int puts(unsigned char *);
+  unsigned int putchar(unsigned int);
+  int fflush(void *);
+  uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t);
+  uint8_t *memset(uint8_t *, uint8_t, uint64_t);
+  void memset_pattern16(void *, const void *, uint64_t);
+}
+#endif
+
+typedef float __vec1_f;
+typedef double __vec1_d;
+typedef int8_t __vec1_i8;
+typedef int16_t __vec1_i16;
+typedef int32_t __vec1_i32;
+typedef int64_t __vec1_i64;
+
+struct __vec16_i1 {
+    __vec16_i1() { }
+    __vec16_i1(const __mmask16 &vv) : v(vv) { }
+    __vec16_i1(bool v0, bool v1, bool v2, bool v3,
+               bool v4, bool v5, bool v6, bool v7,
+               bool v8, bool v9, bool v10, bool v11,
+               bool v12, bool v13, bool v14, bool v15) {
+        v = ((v0 & 1) |
+             ((v1 & 1) << 1) |
+             ((v2 & 1) << 2) |
+             ((v3 & 1) << 3) |
+             ((v4 & 1) << 4) |
+             ((v5 & 1) << 5) |
+             ((v6 & 1) << 6) |
+             ((v7 & 1) << 7) |
+             ((v8 & 1) << 8) |
+             ((v9 & 1) << 9) |
+             ((v10 & 1) << 10) |
+             ((v11 & 1) << 11) |
+             ((v12 & 1) << 12) |
+             ((v13 & 1) << 13) |
+             ((v14 & 1) << 14) |
+             ((v15 & 1) << 15));
+    }
+             
+    __mmask16 v;
+    FORCEINLINE operator __mmask16() const { return v; }
+};
+
+
+template <typename T>
+struct vec16 {
+    vec16() { }
+    vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+          T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) {
+        data[0] = v0;        data[1] = v1;        data[2] = v2;        data[3] = v3;
+        data[4] = v4;        data[5] = v5;        data[6] = v6;        data[7] = v7;
+        data[8] = v8;        data[9] = v9;        data[10] = v10;      data[11] = v11;
+        data[12] = v12;      data[13] = v13;      data[14] = v14;      data[15] = v15;
+    }
+    T data[16]; 
+    FORCEINLINE const T& operator[](const int i) const { return data[i]; }
+    FORCEINLINE       T& operator[](const int i)       { return data[i]; }
+};
+
+#if 0 /* evghenii:i32 */
+struct PRE_ALIGN(64) __vec16_i32  : public vec16<int32_t> { 
+  __vec16_i32() { }
+  __vec16_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, 
+      int32_t v4, int32_t v5, int32_t v6, int32_t v7,
+      int32_t v8, int32_t v9, int32_t v10, int32_t v11, 
+      int32_t v12, int32_t v13, int32_t v14, int32_t v15) 
+    : vec16<int32_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(64);
+#else /* evghenii:i32 */
+struct PRE_ALIGN(64) __vec16_i32 
+{
+  __m512i v;
+  FORCEINLINE operator __m512i() const { return v; }
+  FORCEINLINE __vec16_i32() : v(_mm512_undefined_epi32()) {}
+  FORCEINLINE __vec16_i32(const int32_t &in) : v(_mm512_set1_epi32(in)) {}
+  FORCEINLINE __vec16_i32(const __m512i &in) : v(in) {}
+  FORCEINLINE __vec16_i32(const __vec16_i32 &o) : v(o.v) {}
+  FORCEINLINE __vec16_i32& operator =(const __vec16_i32 &o) { v=o.v; return *this; }
+  FORCEINLINE __vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, 
+      int32_t v04, int32_t v05, int32_t v06, int32_t v07,
+      int32_t v08, int32_t v09, int32_t v10, int32_t v11,
+      int32_t v12, int32_t v13, int32_t v14, int32_t v15) :
+    v ( _mm512_set_16to16_pi(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) ) {}
+    FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
+    FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
+} POST_ALIGN(64);
+#endif /* evghenii:i32 */
+
+#if 0 /* evghenii::f */
+PRE_ALIGN(64) struct __vec16_f : public vec16<float> { 
+    __vec16_f() { }
+    __vec16_f(float v0, float v1, float v2, float v3, 
+              float v4, float v5, float v6, float v7,
+              float v8, float v9, float v10, float v11, 
+              float v12, float v13, float v14, float v15) 
+        : vec16<float>(v0, v1, v2, v3, v4, v5, v6, v7,
+                       v8, v9, v10, v11, v12, v13, v14, v15) { }
+
+} POST_ALIGN(64);
+#else /* evghenii::f */
+PRE_ALIGN(64) struct __vec16_f 
+{
+    __m512 v;
+    FORCEINLINE operator __m512() const { return v; }
+    FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { }
+    FORCEINLINE __vec16_f(const __m512 &in) : v(in) {}
+    FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {}
+    FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; }
+    FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, 
+                          float v04, float v05, float v06, float v07,
+                          float v08, float v09, float v10, float v11,
+                          float v12, float v13, float v14, float v15) :
+        v ( _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) )  {}
+    FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
+    FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
+} POST_ALIGN(64);
+#endif /* evghenii::f */
+
+#if 0 /* evghenii::d */
+PRE_ALIGN(128) struct __vec16_d : public vec16<double> { 
+    __vec16_d() { }
+    __vec16_d(double v0, double v1, double v2, double v3, 
+              double v4, double v5, double v6, double v7,
+              double v8, double v9, double v10, double v11, 
+              double v12, double v13, double v14, double v15) 
+        : vec16<double>(v0, v1, v2, v3, v4, v5, v6, v7,
+                        v8, v9, v10, v11, v12, v13, v14, v15) { }
+
+} POST_ALIGN(128);
+#else /* evghenii::d */
+struct PRE_ALIGN(128) __vec16_d 
+{
+    __m512d v1;
+    __m512d v2;
+    FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {}
+    FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {}
+    FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {}
+    FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; }
+    FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, 
+                          double v04, double v05, double v06, double v07,
+                          double v08, double v09, double v10, double v11,
+                          double v12, double v13, double v14, double v15) {
+        v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08);
+        v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00);
+    }
+    FORCEINLINE const double& operator[](const int i) const {  return ((double*)this)[i]; }
+    FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
+} POST_ALIGN(128);
+#endif /* evghenii::d */
+
+#if 1 /* evghenii::i64 */
+PRE_ALIGN(128) struct __vec16_i64  : public vec16<int64_t> { 
+    __vec16_i64() { }
+    __vec16_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, 
+                int64_t v4, int64_t v5, int64_t v6, int64_t v7,
+                int64_t v8, int64_t v9, int64_t v10, int64_t v11, 
+                int64_t v12, int64_t v13, int64_t v14, int64_t v15) 
+        : vec16<int64_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                         v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(128);
+#else /* evghenii::i64 */
+struct PRE_ALIGN(64) __vec16_i64 {
+    FORCEINLINE __vec16_i64() : v_lo(_mm512_undefined_epi32()),  v_hi(_mm512_undefined_epi32()) {}
+    FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {}
+    FORCEINLINE __vec16_i64(__m512i l, __m512i h) : v_lo(l), v_hi(h) {}
+    FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v_lo=o.v_lo; v_hi=o.v_hi; return *this; }
+    FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, 
+                            int64_t v04, int64_t v05, int64_t v06, int64_t v07,
+                            int64_t v08, int64_t v09, int64_t v10, int64_t v11,
+                            int64_t v12, int64_t v13, int64_t v14, int64_t v15) {
+        __m512i v1 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08);
+        __m512i v2 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00);
+        v_hi = _mm512_mask_permutevar_epi32(v_hi, 0xFF00, 
+                      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+                      v1);
+        v_hi = _mm512_mask_permutevar_epi32(v_hi, 0x00FF, 
+                      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+                      v2);
+        v_lo = _mm512_mask_permutevar_epi32(v_lo, 0xFF00,
+                      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+                      v1);
+        v_lo = _mm512_mask_permutevar_epi32(v_lo, 0x00FF,
+                      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+                      v2);
+    }
+    __m512i v_hi;
+    __m512i v_lo;
+} POST_ALIGN(64);
+
+#endif /* evghenii::i64 */
+
+PRE_ALIGN(16) struct __vec16_i8   : public vec16<int8_t> { 
+    __vec16_i8() { }
+    __vec16_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, 
+               int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+               int8_t v8, int8_t v9, int8_t v10, int8_t v11, 
+               int8_t v12, int8_t v13, int8_t v14, int8_t v15) 
+        : vec16<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                        v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(16);
+
+PRE_ALIGN(32) struct __vec16_i16  : public vec16<int16_t> { 
+    __vec16_i16() { }
+    __vec16_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, 
+                int16_t v4, int16_t v5, int16_t v6, int16_t v7,
+                int16_t v8, int16_t v9, int16_t v10, int16_t v11, 
+                int16_t v12, int16_t v13, int16_t v14, int16_t v15) 
+        : vec16<int16_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                         v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(32);
+
+static inline int32_t __extract_element(__vec16_i32, int);
+
+
+///////////////////////////////////////////////////////////////////////////
+// macros...
+
+#define UNARY_OP(TYPE, NAME, OP)            \
+static FORCEINLINE TYPE NAME(TYPE v) {      \
+    TYPE ret;                               \
+    for (int i = 0; i < 16; ++i)            \
+        ret[i] = OP(v[i]);              \
+    return ret;                             \
+}
+
+#define BINARY_OP(TYPE, NAME, OP)                               \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
+    TYPE ret;                                                   \
+   for (int i = 0; i < 16; ++i)                                 \
+       ret[i] = a[i] OP b[i];                             \
+   return ret;                                                   \
+}
+
+#define BINARY_OP_CAST(TYPE, CAST, NAME, OP)                        \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP (CAST)(b[i]);                 \
+   return ret;                                                      \
+}
+
+#define BINARY_OP_FUNC(TYPE, NAME, FUNC)                            \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret[i] = FUNC(a[i], b[i]);                             \
+   return ret;                                                      \
+}
+
+#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP)                        \
+static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) {     \
+   __vec16_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   return ret;                                                      \
+}                                                                   \
+static FORCEINLINE __vec16_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b,       \
+                                              __vec16_i1 mask) {    \
+   __vec16_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   ret.v &= mask.v;                                                 \
+   return ret;                                                      \
+}
+
+#define INSERT_EXTRACT(VTYPE, STYPE)                                  \
+static FORCEINLINE STYPE __extract_element(VTYPE v, int index) {      \
+    return ((STYPE *)&v)[index];                                      \
+}                                                                     \
+static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
+    ((STYPE *)v)[index] = val;                                        \
+}
+
+#define LOAD_STORE(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 16; ++i)                       \
+        ret[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 16; ++i)                       \
+        ptr[i] = v[i];                               \
+}
+
+#define LOADS(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 16; ++i)                       \
+        ret[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+
+#define STORES(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 16; ++i)                       \
+        ptr[i] = v[i];                               \
+}
+
+#define REDUCE_ADD(TYPE, VTYPE, NAME)           \
+static FORCEINLINE TYPE NAME(VTYPE v) {         \
+     TYPE ret = v[0];                         \
+     for (int i = 1; i < 16; ++i)               \
+         ret = ret + v[i];                    \
+     return ret;                                \
+}
+
+#define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP)                    \
+static FORCEINLINE TYPE NAME(VTYPE v) {                         \
+    TYPE ret = v[0];                                          \
+    for (int i = 1; i < 16; ++i)                                \
+        ret = (ret OP (TYPE)v[i]) ? ret : (TYPE)v[i];       \
+    return ret;                                                 \
+}
+
+#define SELECT(TYPE)                                                \
+static FORCEINLINE TYPE __select(__vec16_i1 mask, TYPE a, TYPE b) { \
+    TYPE ret;                                                       \
+    for (int i = 0; i < 16; ++i)                                    \
+        ret[i] = (mask.v & (1<<i)) ? a[i] : b[i];             \
+    return ret;                                                     \
+}                                                                   \
+static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
+    return cond ? a : b;                                            \
+}
+
+#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP)                         \
+static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP b;                              \
+   return ret;                                                      \
+}
+
+#define SMEAR(VTYPE, NAME, STYPE)                                  \
+template <class RetVecType> VTYPE __smear_##NAME(STYPE);           \
+template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 16; ++i)                                   \
+        ret[i] = v;                                              \
+    return ret;                                                    \
+}
+
+#define SETZERO(VTYPE, NAME)                                       \
+template <class RetVecType> VTYPE __setzero_##NAME();              \
+template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 16; ++i)                                   \
+        ret[i] = 0;                                              \
+    return ret;                                                    \
+}
+
+#define UNDEF(VTYPE, NAME)                                         \
+template <class RetVecType> VTYPE __undef_##NAME();                \
+template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() {            \
+    return VTYPE();                                                \
+}
+
+#define BROADCAST(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = v[index & 0xf];                  \
+    return ret;                                       \
+}                                                     \
+
+#define ROTATE(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = v[(i+index) & 0xf];              \
+    return ret;                                       \
+}                                                     \
+
+#define SHUFFLES(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = v[__extract_element(index, i) & 0xf];      \
+    return ret;                                       \
+}                                                     \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i) {                    \
+        int ii = __extract_element(index, i) & 0x1f;    \
+        ret[i] = (ii < 16) ? v0[ii] : v1[ii-16];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+#define SHUFFLE2(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i) {                    \
+        int ii = __extract_element(index, i) & 0x1f;    \
+        ret[i] = (ii < 16) ? v0[ii] : v1[ii-16];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+INSERT_EXTRACT(__vec1_i8, int8_t)
+INSERT_EXTRACT(__vec1_i16, int16_t)
+INSERT_EXTRACT(__vec1_i32, int32_t)
+INSERT_EXTRACT(__vec1_i64, int64_t)
+INSERT_EXTRACT(__vec1_f, float)
+INSERT_EXTRACT(__vec1_d, double)
+
+///////////////////////////////////////////////////////////////////////////
+// mask ops
+
+static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
+    return (uint64_t)mask.v;
+}
+
+static FORCEINLINE bool __any(__vec16_i1 mask) {
+    return (mask.v!=0);
+}
+
+static FORCEINLINE bool __all(__vec16_i1 mask) {
+    return (mask.v==0xFFFF);
+}
+
+static FORCEINLINE bool __none(__vec16_i1 mask) {
+    return (mask.v==0);
+}
+
+static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = (a.v & b.v) | (~a.v & ~b.v);
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __xor(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = a.v ^ b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = a.v | b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __not(__vec16_i1 v) {
+    __vec16_i1 r;
+    r.v = ~v.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = ~a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = a.v & ~b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, 
+                                       __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = (a.v & mask.v) | (b.v & ~mask.v);
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) {
+    return (vec.v & (1 << index)) ? true : false;
+}
+
+static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, 
+                                         bool val) {
+    if (val == false)
+        vec->v &= ~(1 << index);
+    else
+        vec->v |= (1 << index);
+}
+
+template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
+    uint16_t *ptr = (uint16_t *)p;
+    __vec16_i1 r;
+    r.v = *ptr;
+    return r;
+}
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) {
+    uint16_t *ptr = (uint16_t *)p;
+    *ptr = v.v;
+}
+
+template <class RetVecType> RetVecType __smear_i1(int i);
+template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) {
+    return i?0xFFFF:0x0;
+}
+
+template <class RetVecType> RetVecType __setzero_i1();
+template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
+    return 0;
+}
+
+template <class RetVecType> __vec16_i1 __undef_i1();
+template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
+    return __vec16_i1();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// int8
+
+BINARY_OP(__vec16_i8, __add, +)
+BINARY_OP(__vec16_i8, __sub, -)
+BINARY_OP(__vec16_i8, __mul, *)
+
+BINARY_OP(__vec16_i8, __or, |)
+BINARY_OP(__vec16_i8, __and, &)
+BINARY_OP(__vec16_i8, __xor, ^)
+BINARY_OP(__vec16_i8, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i8, uint8_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i8, int8_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i8, uint8_t, __urem, %)
+BINARY_OP_CAST(__vec16_i8, int8_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i8, uint8_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i8, int8_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
+
+CMP_OP(__vec16_i8, i8, int8_t,  __equal, ==)
+CMP_OP(__vec16_i8, i8, int8_t,  __not_equal, !=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_less_than, <)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i8)
+INSERT_EXTRACT(__vec16_i8, int8_t)
+SMEAR(__vec16_i8, i8, int8_t)
+SETZERO(__vec16_i8, i8)
+UNDEF(__vec16_i8, i8)
+BROADCAST(__vec16_i8, i8, int8_t)
+ROTATE(__vec16_i8, i8, int8_t)
+SHUFFLES(__vec16_i8, i8, int8_t)
+LOAD_STORE(__vec16_i8, int8_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int16
+
+BINARY_OP(__vec16_i16, __add, +)
+BINARY_OP(__vec16_i16, __sub, -)
+BINARY_OP(__vec16_i16, __mul, *)
+
+BINARY_OP(__vec16_i16, __or, |)
+BINARY_OP(__vec16_i16, __and, &)
+BINARY_OP(__vec16_i16, __xor, ^)
+BINARY_OP(__vec16_i16, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i16, uint16_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i16, int16_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i16, uint16_t, __urem, %)
+BINARY_OP_CAST(__vec16_i16, int16_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i16, uint16_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i16, int16_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
+
+CMP_OP(__vec16_i16, i16, int16_t,  __equal, ==)
+CMP_OP(__vec16_i16, i16, int16_t,  __not_equal, !=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_less_than, <)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i16)
+INSERT_EXTRACT(__vec16_i16, int16_t)
+SMEAR(__vec16_i16, i16, int16_t)
+SETZERO(__vec16_i16, i16)
+UNDEF(__vec16_i16, i16)
+BROADCAST(__vec16_i16, i16, int16_t)
+ROTATE(__vec16_i16, i16, int16_t)
+SHUFFLES(__vec16_i16, i16, int16_t)
+LOAD_STORE(__vec16_i16, int16_t)
+
+#if 0 /* evghenii::int32 */
+///////////////////////////////////////////////////////////////////////////
+// int32
+
+BINARY_OP(__vec16_i32, __add, +)
+BINARY_OP(__vec16_i32, __sub, -)
+BINARY_OP(__vec16_i32, __mul, *)
+
+BINARY_OP(__vec16_i32, __or, |)
+BINARY_OP(__vec16_i32, __and, &)
+BINARY_OP(__vec16_i32, __xor, ^)
+BINARY_OP(__vec16_i32, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i32, uint32_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i32, int32_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i32, uint32_t, __urem, %)
+BINARY_OP_CAST(__vec16_i32, int32_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i32, uint32_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i32, int32_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
+
+CMP_OP(__vec16_i32, i32, int32_t,  __equal, ==)
+CMP_OP(__vec16_i32, i32, int32_t,  __not_equal, !=)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_less_than, <)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i32)
+INSERT_EXTRACT(__vec16_i32, int32_t)
+SMEAR(__vec16_i32, i32, int32_t)
+SETZERO(__vec16_i32, i32)
+UNDEF(__vec16_i32, i32)
+BROADCAST(__vec16_i32, i32, int32_t)
+ROTATE(__vec16_i32, i32, int32_t)
+SHUFFLES(__vec16_i32, i32, int32_t)
+LOAD_STORE(__vec16_i32, int32_t)
+
+#else /* evghenii::int32 */
+///////////////////////////////////////////////////////////////////////////
+// int32
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_i32 __add(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_add_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __sub(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_sub_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __mul(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_mullo_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_div_epu32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_div_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_rem_epu32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_rem_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __or(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_or_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __and(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_and_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __xor(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_xor_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_sllv_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_srlv_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_srav_epi32(a, b); 
+}
+
+static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, int32_t n) {
+    return _mm512_slli_epi32(a, n);
+}
+
+static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, int32_t n) {
+    return _mm512_srli_epi32(a, n); 
+}
+
+static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) {
+    return _mm512_srai_epi32(a, n); 
+}
+
+static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i32 &b) {
+    return _mm512_cmpeq_epi32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __equal_i32_and_mask(const __vec16_i32 &a, const __vec16_i32 &b,
+                                                   __vec16_i1 m) {
+    return _mm512_mask_cmpeq_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmpneq_epi32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                       __vec16_i1 m) {
+    return _mm512_mask_cmpneq_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmple_epu32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                                 __vec16_i1 m) {
+    return _mm512_mask_cmple_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmple_epi32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                               __vec16_i1 m) {
+    return _mm512_mask_cmple_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmpge_epu32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                                    __vec16_i1 m) {
+    return _mm512_mask_cmpge_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmpge_epi32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                                  __vec16_i1 m) {
+    return _mm512_mask_cmpge_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmplt_epu32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                                __vec16_i1 m) {
+    return _mm512_mask_cmplt_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmplt_epi32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                              __vec16_i1 m) {
+    return _mm512_mask_cmplt_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmpgt_epu32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                                   __vec16_i1 m) {
+    return _mm512_mask_cmpgt_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmpgt_epi32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                                 __vec16_i1 m) {
+    return _mm512_mask_cmpgt_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask,
+                                        __vec16_i32 a, __vec16_i32 b) {
+    return _mm512_mask_mov_epi32(b.v, mask, a.v);
+} 
+
+static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int index) { //uint32_t index) {
+    return ((int32_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec16_i32 *v, uint32_t index, int32_t val) {
+    ((int32_t *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_i32(int32_t i);
+template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) {
+    return _mm512_set1_epi32(i);
+}
+
+static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
+static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
+static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
+static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+template <class RetVecType> RetVecType __setzero_i32();
+template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() {
+    return _mm512_setzero_epi32();
+}
+
+template <class RetVecType> RetVecType __undef_i32();
+template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() {
+    return __vec16_i32();
+}
+
+static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) {
+    int32_t val = __extract_element(v, index & 0xf);
+    return _mm512_set1_epi32(val);
+}
+
+#if 0 /* evghenii::doesn't work */
+static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) {
+    __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
+    __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0x7));
+    return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
+}
+#else
+ROTATE(__vec16_i32, i32, int32_t)
+#endif
+
+static FORCEINLINE __vec16_i32 __shuffle_i32(__vec16_i32 v, __vec16_i32 index) {
+    return _mm512_mask_permutevar_epi32(v, 0xffff, index, v);
+}
+SHUFFLE2(__vec16_i32, i32, int32_t) /* evghenii::to implement */
+
+template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_load_epi32(p);
+#else
+    __vec16_i32 v;
+    v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    return v;
+#endif
+}
+
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_store_epi32(p, v);
+#else
+    _mm512_extpackstorelo_epi32(p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+#endif
+}
+
+#if 0
+template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) {
+    return _mm512_load_epi32(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
+    _mm512_store_epi32(p, v);
+}
+#endif
+#endif /* evghenii::int32 */
+
+///////////////////////////////////////////////////////////////////////////
+// int64
+
+BINARY_OP(__vec16_i64, __add, +)
+BINARY_OP(__vec16_i64, __sub, -)
+BINARY_OP(__vec16_i64, __mul, *)
+
+BINARY_OP(__vec16_i64, __or, |)
+BINARY_OP(__vec16_i64, __and, &)
+BINARY_OP(__vec16_i64, __xor, ^)
+BINARY_OP(__vec16_i64, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i64, uint64_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i64, int64_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i64, uint64_t, __urem, %)
+BINARY_OP_CAST(__vec16_i64, int64_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
+
+CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
+CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_than, <)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i64)
+INSERT_EXTRACT(__vec16_i64, int64_t)
+SMEAR(__vec16_i64, i64, int64_t)
+SETZERO(__vec16_i64, i64)
+UNDEF(__vec16_i64, i64)
+BROADCAST(__vec16_i64, i64, int64_t)
+ROTATE(__vec16_i64, i64, int64_t)
+SHUFFLES(__vec16_i64, i64, int64_t)
+LOAD_STORE(__vec16_i64, int64_t)
+
+
+#if 0 /* evghenii::float */
+///////////////////////////////////////////////////////////////////////////
+// float
+
+BINARY_OP(__vec16_f, __add, +)
+BINARY_OP(__vec16_f, __sub, -)
+BINARY_OP(__vec16_f, __mul, *)
+BINARY_OP(__vec16_f, __div, /)
+
+CMP_OP(__vec16_f, float, float, __equal, ==)
+CMP_OP(__vec16_f, float, float, __not_equal, !=)
+CMP_OP(__vec16_f, float, float, __less_than, <)
+CMP_OP(__vec16_f, float, float, __less_equal, <=)
+CMP_OP(__vec16_f, float, float, __greater_than, >)
+CMP_OP(__vec16_f, float, float, __greater_equal, >=)
+
+static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec16_f)
+INSERT_EXTRACT(__vec16_f, float)
+SMEAR(__vec16_f, float, float)
+SETZERO(__vec16_f, float)
+UNDEF(__vec16_f, float)
+BROADCAST(__vec16_f, float, float)
+ROTATE(__vec16_f, float, float)
+SHUFFLES(__vec16_f, float, float)
+LOAD_STORE(__vec16_f, float)
+#else /* evghenii::float */
+
+///////////////////////////////////////////////////////////////////////////
+// float
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_f __add(__vec16_f a, __vec16_f b) { 
+    return _mm512_add_ps(a, b);
+}
+
+static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) {
+    return _mm512_sub_ps(a, b);
+}
+
+#if 1 /* evghenii::this two fails assert-3.ispc test */
+static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) {
+    return _mm512_mul_ps(a, b);
+}
+
+static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) {
+    return _mm512_div_ps(a, b);
+}
+#else
+BINARY_OP(__vec16_f, __mul, *)
+BINARY_OP(__vec16_f, __div, /)
+#endif
+
+
+static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) {
+    return _mm512_cmpeq_ps_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __equal_float_and_mask(__vec16_f a, __vec16_f b,
+                                                     __vec16_i1 m) {
+    return _mm512_mask_cmpeq_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __not_equal_float(__vec16_f a, __vec16_f b) {
+    return _mm512_cmpneq_ps_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __not_equal_float_and_mask(__vec16_f a, __vec16_f b,
+                                                         __vec16_i1 m) {
+    return _mm512_mask_cmpneq_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __less_than_float(__vec16_f a, __vec16_f b) {
+    return _mm512_cmplt_ps_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __less_than_float_and_mask(__vec16_f a, __vec16_f b,
+                                                         __vec16_i1 m) {
+    return _mm512_mask_cmplt_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __less_equal_float(__vec16_f a, __vec16_f b) {
+    return _mm512_cmple_ps_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __less_equal_float_and_mask(__vec16_f a, __vec16_f b,
+                                                          __vec16_i1 m) {
+    return _mm512_mask_cmple_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __greater_than_float(__vec16_f a, __vec16_f b) {
+//    return _mm512_cmpnle_ps_mask(a, b);
+    return _mm512_cmp_ps_mask(a, b,_CMP_GT_OS);
+}
+
+static FORCEINLINE __vec16_i1 __greater_than_float_and_mask(__vec16_f a, __vec16_f b,
+                                                            __vec16_i1 m) {
+//    return _mm512_mask_cmpnle_ps_mask(m, a, b);
+    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GT_OS);
+}
+
+static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) {
+//    return _mm512_cmpnlt_ps_mask(a, b);
+    return _mm512_cmp_ps_mask(a, b,_CMP_GE_OS);
+}
+
+static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b,
+                                                             __vec16_i1 m) {
+//    return _mm512_mask_cmpnlt_ps_mask(m, a, b);
+    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GE_OS);
+}
+
+static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) {
+    return _mm512_cmpord_ps_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) {
+    return _mm512_cmpunord_ps_mask(a, b);
+}
+
+static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) {
+    return _mm512_mask_mov_ps(b, mask, a);
+}
+
+static FORCEINLINE __vec16_f __select(bool cond, __vec16_f a, __vec16_f b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) {
+  return v[index];
+ //   return ((float *)&v)[index];
+}
+
+static FORCEINLINE void  __insert_element(__vec16_f *v, uint32_t index, float val) {
+  (*v)[index] = val;
+//    ((float *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_float(float f);
+template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) {
+    return _mm512_set_1to16_ps(f);
+}
+
+template <class RetVecType> RetVecType __setzero_float();
+template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() {
+    return _mm512_setzero_ps();
+}
+
+template <class RetVecType> RetVecType __undef_float();
+template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() {
+    return __vec16_f();
+}
+
+static FORCEINLINE __vec16_f __broadcast_float(__vec16_f v, int index) {
+    float val = __extract_element(v, index & 0xf);
+    return _mm512_set1_ps(val);
+}
+ 
+#if 1
+static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) {
+    return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
+}
+#endif
+ROTATE(__vec16_f, float, float)
+SHUFFLE2(__vec16_f, float, float)
+
+template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_load_ps(p);
+#else
+    __vec16_f v;
+    v = _mm512_extloadunpacklo_ps(v,           p,    _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    return v;
+#endif
+}
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_store_ps(p, v);
+#else
+    _mm512_extpackstorelo_ps(          p,    v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+#endif
+}
+
+#if 0
+template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) {
+    _mm512_store_ps(p, v);
+}
+template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) {
+    return _mm512_load_ps(p);
+}
+#endif
+
+#endif /* evghenii::float */
+
+static FORCEINLINE float __exp_uniform_float(float v) {    return expf(v);}
+static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) { return _mm512_exp_ps(v); }
+
+static FORCEINLINE float __log_uniform_float(float v) {    return logf(v);}
+static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) { return _mm512_log_ps(v); }
+
+static FORCEINLINE float __pow_uniform_float(float a, float b) {    return powf(a, b);}
+static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); }
+
+static FORCEINLINE int __intbits(float v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.f = v;
+    return u.i;
+}
+
+static FORCEINLINE float __floatbits(int v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.i = v;
+    return u.f;
+}
+
+static FORCEINLINE float __half_to_float_uniform(int16_t h) {
+    static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+    uint32_t exp = shifted_exp & o;   // just the exponent
+    o += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) { // Zero/Denormal?
+        o += 1 << 23;             // extra exp adjust
+        o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
+    }
+
+    o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
+    return __floatbits(o);
+}
+
+
+static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret[i] = __half_to_float_uniform(v[i]);
+    return ret;
+}
+
+
+static FORCEINLINE int16_t __float_to_half_uniform(float f) {
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    int32_t fint = __intbits(f);
+    int32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    int32_t f32infty = 255 << 23;
+    o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+    const uint32_t round_mask = ~0xfffu; 
+    const int32_t magic = 15 << 23;
+    const int32_t f16infty = 31 << 23;
+
+    int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
+    fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+
+static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) {
+    __vec16_i16 ret;
+    for (int i = 0; i < 16; ++i)
+        ret[i] = __float_to_half_uniform(v[i]);
+    return ret;
+}
+
+
+#if 0 /* evghenii::double */
+///////////////////////////////////////////////////////////////////////////
+// double
+
+BINARY_OP(__vec16_d, __add, +)
+BINARY_OP(__vec16_d, __sub, -)
+BINARY_OP(__vec16_d, __mul, *)
+BINARY_OP(__vec16_d, __div, /)
+
+CMP_OP(__vec16_d, double, double, __equal, ==)
+CMP_OP(__vec16_d, double, double, __not_equal, !=)
+CMP_OP(__vec16_d, double, double, __less_than, <)
+CMP_OP(__vec16_d, double, double, __less_equal, <=)
+CMP_OP(__vec16_d, double, double, __greater_than, >)
+CMP_OP(__vec16_d, double, double, __greater_equal, >=)
+
+static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec16_d)
+INSERT_EXTRACT(__vec16_d, double)
+SMEAR(__vec16_d, double, double)
+SETZERO(__vec16_d, double)
+UNDEF(__vec16_d, double)
+BROADCAST(__vec16_d, double, double)
+ROTATE(__vec16_d, double, double)
+SHUFFLES(__vec16_d, double, double)
+LOAD_STORE(__vec16_d, double)
+#else /* evghenii::double */
+///////////////////////////////////////////////////////////////////////////
+// double
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_d __add(__vec16_d a, __vec16_d b) { 
+    __vec16_d ret;
+    ret.v1 = _mm512_add_pd(a.v1, b.v1);
+    ret.v2 = _mm512_add_pd(a.v2, b.v2);
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __sub(__vec16_d a, __vec16_d b) {
+    __vec16_d ret;
+    ret.v1 = _mm512_sub_pd(a.v1, b.v1);
+    ret.v2 = _mm512_sub_pd(a.v2, b.v2);
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __mul(__vec16_d a, __vec16_d b) {
+    __vec16_d ret;
+    ret.v1 = _mm512_mul_pd(a.v1, b.v1);
+    ret.v2 = _mm512_mul_pd(a.v2, b.v2);
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) {
+    __vec16_d ret;
+    ret.v1 = _mm512_div_pd(a.v1, b.v1);
+    ret.v2 = _mm512_div_pd(a.v2, b.v2);
+    return ret;
+}
+
+static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmpeq_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmpeq_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __equal_double_and_mask(__vec16_d a, __vec16_d b,
+                                                      __vec16_i1 m) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_mask_cmpeq_pd_mask(m, a.v1, b.v1);
+    __vec16_i1 tmp_m = m;
+    ret2 = _mm512_mask_cmpeq_pd_mask(_mm512_kswapb(tmp_m,tmp_m), a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmpneq_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmpneq_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __not_equal_double_and_mask(__vec16_d a, __vec16_d b,
+                                                          __vec16_i1 m) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    __vec16_i1 tmp_m = m;
+    ret1 = _mm512_mask_cmpneq_pd_mask(m, a.v1, b.v1);
+    ret2 = _mm512_mask_cmpneq_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmplt_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmplt_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __less_than_double_and_mask(__vec16_d a, __vec16_d b,
+                                                          __vec16_i1 m) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    __vec16_i1 tmp_m = m;
+    ret1 = _mm512_mask_cmplt_pd_mask(m, a.v1, b.v1);
+    ret2 = _mm512_mask_cmplt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmple_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmple_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __less_equal_double_and_mask(__vec16_d a, __vec16_d b,
+                                                           __vec16_i1 m) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    __vec16_i1 tmp_m = m;
+    ret1 = _mm512_mask_cmple_pd_mask(m, a.v1, b.v1);
+    ret2 = _mm512_mask_cmple_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmpnle_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmpnle_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __greater_than_double_and_mask(__vec16_d a, __vec16_d b,
+                                                             __vec16_i1 m) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    __vec16_i1 tmp_m = m;
+    ret1 = _mm512_mask_cmpnle_pd_mask(m, a.v1, b.v1);
+    ret2 = _mm512_mask_cmpnle_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b,
+                                                              __vec16_i1 m) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    __vec16_i1 tmp_m = m;
+    ret1 = _mm512_mask_cmpnlt_pd_mask(m, a.v1, b.v1);
+    ret2 = _mm512_mask_cmpnlt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmpord_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmpord_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmpunord_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmpunord_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_d __select(__vec16_i1 mask, __vec16_d a, __vec16_d b) {
+    __vec16_d ret;
+    __vec16_i1 tmp_m = mask;
+    ret.v1 = _mm512_mask_mov_pd(b.v1, mask, a.v1);
+    ret.v2 = _mm512_mask_mov_pd(b.v2, _mm512_kswapb(tmp_m, tmp_m), a.v2);
+    return ret;
+}
+
+
+static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) {
+    return ((double *)&v)[index];
+}
+
+static FORCEINLINE void  __insert_element(__vec16_d *v, uint32_t index, double val) {
+    ((double *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_double(double d);
+template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) {
+    __vec16_d ret;
+    ret.v1 = _mm512_set1_pd(d);
+    ret.v2 = _mm512_set1_pd(d);
+    return ret;
+}
+
+template <class RetVecType> RetVecType __setzero_double();
+template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() {
+    __vec16_d ret;
+    ret.v1 = _mm512_setzero_pd();
+    ret.v2 = _mm512_setzero_pd();
+    return ret;
+}
+
+template <class RetVecType> RetVecType __undef_double();
+template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() {
+    return __vec16_d();
+}
+
+static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) {
+    __vec16_d ret;
+    double val = __extract_element(v, index & 0xf);
+    ret.v1 = _mm512_set1_pd(val);
+    ret.v2 = _mm512_set1_pd(val);
+    return ret;
+}
+
+ROTATE(__vec16_d, double, double)
+SHUFFLES(__vec16_d, double, double)
+
+template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) {
+    __vec16_d ret;
+    ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    return ret;
+}
+ 
+template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) {
+    _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+}
+
+
+#if 0
+template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) {
+    __vec16_d ret;
+    ret.v1 = _mm512_load_pd(p);
+    ret.v2 = _mm512_load_pd(((uint8_t*)p)+64);
+    return ret;
+}
+template <> static FORCEINLINE __vec16_d __load<128>(const __vec16_d *p) {
+    return __load<64>(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) {
+    _mm512_store_pd(p, v.v1);
+    _mm512_store_pd(((uint8_t*)p)+64, v.v2);
+}
+template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) {
+    __store<64>(p, v);
+}
+#endif
+#endif /* evghenii::double */
+
+///////////////////////////////////////////////////////////////////////////
+// casts
+
+
+#define CAST(TO, STO, FROM, SFROM, FUNC)        \
+static FORCEINLINE TO FUNC(TO, FROM val) {      \
+    TO ret;                                     \
+    for (int i = 0; i < 16; ++i)                \
+        ret[i] = (STO)((SFROM)(val[i]));    \
+    return ret;                                 \
+}
+
+// sign extension conversions
+CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext)
+CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext)
+CAST(__vec16_i64, int64_t, __vec16_i8,  int8_t,  __cast_sext)
+CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
+CAST(__vec16_i32, int32_t, __vec16_i8,  int8_t,  __cast_sext)
+CAST(__vec16_i16, int16_t, __vec16_i8,  int8_t,  __cast_sext)
+
+#define CAST_SEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 16; ++i) {                    \
+        ret[i] = 0;                                 \
+        if (v.v & (1 << i))                           \
+            ret[i] = ~ret[i];                     \
+    }                                                 \
+    return ret;                                       \
+}
+
+CAST_SEXT_I1(__vec16_i8)
+CAST_SEXT_I1(__vec16_i16)
+#if 0
+CAST_SEXT_I1(__vec16_i32)
+#else
+static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val)
+{
+    __vec16_i32 ret = _mm512_setzero_epi32();
+    __vec16_i32 one = _mm512_set1_epi32(-1);
+    return _mm512_mask_mov_epi32(ret, val, one);
+}
+#endif
+CAST_SEXT_I1(__vec16_i64)
+
+// zero extension
+CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
+CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext)
+CAST(__vec16_i64, uint64_t, __vec16_i8,  uint8_t,  __cast_zext)
+CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext)
+CAST(__vec16_i32, uint32_t, __vec16_i8,  uint8_t,  __cast_zext)
+CAST(__vec16_i16, uint16_t, __vec16_i8,  uint8_t,  __cast_zext)
+
+#define CAST_ZEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = (v.v & (1 << i)) ? 1 : 0;          \
+    return ret;                                       \
+}
+
+CAST_ZEXT_I1(__vec16_i8)
+CAST_ZEXT_I1(__vec16_i16)
+#if 0
+CAST_ZEXT_I1(__vec16_i32)
+#else
+static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
+{
+    __vec16_i32 ret = _mm512_setzero_epi32();
+    __vec16_i32 one = _mm512_set1_epi32(1);
+    return _mm512_mask_mov_epi32(ret, val, one);
+}
+#endif
+CAST_ZEXT_I1(__vec16_i64)
+
+// truncations
+CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc)
+CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc)
+CAST(__vec16_i8,  int8_t,  __vec16_i64, int64_t, __cast_trunc)
+CAST(__vec16_i16, int16_t, __vec16_i32, int32_t, __cast_trunc)
+CAST(__vec16_i8,  int8_t,  __vec16_i32, int32_t, __cast_trunc)
+CAST(__vec16_i8,  int8_t,  __vec16_i16, int16_t, __cast_trunc)
+
+// signed int to float/double
+#if 0
+CAST(__vec16_f, float, __vec16_i8,   int8_t,  __cast_sitofp)
+CAST(__vec16_f, float, __vec16_i16,  int16_t, __cast_sitofp)
+CAST(__vec16_f, float, __vec16_i32,  int32_t, __cast_sitofp)
+#else
+static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i8  val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepi32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+#endif
+CAST(__vec16_f, float, __vec16_i64,  int64_t, __cast_sitofp)
+#if 0
+CAST(__vec16_d, double, __vec16_i8,  int8_t,  __cast_sitofp)
+CAST(__vec16_d, double, __vec16_i16, int16_t, __cast_sitofp)
+CAST(__vec16_d, double, __vec16_i32, int32_t, __cast_sitofp)
+#else
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i8 val) {
+    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    __vec16_d ret;
+    ret.v1 = _mm512_cvtepi32lo_pd(vi);
+    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+    ret.v2 = _mm512_cvtepi32lo_pd(other8);
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i16 val) {
+    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    __vec16_d ret;
+    ret.v1 = _mm512_cvtepi32lo_pd(vi);
+    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+    ret.v2 = _mm512_cvtepi32lo_pd(other8);
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i32 val) {
+    __vec16_d ret;
+    ret.v1 = _mm512_cvtepi32lo_pd(val);
+    __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
+    ret.v2 = _mm512_cvtepi32lo_pd(other8);
+    return ret;
+}
+#endif
+CAST(__vec16_d, double, __vec16_i64, int64_t, __cast_sitofp)
+
+// unsigned int to float/double
+#if 0
+CAST(__vec16_f, float, __vec16_i8,   uint8_t,  __cast_uitofp)
+CAST(__vec16_f, float, __vec16_i16,  uint16_t, __cast_uitofp)
+CAST(__vec16_f, float, __vec16_i32,  uint32_t, __cast_uitofp)
+#else
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i8  val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepu32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+#endif
+CAST(__vec16_f, float, __vec16_i64,  uint64_t, __cast_uitofp)
+#if 0
+CAST(__vec16_d, double, __vec16_i8,  uint8_t,  __cast_uitofp)
+CAST(__vec16_d, double, __vec16_i16, uint16_t, __cast_uitofp)
+CAST(__vec16_d, double, __vec16_i32, uint32_t, __cast_uitofp)
+#else
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i8 val) {
+    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    __vec16_d ret;
+    ret.v1 = _mm512_cvtepu32lo_pd(vi);
+    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+    ret.v2 = _mm512_cvtepu32lo_pd(other8);
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i16 val) {
+    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    __vec16_d ret;
+    ret.v1 = _mm512_cvtepu32lo_pd(vi);
+    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+    ret.v2 = _mm512_cvtepu32lo_pd(other8);
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i32 val) {
+    __vec16_d ret;
+    ret.v1 = _mm512_cvtepu32lo_pd(val);
+    __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
+    ret.v2 = _mm512_cvtepu32lo_pd(other8);
+    return ret;
+}
+#endif
+CAST(__vec16_d, double, __vec16_i64, uint64_t, __cast_uitofp)
+
+#if 0
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret[i] = (v.v & (1 << i)) ? 1. : 0.;
+    return ret;
+}
+#else
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) 
+{
+    const __m512 ret = _mm512_setzero_ps();
+    const __m512 one = _mm512_set1_ps(1.0);
+    return _mm512_mask_mov_ps(ret, v, one);
+}
+#endif
+
+// float/double to signed int
+CAST(__vec16_i8,  int8_t,  __vec16_f, float, __cast_fptosi)
+CAST(__vec16_i16, int16_t, __vec16_f, float, __cast_fptosi)
+#if 0
+CAST(__vec16_i32, int32_t, __vec16_f, float, __cast_fptosi)
+#else
+static FORCEINLINE __vec16_i32 __cast_fptosi(__vec16_i32, __vec16_f val) {
+  return _mm512_cvtfxpnt_round_adjustps_epi32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+#endif
+CAST(__vec16_i64, int64_t, __vec16_f, float, __cast_fptosi)
+CAST(__vec16_i8,  int8_t,  __vec16_d, double, __cast_fptosi)
+CAST(__vec16_i16, int16_t, __vec16_d, double, __cast_fptosi)
+#if 1
+CAST(__vec16_i32, int32_t, __vec16_d, double, __cast_fptosi)
+#else
+#endif
+CAST(__vec16_i64, int64_t, __vec16_d, double, __cast_fptosi)
+
+// float/double to unsigned int
+CAST(__vec16_i8,  uint8_t,  __vec16_f, float, __cast_fptoui)
+CAST(__vec16_i16, uint16_t, __vec16_f, float, __cast_fptoui)
+#if 0
+CAST(__vec16_i32, uint32_t, __vec16_f, float, __cast_fptoui)
+#else
+static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) {
+  return _mm512_cvtfxpnt_round_adjustps_epu32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+#endif
+CAST(__vec16_i64, uint64_t, __vec16_f, float, __cast_fptoui)
+CAST(__vec16_i8,  uint8_t,  __vec16_d, double, __cast_fptoui)
+CAST(__vec16_i16, uint16_t, __vec16_d, double, __cast_fptoui)
+#if 1
+CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui)
+#else
+#endif
+CAST(__vec16_i64, uint64_t, __vec16_d, double, __cast_fptoui)
+
+// float/double conversions
+#if 1
+CAST(__vec16_f, float,  __vec16_d, double, __cast_fptrunc)
+CAST(__vec16_d, double, __vec16_f, float,  __cast_fpext)
+#else
+static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
+    __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
+    __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
+
+    return _mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA);
+}
+static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) {
+    __vec16_d ret;
+    ret.v2 = _mm512_cvtpslo_pd(val.v);
+    __vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC);
+    ret.v1 = _mm512_cvtpslo_pd(other8);
+    return ret;
+}
+#endif
+
+typedef union {
+    int32_t i32;
+    float f;
+    int64_t i64;
+    double d;
+} BitcastUnion;
+
+#define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT)       \
+static FORCEINLINE TO __cast_bits(TO, FROM val) {   \
+    TO r;                                           \
+    for (int i = 0; i < 16; ++i) {                  \
+        BitcastUnion u;                             \
+        u.FROM_ELT = val[i];                      \
+        r[i] = u.TO_ELT;                          \
+    }                                               \
+    return r;                                       \
+}
+
+#if 0
+CAST_BITS(__vec16_f,   f,   __vec16_i32, i32)
+CAST_BITS(__vec16_i32, i32, __vec16_f,   f)
+#else
+static FORCEINLINE __vec16_f __cast_bits(__vec16_f, __vec16_i32 val) {
+    return _mm512_castsi512_ps(val);
+}
+static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) {
+    return _mm512_castps_si512(val);
+}
+#endif
+
+#if 0
+CAST_BITS(__vec16_d,   d,   __vec16_i64, i64)
+CAST_BITS(__vec16_i64, i64, __vec16_d,   d)
+#else
+static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) {
+    return *(__vec16_i64*)&val;
+}
+static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) {
+    return *(__vec16_d*)&val;
+}
+#endif
+
+#define CAST_BITS_SCALAR(TO, FROM)                  \
+static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
+    union {                                         \
+    TO to;                                          \
+    FROM from;                                      \
+    } u;                                            \
+    u.from = v;                                     \
+    return u.to;                                    \
+}
+
+CAST_BITS_SCALAR(uint32_t, float)
+CAST_BITS_SCALAR(int32_t, float)
+CAST_BITS_SCALAR(float, uint32_t)
+CAST_BITS_SCALAR(float, int32_t)
+CAST_BITS_SCALAR(uint64_t, double)
+CAST_BITS_SCALAR(int64_t, double)
+CAST_BITS_SCALAR(double, uint64_t)
+CAST_BITS_SCALAR(double, int64_t)
+
+///////////////////////////////////////////////////////////////////////////
+// various math functions
+
+static FORCEINLINE void __fastmath() {
+}
+
+static FORCEINLINE float __round_uniform_float(float v) {
+    return roundf(v);
+}
+
+static FORCEINLINE float __floor_uniform_float(float v)  {
+    return floorf(v);
+}
+
+static FORCEINLINE float __ceil_uniform_float(float v) {
+    return ceilf(v);
+}
+
+static FORCEINLINE double __round_uniform_double(double v) {
+    return round(v);
+}
+
+static FORCEINLINE double __floor_uniform_double(double v) {
+    return floor(v);
+}
+
+static FORCEINLINE double __ceil_uniform_double(double v) {
+    return ceil(v);
+}
+
+#if 0
+UNARY_OP(__vec16_f, __round_varying_float, roundf)
+UNARY_OP(__vec16_f, __floor_varying_float, floorf)
+UNARY_OP(__vec16_f, __ceil_varying_float, ceilf)
+#else
+static FORCEINLINE __vec16_f __round_varying_float(__vec16_f v) {
+  return _mm512_round_ps(v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);
+}
+
+static FORCEINLINE __vec16_f __floor_varying_float(__vec16_f v) {
+  return _mm512_floor_ps(v);
+}
+
+static FORCEINLINE __vec16_f __ceil_varying_float(__vec16_f v) {
+  return _mm512_ceil_ps(v);
+}
+#endif
+
+#if 0
+UNARY_OP(__vec16_d, __round_varying_double, round)
+UNARY_OP(__vec16_d, __floor_varying_double, floor)
+UNARY_OP(__vec16_d, __ceil_varying_double, ceil)
+#else
+static FORCEINLINE __vec16_d __round_varying_float(__vec16_d v) {
+  __vec16_d ret;
+  ret.v1 = _mm512_svml_round_pd(v.v1);
+  ret.v2 = _mm512_svml_round_pd(v.v2);
+  return ret;
+}
+
+static FORCEINLINE __vec16_d __floor_varying_float(__vec16_d v) {
+  __vec16_d ret;
+  ret.v1 = _mm512_floor_pd(v.v1);
+  ret.v2 = _mm512_floor_pd(v.v2);
+  return ret;
+}
+
+static FORCEINLINE __vec16_d __ceil_varying_float(__vec16_d v) {
+  __vec16_d ret;
+  ret.v1 = _mm512_ceil_pd(v.v1);
+  ret.v2 = _mm512_ceil_pd(v.v2);
+  return ret;
+}
+#endif
+
+
+// min/max
+
+static FORCEINLINE float __min_uniform_float(float a, float b) { return (a<b) ? a : b; }
+static FORCEINLINE float __max_uniform_float(float a, float b) { return (a>b) ? a : b; }
+static FORCEINLINE double __min_uniform_double(double a, double b) { return (a<b) ? a : b; }
+static FORCEINLINE double __max_uniform_double(double a, double b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_int32(int32_t a, int32_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_uint32(uint32_t a, uint32_t b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_int64(int64_t a, int64_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_uint64(uint64_t a, uint64_t b) { return (a>b) ? a : b; }
+
+
+#if 0
+BINARY_OP_FUNC(__vec16_f, __max_varying_float, __max_uniform_float)
+BINARY_OP_FUNC(__vec16_f, __min_varying_float, __min_uniform_float)
+BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double)
+BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double)
+#else
+static FORCEINLINE __vec16_f __max_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmax_ps(v1, v2);}
+static FORCEINLINE __vec16_f __min_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmin_ps(v1, v2);}
+static FORCEINLINE __vec16_d __max_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmax_pd(v1.v1, v2.v1),_mm512_gmax_pd(v1.v2,v2.v2));}
+static FORCEINLINE __vec16_d __min_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmin_pd(v1.v1, v2.v1),_mm512_gmin_pd(v1.v2,v2.v2));}
+#endif
+
+#if 0
+BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32)
+BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32)
+BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32)
+BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32)
+#else
+static FORCEINLINE __vec16_i32 __max_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epi32(v1, v2);}
+static FORCEINLINE __vec16_i32 __min_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epi32(v1, v2);}
+static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epu32(v1, v2);}
+static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epu32(v1, v2);}
+#endif
+
+BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64)
+BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64)
+BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64)
+BINARY_OP_FUNC(__vec16_i64, __min_varying_uint64, __min_uniform_uint64)
+
+// sqrt/rsqrt/rcp
+
+static FORCEINLINE float __rsqrt_uniform_float(float v) {
+    return 1.f / sqrtf(v);
+}
+
+static FORCEINLINE float __rcp_uniform_float(float v) {
+    return 1.f / v;
+}
+
+static FORCEINLINE float __sqrt_uniform_float(float v) {
+    return sqrtf(v);
+}
+
+static FORCEINLINE double __sqrt_uniform_double(double v) {
+    return sqrt(v);
+}
+
+#if 0
+UNARY_OP(__vec16_f, __rcp_varying_float, __rcp_uniform_float)
+UNARY_OP(__vec16_f, __rsqrt_varying_float, __rsqrt_uniform_float)
+UNARY_OP(__vec16_f, __sqrt_varying_float, __sqrt_uniform_float)
+UNARY_OP(__vec16_d, __sqrt_varying_double, __sqrt_uniform_double)
+#else
+static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) {
+#ifdef ISPC_FAST_MATH
+    return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy.
+#else
+    return _mm512_recip_ps(v);
+#endif
+}
+
+static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
+#ifdef ISPC_FAST_MATH
+    return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy
+#else 
+    return _mm512_invsqrt_ps(v);
+#endif
+}
+static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) {    return _mm512_sqrt_ps(v);}
+static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) {    return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));}
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// svml
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_f __svml_logf(__vec16_f v)              { return _mm512_log_ps(v); }
+static FORCEINLINE __vec16_f __svml_expf(__vec16_f v)              { return _mm512_exp_ps(v); }
+static FORCEINLINE __vec16_f __svml_cosf(__vec16_f v)              { return _mm512_cos_ps(v); }
+static FORCEINLINE __vec16_f __svml_powf(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); }
+
+static FORCEINLINE __vec16_d __svml_logd(__vec16_d v)              { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_expd(__vec16_d v)              { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_cosd(__vec16_d v)              { return __vec16_d(_mm512_cos_pd(v.v1), _mm512_cos_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_powd(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1), _mm512_pow_pd(a.v2,b.v2)); }
+
+///////////////////////////////////////////////////////////////////////////
+// bit ops
+
+static FORCEINLINE int32_t __popcnt_int32(uint32_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __popcnt_int64(uint64_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & (1<<31)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & (1ull<<63)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// reductions
+
+#if 0
+REDUCE_ADD(float, __vec16_f, __reduce_add_float)
+REDUCE_MINMAX(float, __vec16_f, __reduce_min_float, <)
+REDUCE_MINMAX(float, __vec16_f, __reduce_max_float, >)
+#else
+static FORCEINLINE float __reduce_add_float(__vec16_f v) { return _mm512_reduce_add_ps(v); }
+static FORCEINLINE float __reduce_min_float(__vec16_f v) { return _mm512_reduce_min_ps(v); }
+static FORCEINLINE float __reduce_max_float(__vec16_f v) { return _mm512_reduce_max_ps(v); }
+#endif
+
+#if 0
+REDUCE_ADD(double, __vec16_d, __reduce_add_double)
+REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <)
+REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >)
+#else
+static FORCEINLINE float __reduce_add_double(__vec16_d v) { return _mm512_reduce_add_pd(v.v1) + _mm512_reduce_add_pd(v.v2); }
+static FORCEINLINE float __reduce_min_double(__vec16_d v) { return std::min(_mm512_reduce_min_pd(v.v1), _mm512_reduce_min_pd(v.v2)); }
+static FORCEINLINE float __reduce_max_double(__vec16_d v) { return std::max(_mm512_reduce_max_pd(v.v1), _mm512_reduce_max_pd(v.v2)); }
+#endif
+
+
+
+#if 0
+REDUCE_ADD   (int64_t, __vec16_i32, __reduce_add_int32)
+REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <)
+REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >)
+REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <)
+REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >)
+#else
+static FORCEINLINE  int64_t __reduce_add_int32  (__vec16_i32 v) { return _mm512_reduce_add_epi32(v);}
+static FORCEINLINE  int32_t __reduce_min_int32  (__vec16_i32 v) { return _mm512_reduce_min_epi32(v);}
+static FORCEINLINE  int32_t __reduce_max_int32  (__vec16_i32 v) { return _mm512_reduce_max_epi32(v);}
+static FORCEINLINE uint32_t __reduce_min_uint32 (__vec16_i32 v) { return _mm512_reduce_min_epu32(v);}
+static FORCEINLINE uint32_t __reduce_max_uint32 (__vec16_i32 v) { return _mm512_reduce_max_epu32(v);}
+#endif
+
+REDUCE_ADD   ( int16_t, __vec16_i8,  __reduce_add_int8)
+REDUCE_ADD   ( int32_t, __vec16_i16, __reduce_add_int16)
+REDUCE_ADD   ( int64_t, __vec16_i64, __reduce_add_int64)
+REDUCE_MINMAX( int64_t, __vec16_i64, __reduce_min_int64, <)
+REDUCE_MINMAX( int64_t, __vec16_i64, __reduce_max_int64, >)
+REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <)
+REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
+
+///////////////////////////////////////////////////////////////////////////
+// masked load/store
+
+static FORCEINLINE __vec16_i8 __masked_load_i8(void *p,
+                                               __vec16_i1 mask) {
+    __vec16_i8 ret;
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec16_i16 __masked_load_i16(void *p,
+                                                 __vec16_i1 mask) {
+    __vec16_i16 ret;
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+#if 0
+static FORCEINLINE __vec16_i32 __masked_load_i32(void *p,
+                                                 __vec16_i1 mask) {
+    __vec16_i32 ret;
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_mask_load_epi32(__vec16_i32(), mask, p);
+#else
+    __vec16_i32 tmp;
+    tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __vec16_i32 ret;
+    return _mm512_mask_mov_epi32(ret.v, mask, tmp.v);
+#endif
+}
+#endif
+
+#if 0
+static FORCEINLINE __vec16_f __masked_load_float(void *p,
+                                                 __vec16_i1 mask) {
+    __vec16_f ret;
+    float *ptr = (float *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
+#else
+    __vec16_f tmp;
+    tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    __vec16_f ret;
+    return _mm512_mask_mov_ps(ret.v, mask, tmp.v);
+#endif
+}
+#endif
+
+static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
+                                                 __vec16_i1 mask) {
+    __vec16_i64 ret;
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+#if 0
+static FORCEINLINE __vec16_d __masked_load_double(void *p,
+                                                  __vec16_i1 mask) {
+    __vec16_d ret;
+    double *ptr = (double *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    __vec16_d ret;
+    __vec16_i1 tmp_m = mask;
+    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+    ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p);
+    ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64);
+    return ret;
+#else
+    __vec16_d tmp;
+    tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    __vec16_d ret;
+    __vec16_i1 tmp_m = mask;
+    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+    ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1);
+    ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2);
+    return ret;
+#endif
+}
+#endif
+
+
+static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val,
+                                          __vec16_i1 mask) {
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val,
+                                           __vec16_i1 mask) {
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+#if 0
+static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val,
+                                           __vec16_i1 mask) {
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_epi32(p, mask, val.v);
+#else
+    __vec16_i32 tmp;
+    tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v);
+    _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+#if 0
+static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
+                                             __vec16_i1 mask) {
+    float *ptr = (float *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
+                                             __vec16_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_ps(p, mask, val.v);
+#else
+    __vec16_f tmp;
+    tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v);
+    _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
+                                          __vec16_i1 mask) {
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+#if 0
+static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
+                                              __vec16_i1 mask) {
+    double *ptr = (double *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
+                                              __vec16_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    __vec16_i1 tmp_m = mask;
+    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+    _mm512_mask_store_pd(p, mask, val.v1);
+    _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2);
+#else
+    __vec16_d tmp;
+    __vec16_i1 tmp_m = mask;
+    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+    tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1);
+    tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2);
+    _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val,
+                                                __vec16_i1 mask) {
+    __masked_store_i8(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i16(void *p, __vec16_i16 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i16(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
+                                                   __vec16_i1 mask) {
+    __masked_store_float(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i64(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
+                                                    __vec16_i1 mask) {
+    __masked_store_double(p, val, mask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// gather/scatter
+
+// offsets * offsetScale is in bytes (for all of these)
+
+#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec16_i1 mask) {          \
+    VTYPE ret;                                                          \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 16; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            ret[i] = *ptr;                                            \
+        }                                                               \
+    return ret;                                                         \
+}
+    
+
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __gather_base_offsets32_i8)
+#else
+static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets,  __vec16_i1 mask) 
+{
+    // (iw): need to temporarily store as int because gathers can only return ints.
+    __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, 
+                                                     _MM_UPCONV_EPI32_SINT8, scale,
+                                                     _MM_HINT_NONE);
+    // now, downconverting to chars into temporary char vector
+    __vec16_i8 ret;
+    _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+    return ret;
+}
+#endif
+GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
+/****************/
+GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
+GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
+#else
+static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets,   __vec16_i1 mask) 
+{
+    return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, 
+                                          base, _MM_UPCONV_EPI32_NONE, scale,
+                                          _MM_HINT_NONE);
+}
+#endif
+GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
+#else
+static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) 
+{
+    return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets,
+                                       base, _MM_UPCONV_PS_NONE, scale,
+                                       _MM_HINT_NONE);
+}
+#endif
+GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
+/****************/
+GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
+GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __gather_base_offsets32_double)
+#else
+static FORCEINLINE __vec16_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) 
+{
+    __vec16_d ret;
+    ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
+                                       base, _MM_UPCONV_PD_NONE, scale,
+                                       _MM_HINT_NONE); 
+    __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+    const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */
+    ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets,
+                                       base, _MM_UPCONV_PD_NONE, scale,
+                                       _MM_HINT_NONE); 
+    return ret;
+}
+#endif
+GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __gather_base_offsets64_double)
+
+#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
+    VTYPE ret;                                              \
+    for (int i = 0; i < 16; ++i)                            \
+        if ((mask.v & (1 << i)) != 0) {                     \
+            STYPE *ptr = (STYPE *)ptrs[i];                \
+            ret[i] = *ptr;                                \
+        }                                                   \
+    return ret;                                             \
+}
+#define GATHER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
+  return FUNC1(0, 1, ptrs, mask); \
+}
+
+
+#if 1
+/***********/
+GATHER_GENERALF(__vec16_i8,  int8_t,  __vec16_i32, __gather32_i8, __gather_base_offsets32_i8)
+GATHER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __gather32_i16, __gather_base_offsets32_i16)
+GATHER_GENERALF(__vec16_i32, int32_t, __vec16_i32, __gather32_i32, __gather_base_offsets32_i32)
+GATHER_GENERALF(__vec16_i64, int64_t, __vec16_i32, __gather32_i64, __gather_base_offsets32_i64)
+GATHER_GENERALF(__vec16_f,   float,   __vec16_i32, __gather32_float, __gather_base_offsets32_float)
+GATHER_GENERALF(__vec16_d,   double,  __vec16_i32, __gather32_double, __gather_base_offsets32_double)
+/***********/
+GATHER_GENERAL(__vec16_i8,  int8_t,  __vec16_i64, __gather64_i8);
+GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16);
+GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32);
+GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64);
+GATHER_GENERAL(__vec16_f,   float,   __vec16_i64, __gather64_float);
+GATHER_GENERAL(__vec16_d,   double,  __vec16_i64, __gather64_double);
+/***********/
+#endif
+
+// scatter
+
+#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val,                   \
+                             __vec16_i1 mask) {                         \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 16; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            *ptr = val[i];                                            \
+        }                                                               \
+}
+    
+
+/*****************/
+SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __scatter_base_offsets64_i8)
+/*****************/
+SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
+SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
+/*****************/
+#if 0
+SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
+#else
+static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets,  __vec16_i32 val, __vec16_i1 mask)
+{
+    _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, 
+                                    _MM_DOWNCONV_EPI32_NONE, scale, 
+                                    _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
+/*****************/
+#if 0
+SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_base_offsets32_float)
+#else
+static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
+                               __vec16_f val, __vec16_i1 mask) 
+{ 
+    _mm512_mask_i32extscatter_ps(base, mask, offsets, val, 
+                                 _MM_DOWNCONV_PS_NONE, scale,
+                                 _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
+/*****************/
+SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
+SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
+/*****************/
+#if 0 /* evghenii::to implement */
+SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __scatter_base_offsets32_double)
+#else /* evghenii:testme */
+static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t scale, __vec16_i32 offsets,
+                               __vec16_d val, __vec16_i1 mask) 
+{ 
+    _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v1, 
+                                 _MM_DOWNCONV_PD_NONE, scale,
+                                 _MM_HINT_NONE);
+    __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+    const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */
+    _mm512_mask_i32loextscatter_pd(base, mask8, shuffled_offsets, val.v2, 
+                                 _MM_DOWNCONV_PD_NONE, scale,
+                                 _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __scatter_base_offsets64_double)
+
+#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
+    VTYPE ret;                                                       \
+    for (int i = 0; i < 16; ++i)                                     \
+        if ((mask.v & (1 << i)) != 0) {                              \
+            STYPE *ptr = (STYPE *)ptrs[i];                         \
+            *ptr = val[i];                                         \
+        }                                                            \
+}
+#define SCATTER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
+  return FUNC1(0, 1, ptrs, val, mask); \
+}
+
+#if 1
+/***********/
+SCATTER_GENERALF(__vec16_i8,  int8_t,  __vec16_i32, __scatter32_i8, __scatter_base_offsets32_i8)
+SCATTER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16, __scatter_base_offsets32_i16)
+SCATTER_GENERALF(__vec16_i32, int32_t, __vec16_i32, __scatter32_i32, __scatter_base_offsets32_i32)
+SCATTER_GENERALF(__vec16_i64, int64_t, __vec16_i32, __scatter32_i64, __scatter_base_offsets32_i64)
+SCATTER_GENERALF(__vec16_f,   float,   __vec16_i32, __scatter32_float, __scatter_base_offsets32_float)
+SCATTER_GENERALF(__vec16_d,   double,  __vec16_i32, __scatter32_double, __scatter_base_offsets32_double)
+/***********/
+SCATTER_GENERAL(__vec16_i8,  int8_t,  __vec16_i64, __scatter64_i8)
+SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
+SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __scatter64_i32)
+SCATTER_GENERAL(__vec16_f,   float,   __vec16_i64, __scatter64_float)
+SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64)
+SCATTER_GENERAL(__vec16_d,   double,  __vec16_i64, __scatter64_double)
+/***********/
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// packed load/store
+
+#if 0
+static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec16_i32 *val,
+                                                __vec16_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 16; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->operator[](i) = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+#endif
+
+#if 0
+static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, 
+                                                 __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 16; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val[i];
+            ++count;
+        }
+    }
+    return count;
+}
+#endif
+
+#if 0
+static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
+                                                __vec16_i32 *val,
+                                                __vec16_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 16; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->operator[](i) = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
+                                                 __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 16; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val[i];
+            ++count;
+        }
+    }
+    return count;
+}
+#endif
+
+#if 1
+static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val,
+                                                __vec16_i1 mask) {
+    __vec16_i32 v = __load<64>(val);
+    v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __store<64>(val, v);
+    return _mm_countbits_32(uint32_t(mask));
+}
+#endif
+
+#if 1
+static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    return _mm_countbits_32(uint32_t(mask));
+}
+#endif
+
+#if 1
+static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val,
+                                                __vec16_i1 mask) {
+    __vec16_i32 v = __load<64>(val);
+    v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __store<64>(val, v);
+    return _mm_countbits_32(uint32_t(mask));
+}
+#endif
+
+#if 1
+static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    return _mm_countbits_32(uint32_t(mask));
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// aos/soa
+
+static FORCEINLINE void __soa_to_aos3_float(__vec16_f v0, __vec16_f v1, __vec16_f v2,
+                                            float *ptr) {
+    for (int i = 0; i < 16; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec16_f *out0, __vec16_f *out1,
+                                            __vec16_f *out2) {
+    for (int i = 0; i < 16; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+    }
+}
+
+static FORCEINLINE void __soa_to_aos4_float(__vec16_f v0, __vec16_f v1, __vec16_f v2,
+                                            __vec16_f v3, float *ptr) {
+    for (int i = 0; i < 16; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+        *ptr++ = __extract_element(v3, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec16_f *out0, __vec16_f *out1,
+                                            __vec16_f *out2, __vec16_f *out3) {
+    for (int i = 0; i < 16; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+        __insert_element(out3, i, *ptr++);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// prefetch
+
+static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T0); // prefetch into L1$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T1); // prefetch into L2$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *p) {
+    // There is no L3$ on KNC, don't want to pollute L2$ unecessarily
+}
+
+static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint
+    // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
+}
+
+///////////////////////////////////////////////////////////////////////////
+// atomics
+
+static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) {
+    int32_t old, min;
+    do {
+        old = *((volatile int32_t *)p);
+        min = (old < (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) {
+    int32_t old, max;
+    do {
+        old = *((volatile int32_t *)p);
+        max = (old > (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) {
+    uint32_t old, min;
+    do {
+        old = *((volatile uint32_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) {
+    uint32_t old, max;
+    do {
+        old = *((volatile uint32_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange((LONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval,
+                                             uint32_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) {
+    int64_t old, min;
+    do {
+        old = *((volatile int64_t *)p);
+        min = (old < (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) {
+    int64_t old, max;
+    do {
+        old = *((volatile int64_t *)p);
+        max = (old > (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) {
+    uint64_t old, min;
+    do {
+        old = *((volatile uint64_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) {
+    uint64_t old, max;
+    do {
+        old = *((volatile uint64_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange64((LONGLONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
+                                             uint64_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+
+#endif // !WIN32
+
+#undef FORCEINLINE
+#undef PRE_ALIGN
+#undef POST_ALIGN
diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h
new file mode 100644
index 00000000..de9bddcc
--- /dev/null
+++ b/examples/intrinsics/knc-i1x8.h
@@ -0,0 +1,2862 @@
+/**
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdint.h>
+#include <math.h>
+#include <assert.h>
+#include <algorithm>
+#include <immintrin.h>
+#include <zmmintrin.h>
+
+#if 0
+#define __ZMM32BIT__
+#endif
+
+
+#ifdef _MSC_VER
+#define FORCEINLINE __forceinline
+#define PRE_ALIGN(x)  /*__declspec(align(x))*/
+#define POST_ALIGN(x)  
+#define roundf(x) (floorf(x + .5f))
+#define round(x) (floor(x + .5))
+#else
+#define FORCEINLINE __attribute__((always_inline))
+#define PRE_ALIGN(x)
+#define POST_ALIGN(x)  __attribute__ ((aligned(x)))
+#endif
+
+#if 0
+#define KNC 1
+extern "C" 
+{
+  int printf(const unsigned char *, ...);
+  int puts(unsigned char *);
+  unsigned int putchar(unsigned int);
+  int fflush(void *);
+  uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t);
+  uint8_t *memset(uint8_t *, uint8_t, uint64_t);
+  void memset_pattern16(void *, const void *, uint64_t);
+}
+#endif
+
+typedef float __vec1_f;
+typedef double __vec1_d;
+typedef int8_t __vec1_i8;
+typedef int16_t __vec1_i16;
+typedef int32_t __vec1_i32;
+typedef int64_t __vec1_i64;
+
+struct __vec8_i1 {
+    __vec8_i1() { }
+    __vec8_i1(const __mmask16 &vv) : v(vv) { }
+    __vec8_i1(bool v0, bool v1, bool v2, bool v3,
+               bool v4, bool v5, bool v6, bool v7) {
+        v = ((v0 & 1) |
+             ((v1 & 1) << 1) |
+             ((v2 & 1) << 2) |
+             ((v3 & 1) << 3) |
+             ((v4 & 1) << 4) |
+             ((v5 & 1) << 5) |
+             ((v6 & 1) << 6) |
+             ((v7 & 1) << 7) );
+    }
+             
+    __mmask8 v;
+    FORCEINLINE operator __mmask8() const { return v; }
+};
+
+
+template <typename T>
+struct vec8 {
+    vec8() { }
+    vec8(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) {
+        data[0] = v0;        data[1] = v1;        data[2] = v2;        data[3] = v3;
+        data[4] = v4;        data[5] = v5;        data[6] = v6;        data[7] = v7;
+    }
+    T data[8]; 
+    FORCEINLINE const T& operator[](const int i) const { return data[i]; }
+    FORCEINLINE       T& operator[](const int i)       { return data[i]; }
+};
+
+/****************/
+
+#ifndef __ZMM32BIT__
+struct PRE_ALIGN(32) __vec8_i32  : public vec8<int32_t> { 
+  __vec8_i32() { }
+  FORCEINLINE __vec8_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, 
+      int32_t v4, int32_t v5, int32_t v6, int32_t v7)
+    : vec8<int32_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+  FORCEINLINE __vec8_i32(__m512i v) 
+  {
+    union { __m512i v; int32_t s[8]; } val = {v};
+    data[0] = val.s[0];
+    data[1] = val.s[1];
+    data[2] = val.s[2];
+    data[3] = val.s[3];
+    data[4] = val.s[4];
+    data[5] = val.s[5];
+    data[6] = val.s[6];
+    data[7] = val.s[7];
+  }
+  FORCEINLINE operator __m512i() const 
+  { 
+    return _mm512_set_16to16_pi(
+        0,0,0,0, 0,0,0,0,
+        data[7],data[6],data[5],data[4],data[3],data[2],data[1],data[0]);
+  }
+} POST_ALIGN(32);
+#else /* __ZMM32BIT__ */
+struct PRE_ALIGN(32) __vec8_i32 
+{
+  __m512i v;
+  FORCEINLINE operator __m512i() const { return v; }
+  FORCEINLINE __vec8_i32() : v(_mm512_undefined_epi32()) {}
+  FORCEINLINE __vec8_i32(const __m512i &in) : v(in) {}
+  FORCEINLINE __vec8_i32(const __vec8_i32 &o) : v(o.v) {}
+  FORCEINLINE __vec8_i32& operator =(const __vec8_i32 &o) { v=o.v; return *this; }
+  FORCEINLINE __vec8_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, 
+                        int32_t v04, int32_t v05, int32_t v06, int32_t v07) :
+    v ( _mm512_set_16to16_pi(0,0,0,0,0,0,0,0, v07, v06, v05, v04, v03, v02, v01, v00) ) {}
+    FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
+    FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
+} POST_ALIGN(32);
+#endif /* __ZMM32BIT__ */
+
+#ifndef __ZMM32BIT__ /* __ZMM32BIT__ */
+PRE_ALIGN(32) struct __vec8_f : public vec8<float> { 
+    __vec8_f() { }
+  FORCEINLINE  __vec8_f(float v0, float v1, float v2, float v3, 
+             float v4, float v5, float v6, float v7) 
+        : vec8<float>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+  FORCEINLINE operator __m512() const 
+  { 
+    return _mm512_set_16to16_ps(
+        0,0,0,0,0,0,0,0,
+        data[7],data[6],data[5],data[4],data[3],data[2],data[1],data[0]);
+  }
+  FORCEINLINE __vec8_f(__m512 v) 
+  {
+    union { __m512 v; float s[8]; } val = {v};
+    data[0] = val.s[0];
+    data[1] = val.s[1];
+    data[2] = val.s[2];
+    data[3] = val.s[3];
+    data[4] = val.s[4];
+    data[5] = val.s[5];
+    data[6] = val.s[6];
+    data[7] = val.s[7];
+  }
+} POST_ALIGN(32);
+#else /* __ZMM32BIT__ */
+PRE_ALIGN(32) struct __vec8_f 
+{
+    __m512 v;
+    FORCEINLINE operator __m512() const { return v; }
+    FORCEINLINE __vec8_f() : v(_mm512_undefined_ps()) { }
+    FORCEINLINE __vec8_f(const __m512 &in) : v(in) {}
+    FORCEINLINE __vec8_f(const __vec8_f &o) : v(o.v) {}
+    FORCEINLINE __vec8_f& operator =(const __vec8_f &o) { v=o.v; return *this; }
+    FORCEINLINE __vec8_f(float v00, float v01, float v02, float v03, 
+                          float v04, float v05, float v06, float v07) :
+        v ( _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, v07, v06, v05, v04, v03, v02, v01, v00) )  {}
+    FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
+    FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
+} POST_ALIGN(32);
+#endif /* __ZMM32BIT__ */
+
+struct PRE_ALIGN(64) __vec8_d 
+{
+    __m512d v;
+    FORCEINLINE __vec8_d() : v(_mm512_undefined_pd()) {}
+    FORCEINLINE __vec8_d(const __m512d _v) : v(_v) {}
+    FORCEINLINE __vec8_d(const __vec8_d &o) : v(o.v) {}
+    FORCEINLINE __vec8_d& operator =(const __vec8_d &o) { v=o.v; return *this; }
+    FORCEINLINE operator __m512d() const { return v; }
+    FORCEINLINE __vec8_d(double v00, double v01, double v02, double v03, 
+                          double v04, double v05, double v06, double v07) :
+        v ( _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00) ) {}
+    FORCEINLINE const double& operator[](const int i) const {  return ((double*)this)[i]; }
+    FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
+} POST_ALIGN(64);
+
+/****************/
+
+PRE_ALIGN(64) struct __vec8_i64  : public vec8<int64_t> { 
+    __vec8_i64() { }
+    __vec8_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, 
+               int64_t v4, int64_t v5, int64_t v6, int64_t v7) 
+        : vec8<int64_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+} POST_ALIGN(64);
+
+PRE_ALIGN(16) struct __vec8_i8   : public vec8<int8_t> { 
+    __vec8_i8() { }
+    __vec8_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, 
+               int8_t v4, int8_t v5, int8_t v6, int8_t v7)
+        : vec8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+} POST_ALIGN(16);
+
+PRE_ALIGN(32) struct __vec8_i16  : public vec8<int16_t> { 
+    __vec8_i16() { }
+    __vec8_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, 
+                int16_t v4, int16_t v5, int16_t v6, int16_t v7) 
+        : vec8<int16_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+} POST_ALIGN(32);
+
+static inline int32_t __extract_element(__vec8_i32, int);
+
+
+///////////////////////////////////////////////////////////////////////////
+// macros...
+
+#define UNARY_OP(TYPE, NAME, OP)            \
+static FORCEINLINE TYPE NAME(TYPE v) {      \
+    TYPE ret;                               \
+    for (int i = 0; i < 8; ++i)            \
+        ret[i] = OP(v[i]);              \
+    return ret;                             \
+}
+
+#define BINARY_OP(TYPE, NAME, OP)                               \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
+    TYPE ret;                                                   \
+   for (int i = 0; i < 8; ++i)                                 \
+       ret[i] = a[i] OP b[i];                             \
+   return ret;                                                   \
+}
+
+#define BINARY_OP_CAST(TYPE, CAST, NAME, OP)                        \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP (CAST)(b[i]);                 \
+   return ret;                                                      \
+}
+
+#define BINARY_OP_FUNC(TYPE, NAME, FUNC)                            \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret[i] = FUNC(a[i], b[i]);                             \
+   return ret;                                                      \
+}
+
+#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP)                        \
+static FORCEINLINE __vec8_i1 NAME##_##SUFFIX(TYPE a, TYPE b) {     \
+   __vec8_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   return ret;                                                      \
+}                                                                   \
+static FORCEINLINE __vec8_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b,       \
+                                              __vec8_i1 mask) {    \
+   __vec8_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   ret.v &= mask.v;                                                 \
+   return ret;                                                      \
+}
+
+#define INSERT_EXTRACT(VTYPE, STYPE)                                  \
+static FORCEINLINE STYPE __extract_element(VTYPE v, int index) {      \
+    return ((STYPE *)&v)[index];                                      \
+}                                                                     \
+static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
+    ((STYPE *)v)[index] = val;                                        \
+}
+
+#define LOAD_STORE(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 8; ++i)                       \
+        ret[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 8; ++i)                       \
+        ptr[i] = v[i];                               \
+}
+
+#define LOADS(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 8; ++i)                       \
+        ret[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+
+#define STORES(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 8; ++i)                       \
+        ptr[i] = v[i];                               \
+}
+
+#define REDUCE_ADD(TYPE, VTYPE, NAME)           \
+static FORCEINLINE TYPE NAME(VTYPE v) {         \
+     TYPE ret = v[0];                         \
+     for (int i = 1; i < 8; ++i)               \
+         ret = ret + v[i];                    \
+     return ret;                                \
+}
+
+#define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP)                    \
+static FORCEINLINE TYPE NAME(VTYPE v) {                         \
+    TYPE ret = v[0];                                          \
+    for (int i = 1; i < 8; ++i)                                \
+        ret = (ret OP (TYPE)v[i]) ? ret : (TYPE)v[i];       \
+    return ret;                                                 \
+}
+
+#define SELECT(TYPE)                                                \
+static FORCEINLINE TYPE __select(__vec8_i1 mask, TYPE a, TYPE b) { \
+    TYPE ret;                                                       \
+    for (int i = 0; i < 8; ++i)                                    \
+        ret[i] = (mask.v & (1<<i)) ? a[i] : b[i];             \
+    return ret;                                                     \
+}                                                                   \
+static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
+    return cond ? a : b;                                            \
+}
+
+#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP)                         \
+static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP b;                              \
+   return ret;                                                      \
+}
+
+#define SMEAR(VTYPE, NAME, STYPE)                                  \
+template <class RetVecType> VTYPE __smear_##NAME(STYPE);           \
+template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 8; ++i)                                   \
+        ret[i] = v;                                              \
+    return ret;                                                    \
+}
+
+#define SETZERO(VTYPE, NAME)                                       \
+template <class RetVecType> VTYPE __setzero_##NAME();              \
+template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 8; ++i)                                   \
+        ret[i] = 0;                                              \
+    return ret;                                                    \
+}
+
+#define UNDEF(VTYPE, NAME)                                         \
+template <class RetVecType> VTYPE __undef_##NAME();                \
+template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() {            \
+    return VTYPE();                                                \
+}
+
+#define BROADCAST(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = v[index & 0x7];                  \
+    return ret;                                       \
+}                                                     \
+
+#define ROTATE(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = v[(i+index) & 0x7];              \
+    return ret;                                       \
+}                                                     \
+
+#define SHUFFLES(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec8_i32 index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = v[__extract_element(index, i) & 0x7];      \
+    return ret;                                       \
+}                                                     \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec8_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i) {                    \
+        int ii = __extract_element(index, i) & 0xf;    \
+        ret[i] = (ii < 8) ? v0[ii] : v1[ii-8];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+#define SHUFFLE2(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec8_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i) {                    \
+        int ii = __extract_element(index, i) & 0xf;    \
+        ret[i] = (ii < 8) ? v0[ii] : v1[ii-8];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+INSERT_EXTRACT(__vec1_i8, int8_t)
+INSERT_EXTRACT(__vec1_i16, int16_t)
+INSERT_EXTRACT(__vec1_i32, int32_t)
+INSERT_EXTRACT(__vec1_i64, int64_t)
+INSERT_EXTRACT(__vec1_f, float)
+INSERT_EXTRACT(__vec1_d, double)
+
+///////////////////////////////////////////////////////////////////////////
+// mask ops
+
+static FORCEINLINE uint64_t __movmsk(__vec8_i1 mask) {
+    return (uint64_t)mask.v;
+}
+
+static FORCEINLINE bool __any(__vec8_i1 mask) {
+    return (mask.v!=0);
+}
+
+static FORCEINLINE bool __all(__vec8_i1 mask) {
+    return (mask.v==0xFF);
+}
+
+static FORCEINLINE bool __none(__vec8_i1 mask) {
+    return (mask.v==0);
+}
+
+static FORCEINLINE __vec8_i1 __equal_i1(__vec8_i1 a, __vec8_i1 b) {
+    __vec8_i1 r;
+    r.v = (a.v & b.v) | (~a.v & ~b.v);
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __and(__vec8_i1 a, __vec8_i1 b) {
+    __vec8_i1 r;
+    r.v = a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __xor(__vec8_i1 a, __vec8_i1 b) {
+    __vec8_i1 r;
+    r.v = a.v ^ b.v;
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __or(__vec8_i1 a, __vec8_i1 b) {
+    __vec8_i1 r;
+    r.v = a.v | b.v;
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __not(__vec8_i1 v) {
+    __vec8_i1 r;
+    r.v = ~v.v;
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __and_not1(__vec8_i1 a, __vec8_i1 b) {
+    __vec8_i1 r;
+    r.v = ~a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __and_not2(__vec8_i1 a, __vec8_i1 b) {
+    __vec8_i1 r;
+    r.v = a.v & ~b.v;
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __select(__vec8_i1 mask, __vec8_i1 a, 
+                                       __vec8_i1 b) {
+    __vec8_i1 r;
+    r.v = (a.v & mask.v) | (b.v & ~mask.v);
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __select(bool cond, __vec8_i1 a, __vec8_i1 b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE bool __extract_element(__vec8_i1 vec, int index) {
+    return (vec.v & (1 << index)) ? true : false;
+}
+
+static FORCEINLINE void __insert_element(__vec8_i1 *vec, int index, 
+                                         bool val) {
+    if (val == false)
+        vec->v &= ~(1 << index);
+    else
+        vec->v |= (1 << index);
+}
+
+template <int ALIGN> static FORCEINLINE __vec8_i1 __load(const __vec8_i1 *p) {
+    uint8_t *ptr = (uint8_t *)p;
+    __vec8_i1 r;
+    r.v = *ptr;
+    return r;
+}
+
+template <int ALIGN> static FORCEINLINE void __store(__vec8_i1 *p, __vec8_i1 v) {
+    uint8_t *ptr = (uint8_t *)p;
+    *ptr = v.v;
+}
+
+template <class RetVecType> RetVecType __smear_i1(int i);
+template <> static FORCEINLINE __vec8_i1 __smear_i1<__vec8_i1>(int i) {
+    return i?0xFF:0x0;
+}
+
+template <class RetVecType> RetVecType __setzero_i1();
+template <> static FORCEINLINE __vec8_i1 __setzero_i1<__vec8_i1>() {
+    return 0;
+}
+
+template <class RetVecType> __vec8_i1 __undef_i1();
+template <> FORCEINLINE __vec8_i1 __undef_i1<__vec8_i1>() {
+    return __vec8_i1();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// int8
+
+BINARY_OP(__vec8_i8, __add, +)
+BINARY_OP(__vec8_i8, __sub, -)
+BINARY_OP(__vec8_i8, __mul, *)
+
+BINARY_OP(__vec8_i8, __or, |)
+BINARY_OP(__vec8_i8, __and, &)
+BINARY_OP(__vec8_i8, __xor, ^)
+BINARY_OP(__vec8_i8, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i8, uint8_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i8, int8_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i8, uint8_t, __urem, %)
+BINARY_OP_CAST(__vec8_i8, int8_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i8, uint8_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i8, int8_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i8, uint8_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i8, int8_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i8, int8_t, __shl, <<)
+
+CMP_OP(__vec8_i8, i8, int8_t,  __equal, ==)
+CMP_OP(__vec8_i8, i8, int8_t,  __not_equal, !=)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_less_than, <)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i8)
+INSERT_EXTRACT(__vec8_i8, int8_t)
+SMEAR(__vec8_i8, i8, int8_t)
+SETZERO(__vec8_i8, i8)
+UNDEF(__vec8_i8, i8)
+BROADCAST(__vec8_i8, i8, int8_t)
+ROTATE(__vec8_i8, i8, int8_t)
+SHUFFLES(__vec8_i8, i8, int8_t)
+LOAD_STORE(__vec8_i8, int8_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int16
+
+BINARY_OP(__vec8_i16, __add, +)
+BINARY_OP(__vec8_i16, __sub, -)
+BINARY_OP(__vec8_i16, __mul, *)
+
+BINARY_OP(__vec8_i16, __or, |)
+BINARY_OP(__vec8_i16, __and, &)
+BINARY_OP(__vec8_i16, __xor, ^)
+BINARY_OP(__vec8_i16, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i16, uint16_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i16, int16_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i16, uint16_t, __urem, %)
+BINARY_OP_CAST(__vec8_i16, int16_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i16, uint16_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i16, int16_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i16, uint16_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i16, int16_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i16, int16_t, __shl, <<)
+
+CMP_OP(__vec8_i16, i16, int16_t,  __equal, ==)
+CMP_OP(__vec8_i16, i16, int16_t,  __not_equal, !=)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_less_than, <)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i16)
+INSERT_EXTRACT(__vec8_i16, int16_t)
+SMEAR(__vec8_i16, i16, int16_t)
+SETZERO(__vec8_i16, i16)
+UNDEF(__vec8_i16, i16)
+BROADCAST(__vec8_i16, i16, int16_t)
+ROTATE(__vec8_i16, i16, int16_t)
+SHUFFLES(__vec8_i16, i16, int16_t)
+LOAD_STORE(__vec8_i16, int16_t)
+
+#if 0 /* evghenii::int32 */
+///////////////////////////////////////////////////////////////////////////
+// int32
+
+BINARY_OP(__vec8_i32, __add, +)
+BINARY_OP(__vec8_i32, __sub, -)
+BINARY_OP(__vec8_i32, __mul, *)
+
+BINARY_OP(__vec8_i32, __or, |)
+BINARY_OP(__vec8_i32, __and, &)
+BINARY_OP(__vec8_i32, __xor, ^)
+BINARY_OP(__vec8_i32, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i32, uint32_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i32, int32_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i32, uint32_t, __urem, %)
+BINARY_OP_CAST(__vec8_i32, int32_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i32, uint32_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i32, int32_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i32, uint32_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i32, int32_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i32, int32_t, __shl, <<)
+
+CMP_OP(__vec8_i32, i32, int32_t,  __equal, ==)
+CMP_OP(__vec8_i32, i32, int32_t,  __not_equal, !=)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_less_than, <)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i32)
+INSERT_EXTRACT(__vec8_i32, int32_t)
+SMEAR(__vec8_i32, i32, int32_t)
+SETZERO(__vec8_i32, i32)
+UNDEF(__vec8_i32, i32)
+BROADCAST(__vec8_i32, i32, int32_t)
+ROTATE(__vec8_i32, i32, int32_t)
+SHUFFLES(__vec8_i32, i32, int32_t)
+LOAD_STORE(__vec8_i32, int32_t)
+
+#else /* evghenii::int32 */
+///////////////////////////////////////////////////////////////////////////
+// int32
+///////////////////////////////////////////////////////////////////////////
+
+#define IZERO _mm512_setzero_epi32()
+static FORCEINLINE __vec8_i32 __add(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_add_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __sub(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_sub_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __mul(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_mullo_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __udiv(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_div_epu32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __sdiv(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_div_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __urem(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_rem_epu32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __srem(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_rem_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __or(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_or_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __and(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_and_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __xor(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_xor_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __shl(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_sllv_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __lshr(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_srlv_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __ashr(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_srav_epi32(IZERO,0xFF, a, b); 
+}
+
+static FORCEINLINE __vec8_i32 __shl(__vec8_i32 a, int32_t n) {
+    return _mm512_mask_slli_epi32(IZERO,0xFF, a, n);
+}
+
+static FORCEINLINE __vec8_i32 __lshr(__vec8_i32 a, int32_t n) {
+    return _mm512_mask_srli_epi32(IZERO,0xFF, a, n); 
+}
+
+static FORCEINLINE __vec8_i32 __ashr(__vec8_i32 a, int32_t n) {
+    return _mm512_mask_srai_epi32(IZERO,0xFF, a, n); 
+}
+
+static FORCEINLINE __vec8_i1 __equal_i32(const __vec8_i32 &a, const __vec8_i32 &b) {
+    return _mm512_mask_cmpeq_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_i32_and_mask(const __vec8_i32 &a, const __vec8_i32 &b,
+                                                   __vec8_i1 m) {
+    return _mm512_mask_cmpeq_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpneq_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                       __vec8_i1 m) {
+    return _mm512_mask_cmpneq_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmple_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                 __vec8_i1 m) {
+    return _mm512_mask_cmple_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmple_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                               __vec8_i1 m) {
+    return _mm512_mask_cmple_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpge_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                    __vec8_i1 m) {
+    return _mm512_mask_cmpge_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpge_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                  __vec8_i1 m) {
+    return _mm512_mask_cmpge_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmplt_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                __vec8_i1 m) {
+    return _mm512_mask_cmplt_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmplt_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                              __vec8_i1 m) {
+    return _mm512_mask_cmplt_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpgt_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                   __vec8_i1 m) {
+    return _mm512_mask_cmpgt_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpgt_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                 __vec8_i1 m) {
+    return _mm512_mask_cmpgt_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __select(__vec8_i1 mask,
+                                        __vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_mov_epi32(b, mask, a);
+} 
+
+static FORCEINLINE __vec8_i32 __select(bool cond, __vec8_i32 a, __vec8_i32 b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE int32_t __extract_element(__vec8_i32 v, int index) { //uint32_t index) {
+    return ((int32_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec8_i32 *v, uint32_t index, int32_t val) {
+    ((int32_t *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_i32(int32_t i);
+template <> static FORCEINLINE __vec8_i32 __smear_i32<__vec8_i32>(int32_t i) {
+    return _mm512_set_16to16_epi32(0,0,0,0,0,0,0,0, i,i,i,i,i,i,i,i);
+}
+
+static const __vec8_i32 __ispc_one = __smear_i32<__vec8_i32>(1);
+static const __vec8_i32 __ispc_thirty_two = __smear_i32<__vec8_i32>(32);
+static const __vec8_i32 __ispc_ffffffff = __smear_i32<__vec8_i32>(-1);
+static const __vec8_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7);
+
+template <class RetVecType> RetVecType __setzero_i32();
+template <> static FORCEINLINE __vec8_i32 __setzero_i32<__vec8_i32>() {
+    return _mm512_setzero_epi32();
+}
+
+template <class RetVecType> RetVecType __undef_i32();
+template <> static FORCEINLINE __vec8_i32 __undef_i32<__vec8_i32>() {
+    return __vec8_i32();
+}
+
+static FORCEINLINE __vec8_i32 __broadcast_i32(__vec8_i32 v, int index) {
+    int32_t val = __extract_element(v, index & 0xf);
+    return _mm512_set1_epi32(val);
+}
+
+#if 0 /* evghenii::doesn't work */
+static FORCEINLINE __vec8_i32 __rotate_i32(__vec8_i32 v, int index) {
+    __vec8_i32 idx = __smear_i32<__vec8_i32>(index);
+    __vec8_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec8_i32>(0x7));
+    return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
+}
+#else
+ROTATE(__vec8_i32, i32, int32_t)
+#endif
+
+static FORCEINLINE __vec8_i32 __shuffle_i32(__vec8_i32 v, __vec8_i32 index) {
+    return _mm512_mask_permutevar_epi32(v, 0xffff, index, v);
+}
+SHUFFLE2(__vec8_i32, i32, int32_t) /* evghenii::to implement */
+
+template <int ALIGN> static FORCEINLINE __vec8_i32 __load(const __vec8_i32 *p) {
+  __vec8_i32 v;
+  v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  return __select(0xFF,v,IZERO);
+}
+
+
+template <int ALIGN> static FORCEINLINE void __store(__vec8_i32 *p, __vec8_i32 v) {
+  _mm512_mask_extpackstorelo_epi32(          p,    0xFF, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+}
+
+#if 0
+template <> static FORCEINLINE __vec8_i32 __load<64>(const __vec8_i32 *p) {
+    return _mm512_load_epi32(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec8_i32 *p, __vec8_i32 v) {
+    _mm512_store_epi32(p, v);
+}
+#endif
+#endif /* evghenii::int32 */
+
+///////////////////////////////////////////////////////////////////////////
+// int64
+
+BINARY_OP(__vec8_i64, __add, +)
+BINARY_OP(__vec8_i64, __sub, -)
+BINARY_OP(__vec8_i64, __mul, *)
+
+BINARY_OP(__vec8_i64, __or, |)
+BINARY_OP(__vec8_i64, __and, &)
+BINARY_OP(__vec8_i64, __xor, ^)
+BINARY_OP(__vec8_i64, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i64, uint64_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i64, int64_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i64, uint64_t, __urem, %)
+BINARY_OP_CAST(__vec8_i64, int64_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i64, uint64_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i64, int64_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i64, uint64_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i64, int64_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i64, int64_t, __shl, <<)
+
+CMP_OP(__vec8_i64, i64, int64_t,  __equal, ==)
+CMP_OP(__vec8_i64, i64, int64_t,  __not_equal, !=)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_less_than, <)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i64)
+INSERT_EXTRACT(__vec8_i64, int64_t)
+SMEAR(__vec8_i64, i64, int64_t)
+SETZERO(__vec8_i64, i64)
+UNDEF(__vec8_i64, i64)
+BROADCAST(__vec8_i64, i64, int64_t)
+ROTATE(__vec8_i64, i64, int64_t)
+SHUFFLES(__vec8_i64, i64, int64_t)
+LOAD_STORE(__vec8_i64, int64_t)
+
+
+#if 0 /* evghenii::float */
+///////////////////////////////////////////////////////////////////////////
+// float
+
+BINARY_OP(__vec8_f, __add, +)
+BINARY_OP(__vec8_f, __sub, -)
+BINARY_OP(__vec8_f, __mul, *)
+BINARY_OP(__vec8_f, __div, /)
+
+CMP_OP(__vec8_f, float, float, __equal, ==)
+CMP_OP(__vec8_f, float, float, __not_equal, !=)
+CMP_OP(__vec8_f, float, float, __less_than, <)
+CMP_OP(__vec8_f, float, float, __less_equal, <=)
+CMP_OP(__vec8_f, float, float, __greater_than, >)
+CMP_OP(__vec8_f, float, float, __greater_equal, >=)
+
+static FORCEINLINE __vec8_i1 __ordered_float(__vec8_f a, __vec8_f b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+static FORCEINLINE __vec8_i1 __unordered_float(__vec8_f a, __vec8_f b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec8_f)
+INSERT_EXTRACT(__vec8_f, float)
+SMEAR(__vec8_f, float, float)
+SETZERO(__vec8_f, float)
+UNDEF(__vec8_f, float)
+BROADCAST(__vec8_f, float, float)
+ROTATE(__vec8_f, float, float)
+SHUFFLES(__vec8_f, float, float)
+LOAD_STORE(__vec8_f, float)
+#else /* evghenii::float */
+
+///////////////////////////////////////////////////////////////////////////
+// float
+///////////////////////////////////////////////////////////////////////////
+
+#define FZERO _mm512_setzero_ps()
+static FORCEINLINE __vec8_f __add(__vec8_f a, __vec8_f b) { 
+    return _mm512_mask_add_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_f __sub(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_sub_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_f __mul(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_mul_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_f __div(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_div_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpeq_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                     __vec8_i1 m) {
+    return _mm512_mask_cmpeq_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpneq_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                         __vec8_i1 m) {
+    return _mm512_mask_cmpneq_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmplt_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_float_and_mask(__vec8_f a, __vec8_f b,
+                                                         __vec8_i1 m) {
+    return _mm512_mask_cmplt_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmple_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                          __vec8_i1 m) {
+    return _mm512_mask_cmple_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmp_ps_mask(0xFF, a, b,_CMP_GT_OS);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_float_and_mask(__vec8_f a, __vec8_f b,
+                                                            __vec8_i1 m) {
+    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GT_OS);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmp_ps_mask(0xFF, a, b,_CMP_GE_OS);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                             __vec8_i1 m) {
+    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GE_OS);
+}
+
+static FORCEINLINE __vec8_i1 __ordered_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpord_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unordered_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpunord_ps_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_f __select(__vec8_i1 mask, __vec8_f a, __vec8_f b) {
+    return _mm512_mask_mov_ps(b, mask & 0xFF, a);
+}
+
+static FORCEINLINE __vec8_f __select(bool cond, __vec8_f a, __vec8_f b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE float __extract_element(__vec8_f v, uint32_t index) {
+  return v[index];
+ //   return ((float *)&v)[index];
+}
+
+static FORCEINLINE void  __insert_element(__vec8_f *v, uint32_t index, float val) {
+  (*v)[index] = val;
+//    ((float *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_float(float f);
+template <> static FORCEINLINE __vec8_f __smear_float<__vec8_f>(float f) {
+  return _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, f,f,f,f,f,f,f,f);
+}
+
+template <class RetVecType> RetVecType __setzero_float();
+template <> static FORCEINLINE __vec8_f __setzero_float<__vec8_f>() {
+    return _mm512_setzero_ps();
+}
+
+template <class RetVecType> RetVecType __undef_float();
+template <> static FORCEINLINE __vec8_f __undef_float<__vec8_f>() {
+    return __vec8_f();
+}
+
+static FORCEINLINE __vec8_f __broadcast_float(__vec8_f v, int index) {
+    float val = __extract_element(v, index & 0x7);
+  return _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, val,val,val,val,val,val,val,val);
+}
+ 
+#if 1
+static FORCEINLINE __vec8_f __shuffle_float(__vec8_f v, __vec8_i32 index) {
+    return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
+}
+#endif
+ROTATE(__vec8_f, float, float)
+SHUFFLE2(__vec8_f, float, float)
+
+#if 0
+LOADS(__vec8_f, float)
+#else
+template <int ALIGN> static FORCEINLINE __vec8_f __load(const __vec8_f *p) {
+  __vec8_f v;
+  v = _mm512_extloadunpacklo_ps(v,           p,    _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  return __select(0xFF,v,FZERO);
+}
+#endif
+
+#if 0
+STORES(__vec8_f, float)
+#else
+template <int ALIGN> static FORCEINLINE void __store(__vec8_f *p, __vec8_f v) 
+{
+  _mm512_mask_extpackstorelo_ps(          p,    0xFF, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_ps((uint8_t*)p+64, 0xFF, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+}
+#endif
+
+#endif /* evghenii::float */
+
+static FORCEINLINE float __exp_uniform_float(float v) {    return expf(v);}
+static FORCEINLINE __vec8_f __exp_varying_float(__vec8_f v) { return _mm512_mask_exp_ps(FZERO, 0xFF, v); }
+
+
+static FORCEINLINE float __log_uniform_float(float v) {    return logf(v);}
+static FORCEINLINE __vec8_f __log_varying_float(__vec8_f v) { return _mm512_mask_log_ps(FZERO, 0xFF, v); }
+
+static FORCEINLINE float __pow_uniform_float(float a, float b) {    return powf(a, b);}
+static FORCEINLINE __vec8_f __pow_varying_float(__vec8_f a, __vec8_f b) { return _mm512_mask_pow_ps(FZERO, 0xFF, a,b); }
+
+
+static FORCEINLINE int __intbits(float v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.f = v;
+    return u.i;
+}
+
+static FORCEINLINE float __floatbits(int v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.i = v;
+    return u.f;
+}
+
+static FORCEINLINE float __half_to_float_uniform(int16_t h) {
+    static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+    uint32_t exp = shifted_exp & o;   // just the exponent
+    o += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) { // Zero/Denormal?
+        o += 1 << 23;             // extra exp adjust
+        o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
+    }
+
+    o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
+    return __floatbits(o);
+}
+
+
+static FORCEINLINE __vec8_f __half_to_float_varying(__vec8_i16 v) {
+    __vec8_f ret;
+    for (int i = 0; i < 8; ++i)
+        ret[i] = __half_to_float_uniform(v[i]);
+    return ret;
+}
+
+
+static FORCEINLINE int16_t __float_to_half_uniform(float f) {
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    int32_t fint = __intbits(f);
+    int32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    int32_t f32infty = 255 << 23;
+    o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+    const uint32_t round_mask = ~0xfffu; 
+    const int32_t magic = 15 << 23;
+    const int32_t f16infty = 31 << 23;
+
+    int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
+    fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+
+static FORCEINLINE __vec8_i16 __float_to_half_varying(__vec8_f v) {
+    __vec8_i16 ret;
+    for (int i = 0; i < 8; ++i)
+        ret[i] = __float_to_half_uniform(v[i]);
+    return ret;
+}
+
+
+#if 0 /* evghenii::double */
+///////////////////////////////////////////////////////////////////////////
+// double
+
+BINARY_OP(__vec8_d, __add, +)
+BINARY_OP(__vec8_d, __sub, -)
+BINARY_OP(__vec8_d, __mul, *)
+BINARY_OP(__vec8_d, __div, /)
+
+CMP_OP(__vec8_d, double, double, __equal, ==)
+CMP_OP(__vec8_d, double, double, __not_equal, !=)
+CMP_OP(__vec8_d, double, double, __less_than, <)
+CMP_OP(__vec8_d, double, double, __less_equal, <=)
+CMP_OP(__vec8_d, double, double, __greater_than, >)
+CMP_OP(__vec8_d, double, double, __greater_equal, >=)
+
+static FORCEINLINE __vec8_i1 __ordered_double(__vec8_d a, __vec8_d b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+static FORCEINLINE __vec8_i1 __unordered_double(__vec8_d a, __vec8_d b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec8_d)
+INSERT_EXTRACT(__vec8_d, double)
+SMEAR(__vec8_d, double, double)
+SETZERO(__vec8_d, double)
+UNDEF(__vec8_d, double)
+BROADCAST(__vec8_d, double, double)
+ROTATE(__vec8_d, double, double)
+SHUFFLES(__vec8_d, double, double)
+LOAD_STORE(__vec8_d, double)
+#else /* evghenii::double */
+///////////////////////////////////////////////////////////////////////////
+// double
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec8_d __add(__vec8_d a, __vec8_d b) { 
+    return _mm512_add_pd(a, b);
+}
+static FORCEINLINE __vec8_d __sub(__vec8_d a, __vec8_d b) {
+    return _mm512_sub_pd(a, b);
+}
+static FORCEINLINE __vec8_d __mul(__vec8_d a, __vec8_d b) {
+    return _mm512_mul_pd(a, b);
+}
+
+static FORCEINLINE __vec8_d __div(__vec8_d a, __vec8_d b) {
+    return _mm512_div_pd(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpeq_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                      __vec8_i1 m) {
+    return _mm512_mask_cmpeq_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpneq_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                          __vec8_i1 m) {
+    return _mm512_mask_cmpneq_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmplt_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_double_and_mask(__vec8_d a, __vec8_d b,
+                                                          __vec8_i1 m) {
+    return _mm512_mask_cmplt_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmple_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                           __vec8_i1 m) {
+    return _mm512_mask_cmple_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpnle_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_double_and_mask(__vec8_d a, __vec8_d b,
+                                                             __vec8_i1 m) {
+    return _mm512_mask_cmpnle_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpnlt_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                              __vec8_i1 m) {
+    return _mm512_mask_cmpnlt_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __ordered_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpord_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unordered_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpunord_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_d __select(__vec8_i1 mask, __vec8_d a, __vec8_d b) {
+    return _mm512_mask_mov_pd(b, mask, a);
+}
+
+
+static FORCEINLINE __vec8_d __select(bool cond, __vec8_d a, __vec8_d b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE double __extract_element(__vec8_d v, uint32_t index) {
+    return ((double *)&v)[index];
+}
+
+static FORCEINLINE void  __insert_element(__vec8_d *v, uint32_t index, double val) {
+    ((double *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_double(double d);
+template <> static FORCEINLINE __vec8_d __smear_double<__vec8_d>(double d) { return _mm512_set1_pd(d); }
+
+template <class RetVecType> RetVecType __setzero_double();
+template <> static FORCEINLINE __vec8_d __setzero_double<__vec8_d>() { return _mm512_setzero_pd(); }
+
+template <class RetVecType> RetVecType __undef_double();
+template <> static FORCEINLINE __vec8_d __undef_double<__vec8_d>() {    return __vec8_d();}
+
+static FORCEINLINE __vec8_d __broadcast_double(__vec8_d v, int index) {
+    double val = __extract_element(v, index & 0xf);
+    return _mm512_set1_pd(val);
+}
+
+ROTATE(__vec8_d, double, double)
+SHUFFLES(__vec8_d, double, double)
+
+template <int ALIGN> static FORCEINLINE __vec8_d __load(const __vec8_d *p) {
+  __vec8_d ret;
+  ret.v = _mm512_extloadunpacklo_pd(ret.v, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v = _mm512_extloadunpackhi_pd(ret.v, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  return ret;
+}
+ 
+template <int ALIGN> static FORCEINLINE void __store(__vec8_d *p, __vec8_d v) {
+  _mm512_extpackstorelo_pd(p, v.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+}
+
+
+#if 0
+template <> static FORCEINLINE __vec8_d __load<64>(const __vec8_d *p) {
+    return  _mm512_load_pd(p);
+}
+template <> static FORCEINLINE __vec8_d __load<128>(const __vec8_d *p) {
+    return __load<64>(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec8_d *p, __vec8_d v) {
+    _mm512_store_pd(p, v.v);
+}
+template <> static FORCEINLINE void __store<128>(__vec8_d *p, __vec8_d v) {
+    __store<64>(p, v);
+}
+#endif
+#endif /* evghenii::double */
+
+///////////////////////////////////////////////////////////////////////////
+// casts
+
+
+#define CAST(TO, STO, FROM, SFROM, FUNC)        \
+static FORCEINLINE TO FUNC(TO, FROM val) {      \
+    TO ret;                                     \
+    for (int i = 0; i < 8; ++i)                \
+        ret[i] = (STO)((SFROM)(val[i]));    \
+    return ret;                                 \
+}
+
+// sign extension conversions
+CAST(__vec8_i64, int64_t, __vec8_i32, int32_t, __cast_sext)
+CAST(__vec8_i64, int64_t, __vec8_i16, int16_t, __cast_sext)
+CAST(__vec8_i64, int64_t, __vec8_i8,  int8_t,  __cast_sext)
+CAST(__vec8_i32, int32_t, __vec8_i16, int16_t, __cast_sext)
+CAST(__vec8_i32, int32_t, __vec8_i8,  int8_t,  __cast_sext)
+CAST(__vec8_i16, int16_t, __vec8_i8,  int8_t,  __cast_sext)
+
+#define CAST_SEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_sext(TYPE, __vec8_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 8; ++i) {                    \
+        ret[i] = 0;                                 \
+        if (v.v & (1 << i))                           \
+            ret[i] = ~ret[i];                     \
+    }                                                 \
+    return ret;                                       \
+}
+
+CAST_SEXT_I1(__vec8_i8)
+CAST_SEXT_I1(__vec8_i16)
+#if 0
+CAST_SEXT_I1(__vec8_i32)
+#else
+static FORCEINLINE __vec8_i32 __cast_sext(const __vec8_i32 &, const __vec8_i1 &val)
+{
+    __vec8_i32 ret = _mm512_setzero_epi32();
+    __vec8_i32 one = _mm512_set1_epi32(-1);
+    return _mm512_mask_mov_epi32(ret, 0xFF & val, one);
+}
+#endif
+CAST_SEXT_I1(__vec8_i64)
+
+// zero extension
+CAST(__vec8_i64, uint64_t, __vec8_i32, uint32_t, __cast_zext)
+CAST(__vec8_i64, uint64_t, __vec8_i16, uint16_t, __cast_zext)
+CAST(__vec8_i64, uint64_t, __vec8_i8,  uint8_t,  __cast_zext)
+CAST(__vec8_i32, uint32_t, __vec8_i16, uint16_t, __cast_zext)
+CAST(__vec8_i32, uint32_t, __vec8_i8,  uint8_t,  __cast_zext)
+CAST(__vec8_i16, uint16_t, __vec8_i8,  uint8_t,  __cast_zext)
+
+#define CAST_ZEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_zext(TYPE, __vec8_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = (v.v & (1 << i)) ? 1 : 0;          \
+    return ret;                                       \
+}
+
+CAST_ZEXT_I1(__vec8_i8)
+CAST_ZEXT_I1(__vec8_i16)
+#if 0
+CAST_ZEXT_I1(__vec8_i32)
+#else
+static FORCEINLINE __vec8_i32 __cast_zext(const __vec8_i32 &, const __vec8_i1 &val)
+{
+    __vec8_i32 ret = _mm512_setzero_epi32();
+    __vec8_i32 one = _mm512_set1_epi32(1);
+    return _mm512_mask_mov_epi32(ret, 0xFF & val, one);
+}
+#endif
+CAST_ZEXT_I1(__vec8_i64)
+
+// truncations
+CAST(__vec8_i32, int32_t, __vec8_i64, int64_t, __cast_trunc)
+CAST(__vec8_i16, int16_t, __vec8_i64, int64_t, __cast_trunc)
+CAST(__vec8_i8,  int8_t,  __vec8_i64, int64_t, __cast_trunc)
+CAST(__vec8_i16, int16_t, __vec8_i32, int32_t, __cast_trunc)
+CAST(__vec8_i8,  int8_t,  __vec8_i32, int32_t, __cast_trunc)
+CAST(__vec8_i8,  int8_t,  __vec8_i16, int16_t, __cast_trunc)
+
+// signed int to float/double
+#if 0
+CAST(__vec8_f, float, __vec8_i8,   int8_t,  __cast_sitofp)
+CAST(__vec8_f, float, __vec8_i16,  int16_t, __cast_sitofp)
+CAST(__vec8_f, float, __vec8_i32,  int32_t, __cast_sitofp)
+#else
+static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i8  val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i16 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i32 val) {return _mm512_mask_cvtfxpnt_round_adjustepi32_ps(FZERO, 0xFF, val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+#endif
+CAST(__vec8_f, float, __vec8_i64,  int64_t, __cast_sitofp)
+#if 0
+CAST(__vec8_d, double, __vec8_i8,  int8_t,  __cast_sitofp)
+CAST(__vec8_d, double, __vec8_i16, int16_t, __cast_sitofp)
+CAST(__vec8_d, double, __vec8_i32, int32_t, __cast_sitofp)
+#else
+static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i8 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return  _mm512_cvtepi32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i16 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return  _mm512_cvtepi32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i32 val) {
+    __vec8_d ret;
+    return _mm512_cvtepi32lo_pd(val);
+}
+#endif
+CAST(__vec8_d, double, __vec8_i64, int64_t, __cast_sitofp)
+
+// unsigned int to float/double
+#if 0
+CAST(__vec8_f, float, __vec8_i8,   uint8_t,  __cast_uitofp)
+CAST(__vec8_f, float, __vec8_i16,  uint16_t, __cast_uitofp)
+CAST(__vec8_f, float, __vec8_i32,  uint32_t, __cast_uitofp)
+#else
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i8  val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i16 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i32 val) {return _mm512_mask_cvtfxpnt_round_adjustepu32_ps(FZERO, 0xFF, val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+#endif
+CAST(__vec8_f, float, __vec8_i64,  uint64_t, __cast_uitofp)
+#if 0
+CAST(__vec8_d, double, __vec8_i8,  uint8_t,  __cast_uitofp)
+CAST(__vec8_d, double, __vec8_i16, uint16_t, __cast_uitofp)
+CAST(__vec8_d, double, __vec8_i32, uint32_t, __cast_uitofp)
+#else
+static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i8 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return  _mm512_cvtepu32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i16 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return _mm512_cvtepu32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i32 val) {
+    __vec8_d ret;
+    return _mm512_cvtepu32lo_pd(val);
+}
+#endif
+CAST(__vec8_d, double, __vec8_i64, uint64_t, __cast_uitofp)
+
+#if 0
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i1 v) {
+    __vec8_f ret;
+    for (int i = 0; i < 8; ++i)
+        ret[i] = (v.v & (1 << i)) ? 1. : 0.;
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i1 v) 
+{
+    const __m512 ret = _mm512_setzero_ps();
+    const __m512 one = _mm512_set1_ps(1.0);
+    return _mm512_mask_mov_ps(ret, v & 0xFF, one);
+}
+#endif
+
+// float/double to signed int
+CAST(__vec8_i8,  int8_t,  __vec8_f, float, __cast_fptosi)
+CAST(__vec8_i16, int16_t, __vec8_f, float, __cast_fptosi)
+#if 0
+CAST(__vec8_i32, int32_t, __vec8_f, float, __cast_fptosi)
+#else
+static FORCEINLINE __vec8_i32 __cast_fptosi(__vec8_i32, __vec8_f val) {
+  return _mm512_mask_cvtfxpnt_round_adjustps_epi32(IZERO, 0xFF, val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+#endif
+CAST(__vec8_i64, int64_t, __vec8_f, float, __cast_fptosi)
+CAST(__vec8_i8,  int8_t,  __vec8_d, double, __cast_fptosi)
+CAST(__vec8_i16, int16_t, __vec8_d, double, __cast_fptosi)
+#if 1
+CAST(__vec8_i32, int32_t, __vec8_d, double, __cast_fptosi)
+#else
+#endif
+CAST(__vec8_i64, int64_t, __vec8_d, double, __cast_fptosi)
+
+// float/double to unsigned int
+CAST(__vec8_i8,  uint8_t,  __vec8_f, float, __cast_fptoui)
+CAST(__vec8_i16, uint16_t, __vec8_f, float, __cast_fptoui)
+#if 0
+CAST(__vec8_i32, uint32_t, __vec8_f, float, __cast_fptoui)
+#else
+static FORCEINLINE __vec8_i32 __cast_fptoui(__vec8_i32, __vec8_f val) {
+  return _mm512_mask_cvtfxpnt_round_adjustps_epu32(IZERO, 0xFF, val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+#endif
+CAST(__vec8_i64, uint64_t, __vec8_f, float, __cast_fptoui)
+CAST(__vec8_i8,  uint8_t,  __vec8_d, double, __cast_fptoui)
+CAST(__vec8_i16, uint16_t, __vec8_d, double, __cast_fptoui)
+#if 1
+CAST(__vec8_i32, uint32_t, __vec8_d, double, __cast_fptoui)
+#else
+#endif
+CAST(__vec8_i64, uint64_t, __vec8_d, double, __cast_fptoui)
+
+// float/double conversions
+#if 0
+CAST(__vec8_f, float,  __vec8_d, double, __cast_fptrunc)
+CAST(__vec8_d, double, __vec8_f, float,  __cast_fpext)
+#else
+static FORCEINLINE __vec8_f __cast_fptrunc(__vec8_f, __vec8_d val) {
+    return _mm512_mask_cvtpd_pslo(FZERO, 0xFF, val);
+}
+static FORCEINLINE __vec8_d __cast_fpext(__vec8_d, __vec8_f val) {
+    return _mm512_cvtpslo_pd(val);
+}
+#endif
+
+typedef union {
+    int32_t i32;
+    float f;
+    int64_t i64;
+    double d;
+} BitcastUnion;
+
+#define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT)       \
+static FORCEINLINE TO __cast_bits(TO, FROM val) {   \
+    TO r;                                           \
+    for (int i = 0; i < 8; ++i) {                  \
+        BitcastUnion u;                             \
+        u.FROM_ELT = val[i];                      \
+        r[i] = u.TO_ELT;                          \
+    }                                               \
+    return r;                                       \
+}
+
+#if 0
+CAST_BITS(__vec8_f,   f,   __vec8_i32, i32)
+CAST_BITS(__vec8_i32, i32, __vec8_f,   f)
+#else
+static FORCEINLINE __vec8_f __cast_bits(__vec8_f, __vec8_i32 val) {
+    return _mm512_castsi512_ps(val);
+}
+static FORCEINLINE __vec8_i32 __cast_bits(__vec8_i32, __vec8_f val) {
+    return _mm512_castps_si512(val);
+}
+#endif
+
+#if 0
+CAST_BITS(__vec8_d,   d,   __vec8_i64, i64)
+CAST_BITS(__vec8_i64, i64, __vec8_d,   d)
+#else
+static FORCEINLINE __vec8_i64 __cast_bits(__vec8_i64, __vec8_d val) {
+    return *(__vec8_i64*)&val;
+}
+static FORCEINLINE __vec8_d __cast_bits(__vec8_d, __vec8_i64 val) {
+    return *(__vec8_d*)&val;
+}
+#endif
+
+#define CAST_BITS_SCALAR(TO, FROM)                  \
+static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
+    union {                                         \
+    TO to;                                          \
+    FROM from;                                      \
+    } u;                                            \
+    u.from = v;                                     \
+    return u.to;                                    \
+}
+
+CAST_BITS_SCALAR(uint32_t, float)
+CAST_BITS_SCALAR(int32_t, float)
+CAST_BITS_SCALAR(float, uint32_t)
+CAST_BITS_SCALAR(float, int32_t)
+CAST_BITS_SCALAR(uint64_t, double)
+CAST_BITS_SCALAR(int64_t, double)
+CAST_BITS_SCALAR(double, uint64_t)
+CAST_BITS_SCALAR(double, int64_t)
+
+///////////////////////////////////////////////////////////////////////////
+// various math functions
+
+static FORCEINLINE void __fastmath() {
+}
+
+static FORCEINLINE float __round_uniform_float(float v) {
+    return roundf(v);
+}
+
+static FORCEINLINE float __floor_uniform_float(float v)  {
+    return floorf(v);
+}
+
+static FORCEINLINE float __ceil_uniform_float(float v) {
+    return ceilf(v);
+}
+
+static FORCEINLINE double __round_uniform_double(double v) {
+    return round(v);
+}
+
+static FORCEINLINE double __floor_uniform_double(double v) {
+    return floor(v);
+}
+
+static FORCEINLINE double __ceil_uniform_double(double v) {
+    return ceil(v);
+}
+
+#if 0
+UNARY_OP(__vec8_f, __round_varying_float, roundf)
+UNARY_OP(__vec8_f, __floor_varying_float, floorf)
+UNARY_OP(__vec8_f, __ceil_varying_float, ceilf)
+#else
+static FORCEINLINE __vec8_f __round_varying_float(__vec8_f v) {
+  return _mm512_mask_round_ps(FZERO, 0xFF, v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);
+}
+
+static FORCEINLINE __vec8_f __floor_varying_float(__vec8_f v) {
+  return _mm512_mask_floor_ps(FZERO, 0xFF, v);
+}
+
+static FORCEINLINE __vec8_f __ceil_varying_float(__vec8_f v) {
+  return _mm512_mask_ceil_ps(FZERO, 0xFF, v);
+}
+#endif
+
+#if 0
+UNARY_OP(__vec8_d, __round_varying_double, round)
+UNARY_OP(__vec8_d, __floor_varying_double, floor)
+UNARY_OP(__vec8_d, __ceil_varying_double, ceil)
+#else
+static FORCEINLINE __vec8_d __round_varying_float(__vec8_d v) {
+  return _mm512_svml_round_pd(v);
+}
+
+static FORCEINLINE __vec8_d __floor_varying_float(__vec8_d v) {
+  return _mm512_floor_pd(v);
+}
+
+static FORCEINLINE __vec8_d __ceil_varying_float(__vec8_d v) {
+  return _mm512_ceil_pd(v);
+}
+#endif
+
+
+// min/max
+
+static FORCEINLINE float __min_uniform_float(float a, float b) { return (a<b) ? a : b; }
+static FORCEINLINE float __max_uniform_float(float a, float b) { return (a>b) ? a : b; }
+static FORCEINLINE double __min_uniform_double(double a, double b) { return (a<b) ? a : b; }
+static FORCEINLINE double __max_uniform_double(double a, double b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_int32(int32_t a, int32_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_uint32(uint32_t a, uint32_t b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_int64(int64_t a, int64_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_uint64(uint64_t a, uint64_t b) { return (a>b) ? a : b; }
+
+
+#if 0
+BINARY_OP_FUNC(__vec8_f, __max_varying_float, __max_uniform_float)
+BINARY_OP_FUNC(__vec8_f, __min_varying_float, __min_uniform_float)
+#else
+static FORCEINLINE __vec8_f __max_varying_float (__vec8_f v1, __vec8_f v2) { return _mm512_mask_gmax_ps(FZERO, 0xFF, v1, v2);}
+static FORCEINLINE __vec8_f __min_varying_float (__vec8_f v1, __vec8_f v2) { return _mm512_mask_gmin_ps(FZERO, 0xFF, v1, v2);}
+#endif
+
+#if 0
+BINARY_OP_FUNC(__vec8_d, __max_varying_double, __max_uniform_double)
+BINARY_OP_FUNC(__vec8_d, __min_varying_double, __min_uniform_double)
+#else
+static FORCEINLINE __vec8_d __max_varying_double(__vec8_d v1, __vec8_d v2) { return _mm512_gmax_pd(v1,v2); }
+static FORCEINLINE __vec8_d __min_varying_double(__vec8_d v1, __vec8_d v2) { return _mm512_gmin_pd(v1,v2); }
+#endif
+
+#if 0
+BINARY_OP_FUNC(__vec8_i32, __max_varying_int32, __max_uniform_int32)
+BINARY_OP_FUNC(__vec8_i32, __min_varying_int32, __min_uniform_int32)
+BINARY_OP_FUNC(__vec8_i32, __max_varying_uint32, __max_uniform_uint32)
+BINARY_OP_FUNC(__vec8_i32, __min_varying_uint32, __min_uniform_uint32)
+#else
+static FORCEINLINE __vec8_i32 __max_varying_int32 (__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_max_epi32(IZERO,0xFF, v1, v2);}
+static FORCEINLINE __vec8_i32 __min_varying_int32 (__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_min_epi32(IZERO,0xFF, v1, v2);}
+static FORCEINLINE __vec8_i32 __max_varying_uint32(__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_max_epu32(IZERO,0xFF, v1, v2);}
+static FORCEINLINE __vec8_i32 __min_varying_uint32(__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_min_epu32(IZERO,0xFF, v1, v2);}
+#endif
+
+BINARY_OP_FUNC(__vec8_i64, __max_varying_int64, __max_uniform_int64)
+BINARY_OP_FUNC(__vec8_i64, __min_varying_int64, __min_uniform_int64)
+BINARY_OP_FUNC(__vec8_i64, __max_varying_uint64, __max_uniform_uint64)
+BINARY_OP_FUNC(__vec8_i64, __min_varying_uint64, __min_uniform_uint64)
+
+// sqrt/rsqrt/rcp
+
+static FORCEINLINE float __rsqrt_uniform_float(float v) {
+    return 1.f / sqrtf(v);
+}
+
+static FORCEINLINE float __rcp_uniform_float(float v) {
+    return 1.f / v;
+}
+
+static FORCEINLINE float __sqrt_uniform_float(float v) {
+    return sqrtf(v);
+}
+
+static FORCEINLINE double __sqrt_uniform_double(double v) {
+    return sqrt(v);
+}
+
+#if 0
+UNARY_OP(__vec8_f, __rcp_varying_float, __rcp_uniform_float)
+UNARY_OP(__vec8_f, __rsqrt_varying_float, __rsqrt_uniform_float)
+UNARY_OP(__vec8_f, __sqrt_varying_float, __sqrt_uniform_float)
+#else
+static FORCEINLINE __vec8_f __rcp_varying_float(__vec8_f v) {
+#ifdef ISPC_FAST_MATH
+    return _mm512_mask_rcp23_ps(FZERO, 0xFF, v); // Approximation with 23 bits of accuracy.
+#else
+    return _mm512_mask_recip_ps(FZERO, 0xFF, v);
+#endif
+}
+
+static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) {
+#ifdef ISPC_FAST_MATH
+    return _mm512_mask_rsqrt23_ps(FZERO,0xFF,v); // Approximation with 0.775ULP accuracy
+#else 
+    return _mm512_mask_invsqrt_ps(FZERO,0xFF,v);
+#endif
+}
+static FORCEINLINE __vec8_f __sqrt_varying_float (__vec8_f v) {    return _mm512_mask_sqrt_ps(FZERO,0xFF,v);}
+#endif
+
+#if 0
+UNARY_OP(__vec8_d, __sqrt_varying_double, __sqrt_uniform_double)
+#else
+static FORCEINLINE __vec8_d __sqrt_varying_double(__vec8_d v) {    return _mm512_sqrt_pd(v); }
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// svml
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec8_f __svml_logf(__vec8_f v)              { return _mm512_mask_log_ps(FZERO,0xFF,v); }
+static FORCEINLINE __vec8_f __svml_expf(__vec8_f v)              { return _mm512_mask_exp_ps(FZERO,0xFF,v); }
+static FORCEINLINE __vec8_f __svml_cosf(__vec8_f v)              { return _mm512_mask_cos_ps(FZERO,0xFF,v); }
+static FORCEINLINE __vec8_f __svml_powf(__vec8_f a, __vec8_f b) { return _mm512_mask_pow_ps(FZERO,0xFF,a,b); }
+
+static FORCEINLINE __vec8_d __svml_logd(__vec8_d v)              { return _mm512_log_pd(v); }
+static FORCEINLINE __vec8_d __svml_expd(__vec8_d v)              { return _mm512_exp_pd(v); }
+static FORCEINLINE __vec8_d __svml_cosd(__vec8_d v)              { return _mm512_cos_pd(v); }
+static FORCEINLINE __vec8_d __svml_powd(__vec8_d a, __vec8_d b) { return _mm512_pow_pd(a,b); }
+
+///////////////////////////////////////////////////////////////////////////
+// bit ops
+
+static FORCEINLINE int32_t __popcnt_int32(uint32_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __popcnt_int64(uint64_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & (1<<31)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & (1ull<<63)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// reductions
+
+#if 0
+REDUCE_ADD(float, __vec8_f, __reduce_add_float)
+REDUCE_MINMAX(float, __vec8_f, __reduce_min_float, <)
+REDUCE_MINMAX(float, __vec8_f, __reduce_max_float, >)
+#else
+static FORCEINLINE float __reduce_add_float(__vec8_f v) { return _mm512_mask_reduce_add_ps(0xFF,v); }
+static FORCEINLINE float __reduce_min_float(__vec8_f v) { return _mm512_mask_reduce_min_ps(0xFF,v); }
+static FORCEINLINE float __reduce_max_float(__vec8_f v) { return _mm512_mask_reduce_max_ps(0xFF,v); }
+#endif
+
+#if 0
+REDUCE_ADD(double, __vec8_d, __reduce_add_double)
+REDUCE_MINMAX(double, __vec8_d, __reduce_min_double, <)
+REDUCE_MINMAX(double, __vec8_d, __reduce_max_double, >)
+#else
+static FORCEINLINE float __reduce_add_double(__vec8_d v) { return _mm512_reduce_add_pd(v); }
+static FORCEINLINE float __reduce_min_double(__vec8_d v) { return _mm512_reduce_min_pd(v); }
+static FORCEINLINE float __reduce_max_double(__vec8_d v) { return _mm512_reduce_max_pd(v); }
+#endif
+
+
+
+#if 0
+REDUCE_ADD   (int64_t, __vec8_i32, __reduce_add_int32)
+REDUCE_MINMAX(int32_t, __vec8_i32, __reduce_min_int32, <)
+REDUCE_MINMAX(int32_t, __vec8_i32, __reduce_max_int32, >)
+REDUCE_MINMAX(uint32_t, __vec8_i32, __reduce_min_uint32, <)
+REDUCE_MINMAX(uint32_t, __vec8_i32, __reduce_max_uint32, >)
+#else
+static FORCEINLINE  int64_t __reduce_add_int32  (__vec8_i32 v) { return _mm512_mask_reduce_add_epi32(0xFF, v);}
+static FORCEINLINE  int32_t __reduce_min_int32  (__vec8_i32 v) { return _mm512_mask_reduce_min_epi32(0xFF, v);}
+static FORCEINLINE  int32_t __reduce_max_int32  (__vec8_i32 v) { return _mm512_mask_reduce_max_epi32(0xFF, v);}
+static FORCEINLINE uint32_t __reduce_min_uint32 (__vec8_i32 v) { return _mm512_mask_reduce_min_epu32(0xFF, v);}
+static FORCEINLINE uint32_t __reduce_max_uint32 (__vec8_i32 v) { return _mm512_mask_reduce_max_epu32(0xFF, v);}
+#endif
+
+REDUCE_ADD   ( int16_t, __vec8_i8,  __reduce_add_int8)
+REDUCE_ADD   ( int32_t, __vec8_i16, __reduce_add_int16)
+REDUCE_ADD   ( int64_t, __vec8_i64, __reduce_add_int64)
+REDUCE_MINMAX( int64_t, __vec8_i64, __reduce_min_int64, <)
+REDUCE_MINMAX( int64_t, __vec8_i64, __reduce_max_int64, >)
+REDUCE_MINMAX(uint64_t, __vec8_i64, __reduce_min_uint64, <)
+REDUCE_MINMAX(uint64_t, __vec8_i64, __reduce_max_uint64, >)
+
+///////////////////////////////////////////////////////////////////////////
+// masked load/store
+
+static FORCEINLINE __vec8_i8 __masked_load_i8(void *p,
+                                               __vec8_i1 mask) {
+    __vec8_i8 ret;
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec8_i16 __masked_load_i16(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_i16 ret;
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+#if 0
+static FORCEINLINE __vec8_i32 __masked_load_i32(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_i32 ret;
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_i32 __masked_load_i32(void *p, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_mask_load_epi32(__vec8_i32(), mask, p);
+#else
+    __vec8_i32 tmp;
+    tmp = _mm512_mask_extloadunpacklo_epi32(tmp, 0xFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_extloadunpackhi_epi32(tmp, 0xFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __vec8_i32 ret;
+    return _mm512_mask_mov_epi32(ret, 0xFF & mask, tmp);
+#endif
+}
+#endif
+
+#if 0
+static FORCEINLINE __vec8_f __masked_load_float(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_f ret;
+    float *ptr = (float *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_f __masked_load_float(void *p, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
+#else
+    __vec8_f tmp;
+    tmp = _mm512_mask_extloadunpacklo_ps(tmp, 0xFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_extloadunpackhi_ps(tmp, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    __vec8_f ret;
+    return _mm512_mask_mov_ps(ret, 0xFF & mask, tmp);
+#endif
+}
+#endif
+
+static FORCEINLINE __vec8_i64 __masked_load_i64(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_i64 ret;
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+#if 0
+static FORCEINLINE __vec8_d __masked_load_double(void *p,
+                                                  __vec8_i1 mask) {
+    __vec8_d ret;
+    double *ptr = (double *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_d __masked_load_double(void *p, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    __vec8_d ret = FZERO;
+    ret = _mm512_mask_load_pd(ret, 0xFF & mask, p);
+    return ret;
+#else
+    __vec8_d tmp = FZERO;
+    tmp.v = _mm512_mask_extloadunpacklo_pd(tmp.v, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_extloadunpackhi_pd(tmp.v, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    __vec8_d ret = FZERO;
+    ret.v = _mm512_mask_mov_pd(ret.v, mask, tmp.v);
+    return ret;
+#endif
+}
+#endif
+
+
+static FORCEINLINE void __masked_store_i8(void *p, __vec8_i8 val,
+                                          __vec8_i1 mask) {
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+static FORCEINLINE void __masked_store_i16(void *p, __vec8_i16 val,
+                                           __vec8_i1 mask) {
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+#if 0
+static FORCEINLINE void __masked_store_i32(void *p, __vec8_i32 val,
+                                           __vec8_i1 mask) {
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_i32(void *p, __vec8_i32 val, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_epi32(p, mask, val.v);
+#else
+    __vec8_i32 tmp;
+    tmp = _mm512_extloadunpacklo_epi32(tmp, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp = _mm512_extloadunpackhi_epi32(tmp, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_mov_epi32(tmp, 0xFF & mask, val);
+    _mm512_mask_extpackstorelo_epi32(          p,    0xFF, tmp, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF, tmp, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+#if 0
+static FORCEINLINE void __masked_store_float(void *p, __vec8_f val,
+                                             __vec8_i1 mask) {
+    float *ptr = (float *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_float(void *p, __vec8_f val,
+                                             __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_ps(p, 0xFF & mask, val.v);
+#else
+    __vec8_f tmp = FZERO;
+    tmp = _mm512_extloadunpacklo_ps(tmp, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp = _mm512_extloadunpackhi_ps(tmp, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_mov_ps(tmp, 0xFF & mask, val);
+    _mm512_mask_extpackstorelo_ps(          p,    0xFF, tmp, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_ps((uint8_t*)p+64, 0xFF, tmp, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+static FORCEINLINE void __masked_store_i64(void *p, __vec8_i64 val,
+                                          __vec8_i1 mask) {
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+#if 0
+static FORCEINLINE void __masked_store_double(void *p, __vec8_d val,
+                                              __vec8_i1 mask) {
+    double *ptr = (double *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_double(void *p, __vec8_d val,
+                                              __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_pd(p, mask, val.v);
+#else
+    __vec8_d tmp;
+    tmp.v = _mm512_extloadunpacklo_pd(tmp.v, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_extloadunpackhi_pd(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_mov_pd(tmp.v, mask, val.v);
+    _mm512_extpackstorelo_pd(p, tmp.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+static FORCEINLINE void __masked_store_blend_i8(void *p, __vec8_i8 val,
+                                                __vec8_i1 mask) {
+    __masked_store_i8(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i16(void *p, __vec8_i16 val,
+                                                 __vec8_i1 mask) {
+    __masked_store_i16(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i32(void *p, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    __masked_store_i32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_float(void *p, __vec8_f val,
+                                                   __vec8_i1 mask) {
+    __masked_store_float(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i64(void *p, __vec8_i64 val,
+                                                 __vec8_i1 mask) {
+    __masked_store_i64(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_double(void *p, __vec8_d val,
+                                                    __vec8_i1 mask) {
+    __masked_store_double(p, val, mask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// gather/scatter
+
+// offsets * offsetScale is in bytes (for all of these)
+
+#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec8_i1 mask) {          \
+    VTYPE ret;                                                          \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 8; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            ret[i] = *ptr;                                            \
+        }                                                               \
+    return ret;                                                         \
+}
+    
+
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i32, __gather_base_offsets32_i8)
+#else
+static FORCEINLINE __vec8_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec8_i32 offsets,  __vec8_i1 mask) 
+{
+    // (iw): need to temporarily store as int because gathers can only return ints.
+    __vec8_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), 0xFF & mask, offsets, base, 
+                                                     _MM_UPCONV_EPI32_SINT8, scale,
+                                                     _MM_HINT_NONE);
+    // now, downconverting to chars into temporary char vector
+    __vec8_i8 ret;
+    _mm512_mask_extstore_epi32(ret.data,0xFF,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+    return ret;
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i64, __gather_base_offsets64_i8)
+/****************/
+GATHER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i32, __gather_base_offsets32_i16)
+GATHER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i64, __gather_base_offsets64_i16)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i32, __gather_base_offsets32_i32)
+#else
+static FORCEINLINE __vec8_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec8_i32 offsets,   __vec8_i1 mask) 
+{
+    return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), 0xFF & mask, offsets, 
+                                          base, _MM_UPCONV_EPI32_NONE, scale,
+                                          _MM_HINT_NONE);
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i64, __gather_base_offsets64_i32)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i32, __gather_base_offsets32_float)
+#else
+static FORCEINLINE __vec8_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) 
+{
+    return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), 0xFF & mask, offsets,
+                                       base, _MM_UPCONV_PS_NONE, scale,
+                                       _MM_HINT_NONE);
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i64, __gather_base_offsets64_float)
+/****************/
+GATHER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i32, __gather_base_offsets32_i64)
+GATHER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i64, __gather_base_offsets64_i64)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i32, __gather_base_offsets32_double)
+#else
+static FORCEINLINE __vec8_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) 
+{
+    __vec8_d ret;
+    ret.v = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
+                                       base, _MM_UPCONV_PD_NONE, scale,
+                                       _MM_HINT_NONE); 
+#if 0
+    __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+    const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */
+    ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets,
+                                       base, _MM_UPCONV_PD_NONE, scale,
+                                       _MM_HINT_NONE); 
+#endif
+    return ret;
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i64, __gather_base_offsets64_double)
+
+#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec8_i1 mask) {   \
+    VTYPE ret;                                              \
+    for (int i = 0; i < 8; ++i)                            \
+        if ((mask.v & (1 << i)) != 0) {                     \
+            STYPE *ptr = (STYPE *)ptrs[i];                \
+            ret[i] = *ptr;                                \
+        }                                                   \
+    return ret;                                             \
+}
+#define GATHER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec8_i1 mask) {   \
+  return FUNC1(0, 1, ptrs, mask); \
+}
+
+
+#if 1
+/***********/
+GATHER_GENERALF(__vec8_i8,  int8_t,  __vec8_i32, __gather32_i8, __gather_base_offsets32_i8)
+GATHER_GENERALF(__vec8_i16, int16_t, __vec8_i32, __gather32_i16, __gather_base_offsets32_i16)
+GATHER_GENERALF(__vec8_i32, int32_t, __vec8_i32, __gather32_i32, __gather_base_offsets32_i32)
+GATHER_GENERALF(__vec8_i64, int64_t, __vec8_i32, __gather32_i64, __gather_base_offsets32_i64)
+GATHER_GENERALF(__vec8_f,   float,   __vec8_i32, __gather32_float, __gather_base_offsets32_float)
+GATHER_GENERALF(__vec8_d,   double,  __vec8_i32, __gather32_double, __gather_base_offsets32_double)
+/***********/
+GATHER_GENERAL(__vec8_i8,  int8_t,  __vec8_i64, __gather64_i8);
+GATHER_GENERAL(__vec8_i16, int16_t, __vec8_i64, __gather64_i16);
+GATHER_GENERAL(__vec8_i32, int32_t, __vec8_i64, __gather64_i32);
+GATHER_GENERAL(__vec8_i64, int64_t, __vec8_i64, __gather64_i64);
+GATHER_GENERAL(__vec8_f,   float,   __vec8_i64, __gather64_float);
+GATHER_GENERAL(__vec8_d,   double,  __vec8_i64, __gather64_double);
+/***********/
+#endif
+
+// scatter
+
+#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val,                   \
+                             __vec8_i1 mask) {                         \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 8; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            *ptr = val[i];                                            \
+        }                                                               \
+}
+    
+
+/*****************/
+SCATTER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i64, __scatter_base_offsets64_i8)
+/*****************/
+SCATTER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i32, __scatter_base_offsets32_i16)
+SCATTER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i64, __scatter_base_offsets64_i16)
+/*****************/
+#if 0
+SCATTER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i32, __scatter_base_offsets32_i32)
+#else
+static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec8_i32 offsets,  __vec8_i32 val, __vec8_i1 mask)
+{
+    _mm512_mask_i32extscatter_epi32(b, 0xFF & mask, offsets, val, 
+                                    _MM_DOWNCONV_EPI32_NONE, scale, 
+                                    _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i64, __scatter_base_offsets64_i32)
+/*****************/
+#if 0
+SCATTER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i32, __scatter_base_offsets32_float)
+#else
+static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scale, __vec8_i32 offsets,
+                               __vec8_f val, __vec8_i1 mask) 
+{ 
+    _mm512_mask_i32extscatter_ps(base, 0xFF & mask, offsets, val, 
+                                 _MM_DOWNCONV_PS_NONE, scale,
+                                 _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i64, __scatter_base_offsets64_float)
+/*****************/
+SCATTER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i32, __scatter_base_offsets32_i64)
+SCATTER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i64, __scatter_base_offsets64_i64)
+/*****************/
+#if 0 /* evghenii::to implement */
+SCATTER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i32, __scatter_base_offsets32_double)
+#else /* evghenii:testme */
+static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t scale, __vec8_i32 offsets,
+                               __vec8_d val, __vec8_i1 mask) 
+{ 
+    _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v,
+                                 _MM_DOWNCONV_PD_NONE, scale,
+                                 _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i64, __scatter_base_offsets64_double)
+
+#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec8_i1 mask) {  \
+    VTYPE ret;                                                       \
+    for (int i = 0; i < 8; ++i)                                     \
+        if ((mask.v & (1 << i)) != 0) {                              \
+            STYPE *ptr = (STYPE *)ptrs[i];                         \
+            *ptr = val[i];                                         \
+        }                                                            \
+}
+#define SCATTER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec8_i1 mask) {  \
+  return FUNC1(0, 1, ptrs, val, mask); \
+}
+
+#if 1
+/***********/
+SCATTER_GENERALF(__vec8_i8,  int8_t,  __vec8_i32, __scatter32_i8, __scatter_base_offsets32_i8)
+SCATTER_GENERALF(__vec8_i16, int16_t, __vec8_i32, __scatter32_i16, __scatter_base_offsets32_i16)
+SCATTER_GENERALF(__vec8_i32, int32_t, __vec8_i32, __scatter32_i32, __scatter_base_offsets32_i32)
+SCATTER_GENERALF(__vec8_i64, int64_t, __vec8_i32, __scatter32_i64, __scatter_base_offsets32_i64)
+SCATTER_GENERALF(__vec8_f,   float,   __vec8_i32, __scatter32_float, __scatter_base_offsets32_float)
+SCATTER_GENERALF(__vec8_d,   double,  __vec8_i32, __scatter32_double, __scatter_base_offsets32_double)
+/***********/
+SCATTER_GENERAL(__vec8_i8,  int8_t,  __vec8_i64, __scatter64_i8)
+SCATTER_GENERAL(__vec8_i16, int16_t, __vec8_i64, __scatter64_i16)
+SCATTER_GENERAL(__vec8_i32, int32_t, __vec8_i64, __scatter64_i32)
+SCATTER_GENERAL(__vec8_f,   float,   __vec8_i64, __scatter64_float)
+SCATTER_GENERAL(__vec8_i64, int64_t, __vec8_i64, __scatter64_i64)
+SCATTER_GENERAL(__vec8_d,   double,  __vec8_i64, __scatter64_double)
+/***********/
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// packed load/store
+
+#if 0
+static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->operator[](i) = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, 
+                                                 __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val[i];
+            ++count;
+        }
+    }
+    return count;
+}
+static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
+                                                __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->operator[](i) = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
+                                                 __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val[i];
+            ++count;
+        }
+    }
+    return count;
+}
+#else
+static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    __vec8_i32 v = __load<64>(val);
+    v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __store<64>(val, v);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    __vec8_i32 v = __load<64>(val);
+    v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __store<64>(val, v);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// aos/soa
+
+static FORCEINLINE void __soa_to_aos3_float(__vec8_f v0, __vec8_f v1, __vec8_f v2,
+                                            float *ptr) {
+    for (int i = 0; i < 8; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec8_f *out0, __vec8_f *out1,
+                                            __vec8_f *out2) {
+    for (int i = 0; i < 8; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+    }
+}
+
+static FORCEINLINE void __soa_to_aos4_float(__vec8_f v0, __vec8_f v1, __vec8_f v2,
+                                            __vec8_f v3, float *ptr) {
+    for (int i = 0; i < 8; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+        *ptr++ = __extract_element(v3, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec8_f *out0, __vec8_f *out1,
+                                            __vec8_f *out2, __vec8_f *out3) {
+    for (int i = 0; i < 8; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+        __insert_element(out3, i, *ptr++);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// prefetch
+
+static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T0); // prefetch into L1$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T1); // prefetch into L2$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *p) {
+    // There is no L3$ on KNC, don't want to pollute L2$ unecessarily
+}
+
+static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint
+    // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
+}
+
+///////////////////////////////////////////////////////////////////////////
+// atomics
+
+static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) {
+    int32_t old, min;
+    do {
+        old = *((volatile int32_t *)p);
+        min = (old < (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) {
+    int32_t old, max;
+    do {
+        old = *((volatile int32_t *)p);
+        max = (old > (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) {
+    uint32_t old, min;
+    do {
+        old = *((volatile uint32_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) {
+    uint32_t old, max;
+    do {
+        old = *((volatile uint32_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange((LONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval,
+                                             uint32_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) {
+    int64_t old, min;
+    do {
+        old = *((volatile int64_t *)p);
+        min = (old < (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) {
+    int64_t old, max;
+    do {
+        old = *((volatile int64_t *)p);
+        max = (old > (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) {
+    uint64_t old, min;
+    do {
+        old = *((volatile uint64_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) {
+    uint64_t old, max;
+    do {
+        old = *((volatile uint64_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange64((LONGLONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
+                                             uint64_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+
+#endif // !WIN32
+
+#undef FORCEINLINE
+#undef PRE_ALIGN
+#undef POST_ALIGN
diff --git a/examples/intrinsics/knc-i1x8unsafe_fast.h b/examples/intrinsics/knc-i1x8unsafe_fast.h
new file mode 100644
index 00000000..55d97566
--- /dev/null
+++ b/examples/intrinsics/knc-i1x8unsafe_fast.h
@@ -0,0 +1,2 @@
+#define __ZMM32BIT__
+#include "knc-i1x8.h"
diff --git a/run_tests.py b/run_tests.py
index 9729930f..2cca983e 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -362,10 +362,13 @@ def run_test(testname):
                 gcc_isa=""
                 if options.target == 'generic-4':
                     gcc_isa = '-msse4.2'
-                if options.target == 'generic-8':
+                if (options.target == 'generic-8'):
+                  if (options.include_file.find("knc-i1x8.h")!=-1 or options.include_file.find("knc-i1x8unsafe_fast.h")!=-1):
+                    gcc_isa = '-mmic'
+                  else:
                     gcc_isa = '-mavx'
                 if (options.target == 'generic-16' or options.target == 'generic-32' or options.target == 'generic-64') \
-                        and (options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1):
+                        and (options.include_file.find("knc-i1x16.h")!=-1 or options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1):
                     gcc_isa = '-mmic'
 
                 cc_cmd = "%s -O2 -I. %s %s test_static.cpp -DTEST_SIG=%d %s -o %s" % \

From 4b1a0b4bc46f6a4503c1ebec8cbfa7b74ffc78a3 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Wed, 18 Sep 2013 18:41:22 +0300
Subject: [PATCH 025/159] added fails

---
 examples/intrinsics/knc-i1x8unsafe_fast.h | 67 +++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/examples/intrinsics/knc-i1x8unsafe_fast.h b/examples/intrinsics/knc-i1x8unsafe_fast.h
index 55d97566..ce66ea11 100644
--- a/examples/intrinsics/knc-i1x8unsafe_fast.h
+++ b/examples/intrinsics/knc-i1x8unsafe_fast.h
@@ -1,2 +1,69 @@
 #define __ZMM32BIT__
 #include "knc-i1x8.h"
+
+/* the following tests fails because vec8_i32 and vec8_float are 512 and not 256 bit in size.
+ * not sure how it is possible to fix this, any suggestions? 
+33 / 1206 tests FAILED execution:
+        ./tests/array-gather-simple.ispc
+        ./tests/array-gather-vary.ispc
+        ./tests/array-multidim-gather-scatter.ispc
+        ./tests/array-scatter-vary.ispc
+        ./tests/atomics-5.ispc
+        ./tests/atomics-swap.ispc
+        ./tests/cfor-array-gather-vary.ispc
+        ./tests/cfor-gs-improve-varying-1.ispc
+        ./tests/cfor-struct-gather-2.ispc
+        ./tests/cfor-struct-gather-3.ispc
+        ./tests/cfor-struct-gather.ispc
+        ./tests/gather-struct-vector.ispc
+        ./tests/global-array-4.ispc
+        ./tests/gs-improve-varying-1.ispc
+        ./tests/half-1.ispc
+        ./tests/half-3.ispc
+        ./tests/half.ispc
+        ./tests/launch-3.ispc
+        ./tests/launch-4.ispc
+        ./tests/masked-scatter-vector.ispc
+        ./tests/masked-struct-scatter-varying.ispc
+        ./tests/new-delete-6.ispc
+        ./tests/ptr-24.ispc
+        ./tests/ptr-25.ispc
+        ./tests/short-vec-15.ispc
+        ./tests/struct-gather-2.ispc
+        ./tests/struct-gather-3.ispc
+        ./tests/struct-gather.ispc
+        ./tests/struct-ref-lvalue.ispc
+        ./tests/struct-test-118.ispc
+        ./tests/struct-vary-index-expr.ispc
+        ./tests/typedef-2.ispc
+        ./tests/vector-varying-scatter.ispc
+*/
+
+/* knc-i1x8.h has the following fails:
+3 / 1206 tests FAILED execution:
+        ./tests/half-1.ispc
+        ./tests/half-3.ispc
+        ./tests/half.ispc
+*/
+
+/* knc-i1x16.h has the following fails:
+5 / 1206 tests FAILED execution:
+        ./tests/assert-3.ispc
+        ./tests/half-1.ispc
+        ./tests/half-3.ispc
+        ./tests/half.ispc
+        ./tests/test-141.ispc
+*/
+
+/* generics-16, from which these knc-i1x*.h are derived, has the following fails:
+6 / 1206 tests FAILED execution:
+        ./tests/func-overload-max.ispc
+        ./tests/half-1.ispc
+        ./tests/half-3.ispc
+        ./tests/half.ispc
+        ./tests/test-141.ispc
+        ./tests/test-143.ispc
+*/
+
+
+

From e4b1f585952d4748818d01995f24c04d35c4c0b0 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Wed, 18 Sep 2013 19:14:41 +0300
Subject: [PATCH 026/159] performance fix.. still some issues left with
 equal_i1 for __vec8_i1

---
 examples/intrinsics/knc-i1x16.h | 52 ++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 8b1a2bb9..ebffa4d6 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -480,46 +480,63 @@ INSERT_EXTRACT(__vec1_d, double)
 ///////////////////////////////////////////////////////////////////////////
 // mask ops
 
-static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
-    return (uint64_t)mask.v;
+static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) {
+    return _mm512_kmov(mask);
 }
 
 static FORCEINLINE bool __any(__vec16_i1 mask) {
-    return (mask.v!=0);
+    return !_mm512_kortestz(mask, mask);
 }
 
 static FORCEINLINE bool __all(__vec16_i1 mask) {
-    return (mask.v==0xFFFF);
+    return _mm512_kortestc(mask, mask);
 }
 
 static FORCEINLINE bool __none(__vec16_i1 mask) {
-    return (mask.v==0);
+    return _mm512_kortestz(mask, mask);
 }
 
+#if 0
+static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
+#if 0
+    return _mm512_kand(a,b);   /* this fails some short circut tests */
+#else
+    return _mm512_knot( _mm512_kandn(a, b));  /* this fails some asin test */
+#endif
+}
+#else /* passes all the tests */
 static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
     __vec16_i1 r;
     r.v = (a.v & b.v) | (~a.v & ~b.v);
     return r;
 }
+#endif
 
 static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) {
-    __vec16_i1 r;
-    r.v = a.v & b.v;
-    return r;
+    return _mm512_kand(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __xor(__vec16_i1 a, __vec16_i1 b) {
-    __vec16_i1 r;
-    r.v = a.v ^ b.v;
-    return r;
+    return _mm512_kxor(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
-    __vec16_i1 r;
-    r.v = a.v | b.v;
-    return r;
+    return _mm512_kor(a, b);
 }
 
+#if 0
+static FORCEINLINE __vec16_i1 __not(__vec16_i1 a) {
+    return _mm512_knot(a);
+}
+
+static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
+    return _mm512_kandn(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
+    return _mm512_kandnr(a, b);
+}
+#else
 static FORCEINLINE __vec16_i1 __not(__vec16_i1 v) {
     __vec16_i1 r;
     r.v = ~v.v;
@@ -537,18 +554,19 @@ static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
     r.v = a.v & ~b.v;
     return r;
 }
+#endif
 
 static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, 
                                        __vec16_i1 b) {
-    __vec16_i1 r;
-    r.v = (a.v & mask.v) | (b.v & ~mask.v);
-    return r;
+    return ((a & mask) | (b & ~mask));
+    //return __or(__and(a, mask), __andnr(b, mask));
 }
 
 static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) {
     return cond ? a : b;
 }
 
+
 static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) {
     return (vec.v & (1 << index)) ? true : false;
 }

From 3cf63362a4885056bf72e6daaad7ffc67d7a93dc Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Wed, 18 Sep 2013 20:03:08 +0300
Subject: [PATCH 027/159] small tuning

---
 examples/intrinsics/knc-i1x16.h | 39 +++------------------------------
 1 file changed, 3 insertions(+), 36 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index ebffa4d6..b7d3a7f1 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -496,22 +496,9 @@ static FORCEINLINE bool __none(__vec16_i1 mask) {
     return _mm512_kortestz(mask, mask);
 }
 
-#if 0
 static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
-#if 0
-    return _mm512_kand(a,b);   /* this fails some short circut tests */
-#else
-    return _mm512_knot( _mm512_kandn(a, b));  /* this fails some asin test */
-#endif
+    return _mm512_kxnor(a,b);
 }
-#else /* passes all the tests */
-static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
-    __vec16_i1 r;
-    r.v = (a.v & b.v) | (~a.v & ~b.v);
-    return r;
-}
-#endif
-
 static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) {
     return _mm512_kand(a, b);
 }
@@ -524,7 +511,6 @@ static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
     return _mm512_kor(a, b);
 }
 
-#if 0
 static FORCEINLINE __vec16_i1 __not(__vec16_i1 a) {
     return _mm512_knot(a);
 }
@@ -536,30 +522,11 @@ static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
 static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
     return _mm512_kandnr(a, b);
 }
-#else
-static FORCEINLINE __vec16_i1 __not(__vec16_i1 v) {
-    __vec16_i1 r;
-    r.v = ~v.v;
-    return r;
-}
-
-static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
-    __vec16_i1 r;
-    r.v = ~a.v & b.v;
-    return r;
-}
-
-static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
-    __vec16_i1 r;
-    r.v = a.v & ~b.v;
-    return r;
-}
-#endif
 
 static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, 
                                        __vec16_i1 b) {
-    return ((a & mask) | (b & ~mask));
-    //return __or(__and(a, mask), __andnr(b, mask));
+//    return ((a & mask) | (b & ~mask));
+    return __or(__and(a, mask), __and_not2(b, mask));
 }
 
 static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) {

From 406e2eb8d0e9eaac0c1923c8a91837882b8f4610 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Thu, 19 Sep 2013 09:16:37 +0200
Subject: [PATCH 028/159] fix double precision input to support .123d321 type
 of input

---
 lex.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lex.ll b/lex.ll
index f1dcaa6f..3655220f 100644
--- a/lex.ll
+++ b/lex.ll
@@ -345,7 +345,7 @@ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
-FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9]+)|([0-9]+[dD][-+]?[0-9]+))
+FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9]+)|([0-9]+[dD][-+]?[0-9]+)|(\.[0-9]*[dD][-+]?[0-9]+))
 
 
 

From 00cd90c6b0c31a0d709c368db8b0dc42501577cc Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Tue, 17 Sep 2013 17:30:34 +0400
Subject: [PATCH 029/159] test system

---
 alloy.py                                      | 600 ++++++++++++++++++
 check_env.py                                  | 102 +++
 common.py                                     | 120 ++++
 examples/noise/Makefile                       |   2 +-
 examples/perf.py                              | 374 -----------
 fail_db.txt                                   |   1 +
 ...ER.patch => 3_3_r183327-AVX2-GATHER.patch} |   0
 ...hift.patch => 3_3_r184575-x86-shift.patch} |   0
 examples/perf.ini => perf.ini                 |  24 +-
 perf.py                                       | 489 ++++++++++++++
 run_tests.py                                  | 570 +++++++++++------
 11 files changed, 1711 insertions(+), 571 deletions(-)
 create mode 100755 alloy.py
 create mode 100755 check_env.py
 create mode 100644 common.py
 delete mode 100755 examples/perf.py
 create mode 100644 fail_db.txt
 rename llvm_patches/{r183327-AVX2-GATHER.patch => 3_3_r183327-AVX2-GATHER.patch} (100%)
 rename llvm_patches/{r184575-x86-shift.patch => 3_3_r184575-x86-shift.patch} (100%)
 rename examples/perf.ini => perf.ini (84%)
 create mode 100755 perf.py

diff --git a/alloy.py b/alloy.py
new file mode 100755
index 00000000..67f534ca
--- /dev/null
+++ b/alloy.py
@@ -0,0 +1,600 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# // Author: Filippov Ilia
+
+def attach_mail_file(msg, filename, name):
+    if os.path.exists(filename):
+        fp = open(filename, "rb")
+        to_attach = MIMEBase("application", "octet-stream")
+        to_attach.set_payload(fp.read())
+        encode_base64(to_attach)
+        to_attach.add_header("Content-Disposition", "attachment", filename=name)
+        fp.close()
+        msg.attach(to_attach)
+
+def setting_paths(llvm, ispc, sde):
+    if llvm != "":
+        os.environ["LLVM_HOME"]=llvm
+    if ispc != "":
+        os.environ["ISPC_HOME"]=ispc
+    if sde != "":
+        os.environ["SDE_HOME"]=sde
+
+def check_LLVM(which_LLVM):
+    answer = []
+    if which_LLVM[0] == " ":
+        return answer
+    p = os.environ["LLVM_HOME"]
+    for i in range(0,len(which_LLVM)):
+        if not os.path.exists(p + os.sep + "bin-" + which_LLVM[i] + os.sep + "bin"):
+            answer.append(which_LLVM[i])
+    return answer
+
+def try_do_LLVM(text, command, from_validation):
+    if from_validation == True:
+        text = text + "\n"
+    print_debug("Trying to " + text, from_validation, alloy_build)
+    if os.system(command + " >> " + alloy_build + " 2>> " + alloy_build) != 0:
+        print_debug("ERROR.\n", from_validation, alloy_build)
+        error("can't " + text, 1)
+    print_debug("DONE.\n", from_validation, alloy_build)
+
+def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_validation, force):
+    print_debug("Building LLVM. Version: " + version_LLVM + ". ", from_validation, alloy_build)
+    if revision != "":
+        print_debug("Revision: " + revision + ".\n", from_validation, alloy_build)
+    else:
+        print_debug("\n", from_validation, alloy_build)
+    # Here we understand what and where do we want to build
+    current_path = os.getcwd()
+    llvm_home = os.environ["LLVM_HOME"]
+    os.chdir(llvm_home)
+    FOLDER_NAME=version_LLVM
+    if  version_LLVM == "head":
+        SVN_PATH="trunk"
+    if  version_LLVM == "3.3":
+        SVN_PATH="tags/RELEASE_33/final"
+        version_LLVM = "3_3"
+    if  version_LLVM == "3.2":
+        SVN_PATH="tags/RELEASE_32/final"
+        version_LLVM = "3_2"
+    if  version_LLVM == "3.1":
+        SVN_PATH="tags/RELEASE_31/final"
+        version_LLVM = "3_1"
+    if revision != "":
+        FOLDER_NAME = FOLDER_NAME + "_" + revision
+        revision = "-" + revision
+    if folder == "":
+        folder = FOLDER_NAME
+    LLVM_SRC="llvm-" + folder
+    LLVM_BUILD="build-" + folder
+    LLVM_BIN="bin-" + folder
+    if os.path.exists(LLVM_BIN) and not force:
+        print_debug("You have folder " + LLVM_BIN + ". If you want to rebuild use --force\n", False, "")
+        exit(0)
+    LLVM_BUILD_selfbuild = LLVM_BUILD + "_temp"
+    LLVM_BIN_selfbuild = LLVM_BIN + "_temp"
+    common.remove_if_exists(LLVM_SRC)
+    common.remove_if_exists(LLVM_BUILD)
+    common.remove_if_exists(LLVM_BIN)
+    if selfbuild:
+        common.remove_if_exists(LLVM_BUILD_selfbuild)
+        common.remove_if_exists(LLVM_BIN_selfbuild)
+    MAKE = "gmake"
+    print_debug("Using folders: " + LLVM_SRC + " " + LLVM_BUILD + " " + LLVM_BIN + " in " + 
+        llvm_home + "\n", from_validation, alloy_build)
+    # load llvm
+    if tarball == "":
+        try_do_LLVM("load LLVM from http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " " + LLVM_SRC,
+                    from_validation)
+        os.chdir(LLVM_SRC + "/tools")
+        try_do_LLVM("load clang from http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " clang",
+                    from_validation)
+        os.chdir("../")
+    else:
+        tar = tarball.split(" ")
+        os.makedirs(LLVM_SRC) 
+        os.chdir(LLVM_SRC) 
+        try_do_LLVM("untar LLVM from " + tar[0] + " ",
+                    "tar -xvzf " + tar[0] + " --strip-components 1", from_validation)
+        os.chdir("./tools") 
+        os.makedirs("clang") 
+        os.chdir("./clang") 
+        try_do_LLVM("untar clang from " + tar[1] + " ",
+                    "tar -xvzf " + tar[1] + " --strip-components 1", from_validation)
+        os.chdir("../../")
+    # paching llvm
+    patches = glob.glob(os.environ["ISPC_HOME"] + "/llvm_patches/*.*")
+    for patch in patches:
+        if version_LLVM in os.path.basename(patch):
+            try_do_LLVM("patch LLVM with patch" + patch + " ", "patch -p0 < " + patch, from_validation)
+    os.chdir("../")
+    # configuring llvm, build first part of selfbuild
+    os.makedirs(LLVM_BUILD)
+    os.makedirs(LLVM_BIN)
+    selfbuild_compiler = ""
+    if selfbuild:
+        print_debug("Making selfbuild and use folders " + LLVM_BUILD_selfbuild + " and " +
+            LLVM_BIN_selfbuild + "\n", from_validation, alloy_build)
+        os.makedirs(LLVM_BUILD_selfbuild)
+        os.makedirs(LLVM_BIN_selfbuild)
+        os.chdir(LLVM_BUILD_selfbuild)
+        try_do_LLVM("configure release version for selfbuild ",
+                    "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" +
+                    LLVM_BIN_selfbuild + " --enable-optimized",
+                    from_validation)
+        try_do_LLVM("build release version for selfbuild ",
+                    MAKE + " -j32", from_validation)
+        try_do_LLVM("install release version for selfbuild ",
+                    MAKE + " install",
+                    from_validation)
+        os.chdir("../")
+        selfbuild_compiler = " CC="+llvm_home+ "/" + LLVM_BIN_selfbuild + "/bin/clang"
+        print_debug("Now we have compiler for selfbuild: " + selfbuild_compiler + "\n", from_validation, alloy_build)
+    os.chdir(LLVM_BUILD)
+    if debug == False:
+        try_do_LLVM("configure release version ",
+                    "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" +
+                    LLVM_BIN + " --enable-optimized" + selfbuild_compiler,
+                    from_validation)
+    else:
+        try_do_LLVM("configure debug version ",
+                    "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" + LLVM_BIN +
+                    " --enable-debug-runtime --enable-debug-symbols --enable-keep-symbols" + selfbuild_compiler,
+                    from_validation)
+    # building llvm
+    try_do_LLVM("build LLVM ", MAKE + " -j32", from_validation)
+    try_do_LLVM("install LLVM ", MAKE + " install", from_validation)
+    os.chdir(current_path) 
+
+def check_targets():
+    answer = []
+    answer_sde = []
+    SSE2 = False;
+    SSE4 = False;
+    AVX = False;
+    AVX11 = False;
+    AVX2 = False;
+    cpu = open("/proc/cpuinfo")
+    f_lines = cpu.readlines()
+    cpu.close()
+    # check what native targets do we have
+    for i in range(0,len(f_lines)):
+        if SSE2 == False and "sse2" in f_lines[i]:
+            SSE2 = True;
+            answer = answer + ["sse2-i32x4", "sse2-i32x8"]
+        if SSE4 == False and "sse4_1" in f_lines[i]:
+            SSE4 = True;
+            answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
+        if AVX == False and "avx" in f_lines[i]:
+            AVX = True;
+            answer = answer + ["avx1-i32x8", "avx1-i32x16"]
+        if AVX11 == False and "rdrand" in f_lines[i]:
+            AVX11 = True;
+            answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
+        if AVX2 == False and "avx2" in f_lines[i]:
+            AVX2 = True;
+            answer = answer + ["avx2-i32x8", "avx2-i32x16"]
+    answer = answer + ["generic-4", "generic-16", "generic-8", "generic-1", "generic-32", "generic-64"]
+    # now check what targets we have with the help of SDE
+    sde_exists = ""
+    PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+    for counter in PATH_dir:
+        if os.path.exists(counter + os.sep + "sde") and sde_exists == "":
+            sde_exists = counter + os.sep + "sde"
+    if os.environ.get("SDE_HOME") != None:
+        if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"):
+            sde_exists = os.environ.get("SDE_HOME") + os.sep + "sde"
+    if sde_exists == "":
+        error("you haven't got sde neither in SDE_HOME nor in your PATH.\n" + 
+            "To test all platforms please set SDE_HOME to path containing SDE.\n" +
+            "Please refer to http://www.intel.com/software/sde for SDE download information.", 2)
+        return [answer, answer_sde]
+    # here we have SDE
+    os.system(sde_exists + " -help > " + temp_alloy_file)
+    cpu = open(temp_alloy_file)
+    f_lines = cpu.readlines()
+    cpu.close()
+    for i in range(0,len(f_lines)):
+        if SSE4 == False and "wsm" in f_lines[i]:
+            answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]]
+        if AVX == False and "snb" in f_lines[i]:
+            answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"]]
+        if AVX11 == False and "ivb" in f_lines[i]:
+            answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["ivb", "avx1.1-i32x16"]]
+        if AVX2 == False and "hsw" in f_lines[i]:
+            answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"]]
+    return [answer, answer_sde]
+
+def build_ispc(version_LLVM):
+    current_path = os.getcwd()
+    os.chdir(os.environ["ISPC_HOME"])
+    p_temp = os.getenv("PATH")
+    os.environ["PATH"] = os.environ["LLVM_HOME"] + "/bin-" + version_LLVM + "/bin:" + os.environ["PATH"]
+    os.system("make clean >> " + alloy_build)
+    try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", "make -j32", True)
+    os.environ["PATH"] = p_temp
+    os.chdir(current_path)
+
+def execute_stability(stability, R, print_version):
+    stability1 = copy.deepcopy(stability)
+    temp = run_tests.run_tests(stability1, [], print_version)
+    for j in range(0,4):
+        R[j][0] = R[j][0] + temp[j]
+        for i in range(0,len(temp[j])):
+            R[j][1].append(temp[4])
+    number_of_fails = temp[5]
+    number_of_new_fails = len(temp[0]) + len(temp[1])
+    if number_of_fails == 0:
+        str_fails = ". No fails"
+    else:
+        str_fails = ". Fails: " + str(number_of_fails)
+    if number_of_new_fails == 0:
+        str_new_fails = ", No new fails.\n"
+    else:
+        str_new_fails = ", New fails: " + str(number_of_new_fails) + ".\n"
+    print_debug(temp[4][1:-3] + str_fails + str_new_fails, False, stability_log)
+
+def run_special_tests():
+   i = 5 
+
+def validation_run(only, only_targets, reference_branch, notify, update):
+    current_path = os.getcwd()
+    os.chdir(os.environ["ISPC_HOME"])
+    os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"]
+    if options.notify != "":
+        if os.environ.get("SMTP_ISPC") == None:
+            error("you have no SMTP_ISPC in your environment for option notify", 1)
+        common.remove_if_exists(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt")
+        smtp_server = os.environ["SMTP_ISPC"]
+        msg = MIMEMultipart()
+        msg['Subject'] = 'ISPC test system results'
+        msg['From'] = 'ISPC_test_system'
+        msg['To'] = options.notify
+    print_debug("Command: " + ' '.join(sys.argv) + "\n", False, "")
+    print_debug("Folder: " + os.environ["ISPC_HOME"] + "\n", False, "")
+    date = datetime.datetime.now()
+    print_debug("Date: " + date.strftime('%H:%M %d/%m/%Y') + "\n", False, "")
+    class options_for_drivers:
+        pass
+# *** *** ***
+# Stability validation run
+# *** *** ***
+    if ((("stability" in only) == True) or ("performance" in only) == False):
+        print_debug("\n\nStability validation run\n\n", False, "")
+        stability = options_for_drivers()
+# stability constant options
+        stability.random = False
+        stability.ispc_flags = ""
+        stability.compiler_exe = None
+        stability.num_jobs = 1024
+        stability.verbose = False
+        stability.time = False
+        stability.non_interactive = True
+        stability.update = update
+        stability.include_file = None
+        stability.silent = True
+        stability.in_file = "." + os.sep + f_date + os.sep + "run_tests_log.log"
+        stability.verify = False
+# stability varying options
+        stability.target = ""
+        stability.arch = ""
+        stability.no_opt = False
+        stability.wrapexe = ""
+# prepare parameters of run
+        common.check_tools(1)
+        [targets_t, sde_targets_t] = check_targets()
+        rebuild = True
+        opts = []
+        archs = []
+        LLVM = []
+        targets = []
+        sde_targets = []
+# parsing option only, update parameters of run
+        if "-O2" in only:
+            opts.append(False)
+        if "-O0" in only:
+            opts.append(True)
+        if "x86" in only and not ("x86-64" in only):
+            archs.append("x86")
+        if "x86-64" in only:
+            archs.append("x86-64")
+        if "native" in only:
+            sde_targets_t = []
+        for i in ["3.1", "3.2", "3.3", "head"]:
+            if i in only:
+                LLVM.append(i)
+        if "current" in only:
+            LLVM = [" "]
+            rebuild = False
+        if only_targets != "":
+            only_targets_t = only_targets.split(" ")
+            for i in only_targets_t:
+                err = True
+                for j in range(0,len(targets_t)):
+                    if i in targets_t[j]:
+                        targets.append(targets_t[j])
+                        err = False
+                for j in range(0,len(sde_targets_t)):
+                    if i in sde_targets_t[j][1]:
+                        sde_targets.append(sde_targets_t[j])
+                        err = False
+                if err == True:
+                    error("You haven't sde for target " + i, 1)
+        else:
+            targets = targets_t[:-4]
+            sde_targets = sde_targets_t
+        if "build" in only:
+            targets = []
+            sde_targets = []
+            only = only + " stability "
+# finish parameters of run, prepare LLVM
+        if len(opts) == 0:
+            opts = [False]
+        if len(archs) == 0:
+            archs = ["x86", "x86-64"]
+        if len(LLVM) == 0:
+            LLVM = ["3.1", "3.2", "3.3", "head"]
+        gen_archs = ["x86-64"]
+        need_LLVM = check_LLVM(LLVM)
+        for i in range(0,len(need_LLVM)):
+            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False)
+# begin validation run for stabitily
+        common.remove_if_exists(stability.in_file)
+        R = [[[],[]],[[],[]],[[],[]],[[],[]]]
+        print_debug("\n_________________________STABILITY REPORT_________________________\n", False, stability_log)
+        for i in range(0,len(LLVM)):
+            print_version = 2
+            if rebuild:
+                build_ispc(LLVM[i])
+            for j in range(0,len(targets)):
+                stability.target = targets[j]
+                stability.wrapexe = ""
+                if "generic" in targets[j]:
+                    arch = gen_archs
+                else:
+                    arch = archs
+                for i1 in range(0,len(arch)):
+                    for i2 in range(0,len(opts)):
+                        stability.arch = arch[i1]
+                        stability.no_opt = opts[i2]
+                        execute_stability(stability, R, print_version)
+                        print_version = 0
+            for j in range(0,len(sde_targets)):
+                stability.target = sde_targets[j][1]
+                stability.wrapexe = os.environ["SDE_HOME"] + "/sde " + sde_targets[j][0] + " -- "
+                for i1 in range(0,len(archs)):
+                    for i2 in range(0,len(opts)):
+                        stability.arch = archs[i1]
+                        stability.no_opt = opts[i2]
+                        execute_stability(stability, R, print_version)
+                        print_version = 0
+# run special tests like embree
+# 
+        run_special_tests()
+        ttt = ["NEW RUNFAILS: ", "NEW COMPFAILS: ", "NEW PASSES RUNFAILS: ", "NEW PASSES COMPFAILS: "]
+        for j in range(0,4):
+            if len(R[j][0]) == 0:
+                print_debug("NO " + ttt[j][:-2] + "\n", False, stability_log)
+            else:
+                print_debug(ttt[j] + str(len(R[j][0])) + "\n", False, stability_log)
+                temp5 = [[],[]]
+                for i in range(0,len(R[j][0])):
+                    er = True
+                    for k in range(0,len(temp5[0])):
+                        if R[j][0][i] == temp5[0][k]:
+                            temp5[1][k].append(R[j][1][i])
+                            er = False
+                    if er == True:
+                        temp5[0].append(R[j][0][i])
+                        temp5[1].append([R[j][1][i]])
+                for i in range(0,len(temp5[0])):
+                    print_debug("\t" + temp5[0][i] + "\n", True, stability_log)
+                    for k in range(0,len(temp5[1][i])):
+                        print_debug("\t\t\t" + temp5[1][i][k], True, stability_log)
+        print_debug("__________________Watch stability.log for details_________________\n", False, stability_log)
+        if options.notify != "":
+            attach_mail_file(msg, stability.in_file, "run_tests_log.log")
+            attach_mail_file(msg, stability_log, "stability.log")
+
+# *** *** ***
+# Performance validation run
+# *** *** ***
+    if ((("performance" in only) == True) or ("stability" in only) == False):
+        print_debug("\n\nPerformance validation run\n\n", False, "")
+        performance = options_for_drivers()
+# performance constant options
+        performance.number = 5
+        performance.config = "./perf.ini"
+        performance.path = "./"
+        performance.silent = True
+        performance.output = ""
+        performance.compiler = ""
+        performance.ref = "ispc_ref"
+        performance.in_file = "." + os.sep + f_date + os.sep + "performance.log"
+# prepare LLVM 3.3 as newest LLVM
+        need_LLVM = check_LLVM(["3.3"])
+        if len(need_LLVM) != 0:
+            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False)
+# prepare reference point. build both test and reference compilers
+        os.system("git branch > " + temp_alloy_file)
+        br = open(temp_alloy_file)
+        temp4 = br.readlines()
+        br.close()
+        for line in temp4:
+            if "*" in line:
+                current_branch = line[2:-1]
+        stashing = True
+        sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n")
+        if "No local changes" in detect_version("git stash"):
+            stashing = False
+        #try_do_LLVM("stash current branch ", "git stash", True)
+        try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True)
+        sys.stdout.write(".\n")
+        build_ispc("3.3")
+        sys.stdout.write(".\n")
+        os.rename("ispc", "ispc_ref")
+        try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True)
+        if stashing:
+            try_do_LLVM("return current branch ", "git stash pop", True)
+        sys.stdout.write("You can interrupt script now.\n")
+        build_ispc("3.3")
+# begin validation run for performance. output is inserted into perf()
+        perf.perf(performance, [])
+        if options.notify != "":
+            attach_mail_file(msg, performance.in_file, "performance.log")
+            attach_mail_file(msg, "." + os.sep + "logs" + os.sep + "perf_build.log", "perf_build.log")
+
+    print_debug("Logs are in alloy_results_[date]", False, "")
+
+# sending e-mail with results
+    if options.notify != "":
+        fp = open(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt", 'rb')
+        f_lines = fp.readlines()
+        fp.close()
+        line = ""
+        for i in range(0,len(f_lines)):
+            line = line + f_lines[i][:-1]
+            line = line + '   \n'
+        text = MIMEText(line, "", "KOI-8")
+        msg.attach(text)
+        attach_mail_file(msg, alloy_build, "alloy_build.log")
+        s = smtplib.SMTP(smtp_server)
+        s.sendmail('ISPC_test_system', options.notify, msg.as_string())
+        s.quit()
+# exit of validation routine
+    common.remove_if_exists(temp_alloy_file)
+    os.chdir(current_path)
+
+def Main():
+    if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True:
+        error("Windows isn't supported now", 1)
+    if (options.build_llvm == False and
+       options.validation_run == False and
+       options.llvm_home == "" and
+       options.ispc_home == "" and
+       options.sde_home == ""):
+        parser.print_help()
+        exit(0)
+    global f_date
+    f_date = "logs"
+    common.remove_if_exists(f_date)
+    os.makedirs(f_date)
+    global temp_alloy_file
+    temp_alloy_file = os.getcwd() + os.sep + f_date + os.sep + "temp_detect_version"
+    global alloy_build
+    alloy_build = os.getcwd() + os.sep + f_date + os.sep + "alloy_build.log"
+    common.remove_if_exists(alloy_build) 
+    global stability_log
+    stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log"
+    common.remove_if_exists(stability_log)
+    setting_paths(options.llvm_home, options.ispc_home, options.sde_home)
+    if os.environ.get("LLVM_HOME") == None:
+        error("you have no LLVM_HOME", 1)
+    if os.environ.get("ISPC_HOME") == None:
+        error("you have no ISPC_HOME", 1)
+    if options.build_llvm:
+        build_LLVM(options.version, options.revision, options.folder, options.tarball,
+                    options.debug, options.selfbuild, False, options.force)
+    if options.validation_run:
+        validation_run(options.only, options.only_targets, options.branch, options.notify, options.update)
+    os.rename(f_date, "alloy_results_" + datetime.datetime.now().strftime('%H_%M_%d_%m_%Y'))
+
+###Main###
+from optparse import OptionParser
+import sys
+import os
+import operator
+import time
+import glob
+import string
+import platform
+import smtplib
+import datetime
+import copy
+from email.MIMEMultipart import MIMEMultipart
+from email.MIMEBase import MIMEBase
+from email.mime.text import MIMEText
+from email.Encoders import encode_base64
+# our drivers
+import run_tests
+import perf
+import common
+error = common.error
+detect_version = common.detect_version
+print_debug = common.print_debug
+# parsing options
+parser = OptionParser()
+# options for activity "build LLVM"
+parser.add_option('-b', '--build-llvm', dest='build_llvm',
+    help='ask to build LLVM', default=False, action="store_true")
+parser.add_option('--version', dest='version',
+    help='version of llvm to build', default="head")
+parser.add_option('--revision', dest='revision',
+    help='revision of llvm to build', default="")
+parser.add_option('--debug', dest='debug',
+    help='debug build of LLVM?', default=False, action="store_true")
+parser.add_option('--folder', dest='folder',
+    help='folder to build LLVM in', default="")
+parser.add_option('--tarball', dest='tarball',
+    help='"llvm_tarball clang_tarball"', default="")
+parser.add_option('--selfbuild', dest='selfbuild',
+    help='make selfbuild of LLVM and clang', default=False, action="store_true")
+parser.add_option('--force', dest='force',
+    help='rebuild LLVM', default=False, action='store_true')
+# options for activity "setup PATHS"
+parser.add_option('--llvm_home', dest='llvm_home',help='path to LLVM',default="")
+parser.add_option('--ispc_home', dest='ispc_home',help='path to ISPC',default="")
+parser.add_option('--sde_home', dest='sde_home',help='path to SDE',default="")
+# options for activity "validation run"
+parser.add_option('-r', '--run', dest='validation_run',
+    help='ask for validation run', default=False, action="store_true")
+parser.add_option('--compare-with', dest='branch',
+    help='set performance reference point', default="master")
+parser.add_option('--only-targets', dest='only_targets',
+    help='set list of targets to test. Possible values - all subnames of targets.\n' +
+        'Example: --only-targets="avx2-i32x8 sse4 i32x16 sse2"', default="")
+parser.add_option('--notify', dest='notify',
+    help='sent results to email', default="")
+parser.add_option('--only', dest='only',
+    help='set types of tests. Possible values:\n' + 
+        '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' +
+        'build (only build with different LLVM), 3.1, 3.2, 3.3, head, native (do not use SDE), current (do not rebuild ISPC).\n' +
+        'Example: --only="3.2 -O0 stability 3.3"', default="")
+parser.add_option('--update-errors', dest='update',
+    help='rewrite fail_db.txt file according to received results (F or FP)', default="")
+(options, args) = parser.parse_args()
+Main()
diff --git a/check_env.py b/check_env.py
new file mode 100755
index 00000000..98deb235
--- /dev/null
+++ b/check_env.py
@@ -0,0 +1,102 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ 
+# // Author: Filippov Ilia
+
+import common
+import sys
+import os
+import string
+print_debug = common.print_debug
+error = common.error
+detect_version = common.detect_version
+
+exists = [False, False, False, False, False, False, False, False]
+names = ["m4", "bison", "flex", "sde", "ispc", "clang", "gcc", "icc"]
+
+PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+for counter in PATH_dir:
+    for i in range(0,8):
+        if os.path.exists(counter + os.sep + names[i]):
+            exists[i] = True
+
+print_debug("=== in PATH: ===\n", False, "")
+print_debug("Tools:\n", False, "")
+for i in range(0,3):
+    if exists[i]:
+        print_debug(detect_version(names[i] + " --version"), False, "")
+    else:
+        error("you don't have " + names[i], 0)
+if exists[0] and exists[1] and exists[2]:
+    if common.check_tools(2):
+        print_debug("versions are ok\n", False, "")
+print_debug("\nSDE:\n", False, "")
+if exists[3]:
+    print_debug(detect_version(names[3] + " --version"), False, "")
+else:
+    error("you don't have " + names[3], 2)
+print_debug("\nISPC:\n", False, "")
+if exists[4]:
+    print_debug(detect_version(names[4] + " --version"), False, "")
+else:
+    error("you don't have " + names[4], 2)
+print_debug("\nC/C++ compilers:\n", False, "")
+for i in range(5,8):
+    if exists[i]:
+        print_debug(detect_version(names[i] + " --version"), False, "")
+    else:
+        error("you don't have " + names[i], 2)
+
+print_debug("\n=== in ISPC specific environment variables: ===\n", False, "")
+if os.environ.get("LLVM_HOME") == None:
+    error("you have no LLVM_HOME", 2)
+else:
+    print_debug("Your LLVM_HOME:" + os.environ.get("LLVM_HOME") + "\n", False, "")
+if os.environ.get("ISPC_HOME") == None:
+    error("you have no ISPC_HOME", 2)
+else:
+    print_debug("Your ISPC_HOME:" + os.environ.get("ISPC_HOME") + "\n", False, "")
+    if os.path.exists(os.environ.get("ISPC_HOME") + os.sep + "ispc"):
+        print_debug("You have ISPC in your ISPC_HOME: " +
+        detect_version(os.environ.get("ISPC_HOME") + os.sep + "ispc" + " --version"), False, "")
+    else:
+        error("you don't have ISPC in your ISPC_HOME", 2)
+if os.environ.get("SDE_HOME") == None:
+    error("You have no SDE_HOME", 2)
+else:
+    print_debug("Your SDE_HOME:" + os.environ.get("SDE_HOME") + "\n", False, "")
+    if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"):
+        print_debug("You have sde in your SDE_HOME: " +
+        detect_version(os.environ.get("SDE_HOME") + os.sep + "sde" + " --version"), False, "")
+    else:
+        error("you don't have any SDE in your ISPC_HOME", 2)
diff --git a/common.py b/common.py
new file mode 100644
index 00000000..dd8fb388
--- /dev/null
+++ b/common.py
@@ -0,0 +1,120 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation 
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# // Author: Filippov Ilia
+import sys
+import os
+import shutil
+
+def write_to_file(filename, line):
+    f = open(filename, 'a')
+    f.writelines(line)
+    f.close()
+
+#remove file if it exists
+def remove_if_exists(filename):
+    if os.path.exists(filename):
+        if os.path.isdir(filename):
+            shutil.rmtree(filename)
+        else:
+            os.remove(filename)
+
+# detect version which is printed after command
+def detect_version(command):
+    os.system(command + " > " + "temp_detect_version")
+    version = open("temp_detect_version")
+    answer = version.readline()
+    version.close()
+    remove_if_exists("temp_detect_version")
+    return answer
+
+# print versions of compilers
+def print_version(ispc_test, ispc_ref, ref_compiler, s, perf_log, is_windows):
+    print_debug("\nUsing test compiler: " + detect_version(ispc_test + " --version"), s, perf_log)
+    if ispc_ref != "":
+        print_debug("Using ref compiler:  " + detect_version(ispc_ref + " --version"), s, perf_log)
+    if is_windows == False:
+        temp1 = detect_version(ref_compiler + " --version")
+    else:
+        os.system(ref_compiler + " 2>&1" + " 2> temp_detect_version > temp_detect_version1" )
+        version = open("temp_detect_version")
+        temp1 = version.readline()
+        version.close()
+        remove_if_exists("temp_detect_version")
+        remove_if_exists("temp_detect_version1")
+    print_debug("Using C/C++ compiler: " + temp1 + "\n", s, perf_log)
+
+# print everything from scripts instead errors
+def print_debug(line, silent, filename):
+    if silent == False:
+        sys.stdout.write(line)
+        sys.stdout.flush()
+        if os.environ.get("ISPC_HOME") != None:
+            write_to_file(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt", line)
+    if filename != "":
+        write_to_file(filename, line)
+
+# print errors from scripts
+# type 1 for error in environment
+# type 2 for warning
+# type 3 for error of compiler or test which isn't the goal of script 
+def error(line, error_type):
+    line = line + "\n"
+    if error_type == 1:
+        sys.stderr.write("Fatal error: " + line)
+        sys.exit(1)
+    if error_type == 2:
+        sys.stderr.write("Warning: " + line)
+    if error_type == 0:
+        print_debug("FIND ERROR: " + line, False, "")
+
+def check_tools(m):
+    input_tools=[[[1,4],"m4 --version", "bad m4 version"],
+                 [[2,4],"bison --version", "bad bison version"],
+                 [[2,5], "flex --version", "bad flex version"]]
+ 
+    for t in range(0,len(input_tools)):
+        t1 = ((detect_version(input_tools[t][1]))[:-1].split(" "))
+        for i in range(0,len(t1)):
+            t11 = t1[i].split(".")
+            f = True
+            for j in range(0,len(t11)):
+                if not t11[j].isdigit():
+                    f = False
+            if f == True:
+                for j in range(0,len(t11)):
+                    if j < len(input_tools[t][0]):
+                        if int(t11[j])<input_tools[t][0][j]:
+                            error(input_tools[t][2], m)
+                            return 0
+    return 1
diff --git a/examples/noise/Makefile b/examples/noise/Makefile
index 8cc72689..58d1cf3b 100644
--- a/examples/noise/Makefile
+++ b/examples/noise/Makefile
@@ -1,6 +1,6 @@
 
 EXAMPLE=noise
-CPP_SRC=$(EXAMPLE).cpp $(EXAMPLE)_serial.cpp
+CPP_SRC=noise.cpp noise_serial.cpp
 ISPC_SRC=noise.ispc
 ISPC_IA_TARGETS=sse2,sse4,avx-x2
 ISPC_ARM_TARGETS=neon
diff --git a/examples/perf.py b/examples/perf.py
deleted file mode 100755
index 4b661b39..00000000
--- a/examples/perf.py
+++ /dev/null
@@ -1,374 +0,0 @@
-#!/usr/bin/python
-# // Author: Filippov Ilia
-
-from optparse import OptionParser
-import sys
-import os
-import operator
-import time
-import glob
-import string
-import platform
-
-def print_debug(line):
-    if options.silent == False:
-        sys.stdout.write(line)
-
-def print_file(line):
-    if options.output != "":
-        output = open(options.output, 'w')
-        output.writelines(line)
-        output.close()
-
-def build_test():
-    global build_log
-    global is_windows
-    if is_windows == False:
-        os.system("make clean >> "+build_log)
-        return os.system("make CXX="+ref_compiler+" CC="+refc_compiler+" >> "+build_log+" 2>> "+build_log)
-    else:
-        os.system("msbuild /t:clean >> " + build_log)
-        return os.system("msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /t:rebuild >> " + build_log)
-
-def execute_test(command):
-    global perf_temp
-    r = 0
-    if os.path.exists(perf_temp):
-        os.remove(perf_temp)
-    for k in range(int(options.number)):
-        r = r + os.system(command)
-    return r
-
-#gathers all tests results and made an item test from answer structure
-def run_test(command, c1, c2, test, b_serial):
-    global perf_temp
-    if build_test() != 0:
-        sys.stdout.write("ERROR: Compilation fails\n")
-        return
-    if execute_test(command) != 0:
-        sys.stdout.write("ERROR: Execution fails\n")
-        return
-    tasks = [] #list of results with tasks, it will be test[2]
-    ispc = [] #list of results without tasks, it will be test[1]
-    absolute_tasks = []  #list of absolute results with tasks, it will be test[4]
-    absolute_ispc = [] #list of absolute results without tasks, ut will be test[3]
-    serial = [] #list serial times, it will be test[5]
-    j = 1
-    for line in open(perf_temp): # we take test output
-        if "speedup" in line: # we are interested only in lines with speedup
-            if j == c1: # we are interested only in lines with c1 numbers
-                line = line.expandtabs(0)
-                line = line.replace("("," ")
-                line = line.split(",")
-                for i in range(len(line)):
-                    subline = line[i].split(" ")
-                    number = float(subline[1][:-1])
-                    if "speedup from ISPC + tasks" in line[i]:
-                        tasks.append(number)
-                    else:
-                        ispc.append(number)
-                c1 = c1 + c2
-            j+=1
-        if "million cycles" in line:
-            if j == c1:
-                line = line.replace("]","[")
-                line = line.split("[")
-                number = float(line[3])
-                if "tasks" in line[1]:
-                    absolute_tasks.append(number)
-                else:
-                    if "ispc" in line[1]:
-                        absolute_ispc.append(number)
-                if "serial" in line[1]:
-                    serial.append(number)
-
-    if len(ispc) != 0:
-        if len(tasks) != 0:
-            print_debug("ISPC speedup / ISPC + tasks speedup / ISPC time / ISPC + tasks time / serial time\n")
-            for i in range(0,len(serial)):
-                print_debug("%10s   /\t%10s\t    /%9s  /    %10s\t    /%10s\n" %
-                    (ispc[i], tasks[i], absolute_ispc[i], absolute_tasks[i], serial[i]))
-        else:
-            print_debug("ISPC speedup / ISPC time / serial time\n")
-            for i in range(0,len(serial)):
-                print_debug("%10s   /%9s  /%10s\n" % (ispc[i], absolute_ispc[i], serial[i]))
-    else:
-        if len(tasks) != 0:
-            print_debug("ISPC + tasks speedup / ISPC + tasks time / serial time\n")
-            for i in range(0,len(serial)):
-                print_debug("%10s\t     /    %10s\t /%10s\n" % (tasks[i], absolute_tasks[i], serial[i]))
-
-    test[1] = test[1] + ispc
-    test[2] = test[2] + tasks
-    test[3] = test[3] + absolute_ispc
-    test[4] = test[4] + absolute_tasks
-    if b_serial == True:
-        #if we concatenate outputs we should use only the first serial answer.
-        test[5] = test[5] + serial
-
-def cpu_get():
-    p = open("/proc/stat", 'r')
-    cpu = p.readline()
-    p.close()
-    cpu = cpu.split(" ")
-    cpu_usage = (int(cpu[2]) + int(cpu[3]) + int(cpu[4]))
-    cpu_all = cpu_usage + int(cpu[5])
-    return [cpu_usage, cpu_all]
-
-#returns cpu_usage
-def cpu_check():
-    if is_windows == False:
-        if is_mac == False:
-            cpu1 = cpu_get()
-            time.sleep(1)
-            cpu2 = cpu_get()
-            cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
-        else:
-            os.system("sysctl -n vm.loadavg > cpu_temp")
-            c = open("cpu_temp", 'r')
-            c_line = c.readline()
-            c.close
-            os.remove("cpu_temp")
-            R = c_line.split(' ')
-            cpu_percent = float(R[1]) * 3
-    else:
-	os.system("wmic cpu get loadpercentage /value > cpu_temp")
-	c = open("cpu_temp", 'r')
-        c_lines = c.readlines()
-	c.close()
-	os.remove("cpu_temp")
-	t = "0"
-	for i in c_lines[2]:
-            if i.isdigit():
-                t = t + i
-	cpu_percent = int(t)
-    return cpu_percent
-
-#returns geomean of list
-def geomean(par):
-    temp = 1
-    l = len(par)
-    for i in range(l):
-        temp = temp * par[i]
-    temp = temp ** (1.0/l)
-    return round(temp, 2)
-
-#takes an answer struct and print it.
-#answer struct: list answer contains lists test
-#test[0] - name of test
-#test[1] - list of results without tasks
-#test[2] - list of results with tasks
-#test[3] - list of absolute results without tasks
-#test[4] - list of absolute results with tasks
-#test[5] - list of absolute time without ISPC (serial)
-#test[1..4] may be empty
-def print_answer(answer):
-    filelist = []
-    print_debug("--------------------------------------------------------------------------\n")
-    print_debug("test name:\t    ISPC speedup: ISPC + tasks speedup: | " + 
-        "ISPC time:    ISPC + tasks time:  serial:\n")
-    filelist.append("test name,ISPC speedup,diff," +
-        "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n")
-    max_t = [0,0,0,0,0]
-    diff_t = [0,0,0,0,0]
-    geomean_t = [0,0,0,0,0]
-    list_of_max = [[],[],[],[],[]]
-    for i in range(len(answer)):
-        for t in range(1,6):
-            if len(answer[i][t]) == 0:
-                max_t[t-1] = "n/a"
-                diff_t[t-1] = "n/a"
-            else:
-                if t < 3:
-                    mm = max(answer[i][t])
-                else:
-                    mm = min(answer[i][t])
-                max_t[t-1] = '%.2f' % mm
-                list_of_max[t-1].append(mm)
-                diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t]))
-        print_debug("%s:\n" % answer[i][0])
-        print_debug("\t\tmax:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
-            (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4]))
-        print_debug("\t\tdiff:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
-            (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4]))
-        for t in range(0,5):
-            if max_t[t] == "n/a":
-                max_t[t] = ""
-            if diff_t[t] == "n/a":
-                diff_t[t] = ""
-        filelist.append(answer[i][0] + "," +
-                        max_t[0] + "," + diff_t[0] + "," +  max_t[1] + "," + diff_t[1] + "," +
-                        max_t[2] + "," + diff_t[2] + "," +  max_t[3] + "," + diff_t[3] + "," +
-                        max_t[4] + "," + diff_t[4] + "\n")
-    for i in range(0,5):
-        geomean_t[i] = geomean(list_of_max[i])
-    print_debug("---------------------------------------------------------------------------------\n")
-    print_debug("Geomean:\t\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
-        (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4]))
-    filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1])
-        + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n")
-    print_file(filelist)
-
-
-###Main###
-# parsing options
-parser = OptionParser()
-parser.add_option('-n', '--number', dest='number',
-    help='number of repeats', default="3")
-parser.add_option('-c', '--config', dest='config',
-    help='config file of tests', default="./perf.ini")
-parser.add_option('-p', '--path', dest='path',
-    help='path to examples directory', default="./")
-parser.add_option('-s', '--silent', dest='silent',
-    help='silent mode, only table output', default=False, action="store_true")
-parser.add_option('-o', '--output', dest='output',
-    help='output file for script reading', default="")
-parser.add_option('--compiler', dest='compiler',
-    help='reference compiler', default="")
-(options, args) = parser.parse_args()
-
-global is_windows
-is_windows = (platform.system() == 'Windows' or
-              'CYGWIN_NT' in platform.system())
-global is_mac
-is_mac = (platform.system() == 'Darwin')
-
-# save corrent path
-pwd = os.getcwd()
-pwd = pwd + os.sep
-if is_windows:
-    pwd = "..\\"
-
-# check if cpu usage is low now
-cpu_percent = cpu_check()
-if cpu_percent > 20:
-    sys.stdout.write("Warning: CPU Usage is very high.\n")
-    sys.stdout.write("Close other applications.\n")
-
-# check that required compilers exist
-PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
-compiler_exists = False
-ref_compiler_exists = False
-if is_windows == False:
-    compiler = "ispc"
-    ref_compiler = "g++"
-    refc_compiler = "gcc"
-    if options.compiler != "":
-        if options.compiler == "clang" or options.compiler == "clang++":
-            ref_compiler = "clang++"
-            refc_compiler = "clang"
-        if options.compiler == "icc" or options.compiler == "icpc":
-            ref_compiler = "icpc"
-            refc_compiler = "icc"
-else:
-    compiler = "ispc.exe"
-    ref_compiler = "cl.exe"
-for counter in PATH_dir:
-    if os.path.exists(counter + os.sep + compiler):
-        compiler_exists = True
-    if os.path.exists(counter + os.sep + ref_compiler):
-        ref_compiler_exists = True
-if not compiler_exists:
-    sys.stderr.write("Fatal error: ISPC compiler not found.\n")
-    sys.stderr.write("Added path to ispc compiler to your PATH variable.\n")
-    sys.exit()
-if not ref_compiler_exists:
-    sys.stderr.write("Fatal error: reference compiler %s not found.\n" % ref_compiler)
-    sys.stderr.write("Added path to %s compiler to your PATH variable.\n" % ref_compiler)
-    sys.exit()
-
-# checks that config file exists
-path_config = os.path.normpath(options.config)
-if os.path.exists(path_config) == False:
-    sys.stderr.write("Fatal error: config file not found: %s.\n" % options.config) 
-    sys.stderr.write("Set path to your config file in --config.\n")
-    sys.exit()
-
-# read lines from config file except comments
-f = open(path_config, 'r')
-f_lines = f.readlines()
-f.close()
-lines =[]
-for i in range(len(f_lines)):
-    if f_lines[i][0] != "%":
-        lines.append(f_lines[i])
-length = len(lines)
-
-# prepare build.log and perf_temp files
-global build_log
-build_log = pwd + "build.log"
-if is_windows == False:
-    if os.path.exists(build_log):
-        os.remove(build_log)
-else:
-    if os.path.exists("build.log"):
-        os.remove("build.log")
-global perf_temp
-perf_temp = pwd + "perf_temp"
-
-i = 0
-answer = []
-print_debug("Okey go go go!\n\n")
-os.system(compiler + " --version >" + build_log)
-version = open(build_log)
-print_debug("Using test compiler: " + version.readline())
-version.close()
-
-if is_windows == False:
-    os.system(ref_compiler + " --version >" + build_log)
-else:
-    os.system(ref_compiler + " 2>" + build_log + " 1>&2")
-
-version = open(build_log)
-print_debug("Using reference compiler: " + version.readline())
-version.close()
-
-
-# loop for all tests
-while i < length-2:
-    # we read name of test
-    print_debug("%s" % lines[i])
-    test = [lines[i][:-1],[],[],[],[],[]]
-    # read location of test
-    folder = lines[i+1]
-    folder = folder[:-1]
-    folder = os.path.normpath(options.path + os.sep + folder)
-    # check that test exists
-    if os.path.exists(folder) == False:
-        sys.stdout.write("Fatal error: Can't find test %s. Your path is: \"%s\".\n" % (lines[i][:-1], options.path))
-        sys.stdout.write("Change current location to /examples or set path to /examples in --path.\n")
-        exit(0)
-    os.chdir(folder)
-    # read parameters of test
-    command = lines[i+2]
-    command = command[:-1]
-    if is_windows == False:
-        command = "./"+command + " >> " + perf_temp
-    else:
-        command = "x64\\Release\\"+command + " >> " + perf_temp
-    # parsing config parameters
-    next_line = lines[i+3]
-    if next_line[0] == "!": # we should take only one part of test output
-        R = next_line.split(' ')
-        c1 = int(R[1]) #c1 is a number of string which we want to use in test output
-        c2 = int(R[2]) #c2 is total number of strings in test output
-        i = i+1
-    else:
-        c1 = 1
-        c2 = 1
-    next_line = lines[i+3]
-    if next_line[0] == "^":  #we should concatenate result of this test with previous one
-        run_test(command, c1, c2, answer[len(answer)-1], False)
-        i = i+1
-    else: #we run this test and append it's result to answer structure
-        run_test(command, c1, c2, test, True)
-        answer.append(test)
-    # preparing next loop iteration
-    os.chdir(pwd)
-    i+=4
-
-# delete temp file
-if os.path.exists(perf_temp):
-    os.remove(perf_temp)
-#print collected answer
-print_answer(answer)
diff --git a/fail_db.txt b/fail_db.txt
new file mode 100644
index 00000000..7adc3e41
--- /dev/null
+++ b/fail_db.txt
@@ -0,0 +1 @@
+% List of known fails
diff --git a/llvm_patches/r183327-AVX2-GATHER.patch b/llvm_patches/3_3_r183327-AVX2-GATHER.patch
similarity index 100%
rename from llvm_patches/r183327-AVX2-GATHER.patch
rename to llvm_patches/3_3_r183327-AVX2-GATHER.patch
diff --git a/llvm_patches/r184575-x86-shift.patch b/llvm_patches/3_3_r184575-x86-shift.patch
similarity index 100%
rename from llvm_patches/r184575-x86-shift.patch
rename to llvm_patches/3_3_r184575-x86-shift.patch
diff --git a/examples/perf.ini b/perf.ini
similarity index 84%
rename from examples/perf.ini
rename to perf.ini
index d2a5c73e..d8c7fe71 100755
--- a/examples/perf.ini
+++ b/perf.ini
@@ -10,44 +10,48 @@
 %****************************************************************************************************
 AOBench
 aobench
-ao 10 512 512
+10 512 512
 #***
 Deferred Shading
 deferred
-deferred_shading data/pp1280x720.bin
+data/pp1280x720.bin
 #***
 Mandelbrot Set
 mandelbrot
-mandelbrot
+
 #***
 Mandelbrot Set
 mandelbrot_tasks
-mandelbrot_tasks
+
 ^
 #***
 Perlin Noise Function
 noise
-noise
+
 #***
 Binomial Options
 options
-options
+
 ! 1 2
 #***
 Black-Scholes Options
 options
-options
+
 ! 2 2
 #***
 Ray Tracer
 rt
-rt sponza
+sponza
 #***
 3D Stencil
 stencil
-stencil
+
 #***
 Volume Rendering
 volume_rendering
-volume camera.dat density_highres.vol
+camera.dat density_highres.vol
 #***
+%Sort
+%sort
+%
+%#***
diff --git a/perf.py b/perf.py
new file mode 100755
index 00000000..d1d7654b
--- /dev/null
+++ b/perf.py
@@ -0,0 +1,489 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# // Author: Filippov Ilia
+
+def print_file(line):
+    if options.output != "":
+        output = open(options.output, 'w')
+        output.writelines(line)
+        output.close()
+
+def build_test(commands):
+    os.system(commands[4])
+    test = os.system(commands[1])
+    if options.ref:
+        ref = os.system(commands[3])
+    return (options.ref and ref) or test
+
+def execute_test(commands):
+    r = 0
+    common.remove_if_exists(perf_temp+"_test") 
+    common.remove_if_exists(perf_temp+"_ref")
+    for k in range(int(options.number)):
+        r = r + os.system(commands[0])
+        if options.ref:
+            r = r + os.system(commands[2])
+    return r
+
+#gathers all tests results and made an item test from answer structure
+def run_test(commands, c1, c2, test, test_ref, b_serial):
+    if build_test(commands) != 0:
+        error("Compilation fails of test %s\n" % test[0], 0)
+        return
+    if execute_test(commands) != 0:
+        error("Execution fails of test %s\n" % test[0], 0)
+        return
+    print_debug("TEST COMPILER:\n", s, perf_log)
+    analyse_test(c1, c2, test, b_serial, perf_temp+"_test")
+    if options.ref:
+        print_debug("REFERENCE COMPILER:\n", s, perf_log)
+        analyse_test(c1, c2, test_ref, b_serial, perf_temp+"_ref")
+
+
+def analyse_test(c1, c2, test, b_serial, perf_temp_n):
+    tasks = [] #list of results with tasks, it will be test[2]
+    ispc = [] #list of results without tasks, it will be test[1]
+    absolute_tasks = []  #list of absolute results with tasks, it will be test[4]
+    absolute_ispc = [] #list of absolute results without tasks, ut will be test[3]
+    serial = [] #list serial times, it will be test[5]
+    j = 1
+    for line in open(perf_temp_n): # we take test output
+        if "speedup" in line: # we are interested only in lines with speedup
+            if j == c1: # we are interested only in lines with c1 numbers
+                line = line.expandtabs(0)
+                line = line.replace("("," ")
+                line = line.split(",")
+                for i in range(len(line)):
+                    subline = line[i].split(" ")
+                    number = float(subline[1][:-1])
+                    if "speedup from ISPC + tasks" in line[i]:
+                        tasks.append(number)
+                    else:
+                        ispc.append(number)
+                c1 = c1 + c2
+            j+=1
+        if "million cycles" in line:
+            if j == c1:
+                line = line.replace("]","[")
+                line = line.split("[")
+                number = float(line[3])
+                if "tasks" in line[1]:
+                    absolute_tasks.append(number)
+                else:
+                    if "ispc" in line[1]:
+                        absolute_ispc.append(number)
+                if "serial" in line[1]:
+                    serial.append(number)
+
+    if len(ispc) != 0:
+        if len(tasks) != 0:
+            print_debug("ISPC speedup / ISPC + tasks speedup / ISPC time / ISPC + tasks time / serial time\n", s, perf_log)
+            for i in range(0,len(serial)):
+                print_debug("%10s   /\t%10s\t    /%9s  /    %10s\t    /%10s\n" %
+                    (ispc[i], tasks[i], absolute_ispc[i], absolute_tasks[i], serial[i]), s, perf_log)
+        else:
+            print_debug("ISPC speedup / ISPC time / serial time\n", s, perf_log)
+            for i in range(0,len(serial)):
+                print_debug("%10s   /%9s  /%10s\n" % (ispc[i], absolute_ispc[i], serial[i]), s, perf_log)
+    else:
+        if len(tasks) != 0:
+            print_debug("ISPC + tasks speedup / ISPC + tasks time / serial time\n", s, perf_log)
+            for i in range(0,len(serial)):
+                print_debug("%10s\t     /    %10s\t /%10s\n" % (tasks[i], absolute_tasks[i], serial[i]), s, perf_log)
+
+    test[1] = test[1] + ispc
+    test[2] = test[2] + tasks
+    test[3] = test[3] + absolute_ispc
+    test[4] = test[4] + absolute_tasks
+    if b_serial == True:
+        #if we concatenate outputs we should use only the first serial answer.
+        test[5] = test[5] + serial
+
+def cpu_get():
+    p = open("/proc/stat", 'r')
+    cpu = p.readline()
+    p.close()
+    cpu = cpu.split(" ")
+    cpu_usage = (int(cpu[2]) + int(cpu[3]) + int(cpu[4]))
+    cpu_all = cpu_usage + int(cpu[5])
+    return [cpu_usage, cpu_all]
+
+#returns cpu_usage
+def cpu_check():
+    if is_windows == False:
+        if is_mac == False:
+            cpu1 = cpu_get()
+            time.sleep(1)
+            cpu2 = cpu_get()
+            cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
+        else:
+            os.system("sysctl -n vm.loadavg > cpu_temp")
+            c = open("cpu_temp", 'r')
+            c_line = c.readline()
+            c.close
+            os.remove("cpu_temp")
+            R = c_line.split(' ')
+            cpu_percent = float(R[1]) * 3
+    else:
+	os.system("wmic cpu get loadpercentage /value > cpu_temp")
+	c = open("cpu_temp", 'r')
+        c_lines = c.readlines()
+	c.close()
+	os.remove("cpu_temp")
+	t = "0"
+	for i in c_lines[2]:
+            if i.isdigit():
+                t = t + i
+	cpu_percent = int(t)
+    return cpu_percent
+
+#returns geomean of list
+def geomean(par):
+    temp = 1
+    l = len(par)
+    for i in range(l):
+        temp = temp * par[i]
+    temp = temp ** (1.0/l)
+    return round(temp, 2)
+
+#takes an answer struct and print it.
+#answer struct: list answer contains lists test
+#test[0] - name of test
+#test[1] - list of results without tasks
+#test[2] - list of results with tasks
+#test[3] - list of absolute results without tasks
+#test[4] - list of absolute results with tasks
+#test[5] - list of absolute time without ISPC (serial)
+#test[1..4] may be empty
+def print_answer(answer):
+    filelist = []
+    print_debug("--------------------------------------------------------------------------\n", s, perf_log)
+    print_debug("test name:\t    ISPC speedup: ISPC + tasks speedup: | " + 
+        "ISPC time:    ISPC + tasks time:  serial:\n", s, perf_log)
+    filelist.append("test name,ISPC speedup,diff," +
+        "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n")
+    max_t = [0,0,0,0,0]
+    diff_t = [0,0,0,0,0]
+    geomean_t = [0,0,0,0,0]
+    list_of_max = [[],[],[],[],[]]
+    list_of_compare = [[],[],[],[],[],[]]
+    for i in range(len(answer)):
+        list_of_compare[0].append(answer[i][0])
+        for t in range(1,6):
+            if len(answer[i][t]) == 0:
+                max_t[t-1] = "n/a"
+                diff_t[t-1] = "n/a"
+                list_of_compare[t].append(0);
+            else:
+                if t < 3:
+                    mm = max(answer[i][t])
+                else:
+                    mm = min(answer[i][t])
+                list_of_compare[t].append(mm)
+                max_t[t-1] = '%.2f' % mm
+                list_of_max[t-1].append(mm)
+                diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t]))
+        print_debug("%s:\n" % answer[i][0], s, perf_log)
+        print_debug("\t\tmax:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
+            (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4]), s, perf_log)
+        print_debug("\t\tdiff:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
+            (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4]), s, perf_log)
+        for t in range(0,5):
+            if max_t[t] == "n/a":
+                max_t[t] = ""
+            if diff_t[t] == "n/a":
+                diff_t[t] = ""
+        filelist.append(answer[i][0] + "," +
+                        max_t[0] + "," + diff_t[0] + "," +  max_t[1] + "," + diff_t[1] + "," +
+                        max_t[2] + "," + diff_t[2] + "," +  max_t[3] + "," + diff_t[3] + "," +
+                        max_t[4] + "," + diff_t[4] + "\n")
+    for i in range(0,5):
+        geomean_t[i] = geomean(list_of_max[i])
+    print_debug("---------------------------------------------------------------------------------\n", s, perf_log)
+    print_debug("Geomean:\t\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
+        (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4]), s, perf_log)
+    filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1])
+        + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n")
+    print_file(filelist)
+    return list_of_compare
+
+
+def compare(A, B):
+    print_debug("\n\n_____________________PERFORMANCE REPORT____________________________\n", False, "")
+    print_debug("test name:                 ISPC time: ISPC time ref: %:\n", False, "")
+    for i in range(0,len(A[0])):
+        if B[3][i] == 0:
+            p1 = 0
+        else:
+            p1 = 100 - 100 * A[3][i]/B[3][i]
+        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[3][i], B[3][i], p1), False, "")
+        if p1 < -1:
+            print_debug(" <-", False, "")
+        if p1 > 1:
+            print_debug(" <+", False, "")
+        print_debug("\n", False, "")
+    print_debug("\n", False, "")
+
+    print_debug("test name:                 TASKS time: TASKS time ref: %:\n", False, "")
+    for i in range(0,len(A[0])):
+        if B[4][i] == 0:
+            p2 = 0
+        else:
+            p2 = 100 - 100 * A[4][i]/B[4][i]
+        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[4][i], B[4][i], p2), False, "")
+        if p2 < -1:
+            print_debug(" <-", False, "")
+        if p2 > 1:
+            print_debug(" <+", False, "")
+        print_debug("\n", False, "")
+    if "performance.log" in options.in_file:
+        print_debug("\n\n_________________Watch performance.log for details________________\n", False, "")
+    else:
+        print_debug("\n\n__________________________________________________________________\n", False, "")
+
+
+
+def perf(options1, args):
+    global options
+    options = options1  
+    global s
+    s = options.silent
+
+    # save current OS
+    global is_windows
+    is_windows = (platform.system() == 'Windows' or
+              'CYGWIN_NT' in platform.system())
+    global is_mac
+    is_mac = (platform.system() == 'Darwin')
+
+    # save current path
+    pwd = os.getcwd()
+    pwd = pwd + os.sep
+    pwd1 = pwd
+    if is_windows:
+        pwd1 = "..\\..\\"
+
+    # check if cpu usage is low now
+    cpu_percent = cpu_check()
+    if cpu_percent > 20:
+        error("CPU Usage is very high.\nClose other applications.\n", 2)
+
+    global ispc_test
+    global ispc_ref
+    global ref_compiler
+    global refc_compiler
+    # check that required compilers exist
+    PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+    ispc_test_exists = False
+    ispc_ref_exists = False
+    ref_compiler_exists = False
+    if is_windows == False:
+        ispc_test = "ispc"
+        ref_compiler = "g++"
+        refc_compiler = "gcc"
+        if options.compiler != "":
+            if options.compiler == "clang" or options.compiler == "clang++":
+                ref_compiler = "clang++"
+                refc_compiler = "clang"
+            if options.compiler == "icc" or options.compiler == "icpc":
+                ref_compiler = "icpc"
+                refc_compiler = "icc"
+    else:
+        ispc_test = "ispc.exe"
+        ref_compiler = "cl.exe"
+    ispc_ref = options.ref
+    if options.ref != "":
+        options.ref = True
+    for counter in PATH_dir:
+        if os.path.exists(counter + os.sep + ispc_test):
+            ispc_test_exists = True
+        if os.path.exists(counter + os.sep + ref_compiler):
+            ref_compiler_exists = True
+        if os.path.exists(counter + os.sep + ispc_ref):
+            ispc_ref_exists = True
+    if not ispc_test_exists:
+        error("ISPC compiler not found.\nAdded path to ispc compiler to your PATH variable.\n", 1)
+    if not ref_compiler_exists:
+        error("C/C++ compiler %s not found.\nAdded path to %s compiler to your PATH variable.\n" % (ref_compiler, ref_compiler), 1)
+    if options.ref:
+        if not ispc_ref_exists:
+            error("ISPC reference compiler not found.\nAdded path to ispc reference compiler to your PATH variable.\n", 1)
+
+    # checks that config file exists
+    path_config = os.path.normpath(options.config)
+    if os.path.exists(path_config) == False:
+        error("config file not found: %s.\nSet path to your config file in --config.\n" % options.config, 1)
+        sys.exit()
+
+    # read lines from config file except comments
+    f = open(path_config, 'r')
+    f_lines = f.readlines()
+    f.close()
+    lines =[]
+    for i in range(len(f_lines)):
+        if f_lines[i][0] != "%":
+            lines.append(f_lines[i])
+    length = len(lines)
+
+    # prepare build.log, perf_temp and perf.log files
+    global perf_log
+    if options.in_file:
+        perf_log = pwd + options.in_file
+        common.remove_if_exists(perf_log)
+    else:
+        perf_log = ""
+    global build_log
+    build_log = pwd + os.sep + "logs" + os.sep + "perf_build.log"
+    common.remove_if_exists(build_log)
+    if os.path.exists(pwd + os.sep + "logs") == False:
+        os.makedirs(pwd + os.sep + "logs")
+
+    global perf_temp
+    perf_temp = pwd + "perf_temp"
+    # end of preparations
+ 
+    print_debug("Okey go go go!\n\n", s, perf_log)
+    
+    #print compilers versions   
+    common.print_version(ispc_test, ispc_ref, ref_compiler, False, perf_log, is_windows) 
+
+    # begin
+    i = 0
+    answer = []
+    answer_ref = []
+
+    # loop for all tests
+    while i < length-2:
+        # we read name of test
+        print_debug("%s" % lines[i], s, perf_log)
+        test = [lines[i][:-1],[],[],[],[],[]]
+        test_ref = [lines[i][:-1],[],[],[],[],[]]
+        # read location of test
+        folder = lines[i+1]
+        folder = folder[:-1]
+        folder = os.path.normpath(options.path + os.sep + "examples" + os.sep + folder)
+        # check that test exists
+        if os.path.exists(folder) == False:
+            error("Can't find test %s. Your path is: \"%s\".\nChange current location to ISPC_HOME or set path to ISPC_HOME in --path.\n" %
+                 (lines[i][:-1], options.path), 1)
+        os.chdir(folder)
+        # read parameters of test
+        command = lines[i+2]
+        command = command[:-1]
+        if is_windows == False:
+            ex_command_ref = "./ref " + command + " >> " + perf_temp + "_ref"
+            ex_command = "./test " + command + " >> " + perf_temp + "_test"
+            bu_command_ref = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=ref ISPC="+ispc_ref+" >> "+build_log+" 2>> "+build_log
+            bu_command = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=test ISPC="+ispc_test+" >> "+build_log+" 2>> "+build_log
+            re_command = "make clean >> "+build_log
+        else:
+            ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref"
+            ex_command = "x64\\Release\\test.exe " + command + " >> " + perf_temp + "_test"
+            bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /t:rebuild >> " + build_log
+            bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /t:rebuild >> " + build_log
+            re_command = "msbuild /t:clean >> " + build_log
+        commands = [ex_command, bu_command, ex_command_ref, bu_command_ref, re_command]
+        # parsing config parameters
+        next_line = lines[i+3]
+        if next_line[0] == "!": # we should take only one part of test output
+            R = next_line.split(' ')
+            c1 = int(R[1]) #c1 is a number of string which we want to use in test output
+            c2 = int(R[2]) #c2 is total number of strings in test output
+            i = i+1
+        else:
+            c1 = 1
+            c2 = 1
+        next_line = lines[i+3]
+        if next_line[0] == "^":  #we should concatenate result of this test with previous one
+            run_test(commands, c1, c2, answer[len(answer)-1], answer_ref[len(answer)-1], False)
+            i = i+1
+        else: #we run this test and append it's result to answer structure
+            run_test(commands, c1, c2, test, test_ref, True)
+            answer.append(test)
+            answer_ref.append(test_ref)
+
+        # preparing next loop iteration
+        os.chdir(pwd1)
+        i+=4
+
+    # delete temp file
+    common.remove_if_exists(perf_temp+"_test")
+    common.remove_if_exists(perf_temp+"_ref")
+
+    #print collected answer
+    print_debug("\n\nTEST COMPILER:\n", s, perf_log)
+    A = print_answer(answer)
+    if options.ref != "":
+        print_debug("\n\nREFERENCE COMPILER:\n", s, perf_log)
+        B = print_answer(answer_ref)
+        # print perf report
+        compare(A,B)
+
+ 
+
+###Main###
+from optparse import OptionParser
+import sys
+import os
+import operator
+import time
+import glob
+import string
+import platform
+# our functions
+import common
+print_debug = common.print_debug
+error = common.error
+
+if __name__ == "__main__":
+    # parsing options
+    parser = OptionParser()
+    parser.add_option('-n', '--number', dest='number',
+        help='number of repeats', default="3")
+    parser.add_option('-c', '--config', dest='config',
+        help='config file of tests', default="./perf.ini")
+    parser.add_option('-p', '--path', dest='path',
+        help='path to test_system directory', default=".")
+    parser.add_option('-s', '--silent', dest='silent',
+        help='silent mode, only table output', default=False, action="store_true")
+    parser.add_option('-o', '--output', dest='output',
+        help='output file for script reading', default="")
+    parser.add_option('--compiler', dest='compiler',
+        help='C/C++ compiler', default="")
+    parser.add_option('-r', '--ref', dest='ref',
+        help='set reference compiler for compare', default="")
+    parser.add_option('-f', '--file', dest='in_file',
+        help='file to save perf output', default="")
+    (options, args) = parser.parse_args()
+    perf(options, args)
diff --git a/run_tests.py b/run_tests.py
index 9729930f..2471b6cb 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -1,165 +1,37 @@
 #!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 # test-running driver for ispc
-
-from optparse import OptionParser
-import multiprocessing
-from ctypes import c_int
-import os
-import sys
-import glob
-import re
-import signal
-import random
-import string
-import subprocess
-import shlex
-import platform
-import tempfile
-import os.path
-import time
-
-# disable fancy error/warning printing with ANSI colors, so grepping for error
-# messages doesn't get confused
-os.environ["TERM"] = "dumb"
-
-# This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
-# git history has a workaround for that issue.
-
-is_windows = (platform.system() == 'Windows' or
-              'CYGWIN_NT' in platform.system())
-
-parser = OptionParser()
-parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
-                  default=False, action="store_true")
-parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics",
-                  default=None)
-parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
-                  default="")
-parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (sse2-i32x4, sse2-i32x8, sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, avx1-i32x8, avx1-i32x16, avx1.1-i32x8, avx1.1-i32x16, avx2-i32x8, avx2-i32x16, generic-x1, generic-x4, generic-x8, generic-x16, generic-x32, generic-x64)',
-                  default="sse4")
-parser.add_option('-a', '--arch', dest='arch',
-                  help='Set architecture (arm, x86, x86-64)',
-                  default="x86-64")
-parser.add_option("-c", "--compiler", dest="compiler_exe", help="Compiler binary to use to run tests",
-                  default=None)
-parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
-                  default=False, action="store_true")
-parser.add_option('-j', '--jobs', dest='num_jobs', help='Maximum number of jobs to run in parallel',
-                  default="1024", type="int")
-parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output',
-                  default=False, action="store_true")
-parser.add_option('--wrap-exe', dest='wrapexe',
-                  help='Executable to wrap test runs with (e.g. "valgrind")',
-                  default="")
-parser.add_option('--time', dest='time', help='Enable time output',
-                  default=False, action="store_true")
-parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates',
-                  default=False, action="store_true")
-
-(options, args) = parser.parse_args()
-
-if options.target == 'neon':
-    options.arch = 'arm'
-
-# use relative path to not depend on host directory, which may possibly
-# have white spaces and unicode characters.
-if not is_windows:
-    ispc_exe = "./ispc"
-else:
-    ispc_exe = ".\\Release\\ispc.exe"
-
-# checks the required ispc compiler otherwise prints an error message
-if not os.path.exists(ispc_exe):
-    sys.stderr.write("Fatal error: missing ispc compiler: %s\n" % ispc_exe)
-    sys.exit()
-
-ispc_exe += " " + options.ispc_flags
-
-if __name__ == '__main__':
-    sys.stdout.write("ispc compiler: %s\n" % ispc_exe)
-
-is_generic_target = (options.target.find("generic-") != -1 and
-                     options.target != "generic-1")
-if is_generic_target and options.include_file == None:
-    if options.target == "generic-4":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/sse4.h\n")
-        options.include_file = "examples/intrinsics/sse4.h"
-    elif options.target == "generic-8":
-        sys.stderr.write("No generics #include specified and no default available for \"generic-8\" target.\n")
-        sys.exit(1)
-    elif options.target == "generic-16":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-16.h\n")
-        options.include_file = "examples/intrinsics/generic-16.h"
-    elif options.target == "generic-32":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-32.h\n")
-        options.include_file = "examples/intrinsics/generic-32.h"
-    elif options.target == "generic-64":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-64.h\n")
-        options.include_file = "examples/intrinsics/generic-64.h"
-
-if options.compiler_exe == None:
-    if is_windows:
-        options.compiler_exe = "cl.exe"
-    else:
-        options.compiler_exe = "g++"
-
-# checks the required compiler otherwise prints an error message
-PATH_dir = string.split(os.getenv("PATH"), os.pathsep) 
-compiler_exists = False
-
-for counter in PATH_dir:
-    if os.path.exists(counter + os.sep + options.compiler_exe):
-        compiler_exists = True
-        break
-
-if not compiler_exists:
-    sys.stderr.write("Fatal error: missing the required compiler: %s \n" %
-        options.compiler_exe)
-    sys.exit()
-
-ispc_root = "."
-    
-# if no specific test files are specified, run all of the tests in tests/,
-# failing_tests/, and tests_errors/
-if len(args) == 0:
-    files = glob.glob(ispc_root + os.sep + "tests" + os.sep + "*ispc") + \
-        glob.glob(ispc_root + os.sep + "failing_tests" + os.sep + "*ispc") + \
-        glob.glob(ispc_root + os.sep + "tests_errors" + os.sep + "*ispc")
-else:
-    if is_windows:
-        argfiles = [ ]
-        for f in args:
-            # we have to glob ourselves if this is being run under a DOS
-            # shell, as it passes wildcard as is.
-            argfiles += glob.glob(f)
-    else:
-        argfiles = args
-        
-    files = [ ]
-    for f in argfiles:
-        if os.path.splitext(string.lower(f))[1] != ".ispc":
-            sys.stdout.write("Ignoring file %s, which doesn't have an .ispc extension.\n" % f)
-        else:
-            files += [ f ]
-
-# max_test_length is used to issue exact number of whitespace characters when
-# updating status. Otherwise update causes new lines standard 80 char terminal
-# on both Linux and Windows.
-max_test_length = 0
-for f in files:
-    max_test_length = max(max_test_length, len(f))
-
-# randomly shuffle the tests if asked to do so
-if (options.random):
-    random.seed()
-    random.shuffle(files)
-
-# counter
-total_tests = 0
-
-
 # utility routine to print an update on the number of tests that have been
 # finished.  Should be called with the lock held..
 def update_progress(fn, total_tests_arg, counter, max_test_length_arg):
@@ -176,7 +48,7 @@ def update_progress(fn, total_tests_arg, counter, max_test_length_arg):
 
 def run_command(cmd):
     if options.verbose:
-        sys.stdout.write("Running: %s\n" % cmd)
+        print_debug("Running: %s\n" % cmd, s, run_tests_log)
 
     # Here's a bit tricky part. To pass a command for execution we should
     # break down the line in to arguments. shlex class is designed exactly
@@ -204,9 +76,9 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
         (return_code, output) = run_command(cmd)
         compile_failed = (return_code != 0)
         if compile_failed:
-            sys.stdout.write("Compilation of test %s failed            \n" % filename)
+            print_debug("Compilation of test %s failed            \n" % filename, s, run_tests_log)
             if output != "":
-                sys.stdout.write("%s" % output.encode("utf-8"))
+                print_debug("%s" % output.encode("utf-8"), s, run_tests_log)
             return (1, 0)
 
     (return_code, output) = run_command(run_cmd)
@@ -215,11 +87,11 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
     surprise = ((expect_failure and not run_failed) or
                 (not expect_failure and run_failed))
     if surprise == True:
-        sys.stderr.write("Test %s %s (return code %d)            \n" % \
+        print_debug("Test %s %s (return code %d)            \n" % \
             (filename, "unexpectedly passed" if expect_failure else "failed",
-             return_code))
+             return_code), s, run_tests_log)
     if output != "":
-        sys.stdout.write("%s\n" % output.encode("utf-8"))
+        print_debug("%s\n" % output.encode("utf-8"), s, run_tests_log)
     if surprise == True:
         return (0, 1)
     else:
@@ -298,11 +170,11 @@ def run_test(testname):
         file.close()
 
         if re.search(firstline, output) == None:
-            sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \
-                (firstline, testname, output))
+            print_debug("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \
+                (firstline, testname, output), s, run_tests_log)
             return (1, 0)
         elif got_error == False:
-            sys.stderr.write("Unexpectedly no errors issued from test %s\n" % testname)
+            print_debug("Unexpectedly no errors issued from test %s\n" % testname, s, run_tests_log)
             return (1, 0)
         else:
             return (0, 0)
@@ -328,8 +200,7 @@ def run_test(testname):
                     break
         file.close()
         if match == -1:
-            sys.stderr.write("Fatal error: unable to find function signature " + \
-                  "in test %s\n" % testname)
+            error("unable to find function signature in test %s\n" % testname, 0)
             return (1, 0)
         else:
             global is_generic_target
@@ -404,7 +275,21 @@ def run_test(testname):
 # pull tests to run from the given queue and run them.  Multiple copies of
 # this function will be running in parallel across all of the CPU cores of
 # the system.
-def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test_length_arg, counter, mutex):
+def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test_length_arg, counter, mutex, glob_var):
+    # This is needed on windows because windows doen't copy globals from parent process whili multiprocessing
+    global is_windows
+    is_windows = glob_var[0]
+    global options
+    options = glob_var[1]
+    global s
+    s = glob_var[2]
+    global ispc_exe
+    ispc_exe = glob_var[3]
+    global is_generic_target
+    is_generic_target = glob_var[4]
+    global run_tests_log
+    run_tests_log = glob_var[5]    
+
     if is_windows:
         tmpdir = "tmp%d" % os.getpid()
         os.mkdir(tmpdir)
@@ -454,7 +339,256 @@ def sigint(signum, frame):
         t.terminate()
     sys.exit(1)
 
-if __name__ == '__main__':
+
+def file_check(compfails, runfails):
+    errors = len(compfails) + len(runfails)
+    new_compfails = []
+    new_runfails = []
+    new_passes_compfails = []
+    new_passes_runfails = []
+# Open file fail_db.txt
+    f = open(test_states, 'r')
+    f_lines = f.readlines()
+    f.close()
+# Detect OS
+    if platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system():
+        OS = "Windows"
+    else:
+        if platform.system() == 'Darwin':
+            OS = "Mac"
+        else:
+            OS = "Linux"
+# Detect opt_set
+    if options.no_opt == True:
+        opt = "-O0"
+    else:
+        opt = "-O2"
+# Detect LLVM version
+    temp1 = common.detect_version(ispc_exe + " --version")
+    llvm_version = temp1[-10:-2]
+#Detect compiler version
+    if is_windows == False:
+        temp1 = common.detect_version(options.compiler_exe + " --version")
+        temp2 = temp1.split(" ")
+        compiler_version = temp2[0] + temp2[2][0:4]
+    else:
+        compiler_version = "cl" 
+    new_line = " "+options.arch.rjust(6)+" "+options.target.rjust(14)+" "+OS.rjust(7)+" "+llvm_version+" "+compiler_version.rjust(10)+" "+opt+" *\n"
+
+    new_compfails = compfails[:]
+    new_runfails = runfails[:]
+    new_f_lines = f_lines[:]
+    for j in range(0, len(f_lines)):
+        if (((" "+options.arch+" ") in f_lines[j]) and
+           ((" "+options.target+" ") in f_lines[j]) and
+           ((" "+OS+" ") in f_lines[j]) and
+           ((" "+llvm_version+" ") in f_lines[j]) and
+           ((" "+compiler_version+" ") in f_lines[j]) and
+           ((" "+opt+" ") in f_lines[j])):
+            if (" compfail " in f_lines[j]):
+                f = 0
+                for i in range(0, len(compfails)):
+                    if compfails[i] in f_lines[j]:
+                        new_compfails.remove(compfails[i])
+                    else:
+                        f = f + 1
+                if f == len(compfails):
+                    temp3 = f_lines[j].split(" ")
+                    new_passes_compfails.append(temp3[0])
+                    if options.update == "FP":
+                        new_f_lines.remove(f_lines[j])
+            if (" runfail " in f_lines[j]):
+                f = 0
+                for i in range(0, len(runfails)):
+                    if runfails[i] in f_lines[j]:
+                        new_runfails.remove(runfails[i])
+                    else:
+                        f = f + 1
+                if f == len(runfails):
+                    temp3 = f_lines[j].split(" ")
+                    new_passes_runfails.append(temp3[0])
+                    if options.update == "FP":
+                        new_f_lines.remove(f_lines[j])
+    if len(new_runfails) != 0:
+        print_debug("NEW RUNFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_runfails)):
+            new_f_lines.append(new_runfails[i] + " runfail " + new_line)
+            print_debug("\t" + new_runfails[i] + "\n", s, run_tests_log)
+    if len(new_compfails) != 0:
+        print_debug("NEW COMPFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_compfails)):
+            new_f_lines.append(new_compfails[i] + " compfail " + new_line)
+            print_debug("\t" + new_compfails[i] + "\n", s, run_tests_log)
+    if len(new_passes_runfails) != 0:
+        print_debug("NEW PASSES after RUNFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_passes_runfails)):
+            print_debug("\t" + new_passes_runfails[i] + "\n", s, run_tests_log)
+    if len(new_passes_compfails) != 0:
+        print_debug("NEW PASSES after COMPFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_passes_compfails)):
+            print_debug("\t" + new_passes_compfails[i] + "\n", s, run_tests_log)
+    
+    if options.update != "":
+        output = open(test_states, 'w')
+        output.writelines(new_f_lines)
+        output.close()
+    return [new_runfails, new_compfails, new_passes_runfails, new_passes_compfails, new_line, errors]
+
+def verify():
+    # Open file fail_db.txt
+    f = open(test_states, 'r')
+    f_lines = f.readlines()
+    f.close()
+    check = [["g++", "clang", "cl"],["-O0", "-O2"],["x86","x86-64"],
+             ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM head"],
+             ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8",
+              "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1.1-i32x8", "avx1.1-i32x16",
+              "avx2-i32x8", "avx2-i32x16", "generic-1", "generic-4", "generic-8",
+              "generic-16", "generic-32", "generic-64"]]
+    for i in range (0,len(f_lines)):
+        if f_lines[i][0] == "%":
+            continue
+        for j in range(0,len(check)):
+            temp = 0
+            for t in range(0,len(check[j])):
+                if " " + check[j][t] + " " in f_lines[i]:
+                    temp = temp + 1
+            if temp != 1:
+                print_debug("error in line " + str(i) + "\n", False, run_tests_log)
+                break
+
+
+def run_tests(options1, args, print_version):
+    global options
+    options = options1
+    global s
+    s = options.silent
+    
+    # prepare run_tests_log and test_states files
+    global run_tests_log
+    if options.in_file:
+        run_tests_log = os.getcwd() + os.sep + options.in_file
+        if print_version == 1:
+            common.remove_if_exists(run_tests_log)
+    else:
+        run_tests_log = ""
+    global test_states
+    test_states = "fail_db.txt"
+    if options.verify:
+        verify()
+        return 0
+
+    # disable fancy error/warning printing with ANSI colors, so grepping for error
+    # messages doesn't get confused
+    os.environ["TERM"] = "dumb"
+ 
+    # This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
+    # git history has a workaround for that issue.
+    global is_windows 
+    is_windows = (platform.system() == 'Windows' or
+                'CYGWIN_NT' in platform.system())
+ 
+    if options.target == 'neon':
+        options.arch = 'arm'
+ 
+    # use relative path to not depend on host directory, which may possibly
+    # have white spaces and unicode characters.
+    global ispc_exe
+    if not is_windows:
+        ispc_exe = "./ispc"
+    else:
+        ispc_exe = ".\\Release\\ispc.exe"
+ 
+    # checks the required ispc compiler otherwise prints an error message
+    if not os.path.exists(ispc_exe):
+        error("missing ispc compiler: %s\n" % ispc_exe, 1)
+    ispc_exe += " " + options.ispc_flags
+    print_debug("ispc compiler: %s\n" % ispc_exe, s, run_tests_log)
+
+    global is_generic_target 
+    is_generic_target = (options.target.find("generic-") != -1 and
+                     options.target != "generic-1" and options.target != "generic-x1")
+    if is_generic_target and options.include_file == None:
+        if options.target == "generic-4" or options.target == "generic-x4":
+            error("No generics #include specified; using examples/intrinsics/sse4.h\n", 2)
+            options.include_file = "examples/intrinsics/sse4.h"
+            options.target = "generic-4"
+        elif options.target == "generic-8" or options.target == "generic-x8":
+            error("No generics #include specified and no default available for \"generic-8\" target.\n", 1)
+            options.target = "generic-8"
+        elif options.target == "generic-16" or options.target == "generic-x16":
+            error("No generics #include specified; using examples/intrinsics/generic-16.h\n", 2)
+            options.include_file = "examples/intrinsics/generic-16.h"
+            options.target = "generic-16"
+        elif options.target == "generic-32" or options.target == "generic-x32":
+            error("No generics #include specified; using examples/intrinsics/generic-32.h\n", 2)
+            options.include_file = "examples/intrinsics/generic-32.h"
+            options.target = "generic-32"
+        elif options.target == "generic-64" or options.target == "generic-x64":
+            error("No generics #include specified; using examples/intrinsics/generic-64.h\n", 2)
+            options.include_file = "examples/intrinsics/generic-64.h"
+            options.target = "generic-64"
+ 
+    if options.compiler_exe == None:
+        if is_windows:
+            options.compiler_exe = "cl.exe"
+        else:
+            options.compiler_exe = "g++"
+ 
+    # checks the required compiler otherwise prints an error message
+    PATH_dir = string.split(os.getenv("PATH"), os.pathsep) 
+    compiler_exists = False
+ 
+    for counter in PATH_dir:
+        if os.path.exists(counter + os.sep + options.compiler_exe):
+            compiler_exists = True
+            break
+ 
+    if not compiler_exists:
+        error("missing the required compiler: %s \n" % options.compiler_exe, 1)
+
+    # print compilers versions
+    if print_version > 0:
+        common.print_version(ispc_exe, "", options.compiler_exe, False, run_tests_log, is_windows)
+ 
+    ispc_root = "."
+    
+    # if no specific test files are specified, run all of the tests in tests/,
+    # failing_tests/, and tests_errors/
+    if len(args) == 0:
+        files = glob.glob(ispc_root + os.sep + "tests" + os.sep + "*ispc") + \
+            glob.glob(ispc_root + os.sep + "failing_tests" + os.sep + "*ispc") + \
+            glob.glob(ispc_root + os.sep + "tests_errors" + os.sep + "*ispc")
+    else:
+        if is_windows:
+            argfiles = [ ]
+            for f in args:
+                # we have to glob ourselves if this is being run under a DOS
+                # shell, as it passes wildcard as is.
+                argfiles += glob.glob(f)
+        else:
+            argfiles = args
+        
+        files = [ ]
+        for f in argfiles:
+            if os.path.splitext(string.lower(f))[1] != ".ispc":
+                error("Ignoring file %s, which doesn't have an .ispc extension.\n" % f, 2)
+            else:
+                files += [ f ]
+ 
+    # max_test_length is used to issue exact number of whitespace characters when
+    # updating status. Otherwise update causes new lines standard 80 char terminal
+    # on both Linux and Windows.
+    max_test_length = 0
+    for f in files:
+        max_test_length = max(max_test_length, len(f))
+ 
+    # randomly shuffle the tests if asked to do so
+    if (options.random):
+        random.seed()
+        random.shuffle(files)
+ 
+    # counter
     total_tests = len(files)
 
     compile_error_files = [ ]
@@ -463,7 +597,7 @@ if __name__ == '__main__':
 
     nthreads = min(multiprocessing.cpu_count(), options.num_jobs)
     nthreads = min(nthreads, len(files))
-    sys.stdout.write("Running %d jobs in parallel. Running %d tests.\n" % (nthreads, total_tests))
+    print_debug("Running %d jobs in parallel. Running %d tests.\n" % (nthreads, total_tests), s, run_tests_log)
 
     # put each of the test filenames into a queue
     q = multiprocessing.Queue()
@@ -483,8 +617,10 @@ if __name__ == '__main__':
 
     start_time = time.time()
     # launch jobs to run tests
+    glob_var = [is_windows, options, s, ispc_exe, is_generic_target, run_tests_log]
     for x in range(nthreads):
-        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests, max_test_length, finished_tests_counter, finished_tests_counter_lock))
+        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests,
+            max_test_length, finished_tests_counter, finished_tests_counter_lock, glob_var))
         task_threads.append(t)
         t.start()
 
@@ -493,35 +629,97 @@ if __name__ == '__main__':
     for t in task_threads:
         t.join()
     if options.non_interactive == False:
-        sys.stdout.write("\n")
+        print_debug("\n", s, run_tests_log)
 
     elapsed_time = time.time() - start_time
 
     while not qret.empty():
-        (c, r, s) = qret.get()
+        (c, r, skip) = qret.get()
         compile_error_files += c
         run_error_files += r
-        skip_files += s
+        skip_files += skip
 
     if options.non_interactive:
-        sys.stdout.write(" Done %d / %d\n" % (finished_tests_counter.value, total_tests))
+        print_debug(" Done %d / %d\n" % (finished_tests_counter.value, total_tests), s, run_tests_log)
     if len(skip_files) > 0:
         skip_files.sort()
-        sys.stdout.write("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests))
+        print_debug("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests), s, run_tests_log)
         for f in skip_files:
-            sys.stdout.write("\t%s\n" % f)
+            print_debug("\t%s\n" % f, s, run_tests_log)
     if len(compile_error_files) > 0:
         compile_error_files.sort()
-        sys.stdout.write("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests))
+        print_debug("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests), s, run_tests_log)
         for f in compile_error_files:
-            sys.stdout.write("\t%s\n" % f)
+            print_debug("\t%s\n" % f, s, run_tests_log)
     if len(run_error_files) > 0:
         run_error_files.sort()
-        sys.stdout.write("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests))
+        print_debug("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests), s, run_tests_log)
         for f in run_error_files:
-            sys.stdout.write("\t%s\n" % f)
+            print_debug("\t%s\n" % f, s, run_tests_log)
+
+    R = file_check(compile_error_files, run_error_files)
 
     if options.time:
-        sys.stdout.write("Elapsed time: %d s\n" % elapsed_time)
+        print_debug("Elapsed time: %d s\n" % elapsed_time, s, run_tests_log)
 
-    sys.exit(len(compile_error_files) + len(run_error_files))
+    return R
+
+
+from optparse import OptionParser
+import multiprocessing
+from ctypes import c_int
+import os
+import sys
+import glob
+import re
+import signal
+import random
+import string
+import subprocess
+import shlex
+import platform
+import tempfile
+import os.path
+import time
+# our functions
+import common
+print_debug = common.print_debug
+error = common.error
+
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
+                  default=False, action="store_true")
+    parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics",
+                  default=None)
+    parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
+                  default="")
+    parser.add_option('-t', '--target', dest='target',
+                  help='Set compilation target (sse2-i32x4, sse2-i32x8, sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, avx1-i32x8, avx1-i32x16, avx1.1-i32x8, avx1.1-i32x16, avx2-i32x8, avx2-i32x16, generic-x1, generic-x4, generic-x8, generic-x16, generic-x32, generic-x64)',
+                                    default="sse4")
+    parser.add_option('-a', '--arch', dest='arch',
+                  help='Set architecture (arm, x86, x86-64)',
+                                    default="x86-64")
+    parser.add_option("-c", "--compiler", dest="compiler_exe", help="C/C++ compiler binary to use to run tests",
+                  default=None)
+    parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
+                  default=False, action="store_true")
+    parser.add_option('-j', '--jobs', dest='num_jobs', help='Maximum number of jobs to run in parallel',
+                  default="1024", type="int")
+    parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output',
+                  default=False, action="store_true")
+    parser.add_option('--wrap-exe', dest='wrapexe',
+                  help='Executable to wrap test runs with (e.g. "valgrind")',
+                                    default="")
+    parser.add_option('--time', dest='time', help='Enable time output',
+                  default=False, action="store_true")
+    parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates',
+                  default=False, action="store_true")
+    parser.add_option('-u', "--update", dest='update', help='Update file with fails (F of FP)', default="")
+    parser.add_option('-s', "--silent", dest='silent', help='enable silent mode without any output', default=False,
+                  action = "store_true")
+    parser.add_option("--file", dest='in_file', help='file to save run_tests output', default="")
+    parser.add_option("--verify", dest='verify', help='verify the file fail_db.txt', default=False, action="store_true")
+    (options, args) = parser.parse_args()
+    L = run_tests(options, args, 1)
+    exit(0)

From f45f6cb32a390d834e53037751365cd1932929e3 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 17 Sep 2013 23:36:16 +0400
Subject: [PATCH 030/159] Test, documentation and vim support for double
 precision constants

---
 contrib/ispc.vim         |  5 +++++
 docs/ispc.rst            | 11 ++++++++++-
 tests/double-consts.ispc | 23 +++++++++++++++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 tests/double-consts.ispc

diff --git a/contrib/ispc.vim b/contrib/ispc.vim
index cc8493f0..4d870dcd 100644
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -19,6 +19,11 @@ syn keyword	ispcRepeat	cdo cfor cwhile
 syn keyword	ispcBuiltin	programCount programIndex	
 syn keyword	ispcType	export uniform varying int8 int16 int32 int64
 
+"double precision floating point number, with dot, optional exponent
+syn match	cFloat		display contained "\d\+\.\d*d[-+]\=\d*\>"
+"double precision floating point number, without dot, with exponent
+syn match	cFloat		display contained "\d\+d[-+]\=\d\+\>"
+
 " Default highlighting
 command -nargs=+ HiLink hi def link <args>
 HiLink ispcStatement	Statement
diff --git a/docs/ispc.rst b/docs/ispc.rst
index ff07f6d8..224faaa9 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -270,6 +270,14 @@ new reserved words: ``unmasked``, ``foreach_unique``, ``foreach_active``,
 and ``in``.  Any program that happens to have a variable or function with
 one of these names must be modified to rename that symbol.
 
+Updating ISPC Programs For Changes In ISPC 1.4.5
+----------------------------------------------
+
+This release adds support for double precision floating point constants.
+Double precision floating point constants are floating point number with
+``d`` suffix and optional exponent part. Here are some examples: 3.14d,
+31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is
+treated as single precision constant.
 
 Getting Started with ISPC
 =========================
@@ -1349,7 +1357,8 @@ but are likely to be supported in future releases:
 * Bitfield members of ``struct`` types
 * Variable numbers of arguments to functions
 * Literal floating-point constants (even without a ``f`` suffix) are
-  currently treated as being ``float`` type, not ``double``
+  currently treated as being ``float`` type, not ``double``. To have a double
+  precision floating point constant use ``d`` suffix.
 * The ``volatile`` qualifier
 * The ``register`` storage class for variables.  (Will be ignored).
 
diff --git a/tests/double-consts.ispc b/tests/double-consts.ispc
new file mode 100644
index 00000000..3259156a
--- /dev/null
+++ b/tests/double-consts.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    // Test parsing of double constants.
+    double d1 = 1.0d40;
+    double d2 = 1.d40;
+    double d3 = 1d40;
+    double d4 = 10000000000000000000000000000000000000000.d;
+    double d5 = 10000000000000000000000000000000000000000.0d;
+
+    // All the constants should be equal and if it's evaluated as "float",
+    // then sqrt will evaluate to +inf.
+    if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 &&
+        ((float)sqrt(d1)) < 2e20) {
+        RET[programIndex] = a;
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}

From 1c527ae34cf7c257f8deaf0261af447b238cab56 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 18 Sep 2013 11:48:24 +0400
Subject: [PATCH 031/159] Adding tests and vim support for double constant of
 the form .1d41

---
 contrib/ispc.vim         | 2 ++
 tests/double-consts.ispc | 7 ++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/contrib/ispc.vim b/contrib/ispc.vim
index 4d870dcd..f3cb413b 100644
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -21,6 +21,8 @@ syn keyword	ispcType	export uniform varying int8 int16 int32 int64
 
 "double precision floating point number, with dot, optional exponent
 syn match	cFloat		display contained "\d\+\.\d*d[-+]\=\d*\>"
+"double precision floating point number, starting with dot, optional exponent
+syn match	cFloat		display contained ".\d*d[-+]\=\d*\>"
 "double precision floating point number, without dot, with exponent
 syn match	cFloat		display contained "\d\+d[-+]\=\d\+\>"
 
diff --git a/tests/double-consts.ispc b/tests/double-consts.ispc
index 3259156a..4096aa1c 100644
--- a/tests/double-consts.ispc
+++ b/tests/double-consts.ispc
@@ -7,12 +7,13 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     double d1 = 1.0d40;
     double d2 = 1.d40;
     double d3 = 1d40;
-    double d4 = 10000000000000000000000000000000000000000.d;
-    double d5 = 10000000000000000000000000000000000000000.0d;
+    double d4 = .1d41;
+    double d5 = 10000000000000000000000000000000000000000.d;
+    double d6 = 10000000000000000000000000000000000000000.0d;
 
     // All the constants should be equal and if it's evaluated as "float",
     // then sqrt will evaluate to +inf.
-    if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 &&
+    if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 && d1 == d6
         ((float)sqrt(d1)) < 2e20) {
         RET[programIndex] = a;
     }

From bb8f7d4e3f2a226a8f4b7b7ae2de6fce7d609791 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Thu, 19 Sep 2013 14:37:26 +0400
Subject: [PATCH 032/159] removing LLVM 3.1 and 3.2 from default testing

---
 alloy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/alloy.py b/alloy.py
index 67f534ca..06025324 100755
--- a/alloy.py
+++ b/alloy.py
@@ -367,7 +367,7 @@ def validation_run(only, only_targets, reference_branch, notify, update):
         if len(archs) == 0:
             archs = ["x86", "x86-64"]
         if len(LLVM) == 0:
-            LLVM = ["3.1", "3.2", "3.3", "head"]
+            LLVM = ["3.3", "head"]
         gen_archs = ["x86-64"]
         need_LLVM = check_LLVM(LLVM)
         for i in range(0,len(need_LLVM)):

From 6a21218c13aa14666d11150c265f542afd79818e Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Thu, 19 Sep 2013 13:45:31 +0300
Subject: [PATCH 033/159] fix warrning and add KNC 1

---
 examples/intrinsics/knc-i1x16.h | 4 ++--
 examples/intrinsics/knc-i1x8.h  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index b7d3a7f1..c535e61a 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -45,13 +45,13 @@
 #define roundf(x) (floorf(x + .5f))
 #define round(x) (floor(x + .5))
 #else
-#define FORCEINLINE __attribute__((always_inline))
+#define FORCEINLINE __forceinline
 #define PRE_ALIGN(x)
 #define POST_ALIGN(x)  __attribute__ ((aligned(x)))
 #endif
 
-#if 0
 #define KNC 1
+#if 0
 extern "C" 
 {
   int printf(const unsigned char *, ...);
diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h
index de9bddcc..573d232c 100644
--- a/examples/intrinsics/knc-i1x8.h
+++ b/examples/intrinsics/knc-i1x8.h
@@ -50,13 +50,13 @@
 #define roundf(x) (floorf(x + .5f))
 #define round(x) (floor(x + .5))
 #else
-#define FORCEINLINE __attribute__((always_inline))
+#define FORCEINLINE __forceinline
 #define PRE_ALIGN(x)
 #define POST_ALIGN(x)  __attribute__ ((aligned(x)))
 #endif
 
-#if 0
 #define KNC 1
+#if 0
 extern "C" 
 {
   int printf(const unsigned char *, ...);

From 43245bbc118c1b415c9c538c98555fc110ad1f3c Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 18 Sep 2013 14:24:46 +0400
Subject: [PATCH 034/159] Adding check for OS AVX support to auto-dispatch code

---
 builtins/dispatch.ll | 81 +++++++++++++++++++++++++++-----------------
 1 file changed, 49 insertions(+), 32 deletions(-)

diff --git a/builtins/dispatch.ll b/builtins/dispatch.ll
index f1d5a969..ba216df7 100644
--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2011, Intel Corporation
+;;  Copyright (c) 2011-2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -41,15 +41,13 @@
 
 @__system_best_isa = internal global i32 -1
 
-declare void @abort() noreturn
-
 ;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
 ;; following code...  Specifically, __get_system_isa should return a value
 ;; corresponding to one of the Target::ISA enumerant values that gives the
 ;; most capable ISA that the curremt system can run.
 ;;
-;; Note: clang from LLVM 3.0 should be used if this is updated, for maximum
-;; backwards compatibility for anyone building ispc with LLVM 3.0
+;; Note: clang from LLVM 3.1 should be used if this is updated, for maximum
+;; backwards compatibility for anyone building ispc with LLVM 3.1
 ;;
 ;; #include <stdint.h>
 ;; #include <stdlib.h>
@@ -60,7 +58,7 @@ declare void @abort() noreturn
 ;;                           : "0" (infoType));
 ;; }
 ;; 
-;; /* Save %ebx in case it's the PIC register */
+;; // Save %ebx in case it's the PIC register.
 ;; static void __cpuid_count(int info[4], int level, int count) {
 ;;   __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
 ;;                         "cpuid\n\t"
@@ -69,13 +67,23 @@ declare void @abort() noreturn
 ;;                         : "0" (level), "2" (count));
 ;; }
 ;; 
+;; static int __os_has_avx_support() {
+;;     // Check xgetbv; this uses a .byte sequence instead of the instruction
+;;     // directly because older assemblers do not include support for xgetbv and
+;;     // there is no easy way to conditionally compile based on the assembler used.
+;;     int rEAX, rEDX;
+;;     __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+;;     return (rEAX & 6) == 6;
+;; }
+;; 
 ;; int32_t __get_system_isa() {
 ;;     int info[4];
 ;;     __cpuid(info, 1);
 ;; 
-;;     /* NOTE: the values returned below must be the same as the
-;;        corresponding enumerant values in Target::ISA. */
-;;     if ((info[2] & (1 << 28)) != 0) {
+;;     // NOTE: the values returned below must be the same as the
+;;     // corresponding enumerant values in Target::ISA.
+;;     if ((info[2] & (1 << 28)) != 0 &&
+;;         __os_has_avx_support()) {
 ;;        if ((info[2] & (1 << 29)) != 0 &&  // F16C
 ;;            (info[2] & (1 << 30)) != 0) {  // RDRAND
 ;;            // So far, so good.  AVX2?
@@ -98,47 +106,56 @@ declare void @abort() noreturn
 ;;         abort();
 ;; }
 
-define i32 @__get_system_isa() nounwind uwtable ssp {
+define i32 @__get_system_isa() nounwind uwtable {
 entry:
   %0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
   %asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2
   %asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
   %and = and i32 %asmresult5.i, 268435456
   %cmp = icmp eq i32 %and, 0
-  br i1 %cmp, label %if.else13, label %if.then
+  br i1 %cmp, label %if.else14, label %land.lhs.true
 
-if.then:                                          ; preds = %entry
-  %1 = and i32 %asmresult5.i, 1610612736
-  %2 = icmp eq i32 %1, 1610612736
-  br i1 %2, label %if.then7, label %return
+land.lhs.true:                                    ; preds = %entry
+  %1 = tail call { i32, i32 } asm sideeffect ".byte 0x0f, 0x01, 0xd0", "={ax},={dx},{cx},~{dirflag},~{fpsr},~{flags}"(i32 0) nounwind
+  %asmresult.i25 = extractvalue { i32, i32 } %1, 0
+  %and.i = and i32 %asmresult.i25, 6
+  %cmp.i = icmp eq i32 %and.i, 6
+  br i1 %cmp.i, label %if.then, label %if.else14
 
-if.then7:                                         ; preds = %if.then
-  %3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
-  %asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1
-  %and10 = lshr i32 %asmresult4.i28, 5
-  %4 = and i32 %and10, 1
-  %5 = add i32 %4, 3
+if.then:                                          ; preds = %land.lhs.true
+  %2 = and i32 %asmresult5.i, 1610612736
+  %3 = icmp eq i32 %2, 1610612736
+  br i1 %3, label %if.then8, label %return
+
+if.then8:                                         ; preds = %if.then
+  %4 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
+  %asmresult4.i30 = extractvalue { i32, i32, i32, i32 } %4, 1
+  %and11 = lshr i32 %asmresult4.i30, 5
+  %5 = and i32 %and11, 1
+  %6 = add i32 %5, 3
   br label %return
 
-if.else13:                                        ; preds = %entry
-  %and15 = and i32 %asmresult5.i, 524288
-  %cmp16 = icmp eq i32 %and15, 0
-  br i1 %cmp16, label %if.else18, label %return
+if.else14:                                        ; preds = %land.lhs.true, %entry
+  %and16 = and i32 %asmresult5.i, 524288
+  %cmp17 = icmp eq i32 %and16, 0
+  br i1 %cmp17, label %if.else19, label %return
 
-if.else18:                                        ; preds = %if.else13
-  %and20 = and i32 %asmresult6.i, 67108864
-  %cmp21 = icmp eq i32 %and20, 0
-  br i1 %cmp21, label %if.else23, label %return
+if.else19:                                        ; preds = %if.else14
+  %and21 = and i32 %asmresult6.i, 67108864
+  %cmp22 = icmp eq i32 %and21, 0
+  br i1 %cmp22, label %if.else24, label %return
 
-if.else23:                                        ; preds = %if.else18
+if.else24:                                        ; preds = %if.else19
   tail call void @abort() noreturn nounwind
   unreachable
 
-return:                                           ; preds = %if.else18, %if.else13, %if.then7, %if.then
-  %retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ]
+return:                                           ; preds = %if.else19, %if.else14, %if.then8, %if.then
+  %retval.0 = phi i32 [ %6, %if.then8 ], [ 2, %if.then ], [ 1, %if.else14 ], [ 0, %if.else19 ]
   ret i32 %retval.0
 }
 
+declare void @abort() noreturn nounwind
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This function is called by each of the dispatch functions we generate;
 ;; it sets @__system_best_isa if it is unset.

From dbef4fd7d7d270e350f8af26f76846ba24deb8a0 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Thu, 19 Sep 2013 14:52:22 +0300
Subject: [PATCH 035/159] fixed notation

---
 examples/intrinsics/knc-i1x8.h            | 17 ++++++-----------
 examples/intrinsics/knc-i1x8unsafe_fast.h |  2 +-
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h
index 573d232c..c17b7238 100644
--- a/examples/intrinsics/knc-i1x8.h
+++ b/examples/intrinsics/knc-i1x8.h
@@ -38,11 +38,6 @@
 #include <immintrin.h>
 #include <zmmintrin.h>
 
-#if 0
-#define __ZMM32BIT__
-#endif
-
-
 #ifdef _MSC_VER
 #define FORCEINLINE __forceinline
 #define PRE_ALIGN(x)  /*__declspec(align(x))*/
@@ -110,7 +105,7 @@ struct vec8 {
 
 /****************/
 
-#ifndef __ZMM32BIT__
+#ifndef __ZMM64BIT__
 struct PRE_ALIGN(32) __vec8_i32  : public vec8<int32_t> { 
   __vec8_i32() { }
   FORCEINLINE __vec8_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, 
@@ -135,7 +130,7 @@ struct PRE_ALIGN(32) __vec8_i32  : public vec8<int32_t> {
         data[7],data[6],data[5],data[4],data[3],data[2],data[1],data[0]);
   }
 } POST_ALIGN(32);
-#else /* __ZMM32BIT__ */
+#else /* __ZMM64BIT__ */
 struct PRE_ALIGN(32) __vec8_i32 
 {
   __m512i v;
@@ -150,9 +145,9 @@ struct PRE_ALIGN(32) __vec8_i32
     FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
     FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
 } POST_ALIGN(32);
-#endif /* __ZMM32BIT__ */
+#endif /* __ZMM64BIT__ */
 
-#ifndef __ZMM32BIT__ /* __ZMM32BIT__ */
+#ifndef __ZMM64BIT__ /* __ZMM64BIT__ */
 PRE_ALIGN(32) struct __vec8_f : public vec8<float> { 
     __vec8_f() { }
   FORCEINLINE  __vec8_f(float v0, float v1, float v2, float v3, 
@@ -177,7 +172,7 @@ PRE_ALIGN(32) struct __vec8_f : public vec8<float> {
     data[7] = val.s[7];
   }
 } POST_ALIGN(32);
-#else /* __ZMM32BIT__ */
+#else /* __ZMM64BIT__ */
 PRE_ALIGN(32) struct __vec8_f 
 {
     __m512 v;
@@ -192,7 +187,7 @@ PRE_ALIGN(32) struct __vec8_f
     FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
     FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
 } POST_ALIGN(32);
-#endif /* __ZMM32BIT__ */
+#endif /* __ZMM64BIT__ */
 
 struct PRE_ALIGN(64) __vec8_d 
 {
diff --git a/examples/intrinsics/knc-i1x8unsafe_fast.h b/examples/intrinsics/knc-i1x8unsafe_fast.h
index ce66ea11..2e00a567 100644
--- a/examples/intrinsics/knc-i1x8unsafe_fast.h
+++ b/examples/intrinsics/knc-i1x8unsafe_fast.h
@@ -1,4 +1,4 @@
-#define __ZMM32BIT__
+#define __ZMM64BIT__
 #include "knc-i1x8.h"
 
 /* the following tests fails because vec8_i32 and vec8_float are 512 and not 256 bit in size.

From 0c274212c2104a4547018fd3be31f33e153b82d3 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Thu, 19 Sep 2013 16:07:22 +0300
Subject: [PATCH 036/159] performance tuning for knc-i1x8.h. this gives goed
 enough performance for double only. float performance is terrible

---
 examples/intrinsics/knc-i1x8.h | 167 +++++++++++++--------------------
 1 file changed, 64 insertions(+), 103 deletions(-)

diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h
index c17b7238..d7696117 100644
--- a/examples/intrinsics/knc-i1x8.h
+++ b/examples/intrinsics/knc-i1x8.h
@@ -73,9 +73,9 @@ typedef int64_t __vec1_i64;
 
 struct __vec8_i1 {
     __vec8_i1() { }
-    __vec8_i1(const __mmask16 &vv) : v(vv) { }
+    __vec8_i1(const __mmask8 &vv) : v(vv) { }
     __vec8_i1(bool v0, bool v1, bool v2, bool v3,
-               bool v4, bool v5, bool v6, bool v7) {
+              bool v4, bool v5, bool v6, bool v7) {
         v = ((v0 & 1) |
              ((v1 & 1) << 1) |
              ((v2 & 1) << 2) |
@@ -87,7 +87,7 @@ struct __vec8_i1 {
     }
              
     __mmask8 v;
-    FORCEINLINE operator __mmask8() const { return v; }
+    FORCEINLINE operator __mmask8() const { return v; }//0xFF & v; }
 };
 
 
@@ -105,89 +105,66 @@ struct vec8 {
 
 /****************/
 
-#ifndef __ZMM64BIT__
-struct PRE_ALIGN(32) __vec8_i32  : public vec8<int32_t> { 
+struct PRE_ALIGN(32) __vec8_i32  
+{
+#ifdef __ZMM64BIT__
+  __m512i _data;
+  FORCEINLINE __vec8_i32(const __m512i &in) : _data(in) {}
+  FORCEINLINE operator __m512i() const   { return _data; }
+#else /* __ZMM64BIT__ */
+  typedef int32_t  _v8si  __attribute__((vector_size(32)));
+  _v8si _data;
+  FORCEINLINE __vec8_i32(const __m512i &in) 
+  {
+    _mm512_mask_extpackstorelo_epi32((__m512i*)&_data,  0xFF, in, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  }
+  FORCEINLINE operator __m512i() const   
+  { 
+    return _mm512_extloadunpacklo_epi32(_mm512_setzero_epi32(), (uint8_t*)&_data, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); 
+  }
+#endif /* __ZMM64BIT__ */
+  
   __vec8_i32() { }
   FORCEINLINE __vec8_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, 
-      int32_t v4, int32_t v5, int32_t v6, int32_t v7)
-    : vec8<int32_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
-  FORCEINLINE __vec8_i32(__m512i v) 
+      int32_t v4, int32_t v5, int32_t v6, int32_t v7) 
   {
-    union { __m512i v; int32_t s[8]; } val = {v};
-    data[0] = val.s[0];
-    data[1] = val.s[1];
-    data[2] = val.s[2];
-    data[3] = val.s[3];
-    data[4] = val.s[4];
-    data[5] = val.s[5];
-    data[6] = val.s[6];
-    data[7] = val.s[7];
+    const __m512i v  = _mm512_set_16to16_pi(0,0,0,0,0,0,0,0, v7, v6, v5, v4, v3, v2, v1, v0);
+    *this = __vec8_i32(v);
   }
-  FORCEINLINE operator __m512i() const 
-  { 
-    return _mm512_set_16to16_pi(
-        0,0,0,0, 0,0,0,0,
-        data[7],data[6],data[5],data[4],data[3],data[2],data[1],data[0]);
-  }
-} POST_ALIGN(32);
-#else /* __ZMM64BIT__ */
-struct PRE_ALIGN(32) __vec8_i32 
-{
-  __m512i v;
-  FORCEINLINE operator __m512i() const { return v; }
-  FORCEINLINE __vec8_i32() : v(_mm512_undefined_epi32()) {}
-  FORCEINLINE __vec8_i32(const __m512i &in) : v(in) {}
-  FORCEINLINE __vec8_i32(const __vec8_i32 &o) : v(o.v) {}
-  FORCEINLINE __vec8_i32& operator =(const __vec8_i32 &o) { v=o.v; return *this; }
-  FORCEINLINE __vec8_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, 
-                        int32_t v04, int32_t v05, int32_t v06, int32_t v07) :
-    v ( _mm512_set_16to16_pi(0,0,0,0,0,0,0,0, v07, v06, v05, v04, v03, v02, v01, v00) ) {}
-    FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
-    FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
-} POST_ALIGN(32);
-#endif /* __ZMM64BIT__ */
 
-#ifndef __ZMM64BIT__ /* __ZMM64BIT__ */
-PRE_ALIGN(32) struct __vec8_f : public vec8<float> { 
-    __vec8_f() { }
-  FORCEINLINE  __vec8_f(float v0, float v1, float v2, float v3, 
-             float v4, float v5, float v6, float v7) 
-        : vec8<float>(v0, v1, v2, v3, v4, v5, v6, v7) { }
-  FORCEINLINE operator __m512() const 
-  { 
-    return _mm512_set_16to16_ps(
-        0,0,0,0,0,0,0,0,
-        data[7],data[6],data[5],data[4],data[3],data[2],data[1],data[0]);
-  }
-  FORCEINLINE __vec8_f(__m512 v) 
-  {
-    union { __m512 v; float s[8]; } val = {v};
-    data[0] = val.s[0];
-    data[1] = val.s[1];
-    data[2] = val.s[2];
-    data[3] = val.s[3];
-    data[4] = val.s[4];
-    data[5] = val.s[5];
-    data[6] = val.s[6];
-    data[7] = val.s[7];
-  }
+  FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
+  FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
 } POST_ALIGN(32);
-#else /* __ZMM64BIT__ */
+
 PRE_ALIGN(32) struct __vec8_f 
 {
-    __m512 v;
-    FORCEINLINE operator __m512() const { return v; }
-    FORCEINLINE __vec8_f() : v(_mm512_undefined_ps()) { }
-    FORCEINLINE __vec8_f(const __m512 &in) : v(in) {}
-    FORCEINLINE __vec8_f(const __vec8_f &o) : v(o.v) {}
-    FORCEINLINE __vec8_f& operator =(const __vec8_f &o) { v=o.v; return *this; }
-    FORCEINLINE __vec8_f(float v00, float v01, float v02, float v03, 
-                          float v04, float v05, float v06, float v07) :
-        v ( _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, v07, v06, v05, v04, v03, v02, v01, v00) )  {}
-    FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
-    FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
-} POST_ALIGN(32);
+#ifdef __ZMM64BIT__
+  __m512 _data;
+  FORCEINLINE __vec8_f(const __m512 &in) : _data(in) {}
+  FORCEINLINE operator __m512() const   { return _data; }
+#else /* __ZMM64BIT__ */
+  typedef float  _v8sf  __attribute__((vector_size(32)));
+  _v8sf _data;
+  FORCEINLINE __vec8_f(const __m512 &in) 
+  {
+    _mm512_mask_extpackstorelo_ps((__m512*)&_data,  0xFF, in, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  }
+  FORCEINLINE operator __m512() const   
+  { 
+    return _mm512_extloadunpacklo_ps(_mm512_setzero_ps(), (uint8_t*)&_data, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); 
+  }
 #endif /* __ZMM64BIT__ */
+  FORCEINLINE __vec8_f() { }
+  FORCEINLINE __vec8_f(float v0, float v1, float v2, float v3, 
+                       float v4, float v5, float v6, float v7) 
+  {
+    const __m512 v  = _mm512_set_16to16_ps(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, v7, v6, v5, v4, v3, v2, v1, v0);
+    *this = __vec8_f(v);
+  }
+
+  FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
+  FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
+} POST_ALIGN(32);
 
 struct PRE_ALIGN(64) __vec8_d 
 {
@@ -438,8 +415,8 @@ INSERT_EXTRACT(__vec1_d, double)
 ///////////////////////////////////////////////////////////////////////////
 // mask ops
 
-static FORCEINLINE uint64_t __movmsk(__vec8_i1 mask) {
-    return (uint64_t)mask.v;
+static FORCEINLINE __vec8_i1 __movmsk(__vec8_i1 mask) {
+    return mask.v;
 }
 
 static FORCEINLINE bool __any(__vec8_i1 mask) {
@@ -455,52 +432,36 @@ static FORCEINLINE bool __none(__vec8_i1 mask) {
 }
 
 static FORCEINLINE __vec8_i1 __equal_i1(__vec8_i1 a, __vec8_i1 b) {
-    __vec8_i1 r;
-    r.v = (a.v & b.v) | (~a.v & ~b.v);
-    return r;
+    return (a.v & b.v) | (~a.v & ~b.v);
 }
 
 static FORCEINLINE __vec8_i1 __and(__vec8_i1 a, __vec8_i1 b) {
-    __vec8_i1 r;
-    r.v = a.v & b.v;
-    return r;
+    return  a.v & b.v;
 }
 
 static FORCEINLINE __vec8_i1 __xor(__vec8_i1 a, __vec8_i1 b) {
-    __vec8_i1 r;
-    r.v = a.v ^ b.v;
-    return r;
+    return a.v ^ b.v;
 }
 
 static FORCEINLINE __vec8_i1 __or(__vec8_i1 a, __vec8_i1 b) {
-    __vec8_i1 r;
-    r.v = a.v | b.v;
-    return r;
+    return  a.v | b.v;
 }
 
 static FORCEINLINE __vec8_i1 __not(__vec8_i1 v) {
-    __vec8_i1 r;
-    r.v = ~v.v;
-    return r;
+    return ~v;
 }
 
 static FORCEINLINE __vec8_i1 __and_not1(__vec8_i1 a, __vec8_i1 b) {
-    __vec8_i1 r;
-    r.v = ~a.v & b.v;
-    return r;
+    return  ~a.v & b.v;
 }
 
 static FORCEINLINE __vec8_i1 __and_not2(__vec8_i1 a, __vec8_i1 b) {
-    __vec8_i1 r;
-    r.v = a.v & ~b.v;
-    return r;
+    return  a.v & ~b.v;
 }
 
 static FORCEINLINE __vec8_i1 __select(__vec8_i1 mask, __vec8_i1 a, 
                                        __vec8_i1 b) {
-    __vec8_i1 r;
-    r.v = (a.v & mask.v) | (b.v & ~mask.v);
-    return r;
+    return  (a.v & mask.v) | (b.v & ~mask.v);
 }
 
 static FORCEINLINE __vec8_i1 __select(bool cond, __vec8_i1 a, __vec8_i1 b) {

From b2678b43388914e4eb94a9cd5845bfea16ae0e3e Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Thu, 19 Sep 2013 17:27:58 +0400
Subject: [PATCH 037/159] Typo fix is tests/double-consts.ispc

---
 tests/double-consts.ispc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/double-consts.ispc b/tests/double-consts.ispc
index 4096aa1c..5f9a66d5 100644
--- a/tests/double-consts.ispc
+++ b/tests/double-consts.ispc
@@ -13,7 +13,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 
     // All the constants should be equal and if it's evaluated as "float",
     // then sqrt will evaluate to +inf.
-    if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 && d1 == d6
+    if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 && d1 == d6 &&
         ((float)sqrt(d1)) < 2e20) {
         RET[programIndex] = a;
     }

From 0ed89e93fa309796867c0e8729c16dac0c27bbb8 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Thu, 19 Sep 2013 16:34:06 +0300
Subject: [PATCH 038/159] added fails info

---
 examples/intrinsics/knc-i1x8unsafe_fast.h | 103 +++++++++++++---------
 1 file changed, 60 insertions(+), 43 deletions(-)

diff --git a/examples/intrinsics/knc-i1x8unsafe_fast.h b/examples/intrinsics/knc-i1x8unsafe_fast.h
index 2e00a567..05be27bd 100644
--- a/examples/intrinsics/knc-i1x8unsafe_fast.h
+++ b/examples/intrinsics/knc-i1x8unsafe_fast.h
@@ -1,61 +1,78 @@
 #define __ZMM64BIT__
 #include "knc-i1x8.h"
 
-/* the following tests fails because vec8_i32 and vec8_float are 512 and not 256 bit in size.
- * not sure how it is possible to fix this, any suggestions? 
+/* the following tests fails because on KNC native vec8_i32 and vec8_float are 512 and not 256 bit in size.
+ *
+ *  Using test compiler: Intel(r) SPMD Program Compiler (ispc), 1.4.5dev (build commit d68dbbc7bce74803 @ 20130919, LLVM 3.3)
+ *  Using C/C++ compiler: icpc (ICC) 14.0.0 20130728
+ *
+ */
+
+/* knc-i1x8unsafe_fast.h fails: 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+	./tests/ptr-assign-lhs-math-1.ispc
 33 / 1206 tests FAILED execution:
-        ./tests/array-gather-simple.ispc
-        ./tests/array-gather-vary.ispc
-        ./tests/array-multidim-gather-scatter.ispc
-        ./tests/array-scatter-vary.ispc
-        ./tests/atomics-5.ispc
-        ./tests/atomics-swap.ispc
-        ./tests/cfor-array-gather-vary.ispc
-        ./tests/cfor-gs-improve-varying-1.ispc
-        ./tests/cfor-struct-gather-2.ispc
-        ./tests/cfor-struct-gather-3.ispc
-        ./tests/cfor-struct-gather.ispc
-        ./tests/gather-struct-vector.ispc
-        ./tests/global-array-4.ispc
-        ./tests/gs-improve-varying-1.ispc
-        ./tests/half-1.ispc
-        ./tests/half-3.ispc
-        ./tests/half.ispc
-        ./tests/launch-3.ispc
-        ./tests/launch-4.ispc
-        ./tests/masked-scatter-vector.ispc
-        ./tests/masked-struct-scatter-varying.ispc
-        ./tests/new-delete-6.ispc
-        ./tests/ptr-24.ispc
-        ./tests/ptr-25.ispc
-        ./tests/short-vec-15.ispc
-        ./tests/struct-gather-2.ispc
-        ./tests/struct-gather-3.ispc
-        ./tests/struct-gather.ispc
-        ./tests/struct-ref-lvalue.ispc
-        ./tests/struct-test-118.ispc
-        ./tests/struct-vary-index-expr.ispc
-        ./tests/typedef-2.ispc
-        ./tests/vector-varying-scatter.ispc
+	./tests/array-gather-simple.ispc
+	./tests/array-gather-vary.ispc
+	./tests/array-multidim-gather-scatter.ispc
+	./tests/array-scatter-vary.ispc
+	./tests/atomics-5.ispc
+	./tests/atomics-swap.ispc
+	./tests/cfor-array-gather-vary.ispc
+	./tests/cfor-gs-improve-varying-1.ispc
+	./tests/cfor-struct-gather-2.ispc
+	./tests/cfor-struct-gather-3.ispc
+	./tests/cfor-struct-gather.ispc
+	./tests/gather-struct-vector.ispc
+	./tests/global-array-4.ispc
+	./tests/gs-improve-varying-1.ispc
+	./tests/half-1.ispc
+	./tests/half-3.ispc
+	./tests/half.ispc
+	./tests/launch-3.ispc
+	./tests/launch-4.ispc
+	./tests/masked-scatter-vector.ispc
+	./tests/masked-struct-scatter-varying.ispc
+	./tests/new-delete-6.ispc
+	./tests/ptr-24.ispc
+	./tests/ptr-25.ispc
+	./tests/short-vec-15.ispc
+	./tests/struct-gather-2.ispc
+	./tests/struct-gather-3.ispc
+	./tests/struct-gather.ispc
+	./tests/struct-ref-lvalue.ispc
+	./tests/struct-test-118.ispc
+	./tests/struct-vary-index-expr.ispc
+	./tests/typedef-2.ispc
+	./tests/vector-varying-scatter.ispc
 */
 
-/* knc-i1x8.h has the following fails:
+/* knc-i1x8.h fails: 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+	./tests/ptr-assign-lhs-math-1.ispc
 3 / 1206 tests FAILED execution:
-        ./tests/half-1.ispc
-        ./tests/half-3.ispc
-        ./tests/half.ispc
+	./tests/half-1.ispc
+	./tests/half-3.ispc
+	./tests/half.ispc
 */
 
-/* knc-i1x16.h has the following fails:
-5 / 1206 tests FAILED execution:
-        ./tests/assert-3.ispc
+/* knc-i1x8.h fails: 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+        ./tests/ptr-assign-lhs-math-1.ispc
+4 / 1206 tests FAILED execution:
         ./tests/half-1.ispc
         ./tests/half-3.ispc
         ./tests/half.ispc
         ./tests/test-141.ispc
 */
 
-/* generics-16, from which these knc-i1x*.h are derived, has the following fails:
+/* generic-16.h fails: (from these knc-i1x8.h & knc-i1x16.h are derived 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+        ./tests/ptr-assign-lhs-math-1.ispc
 6 / 1206 tests FAILED execution:
         ./tests/func-overload-max.ispc
         ./tests/half-1.ispc

From 491c58aef374a1de7987ba8d5919a641a65cb853 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Thu, 19 Sep 2013 17:47:10 +0400
Subject: [PATCH 039/159] change head to trunk

---
 alloy.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/alloy.py b/alloy.py
index 06025324..119874b8 100755
--- a/alloy.py
+++ b/alloy.py
@@ -81,7 +81,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v
     llvm_home = os.environ["LLVM_HOME"]
     os.chdir(llvm_home)
     FOLDER_NAME=version_LLVM
-    if  version_LLVM == "head":
+    if  version_LLVM == "trunk":
         SVN_PATH="trunk"
     if  version_LLVM == "3.3":
         SVN_PATH="tags/RELEASE_33/final"
@@ -334,7 +334,7 @@ def validation_run(only, only_targets, reference_branch, notify, update):
             archs.append("x86-64")
         if "native" in only:
             sde_targets_t = []
-        for i in ["3.1", "3.2", "3.3", "head"]:
+        for i in ["3.1", "3.2", "3.3", "trunk"]:
             if i in only:
                 LLVM.append(i)
         if "current" in only:
@@ -367,7 +367,7 @@ def validation_run(only, only_targets, reference_branch, notify, update):
         if len(archs) == 0:
             archs = ["x86", "x86-64"]
         if len(LLVM) == 0:
-            LLVM = ["3.3", "head"]
+            LLVM = ["3.3", "trunk"]
         gen_archs = ["x86-64"]
         need_LLVM = check_LLVM(LLVM)
         for i in range(0,len(need_LLVM)):
@@ -562,9 +562,9 @@ parser = OptionParser()
 parser.add_option('-b', '--build-llvm', dest='build_llvm',
     help='ask to build LLVM', default=False, action="store_true")
 parser.add_option('--version', dest='version',
-    help='version of llvm to build', default="head")
+    help='version of llvm to build: 3.1 3.2 3.3 trunk', default="trunk")
 parser.add_option('--revision', dest='revision',
-    help='revision of llvm to build', default="")
+    help='revision of llvm to build in format r172870', default="")
 parser.add_option('--debug', dest='debug',
     help='debug build of LLVM?', default=False, action="store_true")
 parser.add_option('--folder', dest='folder',
@@ -592,7 +592,7 @@ parser.add_option('--notify', dest='notify',
 parser.add_option('--only', dest='only',
     help='set types of tests. Possible values:\n' + 
         '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' +
-        'build (only build with different LLVM), 3.1, 3.2, 3.3, head, native (do not use SDE), current (do not rebuild ISPC).\n' +
+        'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).\n' +
         'Example: --only="3.2 -O0 stability 3.3"', default="")
 parser.add_option('--update-errors', dest='update',
     help='rewrite fail_db.txt file according to received results (F or FP)', default="")

From 5cabf0bef06af579571046cae63dcd82768c1220 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Fri, 20 Sep 2013 14:13:40 +0300
Subject: [PATCH 040/159] adding int64 support form knc.h, phase 1. bugs:
 __lshr & __ashr fail idiv.ispc test, __equal_i64 & __equal_i64_and_mask fails
 reduce_equal_8.ispc test

---
 examples/intrinsics/knc-i1x16.h | 290 ++++++++++++++++++++++++++++----
 1 file changed, 259 insertions(+), 31 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index c535e61a..628a38b8 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -208,7 +208,7 @@ struct PRE_ALIGN(128) __vec16_d
 } POST_ALIGN(128);
 #endif /* evghenii::d */
 
-#if 1 /* evghenii::i64 */
+#if 0 /* evghenii::i64 */
 PRE_ALIGN(128) struct __vec16_i64  : public vec16<int64_t> { 
     __vec16_i64() { }
     __vec16_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, 
@@ -219,34 +219,66 @@ PRE_ALIGN(128) struct __vec16_i64  : public vec16<int64_t> {
                          v8, v9, v10, v11, v12, v13, v14, v15) { }
 } POST_ALIGN(128);
 #else /* evghenii::i64 */
-struct PRE_ALIGN(64) __vec16_i64 {
-    FORCEINLINE __vec16_i64() : v_lo(_mm512_undefined_epi32()),  v_hi(_mm512_undefined_epi32()) {}
-    FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {}
-    FORCEINLINE __vec16_i64(__m512i l, __m512i h) : v_lo(l), v_hi(h) {}
-    FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v_lo=o.v_lo; v_hi=o.v_hi; return *this; }
-    FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, 
-                            int64_t v04, int64_t v05, int64_t v06, int64_t v07,
-                            int64_t v08, int64_t v09, int64_t v10, int64_t v11,
-                            int64_t v12, int64_t v13, int64_t v14, int64_t v15) {
-        __m512i v1 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08);
-        __m512i v2 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00);
-        v_hi = _mm512_mask_permutevar_epi32(v_hi, 0xFF00, 
-                      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-                      v1);
-        v_hi = _mm512_mask_permutevar_epi32(v_hi, 0x00FF, 
-                      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-                      v2);
-        v_lo = _mm512_mask_permutevar_epi32(v_lo, 0xFF00,
-                      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-                      v1);
-        v_lo = _mm512_mask_permutevar_epi32(v_lo, 0x00FF,
-                      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-                      v2);
-    }
+struct PRE_ALIGN(128) __vec16_i64 
+{
+  union {
+    __m512i v1;
     __m512i v_hi;
+  };
+  union
+  {
+    __m512i v2;
     __m512i v_lo;
-} POST_ALIGN(64);
+  };
+  FORCEINLINE __vec16_i64() : v1(_mm512_undefined_epi32()), v2(_mm512_undefined_epi32()) {}
+  FORCEINLINE __vec16_i64(const __m512i _v1, const __m512i _v2) : v1(_v1), v2(_v2) {}
+  FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v1(o.v1), v2(o.v2) {}
+  FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v1=o.v1; v2=o.v2; return *this; }
+  FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, 
+      int64_t v04, int64_t v05, int64_t v06, int64_t v07,
+      int64_t v08, int64_t v09, int64_t v10, int64_t v11,
+      int64_t v12, int64_t v13, int64_t v14, int64_t v15) {
+    v2 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08);
+    v1 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00);
+  }
+  FORCEINLINE const int64_t& operator[](const int i) const {  return ((int64_t*)this)[i]; }
+  FORCEINLINE       int64_t& operator[](const int i)       {  return ((int64_t*)this)[i]; }
+  FORCEINLINE __vec16_i64 cvt2hilo()  const
+  {
+    __m512i _hi, _lo;
+    _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, 
+        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+        v1);
+    _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, 
+        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+        v2);
+    _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
+        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+        v1);
+    _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF,
+        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+        v2);
+    return __vec16_i64(_hi, _lo);
+  }
+  FORCEINLINE __vec16_i64 cvt2zmm() const
+  {
+    __m512i _v1, _v2;
+    _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+        v_hi);
+    _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555,
+        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+        v_lo);
 
+    _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+        v_hi);
+    _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555,
+        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+        v_lo);
+    return __vec16_i64(_v1, _v2);
+  }
+} POST_ALIGN(128);
 #endif /* evghenii::i64 */
 
 PRE_ALIGN(16) struct __vec16_i8   : public vec16<int8_t> { 
@@ -959,30 +991,162 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
 
 ///////////////////////////////////////////////////////////////////////////
 // int64
+// evghenii::int64
 
+#if 0
 BINARY_OP(__vec16_i64, __add, +)
 BINARY_OP(__vec16_i64, __sub, -)
 BINARY_OP(__vec16_i64, __mul, *)
+#else
+static FORCEINLINE __vec16_i64 __add(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_add_epi64(a.v1, b.v1), _mm512_add_epi64(a.v2,b.v2));
+}
 
+static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) {
+//    return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2));
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __vec16_i64 ret;
+  __mmask16 borrow = 0;
+  ret.v_lo = _mm512_subsetb_epi32(a.v_lo, b.v_lo, &borrow);
+  ret.v_hi = _mm512_sbb_epi32    (a.v_hi, borrow, b.v_hi, &borrow);
+  return ret.cvt2zmm();
+}
+
+static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b)
+{
+  const __vec16_i64 b = _b.cvt2hilo();
+  return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo),
+      _mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi),
+        _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm();
+}
+
+static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_mullox_epi64(a.v1, b.v1), _mm512_mullox_epi64(a.v2,b.v2));
+}
+#endif
+
+#if 0
 BINARY_OP(__vec16_i64, __or, |)
 BINARY_OP(__vec16_i64, __and, &)
 BINARY_OP(__vec16_i64, __xor, ^)
 BINARY_OP(__vec16_i64, __shl, <<)
+#else
+static FORCEINLINE __vec16_i64 __or(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_or_epi64(a.v1, b.v1), _mm512_or_epi64(a.v2, b.v2));
+}
 
+static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2));
+}
+
+static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2));
+}
+
+static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) {
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __vec16_i32 xfer = _mm512_srlv_epi32(a.v_lo, _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
+  __vec16_i32 hi = _mm512_or_epi32(_mm512_sllv_epi32(a.v_hi, b.v_lo), xfer);
+  __vec16_i32 lo = _mm512_sllv_epi32(a.v_lo, b.v_lo);
+  return __vec16_i64(hi,lo).cvt2zmm();
+}
+#endif
+
+#if 0
 BINARY_OP_CAST(__vec16_i64, uint64_t, __udiv, /)
 BINARY_OP_CAST(__vec16_i64, int64_t,  __sdiv, /)
+#else
+static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2));
+}
+static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2));
+}
+#endif
 
+#if 0
 BINARY_OP_CAST(__vec16_i64, uint64_t, __urem, %)
 BINARY_OP_CAST(__vec16_i64, int64_t,  __srem, %)
+#else
+static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2));
+}
+static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2));
+}
+#endif
+
+#if 1
 BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
+#else /* evghenii::fails idiv.ispc */
+static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) {
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo);
+#if 0
+  __vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift));
+#else
+  __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, 
+        _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), 
+      _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
+#endif
+  __vec16_i32 hi = _mm512_srlv_epi32(a.v_hi, b.v_lo);
+  __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
+  return __vec16_i64(hi,lo).cvt2zmm();
+}
+
+#endif
+
+#if 1
 BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)
+#else /* evghenii::fails idiv.ispc */
+static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) {
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, 
+        _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), 
+      _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
+  __vec16_i32 hi = _mm512_srav_epi32(a.v_hi, b.v_lo);
+  __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
+  return __vec16_i64(hi,lo).cvt2zmm();
+}
+#endif
 
 SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
 
+#if 1
 CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
 CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
+#else /* evghenii::fails         ./tests/reduce-equal-8.ispc, some other test hang... */
+static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &_a, const __vec16_i64 &_b) {
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
+  return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
+}
+static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &_a, const __vec16_i64 &_b,
+                                                   __vec16_i1 mask) {
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
+  __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
+  return _mm512_kand(full_match, (__mmask16)mask);
+}
+
+static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
+    return __not(__equal_i64(a,b));
+}
+static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b,
+                                                       __vec16_i1 mask) {
+    return __and(__not(__equal_i64(a,b)), mask);
+}
+#endif
+
+
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
 CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_equal, <=)
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
@@ -992,15 +1156,84 @@ CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_than, <)
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
 CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_than, >)
 
+#if 0
 SELECT(__vec16_i64)
+#else
+static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask,
+                                        __vec16_i64 a, __vec16_i64 b) {
+  __vec16_i64 ret;
+  ret.v_hi = _mm512_mask_mov_epi64(b.v_hi, mask, a.v_hi);
+  ret.v_lo = _mm512_mask_mov_epi64(b.v_lo, mask >> 8, a.v_lo);
+  return ret;
+}
+#endif
+
 INSERT_EXTRACT(__vec16_i64, int64_t)
+#if 0
 SMEAR(__vec16_i64, i64, int64_t)
 SETZERO(__vec16_i64, i64)
 UNDEF(__vec16_i64, i64)
 BROADCAST(__vec16_i64, i64, int64_t)
+#else
+template <class RetVecType> RetVecType __smear_i64(const int64_t &l);
+template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) {    return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); }
+
+template <class RetVecType> RetVecType __setzero_i64();
+template <> FORCEINLINE  __vec16_i64 __setzero_i64<__vec16_i64>() {    return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); }
+
+template <class RetVecType> RetVecType __undef_i64();
+template <> FORCEINLINE  __vec16_i64 __undef_i64<__vec16_i64>() {    return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
+
+static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 v, int index) {
+    int64_t val = __extract_element(v, index & 0xf);
+    return __smear_i64<__vec16_i64>(val);
+}
+#endif
 ROTATE(__vec16_i64, i64, int64_t)
 SHUFFLES(__vec16_i64, i64, int64_t)
+#if 0
 LOAD_STORE(__vec16_i64, int64_t)
+#else
+template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) 
+{
+  __vec16_i32 v1;
+  __vec16_i32 v2;
+  v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v2 = _mm512_extloadunpackhi_epi32(v2, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v1 = _mm512_extloadunpacklo_epi32(v1, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v1 = _mm512_extloadunpackhi_epi32(v1, (uint8_t*)p+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  return __vec16_i64(v2,v1);
+}
+
+template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
+{
+  __m512i v2 = _mm512_load_epi32(p);
+  __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
+  return __vec16_i64(v2,v1);
+}
+
+template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {    return __load<64>(p); }
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) 
+{
+  __m512i v1 = v.v2;
+  __m512i v2 = v.v1;
+  _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_epi32((uint8_t*)p+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+128, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+}
+
+template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) 
+{
+  __m512i v1 = v.v2;
+  __m512i v2 = v.v1;
+  _mm512_store_epi64(p, v2);
+  _mm512_store_epi64(((uint8_t*)p)+64, v1);
+}
+
+template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
+#endif
 
 
 #if 0 /* evghenii::float */
@@ -1062,7 +1295,6 @@ static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) {
     return _mm512_sub_ps(a, b);
 }
 
-#if 1 /* evghenii::this two fails assert-3.ispc test */
 static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) {
     return _mm512_mul_ps(a, b);
 }
@@ -1070,10 +1302,6 @@ static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) {
 static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) {
     return _mm512_div_ps(a, b);
 }
-#else
-BINARY_OP(__vec16_f, __mul, *)
-BINARY_OP(__vec16_f, __div, /)
-#endif
 
 
 static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) {

From ddecdeb8349e1d3db7d6c4ef949c9fb86734609d Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Fri, 20 Sep 2013 14:55:15 +0300
Subject: [PATCH 041/159] move remaining int64 from knc.h some of fails to pass
 tests, grep for evghenii::fails to find out which functions fail and on what
 tests

---
 examples/intrinsics/knc-i1x16.h | 170 +++++++++++++++++++++++++++++---
 1 file changed, 157 insertions(+), 13 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 628a38b8..1f5a6056 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -1120,7 +1120,6 @@ SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
 
 #if 1
 CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
-CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
 #else /* evghenii::fails         ./tests/reduce-equal-8.ispc, some other test hang... */
 static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &_a, const __vec16_i64 &_b) {
   const __vec16_i64 a = _a.cvt2hilo();
@@ -1128,6 +1127,14 @@ static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &_a, const __vec16_i
   const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
   return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
 }
+static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
+    return __not(__equal_i64(a,b));
+}
+#endif
+
+#if 1
+CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
+#else /* evghenii::fails         ./tests/reduce-equal-8.ispc, some other test hang... */
 static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &_a, const __vec16_i64 &_b,
                                                    __vec16_i1 mask) {
   const __vec16_i64 a = _a.cvt2hilo();
@@ -1136,10 +1143,6 @@ static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &_a, const
   __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
   return _mm512_kand(full_match, (__mmask16)mask);
 }
-
-static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
-    return __not(__equal_i64(a,b));
-}
 static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b,
                                                        __vec16_i1 mask) {
     return __and(__not(__equal_i64(a,b)), mask);
@@ -1147,6 +1150,7 @@ static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, con
 #endif
 
 
+
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
 CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_equal, <=)
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
@@ -1843,7 +1847,14 @@ static FORCEINLINE TO FUNC(TO, FROM val) {      \
 }
 
 // sign extension conversions
+#if 1
 CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext)
+#else /* evghenii::fails on soa-9 soa-13 soa-10 soa-29 soa-3 ... and others  */
+static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
+{
+  return __vec16_i64(_mm512_srai_epi32(val.v,31), val.v).cvt2zmm();
+}
+#endif
 CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext)
 CAST(__vec16_i64, int64_t, __vec16_i8,  int8_t,  __cast_sext)
 CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
@@ -1868,15 +1879,23 @@ CAST_SEXT_I1(__vec16_i32)
 #else
 static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val)
 {
-    __vec16_i32 ret = _mm512_setzero_epi32();
-    __vec16_i32 one = _mm512_set1_epi32(-1);
-    return _mm512_mask_mov_epi32(ret, val, one);
+  __vec16_i32 ret = _mm512_setzero_epi32();
+  __vec16_i32 one = _mm512_set1_epi32(-1);
+  return _mm512_mask_mov_epi32(ret, val, one);
 }
 #endif
 CAST_SEXT_I1(__vec16_i64)
 
 // zero extension
+#if 0
 CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
+#else
+static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val)
+{
+  return __vec16_i64(_mm512_setzero_epi32(), val.v).cvt2zmm();
+}
+
+#endif
 CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext)
 CAST(__vec16_i64, uint64_t, __vec16_i8,  uint8_t,  __cast_zext)
 CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext)
@@ -2714,8 +2733,34 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t
     _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
     return ret;
 }
-#endif
+#if 0 /* evghenii::fails on gather-int8-2 & gather-int8-4 */
+static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) 
+{ 
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  __vec16_i1 still_to_do = mask;
+  __vec16_i32 tmp;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));    
+    tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base,
+        _MM_UPCONV_EPI32_SINT8, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
+  __vec16_i8 ret;
+  _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+  return ret;
+}
+#else
 GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
+#endif
+#endif
 /****************/
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
@@ -2729,8 +2774,35 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32
                                           base, _MM_UPCONV_EPI32_NONE, scale,
                                           _MM_HINT_NONE);
 }
-#endif
+#if 0 /* evghenii::fails on gather-int32-2 & gather-int32-4 */
+static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
+{
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  // There is no gather instruction with 64-bit offsets in KNC.
+  // We have to manually iterate over the upper 32 bits ;-)
+  __vec16_i1  still_to_do = mask;
+  __vec16_i32 ret;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));
+    ret = _mm512_mask_i32extgather_epi32(ret, match, offsets.v_lo, base,
+        _MM_UPCONV_EPI32_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match, still_to_do);
+  }
+
+  return ret;
+}
+#else
 GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
+#endif
+#endif
 /****************/
 #if 0
 GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
@@ -2741,8 +2813,35 @@ static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32
                                        base, _MM_UPCONV_PS_NONE, scale,
                                        _MM_HINT_NONE);
 }
-#endif
+#if 0 /* evghenii::fails on gather-float-2 gather-float-4 & soa-14 */
+static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
+{
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  // There is no gather instruction with 64-bit offsets in KNC.
+  // We have to manually iterate over the upper 32 bits ;-)
+  __vec16_i1 still_to_do = mask;
+  __vec16_f ret;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));
+    ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base,
+        _MM_UPCONV_PS_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match, still_to_do);
+  }
+
+  return ret;
+}
+#else
 GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
+#endif
+#endif
 /****************/
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
@@ -2824,6 +2923,7 @@ SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64
 /*****************/
 #if 0
 SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
+SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
 #else
 static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets,  __vec16_i32 val, __vec16_i1 mask)
 {
@@ -2831,8 +2931,28 @@ static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale,
                                     _MM_DOWNCONV_EPI32_NONE, scale, 
                                     _MM_HINT_NONE);
 }
+static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i32 value, __vec16_i1 mask) 
+{
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+
+  __vec16_i1 still_to_do = mask;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));    
+    _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, 
+        value,
+        _MM_DOWNCONV_EPI32_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
+}
 #endif
-SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
 /*****************/
 #if 0
 SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_base_offsets32_float)
@@ -2844,8 +2964,32 @@ static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scal
                                  _MM_DOWNCONV_PS_NONE, scale,
                                  _MM_HINT_NONE);
 }
-#endif
+#if 0 /* evghenii::fails on soa-10 & soa-13 , it is very similar to __scatter_base_offsets64_it32, but that passes tests, why ?!? */
+static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask) 
+{ 
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+
+  __vec16_i1 still_to_do = mask;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));    
+    _mm512_mask_i32extscatter_ps(base, match, offsets.v_lo, 
+        value,
+        _MM_DOWNCONV_PS_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
+}
+#else
 SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
+#endif
+#endif
 /*****************/
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)

From 87cecddabb69f0a5794c6d6c325c8ccd329165c9 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Fri, 20 Sep 2013 18:57:20 +0400
Subject: [PATCH 042/159] adding sort to performance checking

---
 examples/sort/sort.cpp | 16 +++++++++-------
 perf.ini               |  8 ++++----
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/examples/sort/sort.cpp b/examples/sort/sort.cpp
index 4f402c75..f5e4264a 100644
--- a/examples/sort/sort.cpp
+++ b/examples/sort/sort.cpp
@@ -86,7 +86,8 @@ int main (int argc, char *argv[])
 
     tISPC1 += get_elapsed_mcycles();
 
-    progressbar (i, m);
+    if (argc != 3)
+        progressbar (i, m);
   }
 
   printf("[sort ispc]:\t[%.3f] million cycles\n", tISPC1);
@@ -103,10 +104,11 @@ int main (int argc, char *argv[])
 
     tISPC2 += get_elapsed_mcycles();
 
-    progressbar (i, m);
+    if (argc != 3)
+        progressbar (i, m);
   }
-              
-  printf("[sort ispc+tasks]:\t[%.3f] million cycles\n", tISPC2);
+
+  printf("[sort ispc + tasks]:\t[%.3f] million cycles\n", tISPC2);
 
   srand (0);
 
@@ -120,13 +122,13 @@ int main (int argc, char *argv[])
 
     tSerial += get_elapsed_mcycles();
 
-    progressbar (i, m);
+    if (argc != 3)
+        progressbar (i, m);
   }
 
   printf("[sort serial]:\t\t[%.3f] million cycles\n", tSerial);
 
-  printf("\t\t\t\t(%.2fx speedup from ISPC serial)\n", tSerial/tISPC1);
-  printf("\t\t\t\t(%.2fx speedup from ISPC with tasks)\n", tSerial/tISPC2);
+  printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2);
 
   delete code;
   delete order;
diff --git a/perf.ini b/perf.ini
index d8c7fe71..249c25f4 100755
--- a/perf.ini
+++ b/perf.ini
@@ -51,7 +51,7 @@ Volume Rendering
 volume_rendering
 camera.dat density_highres.vol
 #***
-%Sort
-%sort
-%
-%#***
+Sort
+sort
+1000000 1
+#***

From 9e0e9dbecc484fdbc6fd16a3fca283df71572f65 Mon Sep 17 00:00:00 2001
From: Preston Gurd <preston.gurd@intel.com>
Date: Fri, 20 Sep 2013 14:42:46 -0400
Subject: [PATCH 043/159] - Add Silvermont (--cpu=slm) option for llvm 3.4+. -
 Change default Sandybridge isa name to avx1-i32x8 from avx-i32x8,   to
 conform with replacement of avx-i32x8 by avx1-i32x8 everywhere else. - Add
 "target-cpu" attribute, when using AttrBuilder, to correct a problem  
 whereby llvm would switch from the command line cpu setting   to the native
 (auto-detected) cpu setting on second and subsequent   functions. e.g. if I
 wanted to build for Silvermont on a Sandy Bridge   machine, ispc/llvm would
 correctly use Silvermont and turn on the   Silvermont scheduler. For the
 second and subsequent functions,   it would auto-detect Sandy Bridge, but
 still run the Silvermont   scheduler.

---
 ispc.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/ispc.cpp b/ispc.cpp
index 82f0518b..ea7bfcd7 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -126,7 +126,7 @@ lGetSystemISA() {
                 return "avx1.1-i32x8";
         }
         // Regular AVX
-        return "avx-i32x8";
+        return "avx1-i32x8";
     }
     else if ((info[2] & (1 << 19)) != 0)
         return "sse4-i32x4";
@@ -149,8 +149,11 @@ static const char *supportedCPUs[] = {
 #endif
     "atom", "penryn", "core2", "corei7", "corei7-avx"
 #if !defined(LLVM_3_1)
-    , "core-avx-i", "core-avx2"
+    , "core-avx-i", "core-avx2", "slm"
 #endif // LLVM 3.2+
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3)
+    , "slm" 
+#endif // LLVM 3.4+
 };
 
 Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
@@ -196,9 +199,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
                 isa = "avx1.1-i32x8";
             else if (!strcmp(cpu, "sandybridge") ||
                 !strcmp(cpu, "corei7-avx"))
-                isa = "avx-i32x8";
+                isa = "avx1-i32x8";
             else if (!strcmp(cpu, "corei7") ||
-                     !strcmp(cpu, "penryn"))
+                     !strcmp(cpu, "penryn") ||
+                     !strcmp(cpu, "slm"))
                 isa = "sse4-i32x4";
             else
                 isa = "sse2-i32x4";
@@ -660,6 +664,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         // Initialize target-specific "target-feature" attribute.
         if (!m_attributes.empty()) {
             llvm::AttrBuilder attrBuilder;
+            attrBuilder.addAttribute("target-cpu", this->m_cpu);
             attrBuilder.addAttribute("target-features", this->m_attributes);
             this->m_tf_attributes = new llvm::AttributeSet(
                 llvm::AttributeSet::get(

From 4b26b8b4309ffb3295db16815620d2ab751c61c7 Mon Sep 17 00:00:00 2001
From: Preston Gurd <preston.gurd@intel.com>
Date: Fri, 20 Sep 2013 16:44:01 -0400
Subject: [PATCH 044/159] Remove redundant "slm".

---
 ispc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ispc.cpp b/ispc.cpp
index ea7bfcd7..bec7baf7 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -149,7 +149,7 @@ static const char *supportedCPUs[] = {
 #endif
     "atom", "penryn", "core2", "corei7", "corei7-avx"
 #if !defined(LLVM_3_1)
-    , "core-avx-i", "core-avx2", "slm"
+    , "core-avx-i", "core-avx2"
 #endif // LLVM 3.2+
 #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3)
     , "slm" 

From 019043f55ee13865fe6f672fcce544028ff63e2f Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Mon, 23 Sep 2013 09:55:55 +0300
Subject: [PATCH 045/159] patched half2float & float2half to pass the tests.
 Now only test-141 is failed. but it seems to be test rather than knc-i1x16.h
 related

---
 examples/intrinsics/knc-i1x16.h | 138 +++++++++++++++++++++++++++-----
 1 file changed, 117 insertions(+), 21 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 1f5a6056..2ee6d2f5 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -1478,23 +1478,101 @@ static FORCEINLINE float __floatbits(int v) {
     return u.f;
 }
 
+/* source : 
+ * http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion */
+class Float16Compressor
+{
+  union Bits
+  {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static int const shift = 13;
+  static int const shiftSign = 16;
+
+  static int32_t const infN = 0x7F800000; // flt32 infinity
+  static int32_t const maxN = 0x477FE000; // max flt16 normal as a flt32
+  static int32_t const minN = 0x38800000; // min flt16 normal as a flt32
+  static int32_t const signN = 0x80000000; // flt32 sign bit
+
+  static int32_t const infC = infN >> shift;
+  static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32
+  static int32_t const maxC = maxN >> shift;
+  static int32_t const minC = minN >> shift;
+  static int32_t const signC = signN >> shiftSign; // flt16 sign bit
+
+  static int32_t const mulN = 0x52000000; // (1 << 23) / minN
+  static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift))
+
+  static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted
+  static int32_t const norC = 0x00400; // min flt32 normal down shifted
+
+  static int32_t const maxD = infC - maxC - 1;
+  static int32_t const minD = minC - subC - 1;
+
+  public:
+
+  static uint16_t compress(float value)
+  {
+    Bits v, s;
+    v.f = value;
+    uint32_t sign = v.si & signN;
+    v.si ^= sign;
+    sign >>= shiftSign; // logical shift
+    s.si = mulN;
+    s.si = s.f * v.f; // correct subnormals
+    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    v.ui >>= shift; // logical shift
+    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    return v.ui | sign;
+  }
+
+  static float decompress(uint16_t value)
+  {
+    Bits v;
+    v.ui = value;
+    int32_t sign = v.si & signC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+  }
+};
+
 static FORCEINLINE float __half_to_float_uniform(int16_t h) {
-    static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
+#if 0
+  static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
 
-    int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
-    uint32_t exp = shifted_exp & o;   // just the exponent
-    o += (127 - 15) << 23;        // exponent adjust
+  int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+  uint32_t exp = shifted_exp & o;   // just the exponent
+  o += (127 - 15) << 23;        // exponent adjust
 
-    // handle exponent special cases
-    if (exp == shifted_exp) // Inf/NaN?
-        o += (128 - 16) << 23;    // extra exp adjust
-    else if (exp == 0) { // Zero/Denormal?
-        o += 1 << 23;             // extra exp adjust
-        o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
-    }
+  // handle exponent special cases
+  if (exp == shifted_exp) // Inf/NaN?
+    o += (128 - 16) << 23;    // extra exp adjust
+  else if (exp == 0) { // Zero/Denormal?
+    o += 1 << 23;             // extra exp adjust
+    o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
+  }
 
-    o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
-    return __floatbits(o);
+  o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
+  return __floatbits(o);
+#else
+  return Float16Compressor::decompress(h);
+#endif
 }
 
 
@@ -1507,6 +1585,7 @@ static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) {
 
 
 static FORCEINLINE int16_t __float_to_half_uniform(float f) {
+#if 0
     uint32_t sign_mask = 0x80000000u;
     int32_t o;
 
@@ -1531,6 +1610,9 @@ static FORCEINLINE int16_t __float_to_half_uniform(float f) {
         o = fint2 >> 13; // Take the bits!
 
     return (o | (sign >> 16));
+#else
+  return Float16Compressor::compress(f);
+#endif
 }
 
 
@@ -2075,9 +2157,8 @@ CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui)
 CAST(__vec16_i64, uint64_t, __vec16_d, double, __cast_fptoui)
 
 // float/double conversions
-#if 1
+#if 0
 CAST(__vec16_f, float,  __vec16_d, double, __cast_fptrunc)
-CAST(__vec16_d, double, __vec16_f, float,  __cast_fpext)
 #else
 static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
     __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
@@ -2085,11 +2166,16 @@ static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
 
     return _mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA);
 }
+#endif
+
+#if 0
+CAST(__vec16_d, double, __vec16_f, float,  __cast_fpext)
+#else
 static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) {
     __vec16_d ret;
-    ret.v2 = _mm512_cvtpslo_pd(val.v);
+    ret.v1 = _mm512_cvtpslo_pd(val.v);
     __vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC);
-    ret.v1 = _mm512_cvtpslo_pd(other8);
+    ret.v2 = _mm512_cvtpslo_pd(other8);
     return ret;
 }
 #endif
@@ -2325,14 +2411,24 @@ static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) {    return __ve
 // svml
 ///////////////////////////////////////////////////////////////////////////
 
-static FORCEINLINE __vec16_f __svml_logf(__vec16_f v)              { return _mm512_log_ps(v); }
-static FORCEINLINE __vec16_f __svml_expf(__vec16_f v)              { return _mm512_exp_ps(v); }
+static FORCEINLINE __vec16_f __svml_sinf(__vec16_f v)              { return _mm512_sin_ps(v); }
+static FORCEINLINE __vec16_f __svml_asinf(__vec16_f v)              { return _mm512_asin_ps(v); }
 static FORCEINLINE __vec16_f __svml_cosf(__vec16_f v)              { return _mm512_cos_ps(v); }
+static FORCEINLINE __vec16_f __svml_tanf(__vec16_f v)              { return _mm512_tan_ps(v); }
+static FORCEINLINE __vec16_f __svml_atanf(__vec16_f v)              { return _mm512_atan_ps(v); }
+static FORCEINLINE __vec16_f __svml_atan2f(__vec16_f a, __vec16_f b) { return _mm512_atan2_ps(a,b); }
+static FORCEINLINE __vec16_f __svml_expf(__vec16_f v)              { return _mm512_exp_ps(v); }
+static FORCEINLINE __vec16_f __svml_logf(__vec16_f v)              { return _mm512_log_ps(v); }
 static FORCEINLINE __vec16_f __svml_powf(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); }
 
-static FORCEINLINE __vec16_d __svml_logd(__vec16_d v)              { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); }
-static FORCEINLINE __vec16_d __svml_expd(__vec16_d v)              { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_sind(__vec16_d v)              { return __vec16_d(_mm512_sin_pd(v.v1), _mm512_sin_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_asind(__vec16_d v)              { return __vec16_d(_mm512_asin_pd(v.v1), _mm512_asin_pd(v.v2)); }
 static FORCEINLINE __vec16_d __svml_cosd(__vec16_d v)              { return __vec16_d(_mm512_cos_pd(v.v1), _mm512_cos_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_tand(__vec16_d v)              { return __vec16_d(_mm512_tan_pd(v.v1), _mm512_tan_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_atand(__vec16_d v)              { return __vec16_d(_mm512_atan_pd(v.v1), _mm512_atan_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_atan2d(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_atan2_pd(a.v1,b.v1), _mm512_atan2_pd(a.v2,b.v2)); }
+static FORCEINLINE __vec16_d __svml_expd(__vec16_d v)              { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_logd(__vec16_d v)              { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); }
 static FORCEINLINE __vec16_d __svml_powd(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1), _mm512_pow_pd(a.v2,b.v2)); }
 
 ///////////////////////////////////////////////////////////////////////////

From 5a9b3b3abb592d19fbe298467bcb631b25c8bd76 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Fri, 20 Sep 2013 19:03:58 +0400
Subject: [PATCH 046/159] adding patch for LLVM 3.3 which increases performance
 after regression

---
 .../3_3_r172868-vmovups-vinsertf128.patch     | 102 ++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 llvm_patches/3_3_r172868-vmovups-vinsertf128.patch

diff --git a/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch
new file mode 100644
index 00000000..36bb5572
--- /dev/null
+++ b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch
@@ -0,0 +1,102 @@
+This patch needs to be applied to LLVM 3.3 to fix performance regression after r172868 revision.
+This regression is due to increased register pressure after revision causing spills in case of multiple loads 
+This regression is fixed in 3.4 but the changes in 3.4 is not back portable,
+so we roll back r172868 to avoid regression with 3.3.
+
+Index: test/CodeGen/X86/sandybridge-loads.ll
+===================================================================
+--- test/CodeGen/X86/sandybridge-loads.ll       (revision 191082)
++++ test/CodeGen/X86/sandybridge-loads.ll       (working copy)
+@@ -1,24 +1,5 @@
+ ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
+ 
+-;CHECK: wideloads
+-;CHECK: vmovaps
+-;CHECK: vinsertf128
+-;CHECK: vmovaps
+-;CHECK-NOT: vinsertf128
+-;CHECK: ret
+-
+-define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+-  %v0 = load <8 x float>* %a, align 16  ; <---- unaligned!
+-  %v1 = load <8 x float>* %b, align 32  ; <---- aligned!
+-  %m0 = fcmp olt <8 x float> %v1, %v0
+-  %v2 = load <8 x float>* %c, align 32  ; <---- aligned!
+-  %m1 = fcmp olt <8 x float> %v2, %v0
+-  %mand = and <8 x i1> %m1, %m0
+-  %r = zext <8 x i1> %mand to <8 x i32>
+-  store <8 x i32> %r, <8 x i32>* undef, align 32
+-  ret void
+-}
+-
+ ; CHECK: widestores
+ ; loads:
+ ; CHECK: vmovaps
+Index: test/CodeGen/X86/v8i1-masks.ll
+===================================================================
+--- test/CodeGen/X86/v8i1-masks.ll	(revision 172868)
++++ test/CodeGen/X86/v8i1-masks.ll	(revision 172866)
+@@ -1,7 +1,7 @@
+ ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
+ 
+ ;CHECK: and_masks
+-;CHECK: vmovaps
++;CHECK: vmovups
+ ;CHECK: vcmpltp
+ ;CHECK: vcmpltp
+ ;CHECK: vandps
+Index: lib/Target/X86/X86ISelLowering.cpp
+===================================================================
+--- lib/Target/X86/X86ISelLowering.cpp  (revision 191077)
++++ lib/Target/X86/X86ISelLowering.cpp  (working copy)
+@@ -16756,42 +16756,9 @@
+   EVT MemVT = Ld->getMemoryVT();
+   DebugLoc dl = Ld->getDebugLoc();
+   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+-  unsigned RegSz = RegVT.getSizeInBits();
+ 
+-  // On Sandybridge unaligned 256bit loads are inefficient.
+   ISD::LoadExtType Ext = Ld->getExtensionType();
+-  unsigned Alignment = Ld->getAlignment();
+-  bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
+-  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+-      !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
+-    unsigned NumElems = RegVT.getVectorNumElements();
+-    if (NumElems < 2)
+-      return SDValue();
+ 
+-    SDValue Ptr = Ld->getBasePtr();
+-    SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
+-
+-    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+-                                  NumElems/2);
+-    SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+-                                Ld->getPointerInfo(), Ld->isVolatile(),
+-                                Ld->isNonTemporal(), Ld->isInvariant(),
+-                                Alignment);
+-    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+-    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+-                                Ld->getPointerInfo(), Ld->isVolatile(),
+-                                Ld->isNonTemporal(), Ld->isInvariant(),
+-                                std::min(16U, Alignment));
+-    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+-                             Load1.getValue(1),
+-                             Load2.getValue(1));
+-
+-    SDValue NewVec = DAG.getUNDEF(RegVT);
+-    NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
+-    NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
+-    return DCI.CombineTo(N, NewVec, TF, true);
+-  }
+-
+   // If this is a vector EXT Load then attempt to optimize it using a
+   // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
+   // expansion is still better than scalar code.
+@@ -16805,6 +16772,7 @@
+     assert(MemVT.isVector() && "Must load a vector from memory");
+ 
+     unsigned NumElems = RegVT.getVectorNumElements();
++    unsigned RegSz = RegVT.getSizeInBits();
+     unsigned MemSz = MemVT.getSizeInBits();
+     assert(RegSz > MemSz && "Register size must be greater than the mem size");
+ 

From af5da885a56b32798f4c6dc94ccbbe60bc40b28e Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Fri, 20 Sep 2013 17:28:07 +0400
Subject: [PATCH 047/159] small corrections of test system

---
 .gitignore   |   6 ++
 alloy.py     | 223 ++++++++++++++++++++++++++++++---------------------
 check_env.py |  16 ++--
 common.py    |  23 +++---
 perf.py      |  12 +--
 run_tests.py |   6 +-
 6 files changed, 169 insertions(+), 117 deletions(-)

diff --git a/.gitignore b/.gitignore
index 88fb0197..429199bb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,14 +3,20 @@
 depend
 ispc
 ispc_test
+ispc_ref
 objs
 docs/doxygen
 docs/*.html
 tests*/*cpp
 tests*/*run
+logs/
+notify_log.log
+alloy_results_*
 examples/*/*.png
 examples/*/*.ppm
 examples/*/objs/*
+examples/*/ref
+examples/*/test
 *.swp
 
 
diff --git a/alloy.py b/alloy.py
index 119874b8..31399a37 100755
--- a/alloy.py
+++ b/alloy.py
@@ -101,8 +101,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v
     LLVM_BUILD="build-" + folder
     LLVM_BIN="bin-" + folder
     if os.path.exists(LLVM_BIN) and not force:
-        print_debug("You have folder " + LLVM_BIN + ". If you want to rebuild use --force\n", False, "")
-        exit(0)
+        error("you have folder " + LLVM_BIN + ".\nIf you want to rebuild use --force", 1)
     LLVM_BUILD_selfbuild = LLVM_BUILD + "_temp"
     LLVM_BIN_selfbuild = LLVM_BIN + "_temp"
     common.remove_if_exists(LLVM_SRC)
@@ -188,26 +187,45 @@ def check_targets():
     AVX = False;
     AVX11 = False;
     AVX2 = False;
-    cpu = open("/proc/cpuinfo")
-    f_lines = cpu.readlines()
-    cpu.close()
-    # check what native targets do we have
-    for i in range(0,len(f_lines)):
-        if SSE2 == False and "sse2" in f_lines[i]:
+    if current_OS == "Linux":
+        cpu = open("/proc/cpuinfo")
+        f_lines = cpu.readlines()
+        cpu.close()
+        # check what native targets do we have
+        for i in range(0,len(f_lines)):
+            if SSE2 == False and "sse2" in f_lines[i]:
+                SSE2 = True;
+                answer = answer + ["sse2-i32x4", "sse2-i32x8"]
+            if SSE4 == False and "sse4_1" in f_lines[i]:
+                SSE4 = True;
+                answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
+            if AVX == False and "avx" in f_lines[i]:
+                AVX = True;
+                answer = answer + ["avx1-i32x8", "avx1-i32x16"]
+            if AVX11 == False and "rdrand" in f_lines[i]:
+                AVX11 = True;
+                answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
+            if AVX2 == False and "avx2" in f_lines[i]:
+                AVX2 = True;
+                answer = answer + ["avx2-i32x8", "avx2-i32x16"]
+    if current_OS == "MacOS":
+        f_lines = take_lines("sysctl machdep.cpu.features", "first")
+        if "SSE2" in f_lines:
             SSE2 = True;
             answer = answer + ["sse2-i32x4", "sse2-i32x8"]
-        if SSE4 == False and "sse4_1" in f_lines[i]:
+        if "SSE4.1" in f_lines:
             SSE4 = True;
             answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
-        if AVX == False and "avx" in f_lines[i]:
+        if "AVX1.0" in f_lines:
             AVX = True;
             answer = answer + ["avx1-i32x8", "avx1-i32x16"]
-        if AVX11 == False and "rdrand" in f_lines[i]:
+        if "RDRAND" in f_lines:
             AVX11 = True;
             answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
-        if AVX2 == False and "avx2" in f_lines[i]:
+        if "AVX2.0" in f_lines:
             AVX2 = True;
             answer = answer + ["avx2-i32x8", "avx2-i32x16"]
+
     answer = answer + ["generic-4", "generic-16", "generic-8", "generic-1", "generic-32", "generic-64"]
     # now check what targets we have with the help of SDE
     sde_exists = ""
@@ -224,17 +242,14 @@ def check_targets():
             "Please refer to http://www.intel.com/software/sde for SDE download information.", 2)
         return [answer, answer_sde]
     # here we have SDE
-    os.system(sde_exists + " -help > " + temp_alloy_file)
-    cpu = open(temp_alloy_file)
-    f_lines = cpu.readlines()
-    cpu.close()
+    f_lines = take_lines(sde_exists + " -help", "all")
     for i in range(0,len(f_lines)):
         if SSE4 == False and "wsm" in f_lines[i]:
             answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]]
         if AVX == False and "snb" in f_lines[i]:
             answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"]]
         if AVX11 == False and "ivb" in f_lines[i]:
-            answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["ivb", "avx1.1-i32x16"]]
+            answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"]]
         if AVX2 == False and "hsw" in f_lines[i]:
             answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"]]
     return [answer, answer_sde]
@@ -271,14 +286,11 @@ def execute_stability(stability, R, print_version):
 def run_special_tests():
    i = 5 
 
-def validation_run(only, only_targets, reference_branch, notify, update):
-    current_path = os.getcwd()
+def validation_run(only, only_targets, reference_branch, number, notify, update):
     os.chdir(os.environ["ISPC_HOME"])
     os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"]
     if options.notify != "":
-        if os.environ.get("SMTP_ISPC") == None:
-            error("you have no SMTP_ISPC in your environment for option notify", 1)
-        common.remove_if_exists(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt")
+        common.remove_if_exists(os.environ["ISPC_HOME"] + os.sep + "notify_log.log")
         smtp_server = os.environ["SMTP_ISPC"]
         msg = MIMEMultipart()
         msg['Subject'] = 'ISPC test system results'
@@ -437,7 +449,7 @@ def validation_run(only, only_targets, reference_branch, notify, update):
         print_debug("\n\nPerformance validation run\n\n", False, "")
         performance = options_for_drivers()
 # performance constant options
-        performance.number = 5
+        performance.number = number
         performance.config = "./perf.ini"
         performance.path = "./"
         performance.silent = True
@@ -450,16 +462,13 @@ def validation_run(only, only_targets, reference_branch, notify, update):
         if len(need_LLVM) != 0:
             build_LLVM(need_LLVM[i], "", "", "", False, False, True, False)
 # prepare reference point. build both test and reference compilers
-        os.system("git branch > " + temp_alloy_file)
-        br = open(temp_alloy_file)
-        temp4 = br.readlines()
-        br.close()
+        temp4 = take_lines("git branch", "all")
         for line in temp4:
             if "*" in line:
                 current_branch = line[2:-1]
         stashing = True
         sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n")
-        if "No local changes" in detect_version("git stash"):
+        if "No local changes" in take_lines("git stash", "first"):
             stashing = False
         #try_do_LLVM("stash current branch ", "git stash", True)
         try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True)
@@ -478,11 +487,9 @@ def validation_run(only, only_targets, reference_branch, notify, update):
             attach_mail_file(msg, performance.in_file, "performance.log")
             attach_mail_file(msg, "." + os.sep + "logs" + os.sep + "perf_build.log", "perf_build.log")
 
-    print_debug("Logs are in alloy_results_[date]", False, "")
-
 # sending e-mail with results
     if options.notify != "":
-        fp = open(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt", 'rb')
+        fp = open(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", 'rb')
         f_lines = fp.readlines()
         fp.close()
         line = ""
@@ -495,46 +502,56 @@ def validation_run(only, only_targets, reference_branch, notify, update):
         s = smtplib.SMTP(smtp_server)
         s.sendmail('ISPC_test_system', options.notify, msg.as_string())
         s.quit()
-# exit of validation routine
-    common.remove_if_exists(temp_alloy_file)
-    os.chdir(current_path)
 
 def Main():
+    global current_OS
     if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True:
+        current_OS = "Windows"
         error("Windows isn't supported now", 1)
-    if (options.build_llvm == False and
-       options.validation_run == False and
-       options.llvm_home == "" and
-       options.ispc_home == "" and
-       options.sde_home == ""):
+    else:
+        if (platform.system() == 'Darwin'):
+            current_OS = "MacOS"
+        else:
+            current_OS = "Linux" 
+
+    if (options.build_llvm == False and options.validation_run == False):
         parser.print_help()
         exit(0)
-    global f_date
-    f_date = "logs"
-    common.remove_if_exists(f_date)
-    os.makedirs(f_date)
-    global temp_alloy_file
-    temp_alloy_file = os.getcwd() + os.sep + f_date + os.sep + "temp_detect_version"
-    global alloy_build
-    alloy_build = os.getcwd() + os.sep + f_date + os.sep + "alloy_build.log"
-    common.remove_if_exists(alloy_build) 
-    global stability_log
-    stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log"
-    common.remove_if_exists(stability_log)
+
     setting_paths(options.llvm_home, options.ispc_home, options.sde_home)
     if os.environ.get("LLVM_HOME") == None:
         error("you have no LLVM_HOME", 1)
     if os.environ.get("ISPC_HOME") == None:
         error("you have no ISPC_HOME", 1)
-    if options.build_llvm:
-        build_LLVM(options.version, options.revision, options.folder, options.tarball,
+    if options.notify != "":
+        if os.environ.get("SMTP_ISPC") == None:
+            error("you have no SMTP_ISPC in your environment for option notify", 1)
+
+    global f_date
+    f_date = "logs"
+    common.remove_if_exists(f_date)
+    os.makedirs(f_date)
+    global alloy_build
+    alloy_build = os.getcwd() + os.sep + f_date + os.sep + "alloy_build.log"
+    global stability_log
+    stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log"
+    current_path = os.getcwd()
+    try:
+        if options.build_llvm:
+            build_LLVM(options.version, options.revision, options.folder, options.tarball,
                     options.debug, options.selfbuild, False, options.force)
-    if options.validation_run:
-        validation_run(options.only, options.only_targets, options.branch, options.notify, options.update)
-    os.rename(f_date, "alloy_results_" + datetime.datetime.now().strftime('%H_%M_%d_%m_%Y'))
+        if options.validation_run:
+            validation_run(options.only, options.only_targets, options.branch,
+                    options.number_for_performance, options.notify, options.update)
+    finally:
+        os.chdir(current_path)
+        date_name = "alloy_results_" + datetime.datetime.now().strftime('%H_%M_%d_%m_%Y')
+        os.rename(f_date, date_name)
+        print_debug("Logs are in " + date_name + "\n", False, "")
 
 ###Main###
 from optparse import OptionParser
+from optparse import OptionGroup
 import sys
 import os
 import operator
@@ -554,47 +571,73 @@ import run_tests
 import perf
 import common
 error = common.error
-detect_version = common.detect_version
+take_lines = common.take_lines
 print_debug = common.print_debug
 # parsing options
-parser = OptionParser()
-# options for activity "build LLVM"
+class MyParser(OptionParser):
+    def format_epilog(self, formatter):
+        return self.epilog
+examples =  ("Examples:\n" +
+"Load and build LLVM from trunk\n\talloy.py -b\n" +
+"Load and build LLVM 3.3. Rewrite LLVM folders\n\talloy.py -b --version=3.3 --force\n" +
+"Untar files llvm.tgz clang.tgz, build LLVM from them in folder bin-from_tar\n\talloy.py -b --tarball='llvm.tgz clang.tgz' --folder=from_tar\n" +
+"Load LLVM from trunk, revision r172870. Build it. Do selfbuild\n\talloy.py -b --revision=r172870 --selfbuild\n" +
+"Validation run with LLVM 3.3, trunk; x86, x86-64; -O2;\nall supported targets; performance\n\talloy.py -r\n" + 
+"Validation run with all avx targets and sse4-i8x16 without performance\n\talloy.py -r --only=stability --only-targets='avx sse4-i8x16'\n" +
+"Validation run with avx2-i32x8, all sse4 and sse2 targets\nand all targets with i32x16\n\talloy.py -r --only-targets='avx2-i32x8 sse4 i32x16 sse2'\n" +
+"Stability validation run with LLVM 3.2, 3.3; -O0; x86,\nupdate fail_db.txt with passes and fails\n\talloy.py -r --only='3.2 -O0 stability 3.3 x86' --update-errors=FP\n" +
+"Try to build compiler with all LLVM\n\talloy.py -r --only=build\n" +
+"Performance validation run with 10 runs of each test and comparing to branch 'old'\n\talloy.py -r --only=performance --compare-with=old --number=10\n" +
+"Validation run. Update fail_db.txt with new fails, send results to my@my.com\n\talloy.py -r --update-errors=F --notify='my@my.com'\n")
+parser = MyParser(usage="Usage: alloy.py -r/-b [options]", epilog=examples)
 parser.add_option('-b', '--build-llvm', dest='build_llvm',
     help='ask to build LLVM', default=False, action="store_true")
-parser.add_option('--version', dest='version',
-    help='version of llvm to build: 3.1 3.2 3.3 trunk', default="trunk")
-parser.add_option('--revision', dest='revision',
-    help='revision of llvm to build in format r172870', default="")
-parser.add_option('--debug', dest='debug',
-    help='debug build of LLVM?', default=False, action="store_true")
-parser.add_option('--folder', dest='folder',
-    help='folder to build LLVM in', default="")
-parser.add_option('--tarball', dest='tarball',
-    help='"llvm_tarball clang_tarball"', default="")
-parser.add_option('--selfbuild', dest='selfbuild',
-    help='make selfbuild of LLVM and clang', default=False, action="store_true")
-parser.add_option('--force', dest='force',
-    help='rebuild LLVM', default=False, action='store_true')
-# options for activity "setup PATHS"
-parser.add_option('--llvm_home', dest='llvm_home',help='path to LLVM',default="")
-parser.add_option('--ispc_home', dest='ispc_home',help='path to ISPC',default="")
-parser.add_option('--sde_home', dest='sde_home',help='path to SDE',default="")
-# options for activity "validation run"
 parser.add_option('-r', '--run', dest='validation_run',
     help='ask for validation run', default=False, action="store_true")
-parser.add_option('--compare-with', dest='branch',
-    help='set performance reference point', default="master")
-parser.add_option('--only-targets', dest='only_targets',
-    help='set list of targets to test. Possible values - all subnames of targets.\n' +
-        'Example: --only-targets="avx2-i32x8 sse4 i32x16 sse2"', default="")
-parser.add_option('--notify', dest='notify',
-    help='sent results to email', default="")
-parser.add_option('--only', dest='only',
+# options for activity "build LLVM"
+llvm_group = OptionGroup(parser, "Options for building LLVM",
+                    "These options must be used with -b option.")
+llvm_group.add_option('--version', dest='version',
+    help='version of llvm to build: 3.1 3.2 3.3 trunk. Default: trunk', default="trunk")
+llvm_group.add_option('--revision', dest='revision',
+    help='revision of llvm to build in format r172870', default="")
+llvm_group.add_option('--debug', dest='debug',
+    help='debug build of LLVM?', default=False, action="store_true")
+llvm_group.add_option('--folder', dest='folder',
+    help='folder to build LLVM in', default="")
+llvm_group.add_option('--tarball', dest='tarball',
+    help='"llvm_tarball clang_tarball"', default="")
+llvm_group.add_option('--selfbuild', dest='selfbuild',
+    help='make selfbuild of LLVM and clang', default=False, action="store_true")
+llvm_group.add_option('--force', dest='force',
+    help='rebuild LLVM', default=False, action='store_true')
+parser.add_option_group(llvm_group)
+# options for activity "validation run"
+run_group = OptionGroup(parser, "Options for validation run",
+                    "These options must be used with -r option.")
+run_group.add_option('--compare-with', dest='branch',
+    help='set performance reference point. Dafault: master', default="master")
+run_group.add_option('--number', dest='number_for_performance',
+    help='number of performance runs for each test. Default: 5', default=5)
+run_group.add_option('--notify', dest='notify',
+    help='email to sent results to', default="")
+run_group.add_option('--update-errors', dest='update',
+    help='rewrite fail_db.txt file according to received results (F or FP)', default="")
+run_group.add_option('--only-targets', dest='only_targets',
+    help='set list of targets to test. Possible values - all subnames of targets.',
+    default="")
+run_group.add_option('--only', dest='only',
     help='set types of tests. Possible values:\n' + 
         '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' +
-        'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).\n' +
-        'Example: --only="3.2 -O0 stability 3.3"', default="")
-parser.add_option('--update-errors', dest='update',
-    help='rewrite fail_db.txt file according to received results (F or FP)', default="")
+        'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).',
+        default="")
+parser.add_option_group(run_group)
+# options for activity "setup PATHS"
+setup_group = OptionGroup(parser, "Options for setup",
+                    "These options must be use with -r or -b to setup environment variables")
+setup_group.add_option('--llvm_home', dest='llvm_home',help='path to LLVM',default="")
+setup_group.add_option('--ispc_home', dest='ispc_home',help='path to ISPC',default="")
+setup_group.add_option('--sde_home', dest='sde_home',help='path to SDE',default="")
+parser.add_option_group(setup_group)
 (options, args) = parser.parse_args()
 Main()
diff --git a/check_env.py b/check_env.py
index 98deb235..8c90d895 100755
--- a/check_env.py
+++ b/check_env.py
@@ -39,7 +39,7 @@ import os
 import string
 print_debug = common.print_debug
 error = common.error
-detect_version = common.detect_version
+take_lines = common.take_lines
 
 exists = [False, False, False, False, False, False, False, False]
 names = ["m4", "bison", "flex", "sde", "ispc", "clang", "gcc", "icc"]
@@ -54,26 +54,26 @@ print_debug("=== in PATH: ===\n", False, "")
 print_debug("Tools:\n", False, "")
 for i in range(0,3):
     if exists[i]:
-        print_debug(detect_version(names[i] + " --version"), False, "")
+        print_debug(take_lines(names[i] + " --version", "first"), False, "")
     else:
         error("you don't have " + names[i], 0)
 if exists[0] and exists[1] and exists[2]:
     if common.check_tools(2):
-        print_debug("versions are ok\n", False, "")
+        print_debug("Tools' versions are ok\n", False, "")
 print_debug("\nSDE:\n", False, "")
 if exists[3]:
-    print_debug(detect_version(names[3] + " --version"), False, "")
+    print_debug(take_lines(names[3] + " --version", "first"), False, "")
 else:
     error("you don't have " + names[3], 2)
 print_debug("\nISPC:\n", False, "")
 if exists[4]:
-    print_debug(detect_version(names[4] + " --version"), False, "")
+    print_debug(take_lines(names[4] + " --version", "first"), False, "")
 else:
     error("you don't have " + names[4], 2)
 print_debug("\nC/C++ compilers:\n", False, "")
 for i in range(5,8):
     if exists[i]:
-        print_debug(detect_version(names[i] + " --version"), False, "")
+        print_debug(take_lines(names[i] + " --version", "first"), False, "")
     else:
         error("you don't have " + names[i], 2)
 
@@ -88,7 +88,7 @@ else:
     print_debug("Your ISPC_HOME:" + os.environ.get("ISPC_HOME") + "\n", False, "")
     if os.path.exists(os.environ.get("ISPC_HOME") + os.sep + "ispc"):
         print_debug("You have ISPC in your ISPC_HOME: " +
-        detect_version(os.environ.get("ISPC_HOME") + os.sep + "ispc" + " --version"), False, "")
+        take_lines(os.environ.get("ISPC_HOME") + os.sep + "ispc" + " --version", "first"), False, "")
     else:
         error("you don't have ISPC in your ISPC_HOME", 2)
 if os.environ.get("SDE_HOME") == None:
@@ -97,6 +97,6 @@ else:
     print_debug("Your SDE_HOME:" + os.environ.get("SDE_HOME") + "\n", False, "")
     if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"):
         print_debug("You have sde in your SDE_HOME: " +
-        detect_version(os.environ.get("SDE_HOME") + os.sep + "sde" + " --version"), False, "")
+        take_lines(os.environ.get("SDE_HOME") + os.sep + "sde" + " --version", "first"), False, "")
     else:
         error("you don't have any SDE in your ISPC_HOME", 2)
diff --git a/common.py b/common.py
index dd8fb388..19d09e4d 100644
--- a/common.py
+++ b/common.py
@@ -50,21 +50,24 @@ def remove_if_exists(filename):
             os.remove(filename)
 
 # detect version which is printed after command
-def detect_version(command):
+def take_lines(command, which):
     os.system(command + " > " + "temp_detect_version")
     version = open("temp_detect_version")
-    answer = version.readline()
+    if which == "first":
+        answer = version.readline()
+    if which == "all":
+        answer = version.readlines()
     version.close()
     remove_if_exists("temp_detect_version")
     return answer
 
 # print versions of compilers
 def print_version(ispc_test, ispc_ref, ref_compiler, s, perf_log, is_windows):
-    print_debug("\nUsing test compiler: " + detect_version(ispc_test + " --version"), s, perf_log)
+    print_debug("\nUsing test compiler: " + take_lines(ispc_test + " --version", "first"), s, perf_log)
     if ispc_ref != "":
-        print_debug("Using ref compiler:  " + detect_version(ispc_ref + " --version"), s, perf_log)
+        print_debug("Using ref compiler:  " + take_lines(ispc_ref + " --version", "first"), s, perf_log)
     if is_windows == False:
-        temp1 = detect_version(ref_compiler + " --version")
+        temp1 = take_lines(ref_compiler + " --version", "first")
     else:
         os.system(ref_compiler + " 2>&1" + " 2> temp_detect_version > temp_detect_version1" )
         version = open("temp_detect_version")
@@ -80,7 +83,7 @@ def print_debug(line, silent, filename):
         sys.stdout.write(line)
         sys.stdout.flush()
         if os.environ.get("ISPC_HOME") != None:
-            write_to_file(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt", line)
+            write_to_file(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", line)
     if filename != "":
         write_to_file(filename, line)
 
@@ -102,9 +105,9 @@ def check_tools(m):
     input_tools=[[[1,4],"m4 --version", "bad m4 version"],
                  [[2,4],"bison --version", "bad bison version"],
                  [[2,5], "flex --version", "bad flex version"]]
- 
+    ret = 1 
     for t in range(0,len(input_tools)):
-        t1 = ((detect_version(input_tools[t][1]))[:-1].split(" "))
+        t1 = ((take_lines(input_tools[t][1], "first"))[:-1].split(" "))
         for i in range(0,len(t1)):
             t11 = t1[i].split(".")
             f = True
@@ -116,5 +119,5 @@ def check_tools(m):
                     if j < len(input_tools[t][0]):
                         if int(t11[j])<input_tools[t][0][j]:
                             error(input_tools[t][2], m)
-                            return 0
-    return 1
+                            ret = 0
+    return ret
diff --git a/perf.py b/perf.py
index d1d7654b..b33e1f25 100755
--- a/perf.py
+++ b/perf.py
@@ -247,11 +247,11 @@ def compare(A, B):
             p1 = 0
         else:
             p1 = 100 - 100 * A[3][i]/B[3][i]
-        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[3][i], B[3][i], p1), False, "")
+        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[3][i], B[3][i], abs(p1)), False, "")
         if p1 < -1:
-            print_debug(" <-", False, "")
-        if p1 > 1:
             print_debug(" <+", False, "")
+        if p1 > 1:
+            print_debug(" <-", False, "")
         print_debug("\n", False, "")
     print_debug("\n", False, "")
 
@@ -261,11 +261,11 @@ def compare(A, B):
             p2 = 0
         else:
             p2 = 100 - 100 * A[4][i]/B[4][i]
-        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[4][i], B[4][i], p2), False, "")
+        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[4][i], B[4][i], abs(p2)), False, "")
         if p2 < -1:
-            print_debug(" <-", False, "")
-        if p2 > 1:
             print_debug(" <+", False, "")
+        if p2 > 1:
+            print_debug(" <-", False, "")
         print_debug("\n", False, "")
     if "performance.log" in options.in_file:
         print_debug("\n\n_________________Watch performance.log for details________________\n", False, "")
diff --git a/run_tests.py b/run_tests.py
index 2471b6cb..914f22a7 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -364,11 +364,11 @@ def file_check(compfails, runfails):
     else:
         opt = "-O2"
 # Detect LLVM version
-    temp1 = common.detect_version(ispc_exe + " --version")
+    temp1 = common.take_lines(ispc_exe + " --version", "first")
     llvm_version = temp1[-10:-2]
-#Detect compiler version
+# Detect compiler version
     if is_windows == False:
-        temp1 = common.detect_version(options.compiler_exe + " --version")
+        temp1 = common.take_lines(options.compiler_exe + " --version", "first")
         temp2 = temp1.split(" ")
         compiler_version = temp2[0] + temp2[2][0:4]
     else:

From 1c858c34f795c1b2fb29d9c07ae5c448dab287a0 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Tue, 24 Sep 2013 17:37:39 +0400
Subject: [PATCH 048/159] correction of test system

---
 alloy.py           | 53 +++++++++++++++++++++++++++++-----------------
 examples/common.mk |  2 +-
 perf.py            | 10 ++++-----
 run_tests.py       | 12 +++++++----
 4 files changed, 47 insertions(+), 30 deletions(-)

diff --git a/alloy.py b/alloy.py
index 31399a37..7ae972b4 100755
--- a/alloy.py
+++ b/alloy.py
@@ -70,7 +70,7 @@ def try_do_LLVM(text, command, from_validation):
         error("can't " + text, 1)
     print_debug("DONE.\n", from_validation, alloy_build)
 
-def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_validation, force):
+def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_validation, force, make):
     print_debug("Building LLVM. Version: " + version_LLVM + ". ", from_validation, alloy_build)
     if revision != "":
         print_debug("Revision: " + revision + ".\n", from_validation, alloy_build)
@@ -100,7 +100,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v
     LLVM_SRC="llvm-" + folder
     LLVM_BUILD="build-" + folder
     LLVM_BIN="bin-" + folder
-    if os.path.exists(LLVM_BIN) and not force:
+    if os.path.exists(LLVM_BIN + os.sep + "bin") and not force:
         error("you have folder " + LLVM_BIN + ".\nIf you want to rebuild use --force", 1)
     LLVM_BUILD_selfbuild = LLVM_BUILD + "_temp"
     LLVM_BIN_selfbuild = LLVM_BIN + "_temp"
@@ -110,7 +110,6 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v
     if selfbuild:
         common.remove_if_exists(LLVM_BUILD_selfbuild)
         common.remove_if_exists(LLVM_BIN_selfbuild)
-    MAKE = "gmake"
     print_debug("Using folders: " + LLVM_SRC + " " + LLVM_BUILD + " " + LLVM_BIN + " in " + 
         llvm_home + "\n", from_validation, alloy_build)
     # load llvm
@@ -156,9 +155,9 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v
                     LLVM_BIN_selfbuild + " --enable-optimized",
                     from_validation)
         try_do_LLVM("build release version for selfbuild ",
-                    MAKE + " -j32", from_validation)
+                    make, from_validation)
         try_do_LLVM("install release version for selfbuild ",
-                    MAKE + " install",
+                    "make install",
                     from_validation)
         os.chdir("../")
         selfbuild_compiler = " CC="+llvm_home+ "/" + LLVM_BIN_selfbuild + "/bin/clang"
@@ -175,8 +174,8 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v
                     " --enable-debug-runtime --enable-debug-symbols --enable-keep-symbols" + selfbuild_compiler,
                     from_validation)
     # building llvm
-    try_do_LLVM("build LLVM ", MAKE + " -j32", from_validation)
-    try_do_LLVM("install LLVM ", MAKE + " install", from_validation)
+    try_do_LLVM("build LLVM ", make, from_validation)
+    try_do_LLVM("install LLVM ", "make install", from_validation)
     os.chdir(current_path) 
 
 def check_targets():
@@ -254,13 +253,13 @@ def check_targets():
             answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"]]
     return [answer, answer_sde]
 
-def build_ispc(version_LLVM):
+def build_ispc(version_LLVM, make):
     current_path = os.getcwd()
     os.chdir(os.environ["ISPC_HOME"])
     p_temp = os.getenv("PATH")
     os.environ["PATH"] = os.environ["LLVM_HOME"] + "/bin-" + version_LLVM + "/bin:" + os.environ["PATH"]
-    os.system("make clean >> " + alloy_build)
-    try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", "make -j32", True)
+    try_do_LLVM("clean ISPC for building", "make clean", True)
+    try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", make, True)
     os.environ["PATH"] = p_temp
     os.chdir(current_path)
 
@@ -286,7 +285,7 @@ def execute_stability(stability, R, print_version):
 def run_special_tests():
    i = 5 
 
-def validation_run(only, only_targets, reference_branch, number, notify, update):
+def validation_run(only, only_targets, reference_branch, number, notify, update, make):
     os.chdir(os.environ["ISPC_HOME"])
     os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"]
     if options.notify != "":
@@ -327,7 +326,6 @@ def validation_run(only, only_targets, reference_branch, number, notify, update)
         stability.no_opt = False
         stability.wrapexe = ""
 # prepare parameters of run
-        common.check_tools(1)
         [targets_t, sde_targets_t] = check_targets()
         rebuild = True
         opts = []
@@ -352,6 +350,8 @@ def validation_run(only, only_targets, reference_branch, number, notify, update)
         if "current" in only:
             LLVM = [" "]
             rebuild = False
+        else:
+            common.check_tools(1)
         if only_targets != "":
             only_targets_t = only_targets.split(" ")
             for i in only_targets_t:
@@ -383,7 +383,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update)
         gen_archs = ["x86-64"]
         need_LLVM = check_LLVM(LLVM)
         for i in range(0,len(need_LLVM)):
-            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False)
+            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False, make)
 # begin validation run for stabitily
         common.remove_if_exists(stability.in_file)
         R = [[[],[]],[[],[]],[[],[]],[[],[]]]
@@ -391,7 +391,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update)
         for i in range(0,len(LLVM)):
             print_version = 2
             if rebuild:
-                build_ispc(LLVM[i])
+                build_ispc(LLVM[i], make)
             for j in range(0,len(targets)):
                 stability.target = targets[j]
                 stability.wrapexe = ""
@@ -447,6 +447,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update)
 # *** *** ***
     if ((("performance" in only) == True) or ("stability" in only) == False):
         print_debug("\n\nPerformance validation run\n\n", False, "")
+        common.check_tools(1)
         performance = options_for_drivers()
 # performance constant options
         performance.number = number
@@ -460,8 +461,9 @@ def validation_run(only, only_targets, reference_branch, number, notify, update)
 # prepare LLVM 3.3 as newest LLVM
         need_LLVM = check_LLVM(["3.3"])
         if len(need_LLVM) != 0:
-            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False)
+            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False, make)
 # prepare reference point. build both test and reference compilers
+        try_do_LLVM("apply git", "git branch", True)
         temp4 = take_lines("git branch", "all")
         for line in temp4:
             if "*" in line:
@@ -473,14 +475,14 @@ def validation_run(only, only_targets, reference_branch, number, notify, update)
         #try_do_LLVM("stash current branch ", "git stash", True)
         try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True)
         sys.stdout.write(".\n")
-        build_ispc("3.3")
+        build_ispc("3.3", make)
         sys.stdout.write(".\n")
         os.rename("ispc", "ispc_ref")
         try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True)
         if stashing:
             try_do_LLVM("return current branch ", "git stash pop", True)
         sys.stdout.write("You can interrupt script now.\n")
-        build_ispc("3.3")
+        build_ispc("3.3", make)
 # begin validation run for performance. output is inserted into perf()
         perf.perf(performance, [])
         if options.notify != "":
@@ -526,6 +528,12 @@ def Main():
     if options.notify != "":
         if os.environ.get("SMTP_ISPC") == None:
             error("you have no SMTP_ISPC in your environment for option notify", 1)
+    if options.only != "":
+        test_only_r = " 3.1 3.2 3.3 trunk current build stability performance x86 x86-64 -O0 -O2 native "
+        test_only = options.only.split(" ")
+        for iterator in test_only:
+            if not (" " + iterator + " " in test_only_r):
+                error("unknow option for only: " + iterator, 1)
 
     global f_date
     f_date = "logs"
@@ -536,16 +544,19 @@ def Main():
     global stability_log
     stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log"
     current_path = os.getcwd()
+    make = "make -j" + options.speed
     try:
         if options.build_llvm:
             build_LLVM(options.version, options.revision, options.folder, options.tarball,
-                    options.debug, options.selfbuild, False, options.force)
+                    options.debug, options.selfbuild, False, options.force, make)
         if options.validation_run:
             validation_run(options.only, options.only_targets, options.branch,
-                    options.number_for_performance, options.notify, options.update)
+                    options.number_for_performance, options.notify, options.update, make)
     finally:
         os.chdir(current_path)
-        date_name = "alloy_results_" + datetime.datetime.now().strftime('%H_%M_%d_%m_%Y')
+        date_name = "alloy_results_" + datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')
+        if os.path.exists(date_name):
+            error("It's forbidden to run alloy two times in a second, logs are in ./logs", 1)
         os.rename(f_date, date_name)
         print_debug("Logs are in " + date_name + "\n", False, "")
 
@@ -594,6 +605,8 @@ parser.add_option('-b', '--build-llvm', dest='build_llvm',
     help='ask to build LLVM', default=False, action="store_true")
 parser.add_option('-r', '--run', dest='validation_run',
     help='ask for validation run', default=False, action="store_true")
+parser.add_option('-j', dest='speed',
+    help='set -j for make', default="8")
 # options for activity "build LLVM"
 llvm_group = OptionGroup(parser, "Options for building LLVM",
                     "These options must be used with -b option.")
diff --git a/examples/common.mk b/examples/common.mk
index cdfc4c6a..95ec7ccb 100644
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -44,7 +44,7 @@ dirs:
 objs/%.cpp objs/%.o objs/%.h: dirs
 
 clean:
-	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
+	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 ref test
 
 $(EXAMPLE): $(OBJS)
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
diff --git a/perf.py b/perf.py
index b33e1f25..576a5c7d 100755
--- a/perf.py
+++ b/perf.py
@@ -190,7 +190,7 @@ def print_answer(answer):
     filelist = []
     print_debug("--------------------------------------------------------------------------\n", s, perf_log)
     print_debug("test name:\t    ISPC speedup: ISPC + tasks speedup: | " + 
-        "ISPC time:    ISPC + tasks time:  serial:\n", s, perf_log)
+        "    ISPC time:    ISPC + tasks time:  serial:\n", s, perf_log)
     filelist.append("test name,ISPC speedup,diff," +
         "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n")
     max_t = [0,0,0,0,0]
@@ -215,9 +215,9 @@ def print_answer(answer):
                 list_of_max[t-1].append(mm)
                 diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t]))
         print_debug("%s:\n" % answer[i][0], s, perf_log)
-        print_debug("\t\tmax:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
+        print_debug("\t\tmax:\t%5s\t\t%10s\t|min:%10s\t%10s\t%10s\n" %
             (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4]), s, perf_log)
-        print_debug("\t\tdiff:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
+        print_debug("\t\tdiff:\t%5s\t\t%10s\t|%14s\t%10s\t%10s\n" %
             (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4]), s, perf_log)
         for t in range(0,5):
             if max_t[t] == "n/a":
@@ -231,7 +231,7 @@ def print_answer(answer):
     for i in range(0,5):
         geomean_t[i] = geomean(list_of_max[i])
     print_debug("---------------------------------------------------------------------------------\n", s, perf_log)
-    print_debug("Geomean:\t\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
+    print_debug("Geomean:\t\t%5s\t\t%10s\t|%14s\t%10s\t%10s\n" %
         (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4]), s, perf_log)
     filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1])
         + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n")
@@ -474,7 +474,7 @@ if __name__ == "__main__":
     parser.add_option('-c', '--config', dest='config',
         help='config file of tests', default="./perf.ini")
     parser.add_option('-p', '--path', dest='path',
-        help='path to test_system directory', default=".")
+        help='path to ispc root', default=".")
     parser.add_option('-s', '--silent', dest='silent',
         help='silent mode, only table output', default=False, action="store_true")
     parser.add_option('-o', '--output', dest='output',
diff --git a/run_tests.py b/run_tests.py
index 914f22a7..abc9b656 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -369,8 +369,12 @@ def file_check(compfails, runfails):
 # Detect compiler version
     if is_windows == False:
         temp1 = common.take_lines(options.compiler_exe + " --version", "first")
-        temp2 = temp1.split(" ")
-        compiler_version = temp2[0] + temp2[2][0:4]
+        temp2 = re.search("[0-9]*\.[0-9]*\.[0-9]", temp1)
+        if temp2 == None:
+            temp3 = re.search("[0-9]*\.[0-9]*", temp1)
+        else:
+            temp3 = re.search("[0-9]*\.[0-9]*", temp2.group())
+        compiler_version = options.compiler_exe + temp3.group()
     else:
         compiler_version = "cl" 
     new_line = " "+options.arch.rjust(6)+" "+options.target.rjust(14)+" "+OS.rjust(7)+" "+llvm_version+" "+compiler_version.rjust(10)+" "+opt+" *\n"
@@ -464,7 +468,7 @@ def run_tests(options1, args, print_version):
     global s
     s = options.silent
     
-    # prepare run_tests_log and test_states files
+    # prepare run_tests_log and fail_db files
     global run_tests_log
     if options.in_file:
         run_tests_log = os.getcwd() + os.sep + options.in_file
@@ -715,7 +719,7 @@ if __name__ == "__main__":
                   default=False, action="store_true")
     parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates',
                   default=False, action="store_true")
-    parser.add_option('-u', "--update", dest='update', help='Update file with fails (F of FP)', default="")
+    parser.add_option('-u', "--update-errors", dest='update', help='Update file with fails (F of FP)', default="")
     parser.add_option('-s', "--silent", dest='silent', help='enable silent mode without any output', default=False,
                   action = "store_true")
     parser.add_option("--file", dest='in_file', help='file to save run_tests output', default="")

From dfc723bc1958f39d4526897fdfd5173a936c09f7 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 23 Sep 2013 21:35:33 +0400
Subject: [PATCH 049/159] Add fails with gcc 4.4 on Linux

---
 fail_db.txt | 326 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 326 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index 7adc3e41..23a6c8ca 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -1 +1,327 @@
 % List of known fails
+./tests/masked-scatter-vector.ispc runfail  x86-64     sse2-i32x4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/broadcast-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/count-leading-trailing-zeros-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/count-leading-trailing-zeros-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/exclusive-scan-add-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/exclusive-scan-add-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/exclusive-scan-add-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/exclusive-scan-and-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/exclusive-scan-or-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-uniform-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-uniform-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-uniform-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-varying-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-varying-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-varying-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-varying-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/idiv.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/int64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/int64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/int64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/int64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/new-delete-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/phi-opts-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/phi-opts-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/popcnt-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/popcnt-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/popcnt-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-add-int16-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-add-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/short-vec-14.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/soa-27.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/soa-28.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-128.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-57.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/uint64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/uint64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/uint64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/uint64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-gather-ifs.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-gather-multi-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-gather-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-mixed-unif-vary-indexing-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-mixed-unif-vary-indexing-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-mixed-unif-vary-indexing.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-multidim-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-scatter-unif-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-scatter-vary.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-struct-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/broadcast-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-array-gather-ifs.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-array-gather-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-array-multidim-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-array-struct-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-unif-struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/count-leading-trailing-zeros-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/count-leading-trailing-zeros-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/exclusive-scan-add-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/exclusive-scan-add-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/exclusive-scan-add-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/exclusive-scan-and-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/exclusive-scan-or-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-uniform-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-uniform-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-uniform-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-varying-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-varying-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-varying-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-varying-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/gather-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/gather-to-vload-neg-offset.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/global-array-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/idiv.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/int64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/int64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/int64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/int64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/masked-scatter-struct.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/masked-scatter-vector.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/nested-structs-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/new-delete-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/pass-varying-lvalue-to-ref.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/phi-opts-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/phi-opts-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/popcnt-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/popcnt-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/popcnt-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-add-int16-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-add-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/scatter-int16-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/scatter-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/scatter-mask-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/scatter-mask-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/short-vec-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/short-vec-14.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/soa-28.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-128.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-57.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/uint64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/uint64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/uint64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/uint64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/unif-struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/varying-struct-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/varying-struct-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/varying-struct-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/write-same-loc.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/masked-scatter-struct.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *

From 2a83cefd5b0d3f19f968e9f91702e073211375bb Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Thu, 26 Sep 2013 19:07:38 +0400
Subject: [PATCH 050/159] Add fails with gcc 4.7 on Linux

---
 fail_db.txt | 175 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 175 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index 23a6c8ca..9cc7a884 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -325,3 +325,178 @@
 ./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/masked-scatter-struct.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/memset-varying.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/rotate.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shift1.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shift1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/memset-varying.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shift1.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shift1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *

From 5855ae746021553cea0cb4c81c913a71e4fc71f9 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 27 Sep 2013 02:32:01 +0400
Subject: [PATCH 051/159] Add fails with gcc 4.7 on Mac

---
 fail_db.txt | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 146 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index 9cc7a884..b8e58d8b 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -500,3 +500,149 @@
 ./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
 ./tests/shift1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
 ./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/memset-varying.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-10.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/memset-varying.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-10.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *

From 396aaae098abc2e7a5ed5a02c97254a9f292086e Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 27 Sep 2013 17:00:17 +0400
Subject: [PATCH 052/159] Add fails with VS2010 on Windows

---
 fail_db.txt | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 216 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index b8e58d8b..a6608c12 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -646,3 +646,219 @@
 ./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
 ./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-11.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-13.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-5.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-6.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-4.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-5.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-6.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-11.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-13.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-5.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-6.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-down-int8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-up-int8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-down-int8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-up-int8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-11.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-13.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-5.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-6.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-4.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-5.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-6.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-11.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-13.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-5.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-6.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-down-int8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-up-int8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *

From da52ae844f95ef617ef81af0f0588395109d2994 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 27 Sep 2013 18:06:28 +0400
Subject: [PATCH 053/159] Adding AVX2 fails on Windows

---
 fail_db.txt | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index a6608c12..59e0a7a6 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -862,3 +862,65 @@
 .\tests\switch-12.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-9.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\test-141.ispc runfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\test-141.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *

From 8e71dbd6c12b0fde77ed58c21e4083c84227114e Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 27 Sep 2013 18:12:12 +0400
Subject: [PATCH 054/159] Adding comments to fail_db.txt

---
 fail_db.txt | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fail_db.txt b/fail_db.txt
index 59e0a7a6..eb3c0fe9 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -1,4 +1,11 @@
-% List of known fails
+% List of known fails.
+% The list is unordered and contains information about commonly used platforms / configurations.
+% Our goas is to maintain this list for Linux, MacOS and Windows with reasonably new compilers.
+% Note, that it's important which C++ compiler was used. For example, gcc 4.4 is know to produce
+% considerably more fails with generic targets, than gcc 4.7 or later.
+% Using old compilers (gcc 4.4 is considered to be relatively old) may cause LLVM bugs.
+% To avoid them you can use LLVM selfbuild.
+% 
 ./tests/masked-scatter-vector.ispc runfail  x86-64     sse2-i32x4   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *

From 8a39af8f7204640fa802f6eb07403526523d1ea3 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 27 Sep 2013 23:27:05 +0400
Subject: [PATCH 055/159] Release 1.5.0

---
 docs/ReleaseNotes.txt | 60 +++++++++++++++++++++++++++++++++++++++++++
 docs/news.rst         |  8 ++++++
 doxygen.cfg           |  2 +-
 ispc.h                |  2 +-
 4 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt
index 007f283e..a8575ea0 100644
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,63 @@
+=== v1.5.0 === (27 September 2013)
+
+A major new version of ISPC with several new targets and important bug fixes.
+Here's a list of the most important changes, if you are using pre-built
+binaries (which are based on patched version of LLVM 3.3):
+
+* The naming of targets was changed to explicitly include data type width and
+  a number of threads in the gang. For example, avx2-i32x8 is avx2 target,
+  which uses 32 bit types as a base and has 8 threads in a gang. Old naming
+  scheme is still supported, but depricated.
+
+* New SSE4 targets for calculations based on 8 bit and 16 bit data types:
+  sse4-i8x16 and sse4-i16x8.
+
+* New AVX1 target for calculations based on 64 bit data types: avx1-i64x4.
+
+* SVML support was extended and improved.
+
+* Behavior of -g switch was changed to not affect optimization level.
+
+* ISPC debug infrastructure was redesigned. See --help-dev for more info and
+  enjoy capabilities of new --debug-phase=<value> and --off-phase=<value>
+  switches.
+
+* Fixed an auto-dispatch bug, which caused AVX code execution when OS doesn't
+  support AVX (but hardware does).
+
+* Fixed a bug, which discarded uniform/varying keyword in typedefs.
+
+* Several performance regressions were fixed.
+
+If you are building ISPC yourself, then following changes are also available
+to you:
+
+* --cpu=slm for targeting Intel Atom codename Silvermont (if LLVM 3.4 is used).
+
+* ARM NEON targets are available (if enabled in build system).
+
+* --debug-ir=<value> is available to generate debug information based on LLVM
+  IR (if LLVM 3.4 is used). In debugger you'll see LLVM IR instead of source
+  code.
+
+* A redesigned and improved test and configuration management system is
+  available to facilitate the process of building LLVM and testing ISPC
+  compiler.
+
+Standard library changes/fixes:
+
+* __pause() function was removed from standard library.
+
+* Fixed reduce_[min|max]_[float|double] intrinsics, which were producing
+  incorrect code under some conditions.
+
+Language changes:
+
+* By default a floating point constant without a suffix is a single precision
+  constant (32 bit). A new suffix "d" was introduced to allow double precision
+  constant (64 bit). Please refer to tests/double-consts.ispc for syntax
+  examples.
+
 === v1.4.4 === (19 July 2013)
 
 A minor version update with several stability fixes requested by the customers.
diff --git a/docs/news.rst b/docs/news.rst
index c1c35de3..7d78a662 100644
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -2,6 +2,14 @@
 ispc News
 =========
 
+ispc 1.5.0 is Released
+----------------------
+
+A major update of ``ispc`` has been released with several new targets available
+and bunch of performance and stability fixes. The released binaries are built
+with patched version of LLVM 3.3. Please refer to Release Notes for complete
+set of changes.
+
 ispc 1.4.4 is Released
 ----------------------
 
diff --git a/doxygen.cfg b/doxygen.cfg
index 480d9331..ab4eec20 100644
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
 
-PROJECT_NUMBER         = 1.4.5dev
+PROJECT_NUMBER         = 1.5.0
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
diff --git a/ispc.h b/ispc.h
index 4804832f..4b7ae732 100644
--- a/ispc.h
+++ b/ispc.h
@@ -38,7 +38,7 @@
 #ifndef ISPC_H
 #define ISPC_H
 
-#define ISPC_VERSION "1.4.5dev"
+#define ISPC_VERSION "1.5.0"
 
 #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
 #error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported"

From 3b4cc9080046983932ea461345344deccd0ad33e Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Sat, 28 Sep 2013 01:32:00 +0400
Subject: [PATCH 056/159] Changing ISPC to 1.5.dev

---
 doxygen.cfg | 2 +-
 ispc.h      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doxygen.cfg b/doxygen.cfg
index ab4eec20..a0ad3176 100644
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
 
-PROJECT_NUMBER         = 1.5.0
+PROJECT_NUMBER         = 1.5.1dev
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
diff --git a/ispc.h b/ispc.h
index 4b7ae732..82cb9050 100644
--- a/ispc.h
+++ b/ispc.h
@@ -38,7 +38,7 @@
 #ifndef ISPC_H
 #define ISPC_H
 
-#define ISPC_VERSION "1.5.0"
+#define ISPC_VERSION "1.5.1dev"
 
 #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
 #error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported"

From 758efebb3cc166e46169931490fbb42c5f9ffd65 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 30 Sep 2013 17:54:59 +0400
Subject: [PATCH 057/159] Add missing testing support for avx1-i64x4 target

---
 alloy.py     | 6 +++---
 ispc.cpp     | 4 ++--
 run_tests.py | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/alloy.py b/alloy.py
index 7ae972b4..3f05f4fd 100755
--- a/alloy.py
+++ b/alloy.py
@@ -200,7 +200,7 @@ def check_targets():
                 answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
             if AVX == False and "avx" in f_lines[i]:
                 AVX = True;
-                answer = answer + ["avx1-i32x8", "avx1-i32x16"]
+                answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"]
             if AVX11 == False and "rdrand" in f_lines[i]:
                 AVX11 = True;
                 answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
@@ -217,7 +217,7 @@ def check_targets():
             answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
         if "AVX1.0" in f_lines:
             AVX = True;
-            answer = answer + ["avx1-i32x8", "avx1-i32x16"]
+            answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"]
         if "RDRAND" in f_lines:
             AVX11 = True;
             answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
@@ -246,7 +246,7 @@ def check_targets():
         if SSE4 == False and "wsm" in f_lines[i]:
             answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]]
         if AVX == False and "snb" in f_lines[i]:
-            answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"]]
+            answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]]
         if AVX11 == False and "ivb" in f_lines[i]:
             answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"]]
         if AVX2 == False and "hsw" in f_lines[i]:
diff --git a/ispc.cpp b/ispc.cpp
index bec7baf7..56b0a25f 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -714,11 +714,11 @@ Target::SupportedTargets() {
 #endif
         "sse2-i32x4, sse2-i32x8, "
         "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
-        "avx1-i32x8, avx1-i32x16, "
+        "avx1-i32x8, avx1-i32x16, avx1-i64x4, "
         "avx1.1-i32x8, avx1.1-i32x16, "
         "avx2-i32x8, avx2-i32x16, "
         "generic-x1, generic-x4, generic-x8, generic-x16, "
-            "generic-x32, generic-x64";
+        "generic-x32, generic-x64";
 }
 
 
diff --git a/run_tests.py b/run_tests.py
index 64d3462a..4146576c 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -449,7 +449,7 @@ def verify():
     check = [["g++", "clang", "cl"],["-O0", "-O2"],["x86","x86-64"],
              ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM head"],
              ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8",
-              "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1.1-i32x8", "avx1.1-i32x16",
+              "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8", "avx1.1-i32x16",
               "avx2-i32x8", "avx2-i32x16", "generic-1", "generic-4", "generic-8",
               "generic-16", "generic-32", "generic-64"]]
     for i in range (0,len(f_lines)):

From 7942bdb728f8fc9b6cc560303cf6193ed5aba647 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 30 Sep 2013 18:09:59 +0400
Subject: [PATCH 058/159] Typo fix and copyright update

---
 docs/ispc.rst          | 4 ++--
 docs/template-news.txt | 2 +-
 docs/template-perf.txt | 2 +-
 docs/template.txt      | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/ispc.rst b/docs/ispc.rst
index 224faaa9..eac9b24e 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -270,8 +270,8 @@ new reserved words: ``unmasked``, ``foreach_unique``, ``foreach_active``,
 and ``in``.  Any program that happens to have a variable or function with
 one of these names must be modified to rename that symbol.
 
-Updating ISPC Programs For Changes In ISPC 1.4.5
-----------------------------------------------
+Updating ISPC Programs For Changes In ISPC 1.5.0
+------------------------------------------------
 
 This release adds support for double precision floating point constants.
 Double precision floating point constants are floating point number with
diff --git a/docs/template-news.txt b/docs/template-news.txt
index 9a41fbdb..d5eebdd1 100644
--- a/docs/template-news.txt
+++ b/docs/template-news.txt
@@ -57,7 +57,7 @@
 %(body)s
 </div>
     <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2013 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
       <!-- Please Do Not remove this link, thank u -->
       </div>
       </div>
diff --git a/docs/template-perf.txt b/docs/template-perf.txt
index 4932e332..9537a836 100644
--- a/docs/template-perf.txt
+++ b/docs/template-perf.txt
@@ -57,7 +57,7 @@
 %(body)s
 </div>
     <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2013 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
       <!-- Please Do Not remove this link, thank u -->
       </div>
       </div>
diff --git a/docs/template.txt b/docs/template.txt
index 8cb4f5ab..b9041f19 100644
--- a/docs/template.txt
+++ b/docs/template.txt
@@ -57,7 +57,7 @@
 %(body)s
 </div>
     <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2013 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
       <!-- Please Do Not remove this link, thank u -->
       </div>
       </div>

From 49cefc2e972bb3d742f74f855cd40b09b57f029b Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 30 Sep 2013 19:20:18 +0400
Subject: [PATCH 059/159] Updating fail_db for new target

---
 fail_db.txt | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index eb3c0fe9..31db9961 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -931,3 +931,21 @@
 .\tests\switch-12.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-8.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-9.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.4         cl -O2 *

From 2d6f7a7c93bcbe89c2ec55e99a995d309c2d85b5 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 1 Oct 2013 17:37:34 +0400
Subject: [PATCH 060/159] Support i686 architecture recognition as x86 and
 enable 32 bit x86 platforms

---
 examples/common.mk | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/examples/common.mk b/examples/common.mk
index 95ec7ccb..330a2453 100644
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -12,15 +12,22 @@ LIBS=-lm $(TASK_LIB) -lstdc++
 ISPC=ispc -O2 $(ISPC_FLAGS)
 ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
 
-ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
+ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
 
 ifeq ($(ARCH),x86)
   ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
 	$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
   ISPC_TARGETS=$(ISPC_IA_TARGETS)
-  ISPC_FLAGS += --arch=x86-64
-  CXXFLAGS += -m64
-  CCFLAGS += -m64
+  ARCH_BIT:=$(shell getconf LONG_BIT)
+  ifeq ($(ARCH_BIT),32)
+    ISPC_FLAGS += --arch=x86
+    CXXFLAGS += -m32
+    CCFLAGS += -m32
+  else
+    ISPC_FLAGS += --arch=x86-64
+    CXXFLAGS += -m64
+    CCFLAGS += -m64
+  endif
 else ifeq ($(ARCH),arm)
   ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o))
   ISPC_TARGETS=$(ISPC_ARM_TARGETS)

From b2cf0209b153c072f5e531e23203a68e05d47d87 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Tue, 1 Oct 2013 18:01:29 +0400
Subject: [PATCH 061/159] pipe correction and some other small changes in test
 system

---
 alloy.py     |  4 ++++
 common.py    |  3 ++-
 run_tests.py | 15 ++++++++-------
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/alloy.py b/alloy.py
index 7ae972b4..6b55f85b 100755
--- a/alloy.py
+++ b/alloy.py
@@ -353,8 +353,12 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
         else:
             common.check_tools(1)
         if only_targets != "":
+            only_targets += " "
+            only_targets = only_targets.replace("generic "," generic-4 generic-16 ")
             only_targets_t = only_targets.split(" ")
             for i in only_targets_t:
+                if i == "":
+                    continue
                 err = True
                 for j in range(0,len(targets_t)):
                     if i in targets_t[j]:
diff --git a/common.py b/common.py
index 19d09e4d..be3e9526 100644
--- a/common.py
+++ b/common.py
@@ -83,7 +83,8 @@ def print_debug(line, silent, filename):
         sys.stdout.write(line)
         sys.stdout.flush()
         if os.environ.get("ISPC_HOME") != None:
-            write_to_file(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", line)
+            if os.path.exists(os.environ.get("ISPC_HOME")):
+                write_to_file(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", line)
     if filename != "":
         write_to_file(filename, line)
 
diff --git a/run_tests.py b/run_tests.py
index abc9b656..7b2f5f29 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -332,8 +332,6 @@ def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test
             skip_files += [ filename ]
 
 
-task_threads = []
-
 def sigint(signum, frame):
     for t in task_threads:
         t.terminate()
@@ -423,6 +421,8 @@ def file_check(compfails, runfails):
         for i in range (0,len(new_compfails)):
             new_f_lines.append(new_compfails[i] + " compfail " + new_line)
             print_debug("\t" + new_compfails[i] + "\n", s, run_tests_log)
+    if len(new_runfails) == 0 and len(new_compfails) == 0:
+        print_debug("No new fails\n", s, run_tests_log)
     if len(new_passes_runfails) != 0:
         print_debug("NEW PASSES after RUNFAILS:\n", s, run_tests_log)
         for i in range (0,len(new_passes_runfails)):
@@ -561,7 +561,6 @@ def run_tests(options1, args, print_version):
     # failing_tests/, and tests_errors/
     if len(args) == 0:
         files = glob.glob(ispc_root + os.sep + "tests" + os.sep + "*ispc") + \
-            glob.glob(ispc_root + os.sep + "failing_tests" + os.sep + "*ispc") + \
             glob.glob(ispc_root + os.sep + "tests_errors" + os.sep + "*ispc")
     else:
         if is_windows:
@@ -622,12 +621,12 @@ def run_tests(options1, args, print_version):
     start_time = time.time()
     # launch jobs to run tests
     glob_var = [is_windows, options, s, ispc_exe, is_generic_target, run_tests_log]
+    global task_threads
+    task_threads = [0] * nthreads
     for x in range(nthreads):
-        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests,
+        task_threads[x] = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests,
             max_test_length, finished_tests_counter, finished_tests_counter_lock, glob_var))
-        task_threads.append(t)
-        t.start()
-
+        task_threads[x].start()
     # wait for them to all finish and then return the number that failed
     # (i.e. return 0 if all is ok)
     for t in task_threads:
@@ -660,6 +659,8 @@ def run_tests(options1, args, print_version):
         print_debug("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests), s, run_tests_log)
         for f in run_error_files:
             print_debug("\t%s\n" % f, s, run_tests_log)
+    if len(compile_error_files) == 0 and len(run_error_files) == 0:
+        print_debug("No fails\n", s, run_tests_log)
 
     R = file_check(compile_error_files, run_error_files)
 

From c7b4164122f7a9cf45a1a2ea30c90064650258dd Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 1 Oct 2013 18:40:26 +0400
Subject: [PATCH 062/159] Redefining ISPC should not discard ISPC_FLAGS

---
 examples/common.mk | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/common.mk b/examples/common.mk
index 330a2453..367d3eb3 100644
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -9,7 +9,8 @@ CC=gcc
 CCFLAGS=-Iobjs/ -O2
 
 LIBS=-lm $(TASK_LIB) -lstdc++
-ISPC=ispc -O2 $(ISPC_FLAGS)
+ISPC=ispc
+ISPC_FLAGS=-O2
 ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
 
 ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
@@ -68,10 +69,10 @@ objs/%.o: ../%.cpp dirs
 objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h
 
 objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+	$(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
 
 objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
-	$(ISPC) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
 
 objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
 	$(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@
@@ -80,7 +81,7 @@ $(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
 
 objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
-	$(ISPC) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
 
 objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
 	$(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@
@@ -89,7 +90,7 @@ $(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
 
 objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
-	$(ISPC) $< -o $@ --target=generic-1
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-1
 
 $(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)

From dc8895352af94d7042e9e7658035c3c9d35ba8b7 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Tue, 1 Oct 2013 11:53:56 -0400
Subject: [PATCH 063/159] Adding missing typecasts and guarding i64 __mul with
 compiler version check

---
 examples/intrinsics/knc-i1x16.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 2ee6d2f5..ae9c4130 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -1021,9 +1021,13 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b
         _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm();
 }
 
+#if __ICC_VERSION == 1400
 static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) {
   return __vec16_i64(_mm512_mullox_epi64(a.v1, b.v1), _mm512_mullox_epi64(a.v2,b.v2));
 }
+#else
+BINARY_OP(__vec16_i64, __mul, *)
+#endif
 #endif
 
 #if 0
@@ -2164,7 +2168,7 @@ static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
     __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
     __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
 
-    return _mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA);
+    return _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA));
 }
 #endif
 
@@ -2174,7 +2178,7 @@ CAST(__vec16_d, double, __vec16_f, float,  __cast_fpext)
 static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) {
     __vec16_d ret;
     ret.v1 = _mm512_cvtpslo_pd(val.v);
-    __vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC);
+    __vec16_f other8 = _mm512_castsi512_ps(_mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC));
     ret.v2 = _mm512_cvtpslo_pd(other8);
     return ret;
 }

From d45c5767d8ff81e01f56418095be3d115eb0507c Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Tue, 1 Oct 2013 12:17:57 -0400
Subject: [PATCH 064/159] Due diligence tweaks.

---
 examples/intrinsics/knc-i1x16.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index ae9c4130..730141ec 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -1021,12 +1021,22 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b
         _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm();
 }
 
-#if __ICC_VERSION == 1400
+#if __ICC_VERSION >= 1400
 static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) {
   return __vec16_i64(_mm512_mullox_epi64(a.v1, b.v1), _mm512_mullox_epi64(a.v2,b.v2));
 }
 #else
-BINARY_OP(__vec16_i64, __mul, *)
+static FORCEINLINE __vec16_i64 __mul(const __vec16_i64 &a, const __vec16_i64 &b)
+{
+    __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo);
+    __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
+    __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
+    __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
+    __mmask16 carry = 0;
+    __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry);
+    __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry);
+    return __vec16_i64(lo, hi);
+}
 #endif
 #endif
 

From ac79f3f34555a97da837834076c8adc89e6c50a4 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Tue, 1 Oct 2013 12:31:33 -0400
Subject: [PATCH 065/159] format change

---
 examples/intrinsics/knc-i1x16.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 730141ec..84a1f7aa 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -1026,8 +1026,10 @@ static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) {
   return __vec16_i64(_mm512_mullox_epi64(a.v1, b.v1), _mm512_mullox_epi64(a.v2,b.v2));
 }
 #else
-static FORCEINLINE __vec16_i64 __mul(const __vec16_i64 &a, const __vec16_i64 &b)
+static FORCEINLINE __vec16_i64 __mul(const __vec16_i64 &_a, const __vec16_i64 &_b)
 {
+    const __vec16_i64 a = _a.cvt2hilo();
+    const __vec16_i64 b = _b.cvt2hilo();
     __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo);
     __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
     __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
@@ -1035,7 +1037,7 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i64 &a, const __vec16_i64 &b)
     __mmask16 carry = 0;
     __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry);
     __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry);
-    return __vec16_i64(lo, hi);
+    return __vec16_i64(hi,lo).cvt2zmm();
 }
 #endif
 #endif

From 32c77be2f3537b24890e1334b1a7d2579c58d2c1 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Fri, 4 Oct 2013 11:42:52 +0300
Subject: [PATCH 066/159] cleaned mask & int32, only test141 fails

---
 examples/intrinsics/knc-i1x16.h | 656 +++++++++-----------------------
 1 file changed, 190 insertions(+), 466 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index ae9c4130..aae4be57 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -64,69 +64,48 @@ extern "C"
 }
 #endif
 
-typedef float __vec1_f;
-typedef double __vec1_d;
-typedef int8_t __vec1_i8;
+typedef float   __vec1_f;
+typedef double  __vec1_d;
+typedef int8_t  __vec1_i8;
 typedef int16_t __vec1_i16;
 typedef int32_t __vec1_i32;
 typedef int64_t __vec1_i64;
 
-struct __vec16_i1 {
-    __vec16_i1() { }
-    __vec16_i1(const __mmask16 &vv) : v(vv) { }
-    __vec16_i1(bool v0, bool v1, bool v2, bool v3,
-               bool v4, bool v5, bool v6, bool v7,
-               bool v8, bool v9, bool v10, bool v11,
-               bool v12, bool v13, bool v14, bool v15) {
-        v = ((v0 & 1) |
-             ((v1 & 1) << 1) |
-             ((v2 & 1) << 2) |
-             ((v3 & 1) << 3) |
-             ((v4 & 1) << 4) |
-             ((v5 & 1) << 5) |
-             ((v6 & 1) << 6) |
-             ((v7 & 1) << 7) |
-             ((v8 & 1) << 8) |
-             ((v9 & 1) << 9) |
-             ((v10 & 1) << 10) |
-             ((v11 & 1) << 11) |
-             ((v12 & 1) << 12) |
-             ((v13 & 1) << 13) |
-             ((v14 & 1) << 14) |
-             ((v15 & 1) << 15));
-    }
-             
-    __mmask16 v;
-    FORCEINLINE operator __mmask16() const { return v; }
+/************ mask **************/
+
+struct __vec16_i1 
+{
+  __mmask16 v;
+
+  FORCEINLINE __vec16_i1() { }
+  FORCEINLINE __vec16_i1(const __mmask16 &vv) : v(vv) { }
+  FORCEINLINE __vec16_i1(bool  v0, bool  v1, bool  v2, bool  v3,
+                         bool  v4, bool  v5, bool  v6, bool  v7,
+                         bool  v8, bool  v9, bool v10, bool v11,
+                         bool v12, bool v13, bool v14, bool v15) {
+    v = ((v0 & 1) |
+        ((v1 & 1) << 1) |
+        ((v2 & 1) << 2) |
+        ((v3 & 1) << 3) |
+        ((v4 & 1) << 4) |
+        ((v5 & 1) << 5) |
+        ((v6 & 1) << 6) |
+        ((v7 & 1) << 7) |
+        ((v8 & 1) << 8) |
+        ((v9 & 1) << 9) |
+        ((v10 & 1) << 10) |
+        ((v11 & 1) << 11) |
+        ((v12 & 1) << 12) |
+        ((v13 & 1) << 13) |
+        ((v14 & 1) << 14) |
+        ((v15 & 1) << 15));
+  }
+
+  FORCEINLINE operator __mmask16() const { return v; }
 };
 
+/************ vector **************/
 
-template <typename T>
-struct vec16 {
-    vec16() { }
-    vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
-          T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) {
-        data[0] = v0;        data[1] = v1;        data[2] = v2;        data[3] = v3;
-        data[4] = v4;        data[5] = v5;        data[6] = v6;        data[7] = v7;
-        data[8] = v8;        data[9] = v9;        data[10] = v10;      data[11] = v11;
-        data[12] = v12;      data[13] = v13;      data[14] = v14;      data[15] = v15;
-    }
-    T data[16]; 
-    FORCEINLINE const T& operator[](const int i) const { return data[i]; }
-    FORCEINLINE       T& operator[](const int i)       { return data[i]; }
-};
-
-#if 0 /* evghenii:i32 */
-struct PRE_ALIGN(64) __vec16_i32  : public vec16<int32_t> { 
-  __vec16_i32() { }
-  __vec16_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, 
-      int32_t v4, int32_t v5, int32_t v6, int32_t v7,
-      int32_t v8, int32_t v9, int32_t v10, int32_t v11, 
-      int32_t v12, int32_t v13, int32_t v14, int32_t v15) 
-    : vec16<int32_t>(v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10, v11, v12, v13, v14, v15) { }
-} POST_ALIGN(64);
-#else /* evghenii:i32 */
 struct PRE_ALIGN(64) __vec16_i32 
 {
   __m512i v;
@@ -144,81 +123,43 @@ struct PRE_ALIGN(64) __vec16_i32
     FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
     FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
 } POST_ALIGN(64);
-#endif /* evghenii:i32 */
 
-#if 0 /* evghenii::f */
-PRE_ALIGN(64) struct __vec16_f : public vec16<float> { 
-    __vec16_f() { }
-    __vec16_f(float v0, float v1, float v2, float v3, 
-              float v4, float v5, float v6, float v7,
-              float v8, float v9, float v10, float v11, 
-              float v12, float v13, float v14, float v15) 
-        : vec16<float>(v0, v1, v2, v3, v4, v5, v6, v7,
-                       v8, v9, v10, v11, v12, v13, v14, v15) { }
-
-} POST_ALIGN(64);
-#else /* evghenii::f */
 PRE_ALIGN(64) struct __vec16_f 
 {
-    __m512 v;
-    FORCEINLINE operator __m512() const { return v; }
-    FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { }
-    FORCEINLINE __vec16_f(const __m512 &in) : v(in) {}
-    FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {}
-    FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; }
-    FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, 
-                          float v04, float v05, float v06, float v07,
-                          float v08, float v09, float v10, float v11,
-                          float v12, float v13, float v14, float v15) :
-        v ( _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) )  {}
-    FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
-    FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
+  __m512 v;
+  FORCEINLINE operator __m512() const { return v; }
+  FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { }
+  FORCEINLINE __vec16_f(const __m512 &in) : v(in) {}
+  FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {}
+  FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; }
+  FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, 
+      float v04, float v05, float v06, float v07,
+      float v08, float v09, float v10, float v11,
+      float v12, float v13, float v14, float v15) :
+    v ( _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) )  {}
+  FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
+  FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
 } POST_ALIGN(64);
-#endif /* evghenii::f */
 
-#if 0 /* evghenii::d */
-PRE_ALIGN(128) struct __vec16_d : public vec16<double> { 
-    __vec16_d() { }
-    __vec16_d(double v0, double v1, double v2, double v3, 
-              double v4, double v5, double v6, double v7,
-              double v8, double v9, double v10, double v11, 
-              double v12, double v13, double v14, double v15) 
-        : vec16<double>(v0, v1, v2, v3, v4, v5, v6, v7,
-                        v8, v9, v10, v11, v12, v13, v14, v15) { }
-
-} POST_ALIGN(128);
-#else /* evghenii::d */
 struct PRE_ALIGN(128) __vec16_d 
 {
-    __m512d v1;
-    __m512d v2;
-    FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {}
-    FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {}
-    FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {}
-    FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; }
-    FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, 
-                          double v04, double v05, double v06, double v07,
-                          double v08, double v09, double v10, double v11,
-                          double v12, double v13, double v14, double v15) {
-        v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08);
-        v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00);
-    }
-    FORCEINLINE const double& operator[](const int i) const {  return ((double*)this)[i]; }
-    FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
+  __m512d v1;
+  __m512d v2;
+  FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {}
+  FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {}
+  FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {}
+  FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; }
+  FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, 
+      double v04, double v05, double v06, double v07,
+      double v08, double v09, double v10, double v11,
+      double v12, double v13, double v14, double v15) {
+    v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08);
+    v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00);
+  }
+  FORCEINLINE const double& operator[](const int i) const {  return ((double*)this)[i]; }
+  FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
 } POST_ALIGN(128);
-#endif /* evghenii::d */
 
-#if 0 /* evghenii::i64 */
-PRE_ALIGN(128) struct __vec16_i64  : public vec16<int64_t> { 
-    __vec16_i64() { }
-    __vec16_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, 
-                int64_t v4, int64_t v5, int64_t v6, int64_t v7,
-                int64_t v8, int64_t v9, int64_t v10, int64_t v11, 
-                int64_t v12, int64_t v13, int64_t v14, int64_t v15) 
-        : vec16<int64_t>(v0, v1, v2, v3, v4, v5, v6, v7,
-                         v8, v9, v10, v11, v12, v13, v14, v15) { }
-} POST_ALIGN(128);
-#else /* evghenii::i64 */
 struct PRE_ALIGN(128) __vec16_i64 
 {
   union {
@@ -279,7 +220,24 @@ struct PRE_ALIGN(128) __vec16_i64
     return __vec16_i64(_v1, _v2);
   }
 } POST_ALIGN(128);
-#endif /* evghenii::i64 */
+
+/************ scalar **************/
+
+template <typename T>
+struct vec16 
+{
+  FORCEINLINE vec16() { }
+  FORCEINLINE vec16(T v0, T v1, T  v2, T  v3, T  v4, T  v5, T  v6, T  v7,
+                    T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) {
+    data[0] = v0;        data[1] = v1;        data[2] = v2;        data[3] = v3;
+    data[4] = v4;        data[5] = v5;        data[6] = v6;        data[7] = v7;
+    data[8] = v8;        data[9] = v9;        data[10] = v10;      data[11] = v11;
+    data[12] = v12;      data[13] = v13;      data[14] = v14;      data[15] = v15;
+  }
+  T data[16]; 
+  FORCEINLINE const T& operator[](const int i) const { return data[i]; }
+  FORCEINLINE       T& operator[](const int i)       { return data[i]; }
+};
 
 PRE_ALIGN(16) struct __vec16_i8   : public vec16<int8_t> { 
     __vec16_i8() { }
@@ -510,104 +468,54 @@ INSERT_EXTRACT(__vec1_f, float)
 INSERT_EXTRACT(__vec1_d, double)
 
 ///////////////////////////////////////////////////////////////////////////
-// mask ops
+// mask 
+///////////////////////////////////////////////////////////////////////////
 
-static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) {
-    return _mm512_kmov(mask);
+static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) { return  _mm512_kmov    (mask);       }
+static FORCEINLINE       bool __any   (__vec16_i1 mask) { return !_mm512_kortestz(mask, mask); }
+static FORCEINLINE       bool __all   (__vec16_i1 mask) { return  _mm512_kortestc(mask, mask); }
+static FORCEINLINE       bool __none  (__vec16_i1 mask) { return  _mm512_kortestz(mask, mask); }
+static FORCEINLINE __vec16_i1 __not   (__vec16_i1 mask) { return  _mm512_knot    (mask);       }
+
+static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { return _mm512_kxnor (a,b); }
+static FORCEINLINE __vec16_i1 __and     (__vec16_i1 a, __vec16_i1 b) { return _mm512_kand  (a,b); }
+static FORCEINLINE __vec16_i1 __xor     (__vec16_i1 a, __vec16_i1 b) { return _mm512_kxor  (a,b); }
+static FORCEINLINE __vec16_i1 __or      (__vec16_i1 a, __vec16_i1 b) { return _mm512_kor   (a,b); }
+static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) { return _mm512_kandn (a,b); }
+static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) { return _mm512_kandnr(a,b); }
+
+static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, __vec16_i1 b) { return __or(__and(a, mask), __and_not2(b, mask)); }
+static FORCEINLINE __vec16_i1 __select(      bool cond, __vec16_i1 a, __vec16_i1 b) { return cond ? a : b; }
+
+static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) { return (vec.v & (1 << index)) ? true : false; }
+static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, bool val) 
+{
+  if (val == false)  vec->v &= ~(1 << index);
+  else               vec->v |=  (1 << index);
 }
 
-static FORCEINLINE bool __any(__vec16_i1 mask) {
-    return !_mm512_kortestz(mask, mask);
+template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) 
+{
+  return *p;
 }
 
-static FORCEINLINE bool __all(__vec16_i1 mask) {
-    return _mm512_kortestc(mask, mask);
-}
-
-static FORCEINLINE bool __none(__vec16_i1 mask) {
-    return _mm512_kortestz(mask, mask);
-}
-
-static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kxnor(a,b);
-}
-static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kand(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __xor(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kxor(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kor(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __not(__vec16_i1 a) {
-    return _mm512_knot(a);
-}
-
-static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kandn(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kandnr(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, 
-                                       __vec16_i1 b) {
-//    return ((a & mask) | (b & ~mask));
-    return __or(__and(a, mask), __and_not2(b, mask));
-}
-
-static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) {
-    return cond ? a : b;
-}
-
-
-static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) {
-    return (vec.v & (1 << index)) ? true : false;
-}
-
-static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, 
-                                         bool val) {
-    if (val == false)
-        vec->v &= ~(1 << index);
-    else
-        vec->v |= (1 << index);
-}
-
-template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
-    uint16_t *ptr = (uint16_t *)p;
-    __vec16_i1 r;
-    r.v = *ptr;
-    return r;
-}
-
-template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) {
-    uint16_t *ptr = (uint16_t *)p;
-    *ptr = v.v;
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) 
+{
+  *p = v;
 }
 
 template <class RetVecType> RetVecType __smear_i1(int i);
-template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) {
-    return i?0xFFFF:0x0;
-}
+template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
 
 template <class RetVecType> RetVecType __setzero_i1();
-template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
-    return 0;
-}
+template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
 
 template <class RetVecType> __vec16_i1 __undef_i1();
-template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
-    return __vec16_i1();
-}
-
+template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); }
 
 ///////////////////////////////////////////////////////////////////////////
 // int8
+///////////////////////////////////////////////////////////////////////////
 
 BINARY_OP(__vec16_i8, __add, +)
 BINARY_OP(__vec16_i8, __sub, -)
@@ -653,6 +561,7 @@ LOAD_STORE(__vec16_i8, int8_t)
 
 ///////////////////////////////////////////////////////////////////////////
 // int16
+///////////////////////////////////////////////////////////////////////////
 
 BINARY_OP(__vec16_i16, __add, +)
 BINARY_OP(__vec16_i16, __sub, -)
@@ -696,232 +605,57 @@ ROTATE(__vec16_i16, i16, int16_t)
 SHUFFLES(__vec16_i16, i16, int16_t)
 LOAD_STORE(__vec16_i16, int16_t)
 
-#if 0 /* evghenii::int32 */
-///////////////////////////////////////////////////////////////////////////
-// int32
-
-BINARY_OP(__vec16_i32, __add, +)
-BINARY_OP(__vec16_i32, __sub, -)
-BINARY_OP(__vec16_i32, __mul, *)
-
-BINARY_OP(__vec16_i32, __or, |)
-BINARY_OP(__vec16_i32, __and, &)
-BINARY_OP(__vec16_i32, __xor, ^)
-BINARY_OP(__vec16_i32, __shl, <<)
-
-BINARY_OP_CAST(__vec16_i32, uint32_t, __udiv, /)
-BINARY_OP_CAST(__vec16_i32, int32_t,  __sdiv, /)
-
-BINARY_OP_CAST(__vec16_i32, uint32_t, __urem, %)
-BINARY_OP_CAST(__vec16_i32, int32_t,  __srem, %)
-BINARY_OP_CAST(__vec16_i32, uint32_t, __lshr, >>)
-BINARY_OP_CAST(__vec16_i32, int32_t,  __ashr, >>)
-
-SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
-SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
-SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
-
-CMP_OP(__vec16_i32, i32, int32_t,  __equal, ==)
-CMP_OP(__vec16_i32, i32, int32_t,  __not_equal, !=)
-CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_equal, <=)
-CMP_OP(__vec16_i32, i32, int32_t,  __signed_less_equal, <=)
-CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_equal, >=)
-CMP_OP(__vec16_i32, i32, int32_t,  __signed_greater_equal, >=)
-CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_than, <)
-CMP_OP(__vec16_i32, i32, int32_t,  __signed_less_than, <)
-CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_than, >)
-CMP_OP(__vec16_i32, i32, int32_t,  __signed_greater_than, >)
-
-SELECT(__vec16_i32)
-INSERT_EXTRACT(__vec16_i32, int32_t)
-SMEAR(__vec16_i32, i32, int32_t)
-SETZERO(__vec16_i32, i32)
-UNDEF(__vec16_i32, i32)
-BROADCAST(__vec16_i32, i32, int32_t)
-ROTATE(__vec16_i32, i32, int32_t)
-SHUFFLES(__vec16_i32, i32, int32_t)
-LOAD_STORE(__vec16_i32, int32_t)
-
-#else /* evghenii::int32 */
 ///////////////////////////////////////////////////////////////////////////
 // int32
 ///////////////////////////////////////////////////////////////////////////
 
-static FORCEINLINE __vec16_i32 __add(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_add_epi32(a, b);
-}
+static FORCEINLINE __vec16_i32 __add (__vec16_i32 a, __vec16_i32 b) { return _mm512_add_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __sub (__vec16_i32 a, __vec16_i32 b) { return _mm512_sub_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __mul (__vec16_i32 a, __vec16_i32 b) { return _mm512_mullo_epi32(a,b); }
+static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) { return _mm512_div_epu32  (a,b); }
+static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) { return _mm512_div_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) { return _mm512_rem_epu32  (a,b); }
+static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) { return _mm512_rem_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __or  (__vec16_i32 a, __vec16_i32 b) { return _mm512_or_epi32   (a,b); }
+static FORCEINLINE __vec16_i32 __and (__vec16_i32 a, __vec16_i32 b) { return _mm512_and_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __xor (__vec16_i32 a, __vec16_i32 b) { return _mm512_xor_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __shl (__vec16_i32 a, __vec16_i32 b) { return _mm512_sllv_epi32 (a,b); }
+static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) { return _mm512_srlv_epi32 (a,b); }
+static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) { return _mm512_srav_epi32 (a,b); }
+static FORCEINLINE __vec16_i32 __shl (__vec16_i32 a,     int32_t n) { return _mm512_slli_epi32 (a,n); }
+static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a,     int32_t n) { return _mm512_srli_epi32 (a,n); }
+static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a,     int32_t n) { return _mm512_srai_epi32 (a,n); }
 
-static FORCEINLINE __vec16_i32 __sub(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_sub_epi32(a, b);
-}
+static FORCEINLINE __vec16_i1 __equal_i32                 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpeq_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __not_equal_i32             (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpneq_epi32_mask(a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32   (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_equal_i32     (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_equal_i32  (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_than_i32    (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_than_i32      (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_than_i32   (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epi32_mask (a,b); }
 
-static FORCEINLINE __vec16_i32 __mul(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_mullo_epi32(a, b);
-}
+static FORCEINLINE __vec16_i1 __equal_i32_and_mask                 (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpeq_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask             (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpneq_epi32_mask(m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask   (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmple_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask     (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmple_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpge_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask  (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpge_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask    (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmplt_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask      (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmplt_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpgt_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask   (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpgt_epi32_mask (m,a,b); }
 
-static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_div_epu32(a, b);
-}
+static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask, __vec16_i32 a, __vec16_i32 b) { return _mm512_mask_mov_epi32(b, mask, a); }
+static FORCEINLINE __vec16_i32 __select(      bool cond, __vec16_i32 a, __vec16_i32 b) { return cond ? a : b; }
 
-static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_div_epi32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_rem_epu32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_rem_epi32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __or(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_or_epi32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __and(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_and_epi32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __xor(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_xor_epi32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_sllv_epi32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_srlv_epi32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_srav_epi32(a, b); 
-}
-
-static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, int32_t n) {
-    return _mm512_slli_epi32(a, n);
-}
-
-static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, int32_t n) {
-    return _mm512_srli_epi32(a, n); 
-}
-
-static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) {
-    return _mm512_srai_epi32(a, n); 
-}
-
-static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i32 &b) {
-    return _mm512_cmpeq_epi32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __equal_i32_and_mask(const __vec16_i32 &a, const __vec16_i32 &b,
-                                                   __vec16_i1 m) {
-    return _mm512_mask_cmpeq_epi32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpneq_epi32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                       __vec16_i1 m) {
-    return _mm512_mask_cmpneq_epi32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmple_epu32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                 __vec16_i1 m) {
-    return _mm512_mask_cmple_epu32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmple_epi32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                               __vec16_i1 m) {
-    return _mm512_mask_cmple_epi32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpge_epu32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                    __vec16_i1 m) {
-    return _mm512_mask_cmpge_epu32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpge_epi32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                  __vec16_i1 m) {
-    return _mm512_mask_cmpge_epi32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmplt_epu32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                __vec16_i1 m) {
-    return _mm512_mask_cmplt_epu32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmplt_epi32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                              __vec16_i1 m) {
-    return _mm512_mask_cmplt_epi32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpgt_epu32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                   __vec16_i1 m) {
-    return _mm512_mask_cmpgt_epu32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpgt_epi32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                 __vec16_i1 m) {
-    return _mm512_mask_cmpgt_epi32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask,
-                                        __vec16_i32 a, __vec16_i32 b) {
-    return _mm512_mask_mov_epi32(b.v, mask, a.v);
-} 
-
-static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b) {
-    return cond ? a : b;
-}
-
-static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int index) { //uint32_t index) {
-    return ((int32_t *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec16_i32 *v, uint32_t index, int32_t val) {
-    ((int32_t *)v)[index] = val;
-}
+static FORCEINLINE int32_t __extract_element(__vec16_i32  v,  int32_t index)              { return v[index];    }
+static FORCEINLINE void    __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val;  }
 
 template <class RetVecType> RetVecType __smear_i32(int32_t i);
-template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) {
-    return _mm512_set1_epi32(i);
-}
+template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
 
 static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
 static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
@@ -929,66 +663,56 @@ static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
 static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 
 template <class RetVecType> RetVecType __setzero_i32();
-template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() {
-    return _mm512_setzero_epi32();
-}
+template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
 
 template <class RetVecType> RetVecType __undef_i32();
-template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() {
-    return __vec16_i32();
+template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
+
+static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); }
+
+static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) 
+{
+  __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
+  __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0xF));
+  return _mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v);
 }
 
-static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) {
-    int32_t val = __extract_element(v, index & 0xf);
-    return _mm512_set1_epi32(val);
+static FORCEINLINE __vec16_i32 __shuffle_i32 (__vec16_i32 v, __vec16_i32 index) 
+{ 
+  return _mm512_mask_permutevar_epi32(v, 0xFFFF, __and(index, __smear_i32<__vec16_i32>(0xF)), v); 
+}
+static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __vec16_i32 index)
+{
+  const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10));
+  index  = __and(index, __smear_i32<__vec16_i32>(0xF));
+  __vec16_i32 ret = __undef_i32<__vec16_i32>();
+  ret = _mm512_mask_permutevar_epi32(ret,       mask,  index, v0);
+  ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1);
+  return ret;
 }
 
-#if 0 /* evghenii::doesn't work */
-static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) {
-    __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
-    __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0x7));
-    return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
-}
-#else
-ROTATE(__vec16_i32, i32, int32_t)
-#endif
-
-static FORCEINLINE __vec16_i32 __shuffle_i32(__vec16_i32 v, __vec16_i32 index) {
-    return _mm512_mask_permutevar_epi32(v, 0xffff, index, v);
-}
-SHUFFLE2(__vec16_i32, i32, int32_t) /* evghenii::to implement */
-
-template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) {
+template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    return _mm512_load_epi32(p);
+  return __load<64>(p);
 #else
-    __vec16_i32 v;
-    v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    return v;
+  __vec16_i32 v;
+  v = _mm512_extloadunpacklo_epi32(v,           p,    _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  return v;
 #endif
 }
 
-
-template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) {
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    _mm512_store_epi32(p, v);
+  __store<64>(p,v);
 #else
-    _mm512_extpackstorelo_epi32(p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_epi32(          p,    v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
 #endif
 }
 
-#if 0
-template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) {
-    return _mm512_load_epi32(p);
-}
-template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
-    _mm512_store_epi32(p, v);
-}
-#endif
-#endif /* evghenii::int32 */
-
 ///////////////////////////////////////////////////////////////////////////
 // int64
 // evghenii::int64

From 57f019a6e02db5b90f9310b1f19114c0c93926ee Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Fri, 4 Oct 2013 13:39:15 +0300
Subject: [PATCH 067/159] cleaned int64 added fails info

---
 examples/intrinsics/knc-i1x16.h | 162 +++++++++++++-------------------
 1 file changed, 67 insertions(+), 95 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index aae4be57..934d90b6 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -715,19 +715,18 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
 
 ///////////////////////////////////////////////////////////////////////////
 // int64
-// evghenii::int64
+///////////////////////////////////////////////////////////////////////////
 
-#if 0
-BINARY_OP(__vec16_i64, __add, +)
-BINARY_OP(__vec16_i64, __sub, -)
-BINARY_OP(__vec16_i64, __mul, *)
-#else
-static FORCEINLINE __vec16_i64 __add(__vec16_i64 a, __vec16_i64 b) {
+static FORCEINLINE __vec16_i64 __add(__vec16_i64 a, __vec16_i64 b) 
+{
   return __vec16_i64(_mm512_add_epi64(a.v1, b.v1), _mm512_add_epi64(a.v2,b.v2));
 }
 
-static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) {
-//    return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2));
+static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) 
+{
+  //  this intrinsic doesn't exist :S 
+  //  return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2));
+  //  use knc.h implementation
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
   __vec16_i64 ret;
@@ -745,34 +744,30 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b
         _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm();
 }
 
-#if __ICC_VERSION == 1400
-static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_mullox_epi64(a.v1, b.v1), _mm512_mullox_epi64(a.v2,b.v2));
-}
+static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) 
+{
+#if __ICC >= 1400
+  return __vec16_i64(_mm512_mullox_epi64(_a.v1,_b.v1), _mm512_mullox_epi64(_a.v2,_b.v2));
 #else
-BINARY_OP(__vec16_i64, __mul, *)
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo);
+  __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
+  __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
+  __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
+  __mmask16 carry = 0;
+  __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry);
+  __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry);
+  return __vec16_i64(hi,lo).cvt2zmm();
 #endif
-#endif
-
-#if 0
-BINARY_OP(__vec16_i64, __or, |)
-BINARY_OP(__vec16_i64, __and, &)
-BINARY_OP(__vec16_i64, __xor, ^)
-BINARY_OP(__vec16_i64, __shl, <<)
-#else
-static FORCEINLINE __vec16_i64 __or(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_or_epi64(a.v1, b.v1), _mm512_or_epi64(a.v2, b.v2));
 }
 
-static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2));
-}
+static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_or_epi64 (a.v1, b.v1), _mm512_or_epi64 (a.v2, b.v2)); }
+static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); }
+static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2)); }
 
-static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2));
-}
-
-static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) {
+static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) 
+{
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
   __vec16_i32 xfer = _mm512_srlv_epi32(a.v_lo, _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
@@ -780,35 +775,16 @@ static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) {
   __vec16_i32 lo = _mm512_sllv_epi32(a.v_lo, b.v_lo);
   return __vec16_i64(hi,lo).cvt2zmm();
 }
-#endif
 
-#if 0
-BINARY_OP_CAST(__vec16_i64, uint64_t, __udiv, /)
-BINARY_OP_CAST(__vec16_i64, int64_t,  __sdiv, /)
-#else
-static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2));
-}
-static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2));
-}
-#endif
+static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2)); }
+static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2)); }
 
-#if 0
-BINARY_OP_CAST(__vec16_i64, uint64_t, __urem, %)
-BINARY_OP_CAST(__vec16_i64, int64_t,  __srem, %)
-#else
-static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2));
-}
-static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2));
-}
-#endif
+static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2)); }
+static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2)); }
 
 #if 1
 BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
-#else /* evghenii::fails idiv.ispc */
+#else /* knc::fails  ./tests/idiv.ispc */
 static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) {
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
@@ -829,7 +805,7 @@ static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) {
 
 #if 1
 BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)
-#else /* evghenii::fails idiv.ispc */
+#else /* knc::fails  ./tests/idiv.ispc */
 static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) {
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
@@ -848,31 +824,30 @@ SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
 
 #if 1
 CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
-#else /* evghenii::fails         ./tests/reduce-equal-8.ispc, some other test hang... */
-static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &_a, const __vec16_i64 &_b) {
+CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
+#else /* knc::fails  ./tests/reduce-equal-8.ispc , knc::hangs foreach-unique-6.ispc funcptr-null-[2-6].ispc  funcptr-uniform-9.ispc  funcptr-varying-5.ispc */
+static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b) 
+{
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
   const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
   return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
 }
-static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
-    return __not(__equal_i64(a,b));
-}
-#endif
-
-#if 1
-CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
-#else /* evghenii::fails         ./tests/reduce-equal-8.ispc, some other test hang... */
-static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &_a, const __vec16_i64 &_b,
-                                                   __vec16_i1 mask) {
+static FORCEINLINE __vec16_i1 __equal_i64_and_mask(__vec16_i64 _a, __vec16_i64 _b, __vec16_i1 mask) 
+{
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
   __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
   __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
   return _mm512_kand(full_match, (__mmask16)mask);
 }
-static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b,
-                                                       __vec16_i1 mask) {
+
+static FORCEINLINE __vec16_i1 __not_equal_i64(__vec16_i64 a, __vec16_i64 b) 
+{
+    return __not(__equal_i64(a,b));
+}
+static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(__vec16_i64 a, __vec16_i64 b, __vec16_i1 mask) 
+{
     return __and(__not(__equal_i64(a,b)), mask);
 }
 #endif
@@ -888,46 +863,39 @@ CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_than, <)
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
 CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_than, >)
 
-#if 0
-SELECT(__vec16_i64)
-#else
-static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask,
-                                        __vec16_i64 a, __vec16_i64 b) {
+static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_i64 b) 
+{
   __vec16_i64 ret;
   ret.v_hi = _mm512_mask_mov_epi64(b.v_hi, mask, a.v_hi);
   ret.v_lo = _mm512_mask_mov_epi64(b.v_lo, mask >> 8, a.v_lo);
   return ret;
 }
-#endif
 
 INSERT_EXTRACT(__vec16_i64, int64_t)
-#if 0
-SMEAR(__vec16_i64, i64, int64_t)
-SETZERO(__vec16_i64, i64)
-UNDEF(__vec16_i64, i64)
-BROADCAST(__vec16_i64, i64, int64_t)
-#else
+
 template <class RetVecType> RetVecType __smear_i64(const int64_t &l);
-template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) {    return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); }
+template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); }
 
 template <class RetVecType> RetVecType __setzero_i64();
-template <> FORCEINLINE  __vec16_i64 __setzero_i64<__vec16_i64>() {    return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); }
+template <> FORCEINLINE  __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); }
 
 template <class RetVecType> RetVecType __undef_i64();
-template <> FORCEINLINE  __vec16_i64 __undef_i64<__vec16_i64>() {    return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
+template <> FORCEINLINE  __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
 
-static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 v, int index) {
-    int64_t val = __extract_element(v, index & 0xf);
-    return __smear_i64<__vec16_i64>(val);
+static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 v, int index) 
+{
+  int64_t val = __extract_element(v, index & 0xf);
+  return __smear_i64<__vec16_i64>(val);
 }
-#endif
-ROTATE(__vec16_i64, i64, int64_t)
+
+ROTATE  (__vec16_i64, i64, int64_t)
 SHUFFLES(__vec16_i64, i64, int64_t)
-#if 0
-LOAD_STORE(__vec16_i64, int64_t)
-#else
+
 template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) 
 {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  return __load<128>(p);
+#else
   __vec16_i32 v1;
   __vec16_i32 v2;
   v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
@@ -935,6 +903,7 @@ template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p)
   v1 = _mm512_extloadunpacklo_epi32(v1, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
   v1 = _mm512_extloadunpackhi_epi32(v1, (uint8_t*)p+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
   return __vec16_i64(v2,v1);
+#endif
 }
 
 template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
@@ -948,12 +917,16 @@ template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {
 
 template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) 
 {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  return __store<128>(p,v);
+#else
   __m512i v1 = v.v2;
   __m512i v2 = v.v1;
   _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
   _mm512_extpackstorehi_epi32((uint8_t*)p+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
   _mm512_extpackstorelo_epi32((uint8_t*)p+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
   _mm512_extpackstorehi_epi32((uint8_t*)p+128, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+#endif
 }
 
 template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) 
@@ -965,7 +938,6 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v)
 }
 
 template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
-#endif
 
 
 #if 0 /* evghenii::float */

From 8a6789ef61e006866ead9e0c5d0cfa1db39cd8c5 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Fri, 4 Oct 2013 14:11:09 +0300
Subject: [PATCH 068/159] cleaned float added fails info

---
 examples/intrinsics/knc-i1x16.h | 348 ++++++++++----------------------
 1 file changed, 107 insertions(+), 241 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 934d90b6..87f54dfa 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -940,217 +940,113 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v)
 template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
 
 
-#if 0 /* evghenii::float */
-///////////////////////////////////////////////////////////////////////////
-// float
-
-BINARY_OP(__vec16_f, __add, +)
-BINARY_OP(__vec16_f, __sub, -)
-BINARY_OP(__vec16_f, __mul, *)
-BINARY_OP(__vec16_f, __div, /)
-
-CMP_OP(__vec16_f, float, float, __equal, ==)
-CMP_OP(__vec16_f, float, float, __not_equal, !=)
-CMP_OP(__vec16_f, float, float, __less_than, <)
-CMP_OP(__vec16_f, float, float, __less_equal, <=)
-CMP_OP(__vec16_f, float, float, __greater_than, >)
-CMP_OP(__vec16_f, float, float, __greater_equal, >=)
-
-static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) {
-    __vec16_i1 ret;
-    ret.v = 0;
-    for (int i = 0; i < 16; ++i)
-        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
-    return ret;
-}
-
-static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) {
-    __vec16_i1 ret;
-    ret.v = 0;
-    for (int i = 0; i < 16; ++i)
-        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
-    return ret;
-}
-
-#if 0
-      case Instruction::FRem: intrinsic = "__frem"; break;
-#endif
-
-SELECT(__vec16_f)
-INSERT_EXTRACT(__vec16_f, float)
-SMEAR(__vec16_f, float, float)
-SETZERO(__vec16_f, float)
-UNDEF(__vec16_f, float)
-BROADCAST(__vec16_f, float, float)
-ROTATE(__vec16_f, float, float)
-SHUFFLES(__vec16_f, float, float)
-LOAD_STORE(__vec16_f, float)
-#else /* evghenii::float */
-
 ///////////////////////////////////////////////////////////////////////////
 // float
 ///////////////////////////////////////////////////////////////////////////
 
-static FORCEINLINE __vec16_f __add(__vec16_f a, __vec16_f b) { 
-    return _mm512_add_ps(a, b);
-}
+static FORCEINLINE __vec16_f __add(__vec16_f a, __vec16_f b) { return _mm512_add_ps(a,b); }
+static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) { return _mm512_sub_ps(a,b); }
+static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) { return _mm512_mul_ps(a,b); }
+static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) { return _mm512_div_ps(a,b); }
 
-static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) {
-    return _mm512_sub_ps(a, b);
-}
+static FORCEINLINE __vec16_i1 __equal_float        (__vec16_f a, __vec16_f b) { return _mm512_cmpeq_ps_mask (a,b);            }
+static FORCEINLINE __vec16_i1 __not_equal_float    (__vec16_f a, __vec16_f b) { return _mm512_cmpneq_ps_mask(a,b);            }
+static FORCEINLINE __vec16_i1 __less_than_float    (__vec16_f a, __vec16_f b) { return _mm512_cmplt_ps_mask (a,b);            }
+static FORCEINLINE __vec16_i1 __less_equal_float   (__vec16_f a, __vec16_f b) { return _mm512_cmple_ps_mask (a,b);            }
+static FORCEINLINE __vec16_i1 __greater_than_float (__vec16_f a, __vec16_f b) { return _mm512_cmp_ps_mask   (a,b,_CMP_GT_OS); }
+static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmp_ps_mask   (a,b,_CMP_GE_OS); }
 
-static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) {
-    return _mm512_mul_ps(a, b);
-}
+static FORCEINLINE __vec16_i1 __equal_float_and_mask        (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmpeq_ps_mask (m,a,b);            }
+static FORCEINLINE __vec16_i1 __not_equal_float_and_mask    (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmpneq_ps_mask(m,a,b);            }
+static FORCEINLINE __vec16_i1 __less_than_float_and_mask    (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmplt_ps_mask (m,a,b);            }
+static FORCEINLINE __vec16_i1 __less_equal_float_and_mask   (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmple_ps_mask (m,a,b);            }
+static FORCEINLINE __vec16_i1 __greater_than_float_and_mask (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmp_ps_mask   (m,a,b,_CMP_GT_OS); }
+static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmp_ps_mask   (m,a,b,_CMP_GE_OS); }
 
-static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) {
-    return _mm512_div_ps(a, b);
-}
+static FORCEINLINE __vec16_i1   __ordered_float(__vec16_f a, __vec16_f b) { return _mm512_cmpord_ps_mask  (a,b); }
+static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) { return _mm512_cmpunord_ps_mask(a,b); }
 
+static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) { return _mm512_mask_mov_ps(b, mask, a); }
+static FORCEINLINE __vec16_f __select(      bool cond, __vec16_f a, __vec16_f b) { return cond ? a : b; }
 
-static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmpeq_ps_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __equal_float_and_mask(__vec16_f a, __vec16_f b,
-                                                     __vec16_i1 m) {
-    return _mm512_mask_cmpeq_ps_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __not_equal_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmpneq_ps_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __not_equal_float_and_mask(__vec16_f a, __vec16_f b,
-                                                         __vec16_i1 m) {
-    return _mm512_mask_cmpneq_ps_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __less_than_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmplt_ps_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __less_than_float_and_mask(__vec16_f a, __vec16_f b,
-                                                         __vec16_i1 m) {
-    return _mm512_mask_cmplt_ps_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __less_equal_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmple_ps_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __less_equal_float_and_mask(__vec16_f a, __vec16_f b,
-                                                          __vec16_i1 m) {
-    return _mm512_mask_cmple_ps_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __greater_than_float(__vec16_f a, __vec16_f b) {
-//    return _mm512_cmpnle_ps_mask(a, b);
-    return _mm512_cmp_ps_mask(a, b,_CMP_GT_OS);
-}
-
-static FORCEINLINE __vec16_i1 __greater_than_float_and_mask(__vec16_f a, __vec16_f b,
-                                                            __vec16_i1 m) {
-//    return _mm512_mask_cmpnle_ps_mask(m, a, b);
-    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GT_OS);
-}
-
-static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) {
-//    return _mm512_cmpnlt_ps_mask(a, b);
-    return _mm512_cmp_ps_mask(a, b,_CMP_GE_OS);
-}
-
-static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b,
-                                                             __vec16_i1 m) {
-//    return _mm512_mask_cmpnlt_ps_mask(m, a, b);
-    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GE_OS);
-}
-
-static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmpord_ps_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmpunord_ps_mask(a, b);
-}
-
-static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) {
-    return _mm512_mask_mov_ps(b, mask, a);
-}
-
-static FORCEINLINE __vec16_f __select(bool cond, __vec16_f a, __vec16_f b) {
-    return cond ? a : b;
-}
-
-static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) {
-  return v[index];
- //   return ((float *)&v)[index];
-}
-
-static FORCEINLINE void  __insert_element(__vec16_f *v, uint32_t index, float val) {
-  (*v)[index] = val;
-//    ((float *)v)[index] = val;
-}
+static FORCEINLINE float __extract_element(__vec16_f  v, uint32_t index)            { return v[index];   }
+static FORCEINLINE void   __insert_element(__vec16_f *v, uint32_t index, float val) { (*v)[index] = val; }
 
 template <class RetVecType> RetVecType __smear_float(float f);
-template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) {
-    return _mm512_set_1to16_ps(f);
-}
+template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
 
 template <class RetVecType> RetVecType __setzero_float();
-template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() {
-    return _mm512_setzero_ps();
-}
+template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
 
 template <class RetVecType> RetVecType __undef_float();
-template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() {
-    return __vec16_f();
-}
+template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
 
-static FORCEINLINE __vec16_f __broadcast_float(__vec16_f v, int index) {
-    float val = __extract_element(v, index & 0xf);
-    return _mm512_set1_ps(val);
+static FORCEINLINE __vec16_f __broadcast_float(__vec16_f _v, int index) 
+{
+  const __vec16_i32 v = _mm512_castps_si512(_v);
+  return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v));
 }
  
-#if 1
-static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) {
-    return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
+static FORCEINLINE __vec16_f __rotate_float(__vec16_f _v, int index) 
+{
+  const __vec16_i32 v =  _mm512_castps_si512(_v);
+  const __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
+  const __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0xF));
+  return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v));
+}
+static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) 
+{
+  return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
+}
+static FORCEINLINE __vec16_f __shuffle2_float(__vec16_f _v0, __vec16_f _v1, __vec16_i32 index)
+{
+  const __vec16_i32 v0 =  _mm512_castps_si512(_v0);
+  const __vec16_i32 v1 =  _mm512_castps_si512(_v1);
+  const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10));
+  index  = __and(index, __smear_i32<__vec16_i32>(0xF));
+  __vec16_i32 ret = __undef_i32<__vec16_i32>();
+  ret = _mm512_mask_permutevar_epi32(ret,       mask,  index, v0);
+  ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1);
+  return _mm512_castsi512_ps(ret);
 }
-#endif
-ROTATE(__vec16_f, float, float)
-SHUFFLE2(__vec16_f, float, float)
 
-template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) {
+template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    return _mm512_load_ps(p);
+  return __load<64>(p);
 #else
-    __vec16_f v;
-    v = _mm512_extloadunpacklo_ps(v,           p,    _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    return v;
+  __vec16_f v;
+  v = _mm512_extloadunpacklo_ps(v,           p,    _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  return v;
 #endif
 }
 
-template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) {
+template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    _mm512_store_ps(p, v);
+  __store<64>(p,v);
 #else
-    _mm512_extpackstorelo_ps(          p,    v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_ps(          p,    v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
 #endif
 }
 
-#if 0
-template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) {
-    _mm512_store_ps(p, v);
-}
-template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) {
+#if 0 /* knc::fails  ./tests/gs-improve-progindex.ispc with segfault */
+template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) 
+{
     return _mm512_load_ps(p);
 }
+/* this one doesn't fail but it is  commented out for completenes, no aligned load/stores */
+template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) 
+{
+  _mm512_store_ps(p, v);
+}
 #endif
 
-#endif /* evghenii::float */
+/******** math ******/
 
+/*** float ***/
 static FORCEINLINE float __exp_uniform_float(float v) {    return expf(v);}
 static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) { return _mm512_exp_ps(v); }
 
@@ -1160,6 +1056,18 @@ static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) { return _mm512_lo
 static FORCEINLINE float __pow_uniform_float(float a, float b) {    return powf(a, b);}
 static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); }
 
+/*** double ***/
+static FORCEINLINE double __exp_uniform_double(double v) {    return exp(v);}
+static FORCEINLINE __vec16_d __exp_varying_double(__vec16_d v) { return __vec16_d(_mm512_exp_pd(v.v1),_mm512_exp_pd(v.v2)); }
+
+static FORCEINLINE double __log_uniform_double(double v) {    return log(v);}
+static FORCEINLINE __vec16_d __log_varying_double(__vec16_d v) { return __vec16_d(_mm512_log_pd(v.v1),_mm512_log_pd(v.v2)); }
+
+static FORCEINLINE double __pow_uniform_double(double a, double b) {    return pow(a,b);}
+static FORCEINLINE __vec16_d __pow_varying_double(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1),_mm512_pow_pd(a.v2,b.v2)); }
+
+/******** bitcast ******/
+
 static FORCEINLINE int __intbits(float v) {
     union {
         float f;
@@ -1178,8 +1086,11 @@ static FORCEINLINE float __floatbits(int v) {
     return u.f;
 }
 
-/* source : 
- * http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion */
+///////////////////////////////////////////////////////////////////////////
+// half<->float : this one passes the tests 
+// source : 
+// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion 
+///////////////////////////////////////////////////////////////////////////
 class Float16Compressor
 {
   union Bits
@@ -1252,81 +1163,36 @@ class Float16Compressor
   }
 };
 
-static FORCEINLINE float __half_to_float_uniform(int16_t h) {
-#if 0
-  static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
-
-  int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
-  uint32_t exp = shifted_exp & o;   // just the exponent
-  o += (127 - 15) << 23;        // exponent adjust
-
-  // handle exponent special cases
-  if (exp == shifted_exp) // Inf/NaN?
-    o += (128 - 16) << 23;    // extra exp adjust
-  else if (exp == 0) { // Zero/Denormal?
-    o += 1 << 23;             // extra exp adjust
-    o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
-  }
-
-  o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
-  return __floatbits(o);
-#else
+static FORCEINLINE float __half_to_float_uniform(int16_t h) 
+{
   return Float16Compressor::decompress(h);
-#endif
+}
+static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) 
+{
+  __vec16_f ret;
+  for (int i = 0; i < 16; ++i)
+    ret[i] = __half_to_float_uniform(v[i]);
+  return ret;
 }
 
 
-static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) {
-    __vec16_f ret;
-    for (int i = 0; i < 16; ++i)
-        ret[i] = __half_to_float_uniform(v[i]);
-    return ret;
-}
-
-
-static FORCEINLINE int16_t __float_to_half_uniform(float f) {
-#if 0
-    uint32_t sign_mask = 0x80000000u;
-    int32_t o;
-
-    int32_t fint = __intbits(f);
-    int32_t sign = fint & sign_mask;
-    fint ^= sign;
-
-    int32_t f32infty = 255 << 23;
-    o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
-
-    // (De)normalized number or zero
-    // update fint unconditionally to save the blending; we don't need it
-    // anymore for the Inf/NaN case anyway.
-    const uint32_t round_mask = ~0xfffu; 
-    const int32_t magic = 15 << 23;
-    const int32_t f16infty = 31 << 23;
-
-    int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
-    fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
-
-    if (fint < f32infty)
-        o = fint2 >> 13; // Take the bits!
-
-    return (o | (sign >> 16));
-#else
+static FORCEINLINE int16_t __float_to_half_uniform(float f) 
+{
   return Float16Compressor::compress(f);
-#endif
 }
-
-
-static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) {
-    __vec16_i16 ret;
-    for (int i = 0; i < 16; ++i)
-        ret[i] = __float_to_half_uniform(v[i]);
-    return ret;
+static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) 
+{
+  __vec16_i16 ret;
+  for (int i = 0; i < 16; ++i)
+    ret[i] = __float_to_half_uniform(v[i]);
+  return ret;
 }
 
 
 #if 0 /* evghenii::double */
 ///////////////////////////////////////////////////////////////////////////
 // double
+///////////////////////////////////////////////////////////////////////////
 
 BINARY_OP(__vec16_d, __add, +)
 BINARY_OP(__vec16_d, __sub, -)

From 8b0fc558cb88a1675f903058a1695b70b60efefe Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Sat, 5 Oct 2013 14:15:33 +0300
Subject: [PATCH 069/159] complete cleaning

---
 examples/intrinsics/knc-i1x16.h | 1322 ++++++++++---------------------
 1 file changed, 438 insertions(+), 884 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 87f54dfa..e712c969 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -1,5 +1,5 @@
 /**
-  Copyright (c) 2010-2012, Intel Corporation
+  Copyright (c) 2010-2013, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -265,6 +265,7 @@ static inline int32_t __extract_element(__vec16_i32, int);
 ///////////////////////////////////////////////////////////////////////////
 // macros...
 
+/* knc::macro::not used */
 #define UNARY_OP(TYPE, NAME, OP)            \
 static FORCEINLINE TYPE NAME(TYPE v) {      \
     TYPE ret;                               \
@@ -273,6 +274,7 @@ static FORCEINLINE TYPE NAME(TYPE v) {      \
     return ret;                             \
 }
 
+/* knc::macro::used */
 #define BINARY_OP(TYPE, NAME, OP)                               \
 static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
     TYPE ret;                                                   \
@@ -281,6 +283,7 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
    return ret;                                                   \
 }
 
+/* knc::macro::used */
 #define BINARY_OP_CAST(TYPE, CAST, NAME, OP)                        \
 static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
    TYPE ret;                                                        \
@@ -289,6 +292,7 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
    return ret;                                                      \
 }
 
+/* knc::macro::used */
 #define BINARY_OP_FUNC(TYPE, NAME, FUNC)                            \
 static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
    TYPE ret;                                                        \
@@ -297,6 +301,7 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
    return ret;                                                      \
 }
 
+/* knc::macro::used */
 #define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP)                        \
 static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) {     \
    __vec16_i1 ret;                                                  \
@@ -315,6 +320,7 @@ static FORCEINLINE __vec16_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b,       \
    return ret;                                                      \
 }
 
+/* knc::macro::used */
 #define INSERT_EXTRACT(VTYPE, STYPE)                                  \
 static FORCEINLINE STYPE __extract_element(VTYPE v, int index) {      \
     return ((STYPE *)&v)[index];                                      \
@@ -323,6 +329,7 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
     ((STYPE *)v)[index] = val;                                        \
 }
 
+/* knc::macro::used */
 #define LOAD_STORE(VTYPE, STYPE)                       \
 template <int ALIGN>                                   \
 static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
@@ -339,24 +346,7 @@ static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
         ptr[i] = v[i];                               \
 }
 
-#define LOADS(VTYPE, STYPE)                       \
-template <int ALIGN>                                   \
-static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
-    STYPE *ptr = (STYPE *)p;                           \
-    VTYPE ret;                                         \
-    for (int i = 0; i < 16; ++i)                       \
-        ret[i] = ptr[i];                             \
-    return ret;                                        \
-}                                                      \
-
-#define STORES(VTYPE, STYPE)                       \
-template <int ALIGN>                                   \
-static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
-    STYPE *ptr = (STYPE *)p;                           \
-    for (int i = 0; i < 16; ++i)                       \
-        ptr[i] = v[i];                               \
-}
-
+/* knc::macro::used */
 #define REDUCE_ADD(TYPE, VTYPE, NAME)           \
 static FORCEINLINE TYPE NAME(VTYPE v) {         \
      TYPE ret = v[0];                         \
@@ -365,6 +355,7 @@ static FORCEINLINE TYPE NAME(VTYPE v) {         \
      return ret;                                \
 }
 
+/* knc::macro::used */
 #define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP)                    \
 static FORCEINLINE TYPE NAME(VTYPE v) {                         \
     TYPE ret = v[0];                                          \
@@ -373,6 +364,7 @@ static FORCEINLINE TYPE NAME(VTYPE v) {                         \
     return ret;                                                 \
 }
 
+/* knc::macro::used */
 #define SELECT(TYPE)                                                \
 static FORCEINLINE TYPE __select(__vec16_i1 mask, TYPE a, TYPE b) { \
     TYPE ret;                                                       \
@@ -384,6 +376,7 @@ static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
     return cond ? a : b;                                            \
 }
 
+/* knc::macro::used */
 #define SHIFT_UNIFORM(TYPE, CAST, NAME, OP)                         \
 static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
    TYPE ret;                                                        \
@@ -392,6 +385,7 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
    return ret;                                                      \
 }
 
+/* knc::macro::used */
 #define SMEAR(VTYPE, NAME, STYPE)                                  \
 template <class RetVecType> VTYPE __smear_##NAME(STYPE);           \
 template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
@@ -401,6 +395,7 @@ template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
     return ret;                                                    \
 }
 
+/* knc::macro::used */
 #define SETZERO(VTYPE, NAME)                                       \
 template <class RetVecType> VTYPE __setzero_##NAME();              \
 template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
@@ -410,12 +405,14 @@ template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
     return ret;                                                    \
 }
 
+/* knc::macro::used */
 #define UNDEF(VTYPE, NAME)                                         \
 template <class RetVecType> VTYPE __undef_##NAME();                \
 template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() {            \
     return VTYPE();                                                \
 }
 
+/* knc::macro::used */
 #define BROADCAST(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
     VTYPE ret;                                        \
@@ -424,6 +421,7 @@ static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
     return ret;                                       \
 }                                                     \
 
+/* knc::macro::used */
 #define ROTATE(VTYPE, NAME, STYPE)                    \
 static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
     VTYPE ret;                                        \
@@ -432,6 +430,7 @@ static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
     return ret;                                       \
 }                                                     \
 
+/* knc::macro::used */
 #define SHUFFLES(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) {   \
     VTYPE ret;                                        \
@@ -448,16 +447,6 @@ static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index
     return ret;                                       \
 }
 
-#define SHUFFLE2(VTYPE, NAME, STYPE)                 \
-static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index) {     \
-    VTYPE ret;                                        \
-    for (int i = 0; i < 16; ++i) {                    \
-        int ii = __extract_element(index, i) & 0x1f;    \
-        ret[i] = (ii < 16) ? v0[ii] : v1[ii-16];  \
-    }                                                 \
-    return ret;                                       \
-}
-
 ///////////////////////////////////////////////////////////////////////////
 
 INSERT_EXTRACT(__vec1_i8, int8_t)
@@ -724,9 +713,9 @@ static FORCEINLINE __vec16_i64 __add(__vec16_i64 a, __vec16_i64 b)
 
 static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) 
 {
-  //  this intrinsic doesn't exist :S 
-  //  return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2));
-  //  use knc.h implementation
+#if __ICC >= 99999 /* compiler gate, icc >= 99999 will hopefully support _mm512_sub_epi64 */
+  return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2));
+#else
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
   __vec16_i64 ret;
@@ -734,6 +723,7 @@ static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b)
   ret.v_lo = _mm512_subsetb_epi32(a.v_lo, b.v_lo, &borrow);
   ret.v_hi = _mm512_sbb_epi32    (a.v_hi, borrow, b.v_hi, &borrow);
   return ret.cvt2zmm();
+#endif
 }
 
 static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b)
@@ -744,11 +734,15 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b
         _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm();
 }
 
+#if __ICC >= 1400 /* compiler gate, icc >= 14.0.0 support _mm512_mullox_epi64 */
+static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) 
+{
+  return __vec16_i64(_mm512_mullox_epi64(a.v1,b.v1), _mm512_mullox_epi64(a.v2,b.v2));
+}
+#else  /* __ICC >= 1400 */
+#if 0  /* knc::fails  ./tests/int64-min-1.ispc ./tests/idiv.ispc */
 static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) 
 {
-#if __ICC >= 1400
-  return __vec16_i64(_mm512_mullox_epi64(_a.v1,_b.v1), _mm512_mullox_epi64(_a.v2,_b.v2));
-#else
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
   __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo);
@@ -759,8 +753,11 @@ static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b)
   __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry);
   __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry);
   return __vec16_i64(hi,lo).cvt2zmm();
-#endif
 }
+#else
+BINARY_OP(__vec16_i64, __mul, *)
+#endif
+#endif  /* __ICC >= 1400 */
 
 static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_or_epi64 (a.v1, b.v1), _mm512_or_epi64 (a.v2, b.v2)); }
 static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); }
@@ -782,9 +779,7 @@ static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) { return __v
 static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2)); }
 static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2)); }
 
-#if 1
-BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
-#else /* knc::fails  ./tests/idiv.ispc */
+#if 0 /* knc::fails  ./tests/idiv.ispc */
 static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) {
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
@@ -800,12 +795,11 @@ static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) {
   __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
   return __vec16_i64(hi,lo).cvt2zmm();
 }
-
+#else
+BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
 #endif
 
-#if 1
-BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)
-#else /* knc::fails  ./tests/idiv.ispc */
+#if 0 /* knc::fails  ./tests/idiv.ispc */
 static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) {
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
@@ -816,16 +810,15 @@ static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) {
   __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
   return __vec16_i64(hi,lo).cvt2zmm();
 }
+#else
+BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)
 #endif
 
 SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
 
-#if 1
-CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
-CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
-#else /* knc::fails  ./tests/reduce-equal-8.ispc , knc::hangs foreach-unique-6.ispc funcptr-null-[2-6].ispc  funcptr-uniform-9.ispc  funcptr-varying-5.ispc */
+#if 0 /* knc::fails  ./tests/reduce-equal-8.ispc , knc::hangs foreach-unique-6.ispc funcptr-null-[2-6].ispc  funcptr-uniform-9.ispc  funcptr-varying-5.ispc */
 static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b) 
 {
   const __vec16_i64 a = _a.cvt2hilo();
@@ -850,6 +843,9 @@ static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(__vec16_i64 a, __vec16_i6
 {
     return __and(__not(__equal_i64(a,b)), mask);
 }
+#else
+CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
+CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
 #endif
 
 
@@ -1037,7 +1033,7 @@ template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
 {
     return _mm512_load_ps(p);
 }
-/* this one doesn't fail but it is  commented out for completenes, no aligned load/stores */
+/* this one doesn't fail but it is  commented out for completeness, no aligned load/stores */
 template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) 
 {
   _mm512_store_ps(p, v);
@@ -1189,303 +1185,110 @@ static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v)
 }
 
 
-#if 0 /* evghenii::double */
 ///////////////////////////////////////////////////////////////////////////
 // double
 ///////////////////////////////////////////////////////////////////////////
 
-BINARY_OP(__vec16_d, __add, +)
-BINARY_OP(__vec16_d, __sub, -)
-BINARY_OP(__vec16_d, __mul, *)
-BINARY_OP(__vec16_d, __div, /)
+#define VECOP(OP) __vec16_d(_mm512_##OP(a.v1,b.v1),_mm512_##OP(a.v2,b.v2))
+static FORCEINLINE __vec16_d __add(__vec16_d a, __vec16_d b) { return VECOP(add_pd); }
+static FORCEINLINE __vec16_d __sub(__vec16_d a, __vec16_d b) { return VECOP(sub_pd); }
+static FORCEINLINE __vec16_d __mul(__vec16_d a, __vec16_d b) { return VECOP(mul_pd); }
+static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) { return VECOP(div_pd); }
+#undef VECOP
 
-CMP_OP(__vec16_d, double, double, __equal, ==)
-CMP_OP(__vec16_d, double, double, __not_equal, !=)
-CMP_OP(__vec16_d, double, double, __less_than, <)
-CMP_OP(__vec16_d, double, double, __less_equal, <=)
-CMP_OP(__vec16_d, double, double, __greater_than, >)
-CMP_OP(__vec16_d, double, double, __greater_equal, >=)
+#define CMPOP(OP) _mm512_kmovlhb(_mm512_##OP(a.v1,b.v1),_mm512_##OP(a.v2,b.v2))
+static FORCEINLINE __vec16_i1 __equal_double        (__vec16_d a, __vec16_d b) { return CMPOP(cmpeq_pd_mask);    }
+static FORCEINLINE __vec16_i1 __not_equal_double    (__vec16_d a, __vec16_d b) { return CMPOP(cmpneq_pd_mask);   }
+static FORCEINLINE __vec16_i1 __less_than_double    (__vec16_d a, __vec16_d b) { return CMPOP(cmplt_pd_mask);    }
+static FORCEINLINE __vec16_i1 __less_equal_double   (__vec16_d a, __vec16_d b) { return CMPOP(cmple_pd_mask);    }
+static FORCEINLINE __vec16_i1 __greater_than_double (__vec16_d a, __vec16_d b) { return CMPOP(cmpnle_pd_mask);   }
+static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { return CMPOP(cmpnlt_pd_mask);   }
+static FORCEINLINE __vec16_i1 __ordered_double      (__vec16_d a, __vec16_d b) { return CMPOP(cmpord_pd_mask);   }
+static FORCEINLINE __vec16_i1 __unordered_double    (__vec16_d a, __vec16_d b) { return CMPOP(cmpunord_pd_mask); }
+#undef CMPOP
 
-static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret;
-    ret.v = 0;
-    for (int i = 0; i < 16; ++i)
-        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
-    return ret;
+#define CMPOPMASK(OP) _mm512_kmovlhb(_mm512_mask_##OP(m,a.v1,b.v1),_mm512_mask_##OP(_mm512_kswapb(m,m),a.v2,b.v2))
+static FORCEINLINE __vec16_i1 __equal_double_and_mask        (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpeq_pd_mask);  }
+static FORCEINLINE __vec16_i1 __not_equal_double_and_mask    (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpneq_pd_mask); }
+static FORCEINLINE __vec16_i1 __less_than_double_and_mask    (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmplt_pd_mask);  }
+static FORCEINLINE __vec16_i1 __less_equal_double_and_mask   (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmple_pd_mask);  }
+static FORCEINLINE __vec16_i1 __greater_than_double_and_mask (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpnle_pd_mask); }
+static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpnlt_pd_mask); }
+#undef CMOPMASK
+
+
+static FORCEINLINE __vec16_d __select(__vec16_i1 m, __vec16_d a, __vec16_d b) 
+{
+  return __vec16_d(_mm512_mask_mov_pd(b.v1, m, a.v1), _mm512_mask_mov_pd(b.v2, _mm512_kswapb(m, m), a.v2));
 }
-
-static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret;
-    ret.v = 0;
-    for (int i = 0; i < 16; ++i)
-        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
-    return ret;
-}
-
-#if 0
-      case Instruction::FRem: intrinsic = "__frem"; break;
-#endif
-
-SELECT(__vec16_d)
-INSERT_EXTRACT(__vec16_d, double)
-SMEAR(__vec16_d, double, double)
-SETZERO(__vec16_d, double)
-UNDEF(__vec16_d, double)
-BROADCAST(__vec16_d, double, double)
-ROTATE(__vec16_d, double, double)
-SHUFFLES(__vec16_d, double, double)
-LOAD_STORE(__vec16_d, double)
-#else /* evghenii::double */
-///////////////////////////////////////////////////////////////////////////
-// double
-///////////////////////////////////////////////////////////////////////////
-
-static FORCEINLINE __vec16_d __add(__vec16_d a, __vec16_d b) { 
-    __vec16_d ret;
-    ret.v1 = _mm512_add_pd(a.v1, b.v1);
-    ret.v2 = _mm512_add_pd(a.v2, b.v2);
-    return ret;
-}
-
-static FORCEINLINE __vec16_d __sub(__vec16_d a, __vec16_d b) {
-    __vec16_d ret;
-    ret.v1 = _mm512_sub_pd(a.v1, b.v1);
-    ret.v2 = _mm512_sub_pd(a.v2, b.v2);
-    return ret;
-}
-
-static FORCEINLINE __vec16_d __mul(__vec16_d a, __vec16_d b) {
-    __vec16_d ret;
-    ret.v1 = _mm512_mul_pd(a.v1, b.v1);
-    ret.v2 = _mm512_mul_pd(a.v2, b.v2);
-    return ret;
-}
-
-static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) {
-    __vec16_d ret;
-    ret.v1 = _mm512_div_pd(a.v1, b.v1);
-    ret.v2 = _mm512_div_pd(a.v2, b.v2);
-    return ret;
-}
-
-static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpeq_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpeq_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __equal_double_and_mask(__vec16_d a, __vec16_d b,
-                                                      __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_mask_cmpeq_pd_mask(m, a.v1, b.v1);
-    __vec16_i1 tmp_m = m;
-    ret2 = _mm512_mask_cmpeq_pd_mask(_mm512_kswapb(tmp_m,tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpneq_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpneq_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __not_equal_double_and_mask(__vec16_d a, __vec16_d b,
-                                                          __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    __vec16_i1 tmp_m = m;
-    ret1 = _mm512_mask_cmpneq_pd_mask(m, a.v1, b.v1);
-    ret2 = _mm512_mask_cmpneq_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmplt_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmplt_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __less_than_double_and_mask(__vec16_d a, __vec16_d b,
-                                                          __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    __vec16_i1 tmp_m = m;
-    ret1 = _mm512_mask_cmplt_pd_mask(m, a.v1, b.v1);
-    ret2 = _mm512_mask_cmplt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmple_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmple_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __less_equal_double_and_mask(__vec16_d a, __vec16_d b,
-                                                           __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    __vec16_i1 tmp_m = m;
-    ret1 = _mm512_mask_cmple_pd_mask(m, a.v1, b.v1);
-    ret2 = _mm512_mask_cmple_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpnle_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpnle_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __greater_than_double_and_mask(__vec16_d a, __vec16_d b,
-                                                             __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    __vec16_i1 tmp_m = m;
-    ret1 = _mm512_mask_cmpnle_pd_mask(m, a.v1, b.v1);
-    ret2 = _mm512_mask_cmpnle_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b,
-                                                              __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    __vec16_i1 tmp_m = m;
-    ret1 = _mm512_mask_cmpnlt_pd_mask(m, a.v1, b.v1);
-    ret2 = _mm512_mask_cmpnlt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpord_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpord_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpunord_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpunord_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_d __select(__vec16_i1 mask, __vec16_d a, __vec16_d b) {
-    __vec16_d ret;
-    __vec16_i1 tmp_m = mask;
-    ret.v1 = _mm512_mask_mov_pd(b.v1, mask, a.v1);
-    ret.v2 = _mm512_mask_mov_pd(b.v2, _mm512_kswapb(tmp_m, tmp_m), a.v2);
-    return ret;
-}
-
-
-static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b) {
+static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b) 
+{
     return cond ? a : b;
 }
 
-static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) {
-    return ((double *)&v)[index];
-}
-
-static FORCEINLINE void  __insert_element(__vec16_d *v, uint32_t index, double val) {
-    ((double *)v)[index] = val;
-}
+static FORCEINLINE double __extract_element(__vec16_d  v, uint32_t index)             { return v[index];   }
+static FORCEINLINE void    __insert_element(__vec16_d *v, uint32_t index, double val) { (*v)[index] = val; }
 
 template <class RetVecType> RetVecType __smear_double(double d);
-template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) {
-    __vec16_d ret;
-    ret.v1 = _mm512_set1_pd(d);
-    ret.v2 = _mm512_set1_pd(d);
-    return ret;
-}
+template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
 
 template <class RetVecType> RetVecType __setzero_double();
-template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() {
-    __vec16_d ret;
-    ret.v1 = _mm512_setzero_pd();
-    ret.v2 = _mm512_setzero_pd();
-    return ret;
-}
+template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
 
 template <class RetVecType> RetVecType __undef_double();
-template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() {
-    return __vec16_d();
-}
+template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
 
-static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) {
-    __vec16_d ret;
-    double val = __extract_element(v, index & 0xf);
-    ret.v1 = _mm512_set1_pd(val);
-    ret.v2 = _mm512_set1_pd(val);
-    return ret;
+static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) 
+{
+  __vec16_d ret;
+  double val = __extract_element(v, index & 0xf);
+  ret.v1 = _mm512_set1_pd(val);
+  ret.v2 = _mm512_set1_pd(val);
+  return ret;
 }
 
 ROTATE(__vec16_d, double, double)
 SHUFFLES(__vec16_d, double, double)
 
-template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) {
-    __vec16_d ret;
-    ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    return ret;
+template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) \
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  return ret;
 }
  
-template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) {
-    _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) 
+{
+  _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
 }
 
 
-#if 0
-template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) {
-    __vec16_d ret;
-    ret.v1 = _mm512_load_pd(p);
-    ret.v2 = _mm512_load_pd(((uint8_t*)p)+64);
-    return ret;
+#if 1
+template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) 
+{
+  return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
 }
-template <> static FORCEINLINE __vec16_d __load<128>(const __vec16_d *p) {
-    return __load<64>(p);
-}
-template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) {
-    _mm512_store_pd(p, v.v1);
-    _mm512_store_pd(((uint8_t*)p)+64, v.v2);
-}
-template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) {
-    __store<64>(p, v);
+template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) 
+{
+  _mm512_store_pd(p, v.v1);
+  _mm512_store_pd(((uint8_t*)p)+64, v.v2);
 }
+template <> static FORCEINLINE __vec16_d __load <128>(const __vec16_d *p)        { return __load<64>(p); }
+template <> static FORCEINLINE      void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v);    }
 #endif
-#endif /* evghenii::double */
 
 ///////////////////////////////////////////////////////////////////////////
 // casts
+///////////////////////////////////////////////////////////////////////////
 
 
+/* knc::macro::used */
 #define CAST(TO, STO, FROM, SFROM, FUNC)        \
 static FORCEINLINE TO FUNC(TO, FROM val) {      \
     TO ret;                                     \
@@ -1495,13 +1298,13 @@ static FORCEINLINE TO FUNC(TO, FROM val) {      \
 }
 
 // sign extension conversions
-#if 1
-CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext)
-#else /* evghenii::fails on soa-9 soa-13 soa-10 soa-29 soa-3 ... and others  */
+#if 0 /* knc::fails on soa-9 soa-13 soa-10 soa-29 soa-3 ... and others  */
 static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
 {
   return __vec16_i64(_mm512_srai_epi32(val.v,31), val.v).cvt2zmm();
 }
+#else
+CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext)
 #endif
 CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext)
 CAST(__vec16_i64, int64_t, __vec16_i8,  int8_t,  __cast_sext)
@@ -1509,6 +1312,7 @@ CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
 CAST(__vec16_i32, int32_t, __vec16_i8,  int8_t,  __cast_sext)
 CAST(__vec16_i16, int16_t, __vec16_i8,  int8_t,  __cast_sext)
 
+/* knc::macro::used */
 #define CAST_SEXT_I1(TYPE)                            \
 static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) {  \
     TYPE ret;                                         \
@@ -1522,34 +1326,31 @@ static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) {  \
 
 CAST_SEXT_I1(__vec16_i8)
 CAST_SEXT_I1(__vec16_i16)
-#if 0
-CAST_SEXT_I1(__vec16_i32)
-#else
+
+//CAST_SEXT_I1(__vec16_i32)
 static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val)
 {
   __vec16_i32 ret = _mm512_setzero_epi32();
   __vec16_i32 one = _mm512_set1_epi32(-1);
   return _mm512_mask_mov_epi32(ret, val, one);
 }
-#endif
+
 CAST_SEXT_I1(__vec16_i64)
 
 // zero extension
-#if 0
-CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
-#else
+// CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
 static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val)
 {
   return __vec16_i64(_mm512_setzero_epi32(), val.v).cvt2zmm();
 }
 
-#endif
 CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext)
 CAST(__vec16_i64, uint64_t, __vec16_i8,  uint8_t,  __cast_zext)
 CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext)
 CAST(__vec16_i32, uint32_t, __vec16_i8,  uint8_t,  __cast_zext)
 CAST(__vec16_i16, uint16_t, __vec16_i8,  uint8_t,  __cast_zext)
 
+/* knc::macro::used */
 #define CAST_ZEXT_I1(TYPE)                            \
 static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) {  \
     TYPE ret;                                         \
@@ -1560,16 +1361,15 @@ static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) {  \
 
 CAST_ZEXT_I1(__vec16_i8)
 CAST_ZEXT_I1(__vec16_i16)
-#if 0
-CAST_ZEXT_I1(__vec16_i32)
-#else
+
+//CAST_ZEXT_I1(__vec16_i32)
 static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
 {
-    __vec16_i32 ret = _mm512_setzero_epi32();
-    __vec16_i32 one = _mm512_set1_epi32(1);
-    return _mm512_mask_mov_epi32(ret, val, one);
+  __vec16_i32 ret = _mm512_setzero_epi32();
+  __vec16_i32 one = _mm512_set1_epi32(1);
+  return _mm512_mask_mov_epi32(ret, val, one);
 }
-#endif
+
 CAST_ZEXT_I1(__vec16_i64)
 
 // truncations
@@ -1581,170 +1381,160 @@ CAST(__vec16_i8,  int8_t,  __vec16_i32, int32_t, __cast_trunc)
 CAST(__vec16_i8,  int8_t,  __vec16_i16, int16_t, __cast_trunc)
 
 // signed int to float/double
-#if 0
-CAST(__vec16_f, float, __vec16_i8,   int8_t,  __cast_sitofp)
-CAST(__vec16_f, float, __vec16_i16,  int16_t, __cast_sitofp)
-CAST(__vec16_f, float, __vec16_i32,  int32_t, __cast_sitofp)
-#else
+
+//CAST(__vec16_f, float, __vec16_i8,   int8_t,  __cast_sitofp)
 static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i8  val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i16,  int16_t, __cast_sitofp)
 static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i32,  int32_t, __cast_sitofp)
 static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepi32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
-#endif
+
 CAST(__vec16_f, float, __vec16_i64,  int64_t, __cast_sitofp)
-#if 0
-CAST(__vec16_d, double, __vec16_i8,  int8_t,  __cast_sitofp)
-CAST(__vec16_d, double, __vec16_i16, int16_t, __cast_sitofp)
-CAST(__vec16_d, double, __vec16_i32, int32_t, __cast_sitofp)
-#else
-static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i8 val) {
-    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtepi32lo_pd(vi);
-    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
-    ret.v2 = _mm512_cvtepi32lo_pd(other8);
-    return ret;
+
+//CAST(__vec16_d, double, __vec16_i8,  int8_t,  __cast_sitofp)
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i8 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepi32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepi32lo_pd(other8);
+  return ret;
 }
 
-static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i16 val) {
-    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtepi32lo_pd(vi);
-    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
-    ret.v2 = _mm512_cvtepi32lo_pd(other8);
-    return ret;
+// CAST(__vec16_d, double, __vec16_i16, int16_t, __cast_sitofp)
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i16 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepi32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepi32lo_pd(other8);
+  return ret;
 }
 
-static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i32 val) {
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtepi32lo_pd(val);
-    __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
-    ret.v2 = _mm512_cvtepi32lo_pd(other8);
-    return ret;
+// CAST(__vec16_d, double, __vec16_i32, int32_t, __cast_sitofp)
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i32 val) 
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepi32lo_pd(val);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepi32lo_pd(other8);
+  return ret;
 }
-#endif
+
 CAST(__vec16_d, double, __vec16_i64, int64_t, __cast_sitofp)
 
 // unsigned int to float/double
-#if 0
-CAST(__vec16_f, float, __vec16_i8,   uint8_t,  __cast_uitofp)
-CAST(__vec16_f, float, __vec16_i16,  uint16_t, __cast_uitofp)
-CAST(__vec16_f, float, __vec16_i32,  uint32_t, __cast_uitofp)
-#else
+
+// CAST(__vec16_f, float, __vec16_i8,   uint8_t,  __cast_uitofp)
 static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i8  val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i16,  uint16_t, __cast_uitofp)
 static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i32,  uint32_t, __cast_uitofp)
 static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepu32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
-#endif
+
 CAST(__vec16_f, float, __vec16_i64,  uint64_t, __cast_uitofp)
-#if 0
-CAST(__vec16_d, double, __vec16_i8,  uint8_t,  __cast_uitofp)
-CAST(__vec16_d, double, __vec16_i16, uint16_t, __cast_uitofp)
-CAST(__vec16_d, double, __vec16_i32, uint32_t, __cast_uitofp)
-#else
-static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i8 val) {
-    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtepu32lo_pd(vi);
-    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
-    ret.v2 = _mm512_cvtepu32lo_pd(other8);
-    return ret;
+
+// CAST(__vec16_d, double, __vec16_i8,  uint8_t,  __cast_uitofp)
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i8 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepu32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepu32lo_pd(other8);
+  return ret;
 }
 
-static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i16 val) {
-    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtepu32lo_pd(vi);
-    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
-    ret.v2 = _mm512_cvtepu32lo_pd(other8);
-    return ret;
+// CAST(__vec16_d, double, __vec16_i16, uint16_t, __cast_uitofp)
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i16 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepu32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepu32lo_pd(other8);
+  return ret;
 }
 
-static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i32 val) {
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtepu32lo_pd(val);
-    __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
-    ret.v2 = _mm512_cvtepu32lo_pd(other8);
-    return ret;
+// CAST(__vec16_d, double, __vec16_i32, uint32_t, __cast_uitofp)
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i32 val) 
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepu32lo_pd(val);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepu32lo_pd(other8);
+  return ret;
 }
-#endif
+
 CAST(__vec16_d, double, __vec16_i64, uint64_t, __cast_uitofp)
 
-#if 0
-static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) {
-    __vec16_f ret;
-    for (int i = 0; i < 16; ++i)
-        ret[i] = (v.v & (1 << i)) ? 1. : 0.;
-    return ret;
-}
-#else
 static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) 
 {
-    const __m512 ret = _mm512_setzero_ps();
-    const __m512 one = _mm512_set1_ps(1.0);
-    return _mm512_mask_mov_ps(ret, v, one);
+  const __m512 ret = _mm512_setzero_ps();
+  const __m512 one = _mm512_set1_ps(1.0);
+  return _mm512_mask_mov_ps(ret, v, one);
 }
-#endif
 
 // float/double to signed int
 CAST(__vec16_i8,  int8_t,  __vec16_f, float, __cast_fptosi)
 CAST(__vec16_i16, int16_t, __vec16_f, float, __cast_fptosi)
-#if 0
-CAST(__vec16_i32, int32_t, __vec16_f, float, __cast_fptosi)
-#else
-static FORCEINLINE __vec16_i32 __cast_fptosi(__vec16_i32, __vec16_f val) {
+
+// CAST(__vec16_i32, int32_t, __vec16_f, float, __cast_fptosi)
+static FORCEINLINE __vec16_i32 __cast_fptosi(__vec16_i32, __vec16_f val) 
+{
   return _mm512_cvtfxpnt_round_adjustps_epi32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
 }
-#endif
+
 CAST(__vec16_i64, int64_t, __vec16_f, float, __cast_fptosi)
 CAST(__vec16_i8,  int8_t,  __vec16_d, double, __cast_fptosi)
 CAST(__vec16_i16, int16_t, __vec16_d, double, __cast_fptosi)
-#if 1
-CAST(__vec16_i32, int32_t, __vec16_d, double, __cast_fptosi)
+#if 0 /* knc::2implement */
 #else
+CAST(__vec16_i32, int32_t, __vec16_d, double, __cast_fptosi)
 #endif
 CAST(__vec16_i64, int64_t, __vec16_d, double, __cast_fptosi)
 
 // float/double to unsigned int
 CAST(__vec16_i8,  uint8_t,  __vec16_f, float, __cast_fptoui)
 CAST(__vec16_i16, uint16_t, __vec16_f, float, __cast_fptoui)
-#if 0
-CAST(__vec16_i32, uint32_t, __vec16_f, float, __cast_fptoui)
-#else
-static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) {
+
+// CAST(__vec16_i32, uint32_t, __vec16_f, float, __cast_fptoui)
+static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) 
+{
   return _mm512_cvtfxpnt_round_adjustps_epu32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
 }
-#endif
+
 CAST(__vec16_i64, uint64_t, __vec16_f, float, __cast_fptoui)
 CAST(__vec16_i8,  uint8_t,  __vec16_d, double, __cast_fptoui)
 CAST(__vec16_i16, uint16_t, __vec16_d, double, __cast_fptoui)
-#if 1
-CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui)
+#if 0 /* knc::2implement */
 #else
+CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui)
 #endif
 CAST(__vec16_i64, uint64_t, __vec16_d, double, __cast_fptoui)
 
 // float/double conversions
-#if 0
-CAST(__vec16_f, float,  __vec16_d, double, __cast_fptrunc)
-#else
-static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
-    __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
-    __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
 
-    return _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA));
-}
-#endif
+// CAST(__vec16_f, float,  __vec16_d, double, __cast_fptrunc)
+static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) 
+{
+  __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
+  __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
 
-#if 0
-CAST(__vec16_d, double, __vec16_f, float,  __cast_fpext)
-#else
-static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) {
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtpslo_pd(val.v);
-    __vec16_f other8 = _mm512_castsi512_ps(_mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC));
-    ret.v2 = _mm512_cvtpslo_pd(other8);
-    return ret;
+  return _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA));
+}
+
+// CAST(__vec16_d, double, __vec16_f, float,  __cast_fpext)
+static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) 
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtpslo_pd(val.v);
+  __vec16_f other8 = _mm512_castsi512_ps(_mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC));
+  ret.v2 = _mm512_cvtpslo_pd(other8);
+  return ret;
 }
-#endif
 
 typedef union {
     int32_t i32;
@@ -1753,6 +1543,7 @@ typedef union {
     double d;
 } BitcastUnion;
 
+/* knc::macro::not used */
 #define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT)       \
 static FORCEINLINE TO __cast_bits(TO, FROM val) {   \
     TO r;                                           \
@@ -1764,30 +1555,17 @@ static FORCEINLINE TO __cast_bits(TO, FROM val) {   \
     return r;                                       \
 }
 
-#if 0
-CAST_BITS(__vec16_f,   f,   __vec16_i32, i32)
-CAST_BITS(__vec16_i32, i32, __vec16_f,   f)
-#else
-static FORCEINLINE __vec16_f __cast_bits(__vec16_f, __vec16_i32 val) {
-    return _mm512_castsi512_ps(val);
-}
-static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) {
-    return _mm512_castps_si512(val);
-}
-#endif
+// CAST_BITS(__vec16_f,   f,   __vec16_i32, i32)
+static FORCEINLINE __vec16_f __cast_bits(__vec16_f, __vec16_i32 val) { return _mm512_castsi512_ps(val); }
+// CAST_BITS(__vec16_i32, i32, __vec16_f,   f)
+static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) { return _mm512_castps_si512(val); }
 
-#if 0
-CAST_BITS(__vec16_d,   d,   __vec16_i64, i64)
-CAST_BITS(__vec16_i64, i64, __vec16_d,   d)
-#else
-static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) {
-    return *(__vec16_i64*)&val;
-}
-static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) {
-    return *(__vec16_d*)&val;
-}
-#endif
+// CAST_BITS(__vec16_d,   d,   __vec16_i64, i64)
+static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) { return *(__vec16_i64*)&val; }
+// CAST_BITS(__vec16_i64, i64, __vec16_d,   d)
+static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) { return *(__vec16_d*)&val; }
 
+/* knc::macro::used */
 #define CAST_BITS_SCALAR(TO, FROM)                  \
 static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
     union {                                         \
@@ -1809,6 +1587,7 @@ CAST_BITS_SCALAR(double, int64_t)
 
 ///////////////////////////////////////////////////////////////////////////
 // various math functions
+///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE void __fastmath() {
 }
@@ -1837,168 +1616,100 @@ static FORCEINLINE double __ceil_uniform_double(double v) {
     return ceil(v);
 }
 
-#if 0
-UNARY_OP(__vec16_f, __round_varying_float, roundf)
-UNARY_OP(__vec16_f, __floor_varying_float, floorf)
-UNARY_OP(__vec16_f, __ceil_varying_float, ceilf)
-#else
-static FORCEINLINE __vec16_f __round_varying_float(__vec16_f v) {
-  return _mm512_round_ps(v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);
-}
-
-static FORCEINLINE __vec16_f __floor_varying_float(__vec16_f v) {
-  return _mm512_floor_ps(v);
-}
-
-static FORCEINLINE __vec16_f __ceil_varying_float(__vec16_f v) {
-  return _mm512_ceil_ps(v);
-}
-#endif
-
-#if 0
-UNARY_OP(__vec16_d, __round_varying_double, round)
-UNARY_OP(__vec16_d, __floor_varying_double, floor)
-UNARY_OP(__vec16_d, __ceil_varying_double, ceil)
-#else
-static FORCEINLINE __vec16_d __round_varying_float(__vec16_d v) {
-  __vec16_d ret;
-  ret.v1 = _mm512_svml_round_pd(v.v1);
-  ret.v2 = _mm512_svml_round_pd(v.v2);
-  return ret;
-}
-
-static FORCEINLINE __vec16_d __floor_varying_float(__vec16_d v) {
-  __vec16_d ret;
-  ret.v1 = _mm512_floor_pd(v.v1);
-  ret.v2 = _mm512_floor_pd(v.v2);
-  return ret;
-}
-
-static FORCEINLINE __vec16_d __ceil_varying_float(__vec16_d v) {
-  __vec16_d ret;
-  ret.v1 = _mm512_ceil_pd(v.v1);
-  ret.v2 = _mm512_ceil_pd(v.v2);
-  return ret;
-}
-#endif
+static FORCEINLINE __vec16_f __round_varying_float(__vec16_f v) { return _mm512_round_ps(v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); }
+static FORCEINLINE __vec16_f __floor_varying_float(__vec16_f v) { return _mm512_floor_ps(v); }
+static FORCEINLINE __vec16_f __ceil_varying_float(__vec16_f v) { return _mm512_ceil_ps(v); }
 
+static FORCEINLINE __vec16_d __round_varying_float(__vec16_d v)  { return __vec16_d(_mm512_svml_round_pd(v.v1), _mm512_svml_round_pd(v.v2)); }
+static FORCEINLINE __vec16_d __floor_varying_float(__vec16_d v)  { return __vec16_d(_mm512_floor_pd(v.v1), _mm512_floor_pd(v.v2)); }
+static FORCEINLINE __vec16_d __ceil_varying_float(__vec16_d v)  { return __vec16_d(_mm512_ceil_pd(v.v1), _mm512_ceil_pd(v.v2)); }
 
 // min/max
 
-static FORCEINLINE float __min_uniform_float(float a, float b) { return (a<b) ? a : b; }
-static FORCEINLINE float __max_uniform_float(float a, float b) { return (a>b) ? a : b; }
+static FORCEINLINE float  __min_uniform_float (float  a, float  b) { return (a<b) ? a : b; }
+static FORCEINLINE float  __max_uniform_float (float  a, float  b) { return (a>b) ? a : b; }
 static FORCEINLINE double __min_uniform_double(double a, double b) { return (a<b) ? a : b; }
 static FORCEINLINE double __max_uniform_double(double a, double b) { return (a>b) ? a : b; }
 
-static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (a<b) ? a : b; }
-static FORCEINLINE int32_t __max_uniform_int32(int32_t a, int32_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int32_t __min_uniform_int32 ( int32_t a,  int32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_int32 ( int32_t a,  int32_t b) { return (a>b) ? a : b; }
 static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (a<b) ? a : b; }
 static FORCEINLINE int32_t __max_uniform_uint32(uint32_t a, uint32_t b) { return (a>b) ? a : b; }
 
-static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (a<b) ? a : b; }
-static FORCEINLINE int64_t __max_uniform_int64(int64_t a, int64_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int64_t __min_uniform_int64 ( int64_t a,  int64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_int64 ( int64_t a,  int64_t b) { return (a>b) ? a : b; }
 static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (a<b) ? a : b; }
 static FORCEINLINE int64_t __max_uniform_uint64(uint64_t a, uint64_t b) { return (a>b) ? a : b; }
 
-
-#if 0
-BINARY_OP_FUNC(__vec16_f, __max_varying_float, __max_uniform_float)
-BINARY_OP_FUNC(__vec16_f, __min_varying_float, __min_uniform_float)
-BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double)
-BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double)
-#else
 static FORCEINLINE __vec16_f __max_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmax_ps(v1, v2);}
 static FORCEINLINE __vec16_f __min_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmin_ps(v1, v2);}
 static FORCEINLINE __vec16_d __max_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmax_pd(v1.v1, v2.v1),_mm512_gmax_pd(v1.v2,v2.v2));}
 static FORCEINLINE __vec16_d __min_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmin_pd(v1.v1, v2.v1),_mm512_gmin_pd(v1.v2,v2.v2));}
-#endif
 
-#if 0
-BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32)
-BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32)
-BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32)
-BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32)
-#else
 static FORCEINLINE __vec16_i32 __max_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epi32(v1, v2);}
 static FORCEINLINE __vec16_i32 __min_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epi32(v1, v2);}
 static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epu32(v1, v2);}
 static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epu32(v1, v2);}
-#endif
 
-BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64)
-BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64)
+BINARY_OP_FUNC(__vec16_i64, __max_varying_int64,  __max_uniform_int64)
+BINARY_OP_FUNC(__vec16_i64, __min_varying_int64,  __min_uniform_int64)
 BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64)
 BINARY_OP_FUNC(__vec16_i64, __min_varying_uint64, __min_uniform_uint64)
 
 // sqrt/rsqrt/rcp
 
-static FORCEINLINE float __rsqrt_uniform_float(float v) {
-    return 1.f / sqrtf(v);
-}
+static FORCEINLINE float  __rsqrt_uniform_float(float  v) { return 1.f / sqrtf(v); }
+static FORCEINLINE float  __rcp_uniform_float  (float  v) { return 1.f / v;        }
+static FORCEINLINE float  __sqrt_uniform_float (float  v) { return sqrtf(v);       }
+static FORCEINLINE double __sqrt_uniform_double(double v) { return sqrt (v);       }
 
-static FORCEINLINE float __rcp_uniform_float(float v) {
-    return 1.f / v;
-}
-
-static FORCEINLINE float __sqrt_uniform_float(float v) {
-    return sqrtf(v);
-}
-
-static FORCEINLINE double __sqrt_uniform_double(double v) {
-    return sqrt(v);
-}
-
-#if 0
-UNARY_OP(__vec16_f, __rcp_varying_float, __rcp_uniform_float)
-UNARY_OP(__vec16_f, __rsqrt_varying_float, __rsqrt_uniform_float)
-UNARY_OP(__vec16_f, __sqrt_varying_float, __sqrt_uniform_float)
-UNARY_OP(__vec16_d, __sqrt_varying_double, __sqrt_uniform_double)
-#else
-static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) {
+static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) 
+{
 #ifdef ISPC_FAST_MATH
-    return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy.
+  return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy.
 #else
-    return _mm512_recip_ps(v);
+  return _mm512_recip_ps(v);
 #endif
 }
 
-static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
+static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) 
+{
 #ifdef ISPC_FAST_MATH
-    return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy
+  return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy
 #else 
-    return _mm512_invsqrt_ps(v);
+  return _mm512_invsqrt_ps(v);
 #endif
 }
-static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) {    return _mm512_sqrt_ps(v);}
-static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) {    return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));}
-#endif
+static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) { return _mm512_sqrt_ps(v);}
+static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));}
 
 ///////////////////////////////////////////////////////////////////////////
 // svml
 ///////////////////////////////////////////////////////////////////////////
 
-static FORCEINLINE __vec16_f __svml_sinf(__vec16_f v)              { return _mm512_sin_ps(v); }
-static FORCEINLINE __vec16_f __svml_asinf(__vec16_f v)              { return _mm512_asin_ps(v); }
-static FORCEINLINE __vec16_f __svml_cosf(__vec16_f v)              { return _mm512_cos_ps(v); }
-static FORCEINLINE __vec16_f __svml_tanf(__vec16_f v)              { return _mm512_tan_ps(v); }
-static FORCEINLINE __vec16_f __svml_atanf(__vec16_f v)              { return _mm512_atan_ps(v); }
+static FORCEINLINE __vec16_f __svml_sinf  (__vec16_f v)              { return _mm512_sin_ps(v);     }
+static FORCEINLINE __vec16_f __svml_asinf (__vec16_f v)              { return _mm512_asin_ps(v);    }
+static FORCEINLINE __vec16_f __svml_cosf  (__vec16_f v)              { return _mm512_cos_ps(v);     }
+static FORCEINLINE __vec16_f __svml_tanf  (__vec16_f v)              { return _mm512_tan_ps(v);     }
+static FORCEINLINE __vec16_f __svml_atanf (__vec16_f v)              { return _mm512_atan_ps(v);    }
 static FORCEINLINE __vec16_f __svml_atan2f(__vec16_f a, __vec16_f b) { return _mm512_atan2_ps(a,b); }
-static FORCEINLINE __vec16_f __svml_expf(__vec16_f v)              { return _mm512_exp_ps(v); }
-static FORCEINLINE __vec16_f __svml_logf(__vec16_f v)              { return _mm512_log_ps(v); }
-static FORCEINLINE __vec16_f __svml_powf(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); }
+static FORCEINLINE __vec16_f __svml_expf  (__vec16_f v)              { return _mm512_exp_ps(v);     }
+static FORCEINLINE __vec16_f __svml_logf  (__vec16_f v)              { return _mm512_log_ps(v);     }
+static FORCEINLINE __vec16_f __svml_powf  (__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b);   }
 
-static FORCEINLINE __vec16_d __svml_sind(__vec16_d v)              { return __vec16_d(_mm512_sin_pd(v.v1), _mm512_sin_pd(v.v2)); }
-static FORCEINLINE __vec16_d __svml_asind(__vec16_d v)              { return __vec16_d(_mm512_asin_pd(v.v1), _mm512_asin_pd(v.v2)); }
-static FORCEINLINE __vec16_d __svml_cosd(__vec16_d v)              { return __vec16_d(_mm512_cos_pd(v.v1), _mm512_cos_pd(v.v2)); }
-static FORCEINLINE __vec16_d __svml_tand(__vec16_d v)              { return __vec16_d(_mm512_tan_pd(v.v1), _mm512_tan_pd(v.v2)); }
-static FORCEINLINE __vec16_d __svml_atand(__vec16_d v)              { return __vec16_d(_mm512_atan_pd(v.v1), _mm512_atan_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_sind  (__vec16_d v)              { return __vec16_d(_mm512_sin_pd(v.v1), _mm512_sin_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_asind (__vec16_d v)              { return __vec16_d(_mm512_asin_pd(v.v1), _mm512_asin_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_cosd  (__vec16_d v)              { return __vec16_d(_mm512_cos_pd(v.v1), _mm512_cos_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_tand  (__vec16_d v)              { return __vec16_d(_mm512_tan_pd(v.v1), _mm512_tan_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_atand (__vec16_d v)              { return __vec16_d(_mm512_atan_pd(v.v1), _mm512_atan_pd(v.v2)); }
 static FORCEINLINE __vec16_d __svml_atan2d(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_atan2_pd(a.v1,b.v1), _mm512_atan2_pd(a.v2,b.v2)); }
-static FORCEINLINE __vec16_d __svml_expd(__vec16_d v)              { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); }
-static FORCEINLINE __vec16_d __svml_logd(__vec16_d v)              { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); }
-static FORCEINLINE __vec16_d __svml_powd(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1), _mm512_pow_pd(a.v2,b.v2)); }
+static FORCEINLINE __vec16_d __svml_expd  (__vec16_d v)              { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_logd  (__vec16_d v)              { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_powd  (__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1), _mm512_pow_pd(a.v2,b.v2)); }
 
 ///////////////////////////////////////////////////////////////////////////
 // bit ops
+///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE int32_t __popcnt_int32(uint32_t v) {
     int count = 0;
@@ -2064,42 +1775,23 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
 
 ///////////////////////////////////////////////////////////////////////////
 // reductions
+///////////////////////////////////////////////////////////////////////////
 
-#if 0
-REDUCE_ADD(float, __vec16_f, __reduce_add_float)
-REDUCE_MINMAX(float, __vec16_f, __reduce_min_float, <)
-REDUCE_MINMAX(float, __vec16_f, __reduce_max_float, >)
-#else
 static FORCEINLINE float __reduce_add_float(__vec16_f v) { return _mm512_reduce_add_ps(v); }
 static FORCEINLINE float __reduce_min_float(__vec16_f v) { return _mm512_reduce_min_ps(v); }
 static FORCEINLINE float __reduce_max_float(__vec16_f v) { return _mm512_reduce_max_ps(v); }
-#endif
 
-#if 0
-REDUCE_ADD(double, __vec16_d, __reduce_add_double)
-REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <)
-REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >)
-#else
 static FORCEINLINE float __reduce_add_double(__vec16_d v) { return _mm512_reduce_add_pd(v.v1) + _mm512_reduce_add_pd(v.v2); }
 static FORCEINLINE float __reduce_min_double(__vec16_d v) { return std::min(_mm512_reduce_min_pd(v.v1), _mm512_reduce_min_pd(v.v2)); }
 static FORCEINLINE float __reduce_max_double(__vec16_d v) { return std::max(_mm512_reduce_max_pd(v.v1), _mm512_reduce_max_pd(v.v2)); }
-#endif
 
 
 
-#if 0
-REDUCE_ADD   (int64_t, __vec16_i32, __reduce_add_int32)
-REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <)
-REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >)
-REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <)
-REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >)
-#else
 static FORCEINLINE  int64_t __reduce_add_int32  (__vec16_i32 v) { return _mm512_reduce_add_epi32(v);}
 static FORCEINLINE  int32_t __reduce_min_int32  (__vec16_i32 v) { return _mm512_reduce_min_epi32(v);}
 static FORCEINLINE  int32_t __reduce_max_int32  (__vec16_i32 v) { return _mm512_reduce_max_epi32(v);}
 static FORCEINLINE uint32_t __reduce_min_uint32 (__vec16_i32 v) { return _mm512_reduce_min_epu32(v);}
 static FORCEINLINE uint32_t __reduce_max_uint32 (__vec16_i32 v) { return _mm512_reduce_max_epu32(v);}
-#endif
 
 REDUCE_ADD   ( int16_t, __vec16_i8,  __reduce_add_int8)
 REDUCE_ADD   ( int32_t, __vec16_i16, __reduce_add_int16)
@@ -2111,6 +1803,7 @@ REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
 
 ///////////////////////////////////////////////////////////////////////////
 // masked load/store
+///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE __vec16_i8 __masked_load_i8(void *p,
                                                __vec16_i1 mask) {
@@ -2132,53 +1825,31 @@ static FORCEINLINE __vec16_i16 __masked_load_i16(void *p,
     return ret;
 }
 
-#if 0
-static FORCEINLINE __vec16_i32 __masked_load_i32(void *p,
-                                                 __vec16_i1 mask) {
-    __vec16_i32 ret;
-    int32_t *ptr = (int32_t *)p;
-    for (int i = 0; i < 16; ++i)
-        if ((mask.v & (1 << i)) != 0)
-            ret[i] = ptr[i];
-    return ret;
-}
-#else
-static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) {
+static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    return _mm512_mask_load_epi32(__vec16_i32(), mask, p);
+  return _mm512_mask_load_epi32(__vec16_i32(), mask, p);
 #else
-    __vec16_i32 tmp;
-    tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    __vec16_i32 ret;
-    return _mm512_mask_mov_epi32(ret.v, mask, tmp.v);
+  __vec16_i32 tmp;
+  tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  __vec16_i32 ret;
+  return _mm512_mask_mov_epi32(ret.v, mask, tmp.v);
 #endif
 }
-#endif
 
-#if 0
-static FORCEINLINE __vec16_f __masked_load_float(void *p,
-                                                 __vec16_i1 mask) {
-    __vec16_f ret;
-    float *ptr = (float *)p;
-    for (int i = 0; i < 16; ++i)
-        if ((mask.v & (1 << i)) != 0)
-            ret[i] = ptr[i];
-    return ret;
-}
-#else
-static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) {
+static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
+  return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
 #else
-    __vec16_f tmp;
-    tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    __vec16_f ret;
-    return _mm512_mask_mov_ps(ret.v, mask, tmp.v);
+  __vec16_f tmp;
+  tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  __vec16_f ret;
+  return _mm512_mask_mov_ps(ret.v, mask, tmp.v);
 #endif
 }
-#endif
 
 static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
                                                  __vec16_i1 mask) {
@@ -2190,40 +1861,29 @@ static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
     return ret;
 }
 
-#if 0
-static FORCEINLINE __vec16_d __masked_load_double(void *p,
-                                                  __vec16_i1 mask) {
-    __vec16_d ret;
-    double *ptr = (double *)p;
-    for (int i = 0; i < 16; ++i)
-        if ((mask.v & (1 << i)) != 0)
-            ret[i] = ptr[i];
-    return ret;
-}
-#else
-static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) {
+static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    __vec16_d ret;
-    __vec16_i1 tmp_m = mask;
-    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
-    ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p);
-    ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64);
-    return ret;
+  __vec16_d ret;
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p);
+  ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64);
+  return ret;
 #else
-    __vec16_d tmp;
-    tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    __vec16_d ret;
-    __vec16_i1 tmp_m = mask;
-    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
-    ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1);
-    ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2);
-    return ret;
+  __vec16_d tmp;
+  tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  __vec16_d ret;
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1);
+  ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2);
+  return ret;
 #endif
 }
-#endif
 
 
 static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val,
@@ -2242,52 +1902,33 @@ static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val,
             ptr[i] = val[i];
 }
 
-#if 0
-static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val,
-                                           __vec16_i1 mask) {
-    int32_t *ptr = (int32_t *)p;
-    for (int i = 0; i < 16; ++i)
-        if ((mask.v & (1 << i)) != 0)
-            ptr[i] = val[i];
-}
-#else
-static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) {
+static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    _mm512_mask_store_epi32(p, mask, val.v);
+  _mm512_mask_store_epi32(p, mask, val.v);
 #else
-    __vec16_i32 tmp;
-    tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v);
-    _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  __vec16_i32 tmp;
+  tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v);
+  _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
 #endif
 }
-#endif
 
-#if 0
-static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
-                                             __vec16_i1 mask) {
-    float *ptr = (float *)p;
-    for (int i = 0; i < 16; ++i)
-        if ((mask.v & (1 << i)) != 0)
-            ptr[i] = val[i];
-}
-#else
-static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
-                                             __vec16_i1 mask) {
+static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, __vec16_i1 mask) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    _mm512_mask_store_ps(p, mask, val.v);
+  _mm512_mask_store_ps(p, mask, val.v);
 #else
-    __vec16_f tmp;
-    tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v);
-    _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  __vec16_f tmp;
+  tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v);
+  _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
 #endif
 }
-#endif
 
 static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
                                           __vec16_i1 mask) {
@@ -2297,39 +1938,29 @@ static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
             ptr[i] = val[i];
 }
 
-#if 0
-static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
-                                              __vec16_i1 mask) {
-    double *ptr = (double *)p;
-    for (int i = 0; i < 16; ++i)
-        if ((mask.v & (1 << i)) != 0)
-            ptr[i] = val[i];
-}
-#else
-static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
-                                              __vec16_i1 mask) {
+static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, __vec16_i1 mask) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    __vec16_i1 tmp_m = mask;
-    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
-    _mm512_mask_store_pd(p, mask, val.v1);
-    _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2);
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  _mm512_mask_store_pd(p, mask, val.v1);
+  _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2);
 #else
-    __vec16_d tmp;
-    __vec16_i1 tmp_m = mask;
-    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
-    tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1);
-    tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2);
-    _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  __vec16_d tmp;
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1);
+  tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2);
+  _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
 #endif
 }
-#endif
 
 static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val,
                                                 __vec16_i1 mask) {
@@ -2363,9 +1994,11 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
 
 ///////////////////////////////////////////////////////////////////////////
 // gather/scatter
+///////////////////////////////////////////////////////////////////////////
 
 // offsets * offsetScale is in bytes (for all of these)
 
+/* knc::macro::used */
 #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
 static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
                               OTYPE offset, __vec16_i1 mask) {          \
@@ -2381,21 +2014,19 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
     
 
 /****************/
-#if 0
-GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __gather_base_offsets32_i8)
-#else
+// GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __gather_base_offsets32_i8)
 static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets,  __vec16_i1 mask) 
 {
-    // (iw): need to temporarily store as int because gathers can only return ints.
-    __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, 
-                                                     _MM_UPCONV_EPI32_SINT8, scale,
-                                                     _MM_HINT_NONE);
-    // now, downconverting to chars into temporary char vector
-    __vec16_i8 ret;
-    _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
-    return ret;
+  // (iw): need to temporarily store as int because gathers can only return ints.
+  __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, 
+                                                   _MM_UPCONV_EPI32_SINT8, scale,
+                                                   _MM_HINT_NONE);
+  // now, downconverting to chars into temporary char vector
+  __vec16_i8 ret;
+  _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+  return ret;
 }
-#if 0 /* evghenii::fails on gather-int8-2 & gather-int8-4 */
+#if 0 /* knc::fails on gather-int8-2 & gather-int8-4 */
 static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) 
 { 
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2422,21 +2053,18 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_
 #else
 GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
 #endif
-#endif
 /****************/
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
 /****************/
-#if 0
-GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
-#else
+// GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
 static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets,   __vec16_i1 mask) 
 {
-    return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, 
-                                          base, _MM_UPCONV_EPI32_NONE, scale,
-                                          _MM_HINT_NONE);
+  return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, 
+                                        base, _MM_UPCONV_EPI32_NONE, scale,
+                                        _MM_HINT_NONE);
 }
-#if 0 /* evghenii::fails on gather-int32-2 & gather-int32-4 */
+#if 0 /* knc::fails on gather-int32-2 & gather-int32-4 */
 static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
 {
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2464,18 +2092,15 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint3
 #else
 GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
 #endif
-#endif
 /****************/
-#if 0
-GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
-#else
+// GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
 static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) 
 {
-    return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets,
-                                       base, _MM_UPCONV_PS_NONE, scale,
-                                       _MM_HINT_NONE);
+  return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets,
+                                     base, _MM_UPCONV_PS_NONE, scale,
+                                     _MM_HINT_NONE);
 }
-#if 0 /* evghenii::fails on gather-float-2 gather-float-4 & soa-14 */
+#if 0 /* knc::fails on gather-float-2 gather-float-4 & soa-14 */
 static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
 {
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2503,30 +2128,27 @@ static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint3
 #else
 GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
 #endif
-#endif
 /****************/
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
 /****************/
-#if 0
-GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __gather_base_offsets32_double)
-#else
+// GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __gather_base_offsets32_double)
 static FORCEINLINE __vec16_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) 
 {
-    __vec16_d ret;
-    ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
-                                       base, _MM_UPCONV_PD_NONE, scale,
-                                       _MM_HINT_NONE); 
-    __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
-    const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */
-    ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets,
-                                       base, _MM_UPCONV_PD_NONE, scale,
-                                       _MM_HINT_NONE); 
-    return ret;
+  __vec16_d ret;
+  ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
+                                         base, _MM_UPCONV_PD_NONE, scale,
+                                         _MM_HINT_NONE); 
+  __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+  const __mmask8 mask8 = 0x00FF & (mask >> 8); /* knc::testme */
+  ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets,
+                                         base, _MM_UPCONV_PD_NONE, scale,
+                                         _MM_HINT_NONE); 
+  return ret;
 }
-#endif
 GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __gather_base_offsets64_double)
 
+/* knc::macro::used */
 #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
 static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
     VTYPE ret;                                              \
@@ -2537,13 +2159,13 @@ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
         }                                                   \
     return ret;                                             \
 }
+/* knc::macro::used */
 #define GATHER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
 static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
   return FUNC1(0, 1, ptrs, mask); \
 }
 
 
-#if 1
 /***********/
 GATHER_GENERALF(__vec16_i8,  int8_t,  __vec16_i32, __gather32_i8, __gather_base_offsets32_i8)
 GATHER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __gather32_i16, __gather_base_offsets32_i16)
@@ -2559,10 +2181,10 @@ GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64);
 GATHER_GENERAL(__vec16_f,   float,   __vec16_i64, __gather64_float);
 GATHER_GENERAL(__vec16_d,   double,  __vec16_i64, __gather64_double);
 /***********/
-#endif
 
 // scatter
 
+/* knc::macro::used */
 #define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
                              OTYPE offset, VTYPE val,                   \
@@ -2583,16 +2205,14 @@ SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __scatter_base_offsets64
 SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
 SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
 /*****************/
-#if 0
-SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
-SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
-#else
+// SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
 static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets,  __vec16_i32 val, __vec16_i1 mask)
 {
-    _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, 
-                                    _MM_DOWNCONV_EPI32_NONE, scale, 
-                                    _MM_HINT_NONE);
+  _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, 
+                                  _MM_DOWNCONV_EPI32_NONE, scale, 
+                                  _MM_HINT_NONE);
 }
+// SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
 static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i32 value, __vec16_i1 mask) 
 {
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2614,19 +2234,16 @@ static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t sc
     still_to_do = _mm512_kxor(match,still_to_do);
   }
 }
-#endif
 /*****************/
-#if 0
-SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_base_offsets32_float)
-#else
+// SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_base_offsets32_float)
 static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
                                __vec16_f val, __vec16_i1 mask) 
 { 
-    _mm512_mask_i32extscatter_ps(base, mask, offsets, val, 
-                                 _MM_DOWNCONV_PS_NONE, scale,
-                                 _MM_HINT_NONE);
+  _mm512_mask_i32extscatter_ps(base, mask, offsets, val, 
+                               _MM_DOWNCONV_PS_NONE, scale,
+                               _MM_HINT_NONE);
 }
-#if 0 /* evghenii::fails on soa-10 & soa-13 , it is very similar to __scatter_base_offsets64_it32, but that passes tests, why ?!? */
+#if 0 /* knc::fails on soa-10 & soa-13 , it is very similar to __scatter_base_offsets64_it32, but that passes tests, why ?!? */
 static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask) 
 { 
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2651,29 +2268,26 @@ static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t
 #else
 SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
 #endif
-#endif
 /*****************/
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
 /*****************/
-#if 0 /* evghenii::to implement */
-SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __scatter_base_offsets32_double)
-#else /* evghenii:testme */
+// SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __scatter_base_offsets32_double)
 static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t scale, __vec16_i32 offsets,
                                __vec16_d val, __vec16_i1 mask) 
 { 
-    _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v1, 
+  _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v1, 
                                  _MM_DOWNCONV_PD_NONE, scale,
                                  _MM_HINT_NONE);
-    __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
-    const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */
-    _mm512_mask_i32loextscatter_pd(base, mask8, shuffled_offsets, val.v2, 
+  __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+  const __mmask8 mask8 = 0x00FF & (mask >> 8); /* knc::testme */
+  _mm512_mask_i32loextscatter_pd(base, mask8, shuffled_offsets, val.v2, 
                                  _MM_DOWNCONV_PD_NONE, scale,
                                  _MM_HINT_NONE);
 }
-#endif
 SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __scatter_base_offsets64_double)
 
+/* knc::macro::used */
 #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
     VTYPE ret;                                                       \
@@ -2683,12 +2297,12 @@ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
             *ptr = val[i];                                         \
         }                                                            \
 }
+/* knc::macro::used */
 #define SCATTER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
 static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
   return FUNC1(0, 1, ptrs, val, mask); \
 }
 
-#if 1
 /***********/
 SCATTER_GENERALF(__vec16_i8,  int8_t,  __vec16_i32, __scatter32_i8, __scatter_base_offsets32_i8)
 SCATTER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16, __scatter_base_offsets32_i16)
@@ -2704,109 +2318,47 @@ SCATTER_GENERAL(__vec16_f,   float,   __vec16_i64, __scatter64_float)
 SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64)
 SCATTER_GENERAL(__vec16_d,   double,  __vec16_i64, __scatter64_double)
 /***********/
-#endif
 
 ///////////////////////////////////////////////////////////////////////////
 // packed load/store
+///////////////////////////////////////////////////////////////////////////
 
-#if 0
-static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec16_i32 *val,
-                                                __vec16_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 16; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            val->operator[](i) = *ptr++;
-            ++count;
-        }
-    }
-    return count;
-}
-#endif
 
-#if 0
-static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, 
-                                                 __vec16_i32 val,
-                                                 __vec16_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 16; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            *ptr++ = val[i];
-            ++count;
-        }
-    }
-    return count;
+static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val, __vec16_i1 mask)
+{
+  __vec16_i32 v = __load<64>(val);
+  v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  __store<64>(val, v);
+  return _mm_countbits_32(uint32_t(mask));
 }
-#endif
 
-#if 0
-static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
-                                                __vec16_i32 *val,
-                                                __vec16_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 16; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            val->operator[](i) = *ptr++;
-            ++count;
-        }
-    }
-    return count;
+static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) 
+{
+  _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  return _mm_countbits_32(uint32_t(mask));
 }
-static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
-                                                 __vec16_i32 val,
-                                                 __vec16_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 16; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            *ptr++ = val[i];
-            ++count;
-        }
-    }
-    return count;
-}
-#endif
 
-#if 1
-static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val,
-                                                __vec16_i1 mask) {
-    __vec16_i32 v = __load<64>(val);
-    v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    __store<64>(val, v);
-    return _mm_countbits_32(uint32_t(mask));
+static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val, __vec16_i1 mask)
+{
+  __vec16_i32 v = __load<64>(val);
+  v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  __store<64>(val, v);
+  return _mm_countbits_32(uint32_t(mask));
 }
-#endif
 
-#if 1
-static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val,
-                                                 __vec16_i1 mask) {
-    _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    return _mm_countbits_32(uint32_t(mask));
+static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val, __vec16_i1 mask) 
+{
+  _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  return _mm_countbits_32(uint32_t(mask));
 }
-#endif
-
-#if 1
-static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val,
-                                                __vec16_i1 mask) {
-    __vec16_i32 v = __load<64>(val);
-    v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    __store<64>(val, v);
-    return _mm_countbits_32(uint32_t(mask));
-}
-#endif
-
-#if 1
-static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val,
-                                                 __vec16_i1 mask) {
-    _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    return _mm_countbits_32(uint32_t(mask));
-}
-#endif
 
 ///////////////////////////////////////////////////////////////////////////
 // aos/soa
+///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE void __soa_to_aos3_float(__vec16_f v0, __vec16_f v1, __vec16_f v2,
                                             float *ptr) {
@@ -2848,6 +2400,7 @@ static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec16_f *out0, __vec16
 
 ///////////////////////////////////////////////////////////////////////////
 // prefetch
+///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *p) {
     _mm_prefetch((char *)p, _MM_HINT_T0); // prefetch into L1$
@@ -2868,6 +2421,7 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
 
 ///////////////////////////////////////////////////////////////////////////
 // atomics
+///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) {
 #ifdef _MSC_VER

From 10223cfac3a8d0f5d80bd5eff095055e593764cd Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Sat, 5 Oct 2013 15:23:55 +0300
Subject: [PATCH 070/159] workong on shuffle/rotate for double, there seems to
 be a bug in cvt2zmm cvt2hilo

---
 examples/intrinsics/knc-i1x16.h | 85 ++++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 2 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index e712c969..807781f0 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -143,8 +143,14 @@ PRE_ALIGN(64) struct __vec16_f
 
 struct PRE_ALIGN(128) __vec16_d 
 {
-  __m512d v1;
-  __m512d v2;
+  union {
+    __m512d v1;
+    __m512d v_hi;
+  };
+  union {
+    __m512d v2;
+    __m512d v_lo;
+  };
   FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {}
   FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {}
   FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {}
@@ -158,6 +164,40 @@ struct PRE_ALIGN(128) __vec16_d
   }
   FORCEINLINE const double& operator[](const int i) const {  return ((double*)this)[i]; }
   FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
+  FORCEINLINE __vec16_d cvt2hilo()  const
+  {
+    __m512i _hi, _lo;
+    _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, 
+        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+        _mm512_castpd_si512(v1));
+    _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, 
+        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+        _mm512_castpd_si512(v2));
+    _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
+        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+        _mm512_castpd_si512(v1));
+    _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF,
+        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+        _mm512_castpd_si512(v2));
+    return __vec16_d(_mm512_castsi512_pd(_hi), _mm512_castsi512_pd(_lo));
+  }
+  FORCEINLINE __vec16_d cvt2zmm() const
+  {
+    __m512i _v1, _v2;
+    _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+        _mm512_castpd_si512(v_hi));
+    _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555,
+        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+        _mm512_castpd_si512(v_lo));
+    _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+        _mm512_castpd_si512(v_hi));
+    _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555,
+        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+        _mm512_castpd_si512(v_lo));
+    return __vec16_d(_mm512_castsi512_pd(_v1), _mm512_castsi512_pd(_v2));
+  }
 } POST_ALIGN(128);
 
 struct PRE_ALIGN(128) __vec16_i64 
@@ -1247,8 +1287,49 @@ static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index)
   return ret;
 }
 
+#define CASTD2F(_v_, _v_hi_, _v_lo_) \
+  __vec16_f _v_hi_, _v_lo_;  \
+  { \
+  const __vec16_d v      = _v_.cvt2hilo(); \
+  _v_hi_   = _mm512_castpd_ps(v.v_hi); \
+  _v_lo_   = _mm512_castpd_ps(v.v_lo); }
+#define CASTF2D(_ret_hi_, _ret_lo_) \
+  __vec16_d(_mm512_castps_pd(_ret_hi_), _mm512_castps_pd(_ret_lo_)).cvt2zmm()
+
+#if 0 /* knc::testme  there appears to be no tests in ./tests for checking this functionality */
+static FORCEINLINE __vec16_d __rotate_double(const __vec16_d _v, const int index) 
+{
+//  return _v; /* this one passes all tests , but most not */
+  CASTD2F(_v, v_hi, v_lo);
+  const __vec16_f ret_hi = __rotate_float(v_hi, index);
+  const __vec16_f ret_lo = __rotate_float(v_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
+#else
 ROTATE(__vec16_d, double, double)
+#endif
+
+#if 0 /* knc::fails  ./tests/shuffle2-4.ispc ./tests/shuffle2-5.ispc */
+static FORCEINLINE __vec16_d __shuffle_double(__vec16_d _v, const __vec16_i32 index) 
+{
+  CASTD2F(_v, v_hi, v_lo);
+  const __vec16_f ret_hi = __shuffle_float(v_hi, index);
+  const __vec16_f ret_lo = __shuffle_float(v_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_d __shuffle2_double(__vec16_d _v0, __vec16_d _v1, const __vec16_i32 index)
+{
+  CASTD2F(_v0, v0_hi, v0_lo);
+  CASTD2F(_v1, v1_hi, v1_lo);
+  const __vec16_f ret_hi = __shuffle2_float(v0_hi, v1_hi, index);
+  const __vec16_f ret_lo = __shuffle2_float(v0_lo, v1_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
+#else
 SHUFFLES(__vec16_d, double, double)
+#endif
+#undef CASTD2F
+#undef CASTF2D
 
 template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) \
 {

From 1b196520f6877c14203e5bc88ab37db6deeb88a7 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Sat, 5 Oct 2013 22:10:05 +0300
Subject: [PATCH 071/159] knc-i1x16.h is cleaned: int32,float,double are
 complete, int64 is partially complete

---
 examples/intrinsics/knc-i1x16.h | 271 ++++++++++++++++----------------
 1 file changed, 133 insertions(+), 138 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 807781f0..fb2cf618 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -141,6 +141,37 @@ PRE_ALIGN(64) struct __vec16_f
   FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
 } POST_ALIGN(64);
 
+static void zmm2hilo(const __m512i v1, const __m512i v2, __m512i &_hi, __m512i &_lo)
+{
+  _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, 
+      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+      v2);
+  _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, 
+      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+      v1);
+  _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
+      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+      v2);
+  _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF,
+      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+      v1);
+}
+static void hilo2zmm(const __m512i v_hi, const __m512i v_lo, __m512i &_v1, __m512i &_v2)
+{
+  _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+      v_hi);
+  _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555,
+      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+      v_lo);
+  _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+      v_hi);
+  _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555,
+      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+      v_lo);
+}
+
 struct PRE_ALIGN(128) __vec16_d 
 {
   union {
@@ -166,36 +197,18 @@ struct PRE_ALIGN(128) __vec16_d
   FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
   FORCEINLINE __vec16_d cvt2hilo()  const
   {
+    const __m512i _v1 = _mm512_castpd_si512(v1);
+    const __m512i _v2 = _mm512_castpd_si512(v2);
     __m512i _hi, _lo;
-    _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, 
-        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-        _mm512_castpd_si512(v1));
-    _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, 
-        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-        _mm512_castpd_si512(v2));
-    _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
-        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-        _mm512_castpd_si512(v1));
-    _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF,
-        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-        _mm512_castpd_si512(v2));
+    zmm2hilo(_v1, _v2, _hi, _lo);
     return __vec16_d(_mm512_castsi512_pd(_hi), _mm512_castsi512_pd(_lo));
   }
   FORCEINLINE __vec16_d cvt2zmm() const
   {
+    const __m512i _hi = _mm512_castpd_si512(v_hi);
+    const __m512i _lo = _mm512_castpd_si512(v_lo);
     __m512i _v1, _v2;
-    _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
-        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
-        _mm512_castpd_si512(v_hi));
-    _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555,
-        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
-        _mm512_castpd_si512(v_lo));
-    _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
-        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
-        _mm512_castpd_si512(v_hi));
-    _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555,
-        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
-        _mm512_castpd_si512(v_lo));
+    hilo2zmm(_hi,_lo, _v1,_v2);
     return __vec16_d(_mm512_castsi512_pd(_v1), _mm512_castsi512_pd(_v2));
   }
 } POST_ALIGN(128);
@@ -226,38 +239,15 @@ struct PRE_ALIGN(128) __vec16_i64
   FORCEINLINE       int64_t& operator[](const int i)       {  return ((int64_t*)this)[i]; }
   FORCEINLINE __vec16_i64 cvt2hilo()  const
   {
-    __m512i _hi, _lo;
-    _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, 
-        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-        v1);
-    _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, 
-        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-        v2);
-    _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
-        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-        v1);
-    _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF,
-        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-        v2);
-    return __vec16_i64(_hi, _lo);
+    __vec16_i64 ret;
+    zmm2hilo(v1,v2,ret.v_hi,ret.v_lo);
+    return ret;
   }
   FORCEINLINE __vec16_i64 cvt2zmm() const
   {
-    __m512i _v1, _v2;
-    _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
-        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
-        v_hi);
-    _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555,
-        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
-        v_lo);
-
-    _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
-        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
-        v_hi);
-    _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555,
-        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
-        v_lo);
-    return __vec16_i64(_v1, _v2);
+    __vec16_i64 ret;
+    hilo2zmm(v_hi,v_lo, ret.v1, ret.v2);
+    return ret;
   }
 } POST_ALIGN(128);
 
@@ -305,15 +295,6 @@ static inline int32_t __extract_element(__vec16_i32, int);
 ///////////////////////////////////////////////////////////////////////////
 // macros...
 
-/* knc::macro::not used */
-#define UNARY_OP(TYPE, NAME, OP)            \
-static FORCEINLINE TYPE NAME(TYPE v) {      \
-    TYPE ret;                               \
-    for (int i = 0; i < 16; ++i)            \
-        ret[i] = OP(v[i]);              \
-    return ret;                             \
-}
-
 /* knc::macro::used */
 #define BINARY_OP(TYPE, NAME, OP)                               \
 static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
@@ -722,7 +703,7 @@ static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __
 
 template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
   return __load<64>(p);
 #else
   __vec16_i32 v;
@@ -734,7 +715,7 @@ template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p)
 
 template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
   __store<64>(p,v);
 #else
   _mm512_extpackstorelo_epi32(          p,    v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
@@ -742,6 +723,17 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
 #endif
 }
 
+#if 0 /* knc::fails  ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
+template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) 
+{
+  return _mm512_load_epi32(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) 
+{
+  _mm512_store_epi32(p, v);
+}
+#endif
+
 ///////////////////////////////////////////////////////////////////////////
 // int64
 ///////////////////////////////////////////////////////////////////////////
@@ -783,8 +775,8 @@ static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b)
 #if 0  /* knc::fails  ./tests/int64-min-1.ispc ./tests/idiv.ispc */
 static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) 
 {
-  const __vec16_i64 a = _a.cvt2hilo();
-  const __vec16_i64 b = _b.cvt2hilo();
+  __vec16_i64 a = _a.cvt2hilo();
+  __vec16_i64 b = _b.cvt2hilo();
   __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo);
   __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
   __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
@@ -858,7 +850,6 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
 
-#if 0 /* knc::fails  ./tests/reduce-equal-8.ispc , knc::hangs foreach-unique-6.ispc funcptr-null-[2-6].ispc  funcptr-uniform-9.ispc  funcptr-varying-5.ispc */
 static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b) 
 {
   const __vec16_i64 a = _a.cvt2hilo();
@@ -874,22 +865,14 @@ static FORCEINLINE __vec16_i1 __equal_i64_and_mask(__vec16_i64 _a, __vec16_i64 _
   __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
   return _mm512_kand(full_match, (__mmask16)mask);
 }
-
 static FORCEINLINE __vec16_i1 __not_equal_i64(__vec16_i64 a, __vec16_i64 b) 
 {
-    return __not(__equal_i64(a,b));
+  return __not(__equal_i64(a,b));
 }
 static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(__vec16_i64 a, __vec16_i64 b, __vec16_i1 mask) 
 {
-    return __and(__not(__equal_i64(a,b)), mask);
+  return __and(__not(__equal_i64(a,b)), mask);
 }
-#else
-CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
-CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
-#endif
-
-
-
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
 CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_equal, <=)
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
@@ -918,18 +901,49 @@ template <> FORCEINLINE  __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec
 template <class RetVecType> RetVecType __undef_i64();
 template <> FORCEINLINE  __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
 
-static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 v, int index) 
+#define CASTL2I(_v_, _v_hi_, _v_lo_) \
+  __vec16_i32 _v_hi_, _v_lo_;  \
+  { \
+  const __vec16_i64 v      = _v_.cvt2hilo(); \
+  _v_hi_   = v.v_hi; \
+  _v_lo_   = v.v_lo; }
+#define CASTI2L(_ret_hi_, _ret_lo_) \
+  __vec16_i64(_ret_hi_, _ret_lo_).cvt2zmm()
+static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 _v, int index) 
 {
-  int64_t val = __extract_element(v, index & 0xf);
-  return __smear_i64<__vec16_i64>(val);
+  CASTL2I(_v, v_hi, v_lo);
+  const __vec16_i32 ret_hi = __broadcast_i32(v_hi, index);
+  const __vec16_i32 ret_lo = __broadcast_i32(v_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
 }
-
-ROTATE  (__vec16_i64, i64, int64_t)
-SHUFFLES(__vec16_i64, i64, int64_t)
+static FORCEINLINE __vec16_i64 __rotate_i64(const __vec16_i64 _v, const int index) 
+{
+  CASTL2I(_v, v_hi, v_lo);
+  const __vec16_i32 ret_hi = __rotate_i32(v_hi, index);
+  const __vec16_i32 ret_lo = __rotate_i32(v_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_i64 __shuffle_double(__vec16_i64 _v, const __vec16_i32 index) 
+{
+  CASTL2I(_v, v_hi, v_lo);
+  const __vec16_i32 ret_hi = __shuffle_i32(v_hi, index);
+  const __vec16_i32 ret_lo = __shuffle_i32(v_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_i64 __shuffle2_double(__vec16_i64 _v0, __vec16_i64 _v1, const __vec16_i32 index)
+{
+  CASTL2I(_v0, v0_hi, v0_lo);
+  CASTL2I(_v1, v1_hi, v1_lo);
+  const __vec16_i32 ret_hi = __shuffle2_i32(v0_hi, v1_hi, index);
+  const __vec16_i32 ret_lo = __shuffle2_i32(v0_lo, v1_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
+}
+#undef CASTI2L
+#undef CASTL2I
 
 template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
   return __load<128>(p);
 #else
   __vec16_i32 v1;
@@ -942,18 +956,10 @@ template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p)
 #endif
 }
 
-template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
-{
-  __m512i v2 = _mm512_load_epi32(p);
-  __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
-  return __vec16_i64(v2,v1);
-}
-
-template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {    return __load<64>(p); }
 
 template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
   return __store<128>(p,v);
 #else
   __m512i v1 = v.v2;
@@ -965,6 +971,14 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
 #endif
 }
 
+#if 0 /* knc::fails  as with _i32 this may generate fails ... so commetining it out */
+template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
+{
+  __m512i v2 = _mm512_load_epi32(p);
+  __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
+  return __vec16_i64(v2,v1);
+}
+template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {    return __load<64>(p); }
 template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) 
 {
   __m512i v1 = v.v2;
@@ -972,8 +986,8 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v)
   _mm512_store_epi64(p, v2);
   _mm512_store_epi64(((uint8_t*)p)+64, v1);
 }
-
 template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
+#endif
 
 
 ///////////////////////////////////////////////////////////////////////////
@@ -1048,7 +1062,7 @@ static FORCEINLINE __vec16_f __shuffle2_float(__vec16_f _v0, __vec16_f _v1, __ve
 
 template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
   return __load<64>(p);
 #else
   __vec16_f v;
@@ -1060,7 +1074,7 @@ template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p)
 
 template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
   __store<64>(p,v);
 #else
   _mm512_extpackstorelo_ps(          p,    v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
@@ -1278,15 +1292,6 @@ template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return
 template <class RetVecType> RetVecType __undef_double();
 template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
 
-static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) 
-{
-  __vec16_d ret;
-  double val = __extract_element(v, index & 0xf);
-  ret.v1 = _mm512_set1_pd(val);
-  ret.v2 = _mm512_set1_pd(val);
-  return ret;
-}
-
 #define CASTD2F(_v_, _v_hi_, _v_lo_) \
   __vec16_f _v_hi_, _v_lo_;  \
   { \
@@ -1295,21 +1300,20 @@ static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index)
   _v_lo_   = _mm512_castpd_ps(v.v_lo); }
 #define CASTF2D(_ret_hi_, _ret_lo_) \
   __vec16_d(_mm512_castps_pd(_ret_hi_), _mm512_castps_pd(_ret_lo_)).cvt2zmm()
-
-#if 0 /* knc::testme  there appears to be no tests in ./tests for checking this functionality */
+static FORCEINLINE __vec16_d __broadcast_double(__vec16_d _v, int index) 
+{
+  CASTD2F(_v, v_hi, v_lo);
+  const __vec16_f ret_hi = __broadcast_float(v_hi, index);
+  const __vec16_f ret_lo = __broadcast_float(v_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
 static FORCEINLINE __vec16_d __rotate_double(const __vec16_d _v, const int index) 
 {
-//  return _v; /* this one passes all tests , but most not */
   CASTD2F(_v, v_hi, v_lo);
   const __vec16_f ret_hi = __rotate_float(v_hi, index);
   const __vec16_f ret_lo = __rotate_float(v_lo, index);
   return CASTF2D(ret_hi, ret_lo);
 }
-#else
-ROTATE(__vec16_d, double, double)
-#endif
-
-#if 0 /* knc::fails  ./tests/shuffle2-4.ispc ./tests/shuffle2-5.ispc */
 static FORCEINLINE __vec16_d __shuffle_double(__vec16_d _v, const __vec16_i32 index) 
 {
   CASTD2F(_v, v_hi, v_lo);
@@ -1325,32 +1329,37 @@ static FORCEINLINE __vec16_d __shuffle2_double(__vec16_d _v0, __vec16_d _v1, con
   const __vec16_f ret_lo = __shuffle2_float(v0_lo, v1_lo, index);
   return CASTF2D(ret_hi, ret_lo);
 }
-#else
-SHUFFLES(__vec16_d, double, double)
-#endif
-#undef CASTD2F
 #undef CASTF2D
+#undef CASTD2F
 
 template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) \
 {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
+  return __load<128>(p);
+#else
   __vec16_d ret;
   ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
   ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
   ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
   ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
   return ret;
+#endif
 }
  
 template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) 
 {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
+  return __store<128>(p,v);
+#else
   _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
   _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
   _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
   _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+#endif
 }
 
 
-#if 1
+#if 0 /* knc::fails  as with _f this may generate fails ... so commetining it out */
 template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) 
 {
   return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
@@ -1379,14 +1388,12 @@ static FORCEINLINE TO FUNC(TO, FROM val) {      \
 }
 
 // sign extension conversions
-#if 0 /* knc::fails on soa-9 soa-13 soa-10 soa-29 soa-3 ... and others  */
+
+// CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext)
 static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
 {
   return __vec16_i64(_mm512_srai_epi32(val.v,31), val.v).cvt2zmm();
 }
-#else
-CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext)
-#endif
 CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext)
 CAST(__vec16_i64, int64_t, __vec16_i8,  int8_t,  __cast_sext)
 CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
@@ -2107,7 +2114,7 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t
   _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
   return ret;
 }
-#if 0 /* knc::fails on gather-int8-2 & gather-int8-4 */
+// GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
 static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) 
 { 
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2131,9 +2138,6 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_
   _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
   return ret;
 }
-#else
-GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
-#endif
 /****************/
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
@@ -2145,7 +2149,7 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32
                                         base, _MM_UPCONV_EPI32_NONE, scale,
                                         _MM_HINT_NONE);
 }
-#if 0 /* knc::fails on gather-int32-2 & gather-int32-4 */
+// GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
 static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
 {
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2170,9 +2174,6 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint3
 
   return ret;
 }
-#else
-GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
-#endif
 /****************/
 // GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
 static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) 
@@ -2181,7 +2182,7 @@ static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32
                                      base, _MM_UPCONV_PS_NONE, scale,
                                      _MM_HINT_NONE);
 }
-#if 0 /* knc::fails on gather-float-2 gather-float-4 & soa-14 */
+// GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
 static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
 {
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2206,9 +2207,6 @@ static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint3
 
   return ret;
 }
-#else
-GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
-#endif
 /****************/
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
@@ -2324,7 +2322,7 @@ static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scal
                                _MM_DOWNCONV_PS_NONE, scale,
                                _MM_HINT_NONE);
 }
-#if 0 /* knc::fails on soa-10 & soa-13 , it is very similar to __scatter_base_offsets64_it32, but that passes tests, why ?!? */
+//SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
 static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask) 
 { 
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2346,9 +2344,6 @@ static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t
     still_to_do = _mm512_kxor(match,still_to_do);
   }
 }
-#else
-SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
-#endif
 /*****************/
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)

From 4222605f873e589aa9dc905fb0c2e6dcb9353d01 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Mon, 7 Oct 2013 14:24:27 +0300
Subject: [PATCH 072/159] fixed lshr/ashr/shl shifts. __mul i64 vector version
 for icc < 14.0.0 works only on signed, so  commented it out in favour of
 sequential

---
 examples/intrinsics/knc-i1x16.h | 115 ++++++++++++++++----------------
 1 file changed, 58 insertions(+), 57 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index fb2cf618..ffe8fb56 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -668,6 +668,7 @@ template <class RetVecType> RetVecType __smear_i32(int32_t i);
 template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
 
 static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
+static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0);
 static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
 static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
 static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -772,18 +773,18 @@ static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b)
   return __vec16_i64(_mm512_mullox_epi64(a.v1,b.v1), _mm512_mullox_epi64(a.v2,b.v2));
 }
 #else  /* __ICC >= 1400 */
-#if 0  /* knc::fails  ./tests/int64-min-1.ispc ./tests/idiv.ispc */
+#if 0  /* knc::fails  ./tests/int64-min-1.ispc ./tests/idiv.ispc  cause: if one or both numbers are negative multiplication fails */
 static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) 
 {
-  __vec16_i64 a = _a.cvt2hilo();
-  __vec16_i64 b = _b.cvt2hilo();
-  __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo);
-  __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
-  __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
-  __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
-  __mmask16 carry = 0;
-  __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry);
-  __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry);
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  const __vec16_i32    lo = _mm512_mullo_epi32(a.v_lo, b.v_lo);
+  const __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
+  const __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
+  const __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
+  __mmask16 carry;
+  const __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m3, &carry);
+  const __vec16_i32 hi = _mm512_adc_epi32(hi_p23, carry, hi_m1, &carry);
   return __vec16_i64(hi,lo).cvt2zmm();
 }
 #else
@@ -795,60 +796,68 @@ static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __ve
 static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); }
 static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2)); }
 
-static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) 
-{
-  const __vec16_i64 a = _a.cvt2hilo();
-  const __vec16_i64 b = _b.cvt2hilo();
-  __vec16_i32 xfer = _mm512_srlv_epi32(a.v_lo, _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
-  __vec16_i32 hi = _mm512_or_epi32(_mm512_sllv_epi32(a.v_hi, b.v_lo), xfer);
-  __vec16_i32 lo = _mm512_sllv_epi32(a.v_lo, b.v_lo);
-  return __vec16_i64(hi,lo).cvt2zmm();
-}
-
 static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2)); }
 static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2)); }
 
 static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2)); }
 static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2)); }
 
-#if 0 /* knc::fails  ./tests/idiv.ispc */
-static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) {
+
+static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) 
+{
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
-  __vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo);
-#if 0
-  __vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift));
-#else
-  __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, 
-        _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), 
-      _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
-#endif
-  __vec16_i32 hi = _mm512_srlv_epi32(a.v_hi, b.v_lo);
-  __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
+  /* this is a safety gate in case b-shift >= 32 */
+  const __vec16_i32 xfer = __select(
+      __signed_less_than_i32(b.v_lo, __ispc_thirty_two), 
+      __lshr(a.v_lo,   __sub(__ispc_thirty_two, b.v_lo)),
+      __shl (a.v_lo,   __sub(b.v_lo, __ispc_thirty_two))
+      );
+  const __vec16_i32   hi = __or(__shl(a.v_hi, b.v_lo), xfer);
+  const __vec16_i32   lo =      __shl(a.v_lo, b.v_lo);
   return __vec16_i64(hi,lo).cvt2zmm();
 }
-#else
-BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
-#endif
-
-#if 0 /* knc::fails  ./tests/idiv.ispc */
-static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) {
+static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) 
+{
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
-  __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, 
-        _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), 
-      _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
-  __vec16_i32 hi = _mm512_srav_epi32(a.v_hi, b.v_lo);
-  __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
+  /* this is a safety gate in case b-shift >= 32 */
+  const __vec16_i32 xfer = __select(
+      __signed_less_than_i32(b.v_lo, __ispc_thirty_two), 
+      __shl (a.v_hi,   __sub(__ispc_thirty_two, b.v_lo)),
+      __lshr(a.v_hi,   __sub(b.v_lo, __ispc_thirty_two))
+      );
+  const __vec16_i32   lo = __or(__lshr(a.v_lo, b.v_lo), xfer);
+  const __vec16_i32   hi =      __lshr(a.v_hi, b.v_lo);
+  return __vec16_i64(hi,lo).cvt2zmm();
+}
+static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) 
+{
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  /* this is a safety gate in case b-shift >= 32 */
+  const __vec16_i32 xfer = __select(
+      __signed_less_than_i32(b.v_lo, __ispc_thirty_two), 
+      __shl (a.v_hi,   __sub(__ispc_thirty_two, b.v_lo)),
+      __ashr(a.v_hi,   __sub(b.v_lo, __ispc_thirty_two))
+      );
+  const __vec16_i32   lo = __or(__lshr(a.v_lo, b.v_lo), xfer);
+  const __vec16_i32   hi =      __ashr(a.v_hi, b.v_lo);
   return __vec16_i64(hi,lo).cvt2zmm();
 }
-#else
-BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)
-#endif
 
-SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
-SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
-SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
+template <class RetVecType> RetVecType __smear_i64(const int64_t &l);
+template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); }
+
+template <class RetVecType> RetVecType __setzero_i64();
+template <> FORCEINLINE  __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); }
+
+template <class RetVecType> RetVecType __undef_i64();
+template <> FORCEINLINE  __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
+
+static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, uint64_t shift) { return __lshr(a, __smear_i64<__vec16_i64>(shift)); }
+static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a,  int64_t shift) { return __ashr(a, __smear_i64<__vec16_i64>(shift)); }
+static FORCEINLINE __vec16_i64 __shl (__vec16_i64 a,  int64_t shift) { return __shl (a, __smear_i64<__vec16_i64>(shift)); }
 
 static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b) 
 {
@@ -892,14 +901,6 @@ static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_
 
 INSERT_EXTRACT(__vec16_i64, int64_t)
 
-template <class RetVecType> RetVecType __smear_i64(const int64_t &l);
-template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); }
-
-template <class RetVecType> RetVecType __setzero_i64();
-template <> FORCEINLINE  __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); }
-
-template <class RetVecType> RetVecType __undef_i64();
-template <> FORCEINLINE  __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
 
 #define CASTL2I(_v_, _v_hi_, _v_lo_) \
   __vec16_i32 _v_hi_, _v_lo_;  \

From 3da152a150d5b99f856368317031f181835afb9e Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Mon, 7 Oct 2013 18:30:22 +0300
Subject: [PATCH 073/159] fixed zmm __mul for i64 with icc < 14.0.0, 4
 knc::fails lefts, but I doubt these are due to this include..

---
 examples/intrinsics/knc-i1x16.h | 50 ++++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index ffe8fb56..78d35ddc 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -767,31 +767,56 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b
         _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm();
 }
 
+static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_i64 b) 
+{
+  __vec16_i64 ret;
+  ret.v1 = _mm512_mask_mov_epi64(b.v1, mask,      a.v1);
+  ret.v2 = _mm512_mask_mov_epi64(b.v2, mask >> 8, a.v2);
+  return ret;
+}
+
 #if __ICC >= 1400 /* compiler gate, icc >= 14.0.0 support _mm512_mullox_epi64 */
 static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) 
 {
   return __vec16_i64(_mm512_mullox_epi64(a.v1,b.v1), _mm512_mullox_epi64(a.v2,b.v2));
 }
 #else  /* __ICC >= 1400 */
-#if 0  /* knc::fails  ./tests/int64-min-1.ispc ./tests/idiv.ispc  cause: if one or both numbers are negative multiplication fails */
+static FORCEINLINE void __abs_i32i64(__m512i &_hi, __m512i &_lo)
+{
+  /*   abs(x) : 
+   * mask  = x >> 32;
+   * abs(x) = (x^mask) - mask
+   */ 
+  const __vec16_i32 mask = __ashr(_hi, __ispc_thirty_two);
+  __vec16_i32 hi = __xor(_hi, mask);
+  __vec16_i32 lo = __xor(_lo, mask);
+  __mmask16 borrow = 0;
+  _lo = _mm512_subsetb_epi32(lo, mask, &borrow);
+  _hi = _mm512_sbb_epi32    (hi, borrow, mask, &borrow);
+}
 static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) 
 {
-  const __vec16_i64 a = _a.cvt2hilo();
-  const __vec16_i64 b = _b.cvt2hilo();
-  const __vec16_i32    lo = _mm512_mullo_epi32(a.v_lo, b.v_lo);
-  const __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
+  __vec16_i64 a = _a.cvt2hilo();
+  __vec16_i64 b = _b.cvt2hilo();
+  /* sign = (a^b) >> 32, if sign == 0 then a*b >= 0, otherwise a*b < 0 */
+  const __vec16_i1 sign = __not_equal_i32(__ashr(__xor(a.v_hi, b.v_hi), __ispc_thirty_two), __ispc_zero);
+  __abs_i32i64(a.v_hi, a.v_lo);  /* abs(a) */
+  __abs_i32i64(b.v_hi, b.v_lo);  /* abs(b) */
+  const __vec16_i32 lo_m1 = _mm512_mullo_epi32(a.v_lo, b.v_lo);
+  const __vec16_i32 hi_m1 = _mm512_mulhi_epu32(a.v_lo, b.v_lo);
   const __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
   const __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
   __mmask16 carry;
   const __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m3, &carry);
   const __vec16_i32 hi = _mm512_adc_epi32(hi_p23, carry, hi_m1, &carry);
-  return __vec16_i64(hi,lo).cvt2zmm();
+  const __vec16_i32 lo = lo_m1;
+  const __vec16_i64 ret_abs = __vec16_i64(hi,lo).cvt2zmm();
+  /* if sign != 0, means either a or b is negative, then negate the result */
+  return __select(sign, __sub(__vec16_i64(__ispc_zero, __ispc_zero), ret_abs), ret_abs);
 }
-#else
-BINARY_OP(__vec16_i64, __mul, *)
-#endif
 #endif  /* __ICC >= 1400 */
 
+
 static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_or_epi64 (a.v1, b.v1), _mm512_or_epi64 (a.v2, b.v2)); }
 static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); }
 static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2)); }
@@ -891,13 +916,6 @@ CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_than, <)
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
 CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_than, >)
 
-static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_i64 b) 
-{
-  __vec16_i64 ret;
-  ret.v_hi = _mm512_mask_mov_epi64(b.v_hi, mask, a.v_hi);
-  ret.v_lo = _mm512_mask_mov_epi64(b.v_lo, mask >> 8, a.v_lo);
-  return ret;
-}
 
 INSERT_EXTRACT(__vec16_i64, int64_t)
 

From 44912e6b1e0478da79fe4c3a00fd5a64f2904ece Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Tue, 8 Oct 2013 18:27:03 -0400
Subject: [PATCH 074/159] Fix segfault when using both -g and -MMM

---
 module.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module.cpp b/module.cpp
index 755a5dc4..41861a2d 100644
--- a/module.cpp
+++ b/module.cpp
@@ -936,7 +936,7 @@ Module::AddExportedTypes(const std::vector<std::pair<const Type *,
 bool
 Module::writeOutput(OutputType outputType, const char *outFileName,
                     const char *includeFileName) {
-    if (diBuilder != NULL && outputType != Header) {
+    if (diBuilder != NULL && (outputType != Header && outputType != Deps)) {
         diBuilder->finalize();
 
         lStripUnusedDebugInfo(module);

From 0d9594354a14f8a975260b22b830bd524b361b52 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Thu, 10 Oct 2013 15:38:08 +0400
Subject: [PATCH 075/159] adding --extra option and correction pathes to ispc
 compiler

---
 alloy.py     | 21 +++++++++++++++++----
 perf.py      | 47 +++++++++++++++++++++++++++--------------------
 run_tests.py | 22 ++++++++++++++++------
 3 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/alloy.py b/alloy.py
index 6b55f85b..68bdd979 100755
--- a/alloy.py
+++ b/alloy.py
@@ -70,7 +70,7 @@ def try_do_LLVM(text, command, from_validation):
         error("can't " + text, 1)
     print_debug("DONE.\n", from_validation, alloy_build)
 
-def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_validation, force, make):
+def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, from_validation, force, make):
     print_debug("Building LLVM. Version: " + version_LLVM + ". ", from_validation, alloy_build)
     if revision != "":
         print_debug("Revision: " + revision + ".\n", from_validation, alloy_build)
@@ -121,6 +121,15 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v
         try_do_LLVM("load clang from http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " ",
                     "svn co " + revision + " http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " clang",
                     from_validation)
+        if extra == True:
+            os.chdir("./clang/tools")
+            try_do_LLVM("load extra clang extra tools ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/clang-tools-extra/" + SVN_PATH + " extra",
+                    from_validation)
+            os.chdir("../../../projects")
+            try_do_LLVM("load extra clang compiler-rt ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/compiler-rt/" + SVN_PATH + " compiler-rt",
+                    from_validation)
         os.chdir("../")
     else:
         tar = tarball.split(" ")
@@ -286,6 +295,8 @@ def run_special_tests():
    i = 5 
 
 def validation_run(only, only_targets, reference_branch, number, notify, update, make):
+    if os.environ["ISPC_HOME"] != os.getcwd():
+        error("you ISPC_HOME and your current pass are different!\n", 2)
     os.chdir(os.environ["ISPC_HOME"])
     os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"]
     if options.notify != "":
@@ -387,7 +398,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
         gen_archs = ["x86-64"]
         need_LLVM = check_LLVM(LLVM)
         for i in range(0,len(need_LLVM)):
-            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False, make)
+            build_LLVM(need_LLVM[i], "", "", "", False, False, False, True, False, make)
 # begin validation run for stabitily
         common.remove_if_exists(stability.in_file)
         R = [[[],[]],[[],[]],[[],[]],[[],[]]]
@@ -465,7 +476,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
 # prepare LLVM 3.3 as newest LLVM
         need_LLVM = check_LLVM(["3.3"])
         if len(need_LLVM) != 0:
-            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False, make)
+            build_LLVM(need_LLVM[i], "", "", "", False, False, False, True, False, make)
 # prepare reference point. build both test and reference compilers
         try_do_LLVM("apply git", "git branch", True)
         temp4 = take_lines("git branch", "all")
@@ -552,7 +563,7 @@ def Main():
     try:
         if options.build_llvm:
             build_LLVM(options.version, options.revision, options.folder, options.tarball,
-                    options.debug, options.selfbuild, False, options.force, make)
+                    options.debug, options.selfbuild, options.extra, False, options.force, make)
         if options.validation_run:
             validation_run(options.only, options.only_targets, options.branch,
                     options.number_for_performance, options.notify, options.update, make)
@@ -628,6 +639,8 @@ llvm_group.add_option('--selfbuild', dest='selfbuild',
     help='make selfbuild of LLVM and clang', default=False, action="store_true")
 llvm_group.add_option('--force', dest='force',
     help='rebuild LLVM', default=False, action='store_true')
+llvm_group.add_option('--extra', dest='extra',
+    help='load extra clang tools', default=False, action='store_true')
 parser.add_option_group(llvm_group)
 # options for activity "validation run"
 run_group = OptionGroup(parser, "Options for validation run",
diff --git a/perf.py b/perf.py
index 576a5c7d..0d4926e0 100755
--- a/perf.py
+++ b/perf.py
@@ -299,6 +299,22 @@ def perf(options1, args):
     if cpu_percent > 20:
         error("CPU Usage is very high.\nClose other applications.\n", 2)
 
+    # prepare build.log, perf_temp and perf.log files
+    global perf_log
+    if options.in_file:
+        perf_log = pwd + options.in_file
+        common.remove_if_exists(perf_log)
+    else:
+        perf_log = ""
+    global build_log
+    build_log = pwd + os.sep + "logs" + os.sep + "perf_build.log"
+    common.remove_if_exists(build_log)
+    if os.path.exists(pwd + os.sep + "logs") == False:
+        os.makedirs(pwd + os.sep + "logs")
+    global perf_temp
+    perf_temp = pwd + "perf_temp"
+
+
     global ispc_test
     global ispc_ref
     global ref_compiler
@@ -325,15 +341,21 @@ def perf(options1, args):
     ispc_ref = options.ref
     if options.ref != "":
         options.ref = True
-    for counter in PATH_dir:
-        if os.path.exists(counter + os.sep + ispc_test):
+    if os.environ.get("ISPC_HOME") != None:
+        if os.path.exists(os.environ["ISPC_HOME"] + os.sep + ispc_test):
             ispc_test_exists = True
+            ispc_test = os.environ["ISPC_HOME"] + os.sep + ispc_test
+    for counter in PATH_dir:
+        if ispc_test_exists == False:
+            if os.path.exists(counter + os.sep + ispc_test):
+                ispc_test_exists = True
+                ispc_test = counter + os.sep + ispc_test
         if os.path.exists(counter + os.sep + ref_compiler):
             ref_compiler_exists = True
         if os.path.exists(counter + os.sep + ispc_ref):
             ispc_ref_exists = True
     if not ispc_test_exists:
-        error("ISPC compiler not found.\nAdded path to ispc compiler to your PATH variable.\n", 1)
+        error("ISPC compiler not found.\nAdded path to ispc compiler to your PATH variable or ISPC_HOME variable\n", 1)
     if not ref_compiler_exists:
         error("C/C++ compiler %s not found.\nAdded path to %s compiler to your PATH variable.\n" % (ref_compiler, ref_compiler), 1)
     if options.ref:
@@ -355,26 +377,11 @@ def perf(options1, args):
         if f_lines[i][0] != "%":
             lines.append(f_lines[i])
     length = len(lines)
-
-    # prepare build.log, perf_temp and perf.log files
-    global perf_log
-    if options.in_file:
-        perf_log = pwd + options.in_file
-        common.remove_if_exists(perf_log)
-    else:
-        perf_log = ""
-    global build_log
-    build_log = pwd + os.sep + "logs" + os.sep + "perf_build.log"
-    common.remove_if_exists(build_log)
-    if os.path.exists(pwd + os.sep + "logs") == False:
-        os.makedirs(pwd + os.sep + "logs")
-
-    global perf_temp
-    perf_temp = pwd + "perf_temp"
     # end of preparations
  
     print_debug("Okey go go go!\n\n", s, perf_log)
-    
+    print_debug("Testing ispc: " + ispc_test + "\n", s, perf_log)
+ 
     #print compilers versions   
     common.print_version(ispc_test, ispc_ref, ref_compiler, False, perf_log, is_windows) 
 
diff --git a/run_tests.py b/run_tests.py
index 7b2f5f29..12822d2d 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -498,16 +498,26 @@ def run_tests(options1, args, print_version):
     # use relative path to not depend on host directory, which may possibly
     # have white spaces and unicode characters.
     global ispc_exe
+    ispc_exe = ""
     if not is_windows:
-        ispc_exe = "./ispc"
+        if os.environ.get("ISPC_HOME") != None:
+            if os.path.exists(os.environ["ISPC_HOME"] + os.sep + "ispc"):
+                ispc_exe = os.environ["ISPC_HOME"] + os.sep + "ispc"
+            else:
+                PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+                for counter in PATH_dir:
+                    if os.path.exists(counter + os.sep + "ispc"):
+                        ispc_exe = counter + os.sep + "ispc"
     else:
-        ispc_exe = ".\\Release\\ispc.exe"
- 
+        if os.path.exists(".\\Release\\ispc.exe"):
+            ispc_exe = ".\\Release\\ispc.exe"
+        else:
+            error("You don't have ispc.exe compiler in .\\Release.\n", 1)
     # checks the required ispc compiler otherwise prints an error message
-    if not os.path.exists(ispc_exe):
-        error("missing ispc compiler: %s\n" % ispc_exe, 1)
+    if ispc_exe == "":
+        error("ISPC compiler not found.\nAdded path to ispc compiler to your PATH variable or ISPC_HOME variable\n", 1)
+    print_debug("Testing ispc: " + ispc_exe + "\n", s, run_tests_log)
     ispc_exe += " " + options.ispc_flags
-    print_debug("ispc compiler: %s\n" % ispc_exe, s, run_tests_log)
 
     global is_generic_target 
     is_generic_target = (options.target.find("generic-") != -1 and

From 7abbe97ee9e73de69dece3ed384e381e0835fecd Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Fri, 11 Oct 2013 11:39:02 +0400
Subject: [PATCH 076/159] patch for LLVM for fails at avx-x2

---
 fail_db.txt                             | 16 -----
 llvm_patches/3_3_0001-Fix-PR16807.patch | 78 +++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 16 deletions(-)
 create mode 100755 llvm_patches/3_3_0001-Fix-PR16807.patch

diff --git a/fail_db.txt b/fail_db.txt
index 31db9961..7c543cc6 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -37,10 +37,6 @@
 ./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
@@ -158,10 +154,6 @@
 ./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/array-gather-ifs.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/array-gather-multi-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/array-gather-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
@@ -362,10 +354,6 @@
 ./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
@@ -425,10 +413,6 @@
 ./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
 ./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
 ./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
 ./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
 ./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
 ./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
diff --git a/llvm_patches/3_3_0001-Fix-PR16807.patch b/llvm_patches/3_3_0001-Fix-PR16807.patch
new file mode 100755
index 00000000..daf1327c
--- /dev/null
+++ b/llvm_patches/3_3_0001-Fix-PR16807.patch
@@ -0,0 +1,78 @@
+From b9c47f44691cb9a648b9fa1ae373f0defe53c757 Mon Sep 17 00:00:00 2001
+From: Michael Liao <michael.hliao@gmail.com>
+Date: Thu, 10 Oct 2013 16:47:00 -0700
+Subject: [PATCH] Fix PR16807
+
+- Lower signed division by constant powers-of-2 to target-independent
+  DAG operators instead of target-dependent ones to support them on
+  targets where vector types are legal but shift operators on that types
+  are illegal, e.g. on AVX, PSRAW is only available on <8 x i16> though
+  <16 x i16> is a legal type.
+---
+ lib/Target/X86/X86ISelLowering.cpp | 22 ++++++++++++++++------
+ test/CodeGen/X86/pr16807.ll        | 18 ++++++++++++++++++
+ 2 files changed, 34 insertions(+), 6 deletions(-)
+ create mode 100644 test/CodeGen/X86/pr16807.ll
+
+diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
+index bd5ad4e..518bb90 100644
+--- lib/Target/X86/X86ISelLowering.cpp
++++ lib/Target/X86/X86ISelLowering.cpp
+@@ -12462,14 +12462,24 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
+       (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
+     unsigned lg2 = SplatValue.countTrailingZeros();
+     // Splat the sign bit.
+-    SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32);
+-    SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG);
++    SmallVector<SDValue, 16> Sz(NumElts,
++                                DAG.getConstant(EltTy.getSizeInBits() - 1,
++                                                EltTy));
++    SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0,
++                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0],
++                                          NumElts));
+     // Add (N0 < 0) ? abs2 - 1 : 0;
+-    SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32);
+-    SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG);
++    SmallVector<SDValue, 16> Amt(NumElts,
++                                 DAG.getConstant(EltTy.getSizeInBits() - lg2,
++                                                 EltTy));
++    SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN,
++                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0],
++                                          NumElts));
+     SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
+-    SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32);
+-    SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG);
++    SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(lg2, EltTy));
++    SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD,
++                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0],
++                                          NumElts));
+ 
+     // If we're dividing by a positive value, we're done.  Otherwise, we must
+     // negate the result.
+diff --git a/test/CodeGen/X86/pr16807.ll b/test/CodeGen/X86/pr16807.ll
+new file mode 100644
+index 0000000..6d55d99
+--- /dev/null
++++ test/CodeGen/X86/pr16807.ll
+@@ -0,0 +1,18 @@
++; RUN: llc < %s -mtriple=x86_64-linux-gnu -mcpu=core-avx-i | FileCheck %s
++
++define <16 x i16> @f_fu(<16 x i16> %bf) {
++allocas:
++  %avg.i.i = sdiv <16 x i16> %bf, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
++  ret <16 x i16> %avg.i.i
++}
++
++; CHECK: f_fu
++; CHECK: psraw
++; CHECK: psrlw
++; CHECK: paddw
++; CHECK: psraw
++; CHECK: psraw
++; CHECK: psrlw
++; CHECK: paddw
++; CHECK: psraw
++; CHECK: ret
+-- 
+1.8.1.2
+

From 92773ada6d0faac1cbb853649aa8e1763e76bddb Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Fri, 11 Oct 2013 15:23:40 +0400
Subject: [PATCH 077/159] fix for ISPC for compfails at sse4-i8 and sse4-i16

---
 builtins/util.m4 |  4 ++--
 fail_db.txt      | 42 ------------------------------------------
 2 files changed, 2 insertions(+), 44 deletions(-)

diff --git a/builtins/util.m4 b/builtins/util.m4
index 68fa818b..11501780 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -3834,9 +3834,9 @@ domixed:
   %first = call i64 @llvm.cttz.i64(i64 %mm)
   %first32 = trunc i64 %first to i32
   %baseval = extractelement <$1 x $2> %v, i32 %first32
-  %basev1 = bitcast $2 %baseval to <1 x $2>
+  %basev1 = insertelement <$1 x $2> undef, $2 %baseval, i32 0
   ; get a vector that is that value smeared across all elements
-  %basesmear = shufflevector <1 x $2> %basev1, <1 x $2> undef,
+  %basesmear = shufflevector <$1 x $2> %basev1, <$1 x $2> undef,
         <$1 x i32> < forloop(i, 0, eval($1-2), `i32 0, ') i32 0 >
 
   ; now to a blend of that vector with the original vector, such that the
diff --git a/fail_db.txt b/fail_db.txt
index 7c543cc6..886a4534 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -8,38 +8,17 @@
 % 
 ./tests/masked-scatter-vector.ispc runfail  x86-64     sse2-i32x4   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-11.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-5.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-6.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-10.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-11.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-5.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-6.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/avg-down-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
@@ -322,38 +301,17 @@
 ./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/masked-scatter-struct.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-11.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-10.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-11.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *

From 8297edd251f6d93b8bed7eaba44baccfde4f2410 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 9 Oct 2013 14:04:10 +0400
Subject: [PATCH 078/159] Switching default compiler on Unix from g++ to
 clang++

---
 Makefile | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 097da238..6c7a5f4f 100644
--- a/Makefile
+++ b/Makefile
@@ -109,8 +109,7 @@ else
     BUILD_VERSION:=$(GIT_REVISION)
 endif
 
-CXX=g++
-CPP=cpp
+CXX=clang++
 OPT=-O2
 CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
 	$(LLVM_VERSION_DEF) \
@@ -202,9 +201,14 @@ ispc: print_llvm_src dirs $(OBJS)
 	@$(CXX) $(OPT) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
 
 # Use clang as a default compiler, instead of gcc
+# This is default now.
 clang: ispc
 clang: CXX=clang++
 
+# Use gcc as a default compiler, instead of gcc
+gcc: ispc
+gcc: CXX=clang++
+
 # Build ispc with address sanitizer instrumentation using clang compiler
 # Note that this is not portable build
 asan: clang

From 99df2d9dbff4ba44bb0dcee5984b20697c4321a9 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 9 Oct 2013 14:05:19 +0400
Subject: [PATCH 079/159] Switch examples on Unix from using g++ to clang++

---
 examples/aobench_instrumented/Makefile |  2 +-
 examples/common.mk                     | 10 +++++-----
 examples/simple/Makefile               |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/aobench_instrumented/Makefile b/examples/aobench_instrumented/Makefile
index 9921cf3e..d0b27cbf 100644
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -1,5 +1,5 @@
 
-CXX=g++ -m64
+CXX=clang++ -m64
 CXXFLAGS=-Iobjs/ -g3 -Wall
 ISPC=ispc
 ISPCFLAGS=-O2 --instrument --arch=x86-64 --target=sse2
diff --git a/examples/common.mk b/examples/common.mk
index 367d3eb3..db7b8eee 100644
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -3,14 +3,14 @@ TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
 TASK_OBJ=objs/tasksys.o
 
-CXX=g++
-CXXFLAGS=-Iobjs/ -O2
-CC=gcc
-CCFLAGS=-Iobjs/ -O2
+CXX=clang++
+CXXFLAGS+=-Iobjs/ -O2
+CC=clang
+CCFLAGS+=-Iobjs/ -O2
 
 LIBS=-lm $(TASK_LIB) -lstdc++
 ISPC=ispc
-ISPC_FLAGS=-O2
+ISPC_FLAGS+=-O2
 ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
 
 ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
diff --git a/examples/simple/Makefile b/examples/simple/Makefile
index 80f09193..dce7942b 100644
--- a/examples/simple/Makefile
+++ b/examples/simple/Makefile
@@ -1,5 +1,5 @@
 
-CXX=g++ -m64
+CXX=clang++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
 ISPCFLAGS=-O2 --arch=x86-64 --target=sse2

From 17b54cb0c871ffad0d759e4fd0ec19e9a6445d6a Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Thu, 10 Oct 2013 16:10:31 +0400
Subject: [PATCH 080/159] Fix problem with building ISPC by clang 3.4

---
 Makefile     | 2 +-
 cbackend.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 6c7a5f4f..10d51bd5 100644
--- a/Makefile
+++ b/Makefile
@@ -115,7 +115,7 @@ CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
 	$(LLVM_VERSION_DEF) \
 	-Wall \
 	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" \
-	-Wno-sign-compare
+	-Wno-sign-compare -Wno-unused-function
 ifneq ($(LLVM_VERSION),LLVM_3_1)
 	CXXFLAGS+=-Werror
 endif
diff --git a/cbackend.cpp b/cbackend.cpp
index 7d4b4cfc..481ca3fd 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -3066,7 +3066,7 @@ void CWriter::visitReturnInst(llvm::ReturnInst &I) {
   // Don't output a void return if this is the last basic block in the function
   if (I.getNumOperands() == 0 &&
       &*--I.getParent()->getParent()->end() == I.getParent() &&
-      !I.getParent()->size() == 1) {
+      (!I.getParent()->size()) == 1) {
     return;
   }
 

From d129e33b51ecaba7793aa744c3467f9c20e03704 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Thu, 10 Oct 2013 16:11:15 +0400
Subject: [PATCH 081/159] Enable clang as default C++ compiler used by
 run_tests.py and perf.py

---
 perf.py      | 7 +++++--
 run_tests.py | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/perf.py b/perf.py
index 0d4926e0..2b5c6edd 100755
--- a/perf.py
+++ b/perf.py
@@ -326,8 +326,8 @@ def perf(options1, args):
     ref_compiler_exists = False
     if is_windows == False:
         ispc_test = "ispc"
-        ref_compiler = "g++"
-        refc_compiler = "gcc"
+        ref_compiler = "clang++"
+        refc_compiler = "clang"
         if options.compiler != "":
             if options.compiler == "clang" or options.compiler == "clang++":
                 ref_compiler = "clang++"
@@ -335,6 +335,9 @@ def perf(options1, args):
             if options.compiler == "icc" or options.compiler == "icpc":
                 ref_compiler = "icpc"
                 refc_compiler = "icc"
+            if options.compiler == "gcc" or options.compiler == "g++":
+                ref_compiler = "g++"
+                refc_compiler = "gcc"
     else:
         ispc_test = "ispc.exe"
         ref_compiler = "cl.exe"
diff --git a/run_tests.py b/run_tests.py
index bf1b4a2b..803410b8 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -446,7 +446,7 @@ def verify():
     f = open(test_states, 'r')
     f_lines = f.readlines()
     f.close()
-    check = [["g++", "clang", "cl"],["-O0", "-O2"],["x86","x86-64"],
+    check = [["g++", "clang++", "cl"],["-O0", "-O2"],["x86","x86-64"],
              ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM head"],
              ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8",
               "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8", "avx1.1-i32x16",
@@ -550,7 +550,7 @@ def run_tests(options1, args, print_version):
         if is_windows:
             options.compiler_exe = "cl.exe"
         else:
-            options.compiler_exe = "g++"
+            options.compiler_exe = "clang++"
  
     # checks the required compiler otherwise prints an error message
     PATH_dir = string.split(os.getenv("PATH"), os.pathsep) 

From 477a688c681cb377d3092e6eb86c9547cf1e5915 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Thu, 10 Oct 2013 16:13:20 +0400
Subject: [PATCH 082/159] fail_db.txt update with fails with clang 3.3 as a C++
 compiler for tests (Linux)

---
 fail_db.txt | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index 886a4534..c807765e 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -891,3 +891,96 @@
 .\tests\uint64-min-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
 .\tests\uint64-min.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
 .\tests\reduce-min-uint64.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.4         cl -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *

From a45e147d381bd36ea4525d10db6209368726358b Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 11 Oct 2013 18:09:28 +0400
Subject: [PATCH 083/159] Update fail_db with new passes after change in trunk

---
 fail_db.txt | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/fail_db.txt b/fail_db.txt
index c807765e..a9bf0031 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -892,39 +892,15 @@
 .\tests\uint64-min.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
 .\tests\reduce-min-uint64.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.4         cl -O2 *
 ./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-11.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-5.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-6.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-10.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-11.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-5.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-6.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *

From a4d6240ab48dacd862750d803fb2fb1730956798 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Sat, 12 Oct 2013 14:07:41 +0400
Subject: [PATCH 084/159] Fail_db update on Mac with clang 3.3 compiler

---
 fail_db.txt | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index a9bf0031..f1aaaab2 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -960,3 +960,68 @@
 ./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/double-3.ispc runfail     x86     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *

From e751977b72b94682977a50296abf557587c55c40 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Sat, 12 Oct 2013 06:15:57 -0700
Subject: [PATCH 085/159] Fix small typo for NEON targets in
 Target::SupportedTargets

---
 ispc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ispc.cpp b/ispc.cpp
index 56b0a25f..41adffe4 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -710,7 +710,7 @@ const char *
 Target::SupportedTargets() {
     return
 #ifdef ISPC_ARM_ENABLED
-        "neon-i8x16, neon-16x8, neon-32x4, "
+        "neon-i8x16, neon-i16x8, neon-i32x4, "
 #endif
         "sse2-i32x4, sse2-i32x8, "
         "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "

From 496845df60f0d3fc7e5b8713af223d0c5f55492f Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Mon, 14 Oct 2013 12:23:14 +0400
Subject: [PATCH 086/159] new changes in test system

---
 alloy.py     | 70 +++++++++++++++++++++++++++++++---------------------
 run_tests.py | 13 ++++++++--
 2 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/alloy.py b/alloy.py
index 68bdd979..dbdc40aa 100755
--- a/alloy.py
+++ b/alloy.py
@@ -294,9 +294,7 @@ def execute_stability(stability, R, print_version):
 def run_special_tests():
    i = 5 
 
-def validation_run(only, only_targets, reference_branch, number, notify, update, make):
-    if os.environ["ISPC_HOME"] != os.getcwd():
-        error("you ISPC_HOME and your current pass are different!\n", 2)
+def validation_run(only, only_targets, reference_branch, number, notify, update, speed_number, make, perf_llvm):
     os.chdir(os.environ["ISPC_HOME"])
     os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"]
     if options.notify != "":
@@ -322,7 +320,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
         stability.random = False
         stability.ispc_flags = ""
         stability.compiler_exe = None
-        stability.num_jobs = 1024
+        stability.num_jobs = speed_number
         stability.verbose = False
         stability.time = False
         stability.non_interactive = True
@@ -476,28 +474,36 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
 # prepare LLVM 3.3 as newest LLVM
         need_LLVM = check_LLVM(["3.3"])
         if len(need_LLVM) != 0:
-            build_LLVM(need_LLVM[i], "", "", "", False, False, False, True, False, make)
-# prepare reference point. build both test and reference compilers
-        try_do_LLVM("apply git", "git branch", True)
-        temp4 = take_lines("git branch", "all")
-        for line in temp4:
-            if "*" in line:
-                current_branch = line[2:-1]
-        stashing = True
-        sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n")
-        if "No local changes" in take_lines("git stash", "first"):
-            stashing = False
-        #try_do_LLVM("stash current branch ", "git stash", True)
-        try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True)
-        sys.stdout.write(".\n")
-        build_ispc("3.3", make)
-        sys.stdout.write(".\n")
-        os.rename("ispc", "ispc_ref")
-        try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True)
-        if stashing:
-            try_do_LLVM("return current branch ", "git stash pop", True)
-        sys.stdout.write("You can interrupt script now.\n")
-        build_ispc("3.3", make)
+            build_LLVM(need_LLVM[0], "", "", "", False, False, False, True, False, make)
+        if perf_llvm == False:
+            # prepare reference point. build both test and reference compilers
+            try_do_LLVM("apply git", "git branch", True)
+            temp4 = take_lines("git branch", "all")
+            for line in temp4:
+                if "*" in line:
+                    current_branch = line[2:-1]
+            stashing = True
+            sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n")
+            if "No local changes" in take_lines("git stash", "first"):
+                stashing = False
+            #try_do_LLVM("stash current branch ", "git stash", True)
+            try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True)
+            sys.stdout.write(".\n")
+            build_ispc("3.3", make)
+            sys.stdout.write(".\n")
+            os.rename("ispc", "ispc_ref")
+            try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True)
+            if stashing:
+                try_do_LLVM("return current branch ", "git stash pop", True)
+            sys.stdout.write("You can interrupt script now.\n")
+            build_ispc("3.3", make)
+        else:
+            # build compiler with two different LLVM versions
+            if len(check_LLVM([reference_branch])) != 0:
+                error("you haven't got llvm called " + reference_branch, 1)
+            build_ispc("3.3", make)
+            os.rename("ispc", "ispc_ref")
+            build_ispc(reference_branch, make)
 # begin validation run for performance. output is inserted into perf()
         perf.perf(performance, [])
         if options.notify != "":
@@ -560,16 +566,22 @@ def Main():
     stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log"
     current_path = os.getcwd()
     make = "make -j" + options.speed
+    if os.environ["ISPC_HOME"] != os.getcwd():
+        error("you ISPC_HOME and your current path are different!\n", 2)
+    if options.perf_llvm == True:
+        if options.branch == "master":
+            options.branch = "trunk"
     try:
         if options.build_llvm:
             build_LLVM(options.version, options.revision, options.folder, options.tarball,
                     options.debug, options.selfbuild, options.extra, False, options.force, make)
         if options.validation_run:
             validation_run(options.only, options.only_targets, options.branch,
-                    options.number_for_performance, options.notify, options.update, make)
+                    options.number_for_performance, options.notify, options.update, int(options.speed),
+                    make, options.perf_llvm)
     finally:
         os.chdir(current_path)
-        date_name = "alloy_results_" + datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')
+        date_name = "alloy_results_" + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
         if os.path.exists(date_name):
             error("It's forbidden to run alloy two times in a second, logs are in ./logs", 1)
         os.rename(f_date, date_name)
@@ -661,6 +673,8 @@ run_group.add_option('--only', dest='only',
         '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' +
         'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).',
         default="")
+run_group.add_option('--perf_LLVM', dest='perf_llvm',
+    help='compare LLVM 3.3 with "--compare-with", default trunk', default=False, action='store_true')
 parser.add_option_group(run_group)
 # options for activity "setup PATHS"
 setup_group = OptionGroup(parser, "Options for setup",
diff --git a/run_tests.py b/run_tests.py
index 12822d2d..bf4f5d8a 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -374,7 +374,12 @@ def file_check(compfails, runfails):
             temp3 = re.search("[0-9]*\.[0-9]*", temp2.group())
         compiler_version = options.compiler_exe + temp3.group()
     else:
-        compiler_version = "cl" 
+        compiler_version = "cl"
+    possible_compilers = ["g++4.4", "g++4.7", "clang++3.3", "cl"]
+    if not compiler_version in possible_compilers:
+        error("\n**********\nWe don't have history of fails for compiler " +
+                compiler_version +
+                "\nAll fails will be new!!!\n**********", 2)
     new_line = " "+options.arch.rjust(6)+" "+options.target.rjust(14)+" "+OS.rjust(7)+" "+llvm_version+" "+compiler_version.rjust(10)+" "+opt+" *\n"
 
     new_compfails = compfails[:]
@@ -672,7 +677,11 @@ def run_tests(options1, args, print_version):
     if len(compile_error_files) == 0 and len(run_error_files) == 0:
         print_debug("No fails\n", s, run_tests_log)
 
-    R = file_check(compile_error_files, run_error_files)
+    if len(args) == 0:
+        R = file_check(compile_error_files, run_error_files)
+    else:
+        error("don't check new fails for incomplete suite of tests", 2)
+        R = 0
 
     if options.time:
         print_debug("Elapsed time: %d s\n" % elapsed_time, s, run_tests_log)

From 7e9b4c0924a884d05182d6d6416dd24e697d9a96 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov@dds.nl>
Date: Tue, 15 Oct 2013 10:02:10 +0200
Subject: [PATCH 087/159]  added avx2-i64x4 and avx1.1-i64x4 targets

---
 Makefile                       |   2 +-
 builtins.cpp                   |  16 ++
 builtins/target-avx11-i64x4.ll | 126 +++++++++++
 builtins/target-avx2-i64x4.ll  | 369 +++++++++++++++++++++++++++++++++
 ispc.cpp                       |  46 +++-
 5 files changed, 556 insertions(+), 3 deletions(-)
 create mode 100644 builtins/target-avx11-i64x4.ll
 create mode 100644 builtins/target-avx2-i64x4.ll

diff --git a/Makefile b/Makefile
index 10d51bd5..9d39baa4 100644
--- a/Makefile
+++ b/Makefile
@@ -140,7 +140,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
diff --git a/builtins.cpp b/builtins.cpp
index 43f68833..af9649b7 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -966,6 +966,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     }
     case Target::AVX11: {
         switch (g->target->getVectorWidth()) {
+        case 4:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avx11_i64x4_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avx11_i64x4_64bit);
+            }
+            break;
         case 8:
             if (runtime32) {
                 EXPORT_MODULE(builtins_bitcode_avx11_32bit);
@@ -989,6 +997,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     }
     case Target::AVX2: {
         switch (g->target->getVectorWidth()) {
+        case 4:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avx2_i64x4_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avx2_i64x4_64bit);
+            }
+            break;
         case 8:
             if (runtime32) {
                 EXPORT_MODULE(builtins_bitcode_avx2_32bit);
diff --git a/builtins/target-avx11-i64x4.ll b/builtins/target-avx11-i64x4.ll
new file mode 100644
index 00000000..aae612bb
--- /dev/null
+++ b/builtins/target-avx11-i64x4.ll
@@ -0,0 +1,126 @@
+;;  Copyright (c) 2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx1-i64x4base.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
+
+define(`expand_4to8', `
+  %$3 = shufflevector <4 x $1> %$2, <4 x $1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+')
+define(`extract_4from8', `
+  %$3 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <4 x float> @__half_to_float_varying(<4 x i16> %v4) nounwind readnone {
+  expand_4to8(i16, v4, v) 
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  extract_4from8(float, r, ret)
+  ret <4 x float> %ret
+}
+
+define <4 x i16> @__float_to_half_varying(<4 x float> %v4) nounwind readnone {
+  expand_4to8(float, v4, v) 
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  extract_4from8(i16, r, ret)
+  ret <4 x i16> %ret
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+')
diff --git a/builtins/target-avx2-i64x4.ll b/builtins/target-avx2-i64x4.ll
new file mode 100644
index 00000000..cdd10386
--- /dev/null
+++ b/builtins/target-avx2-i64x4.ll
@@ -0,0 +1,369 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `',
+       LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
+
+include(`target-avx1-i64x4base.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+;; declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+;; declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readonly
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+;; declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readonly
+;; declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readonly
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
+
+define(`expand_4to8', `
+  %$3 = shufflevector <4 x $1> %$2, <4 x $1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+')
+define(`extract_4from8', `
+  %$3 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <4 x float> @__half_to_float_varying(<4 x i16> %v4) nounwind readnone {
+  expand_4to8(i16, v4, v) 
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  extract_4from8(float, r, ret)
+  ret <4 x float> %ret
+}
+
+define <4 x i16> @__float_to_half_varying(<4 x float> %v4) nounwind readnone {
+  expand_4to8(float, v4, v) 
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  extract_4from8(i16, r, ret)
+  ret <4 x i16> %ret
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+declare void @llvm.trap() noreturn nounwind
+
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)',
+LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
+gen_gather(i8)
+gen_gather(i16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <4 x i32> @__gather_base_offsets32_i32(i8 * %ptr,
+                             i32 %scale, <4 x i32> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8  = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8 * %ptr,
+                             <4 x i32> %offsets, <4 x i32> %vecmask, i8 %scale8)
+  ret <4 x i32> %v
+}
+
+
+define <4 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <4 x i64> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8  = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets, <4 x i32> %vecmask, i8 %scale8)
+
+  ret <4 x i32> %v
+}
+
+
+define <4 x i32> @__gather32_i32(<4 x i32> %ptrs,
+                                 <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8 * null,
+                      <4 x i32> %ptrs, <4 x i32> %vecmask, i8 1)
+  
+  ret <4 x i32> %v
+}
+
+
+define <4 x i32> @__gather64_i32(<4 x i64> %ptrs, 
+                                 <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x i32> %vecmask, i8 1)
+
+  ret <4 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <4 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <4 x i32> %offsets,
+                                  <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8 * %ptr,
+                       <4 x i32> %offsets, <4 x float> %mask, i8 %scale8)
+
+  ret <4 x float> %v
+}
+
+
+define <4 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <4 x i64> %offsets,
+                                   <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets, <4 x float> %mask, i8 %scale8)
+
+  ret <4 x float> %v
+}
+
+
+define <4 x float> @__gather32_float(<4 x i32> %ptrs, 
+                                     <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8 * null,
+                     <4 x i32> %ptrs, <4 x float> %mask, i8 1)
+
+  ret <4 x float> %v
+}
+
+
+define <4 x float> @__gather64_float(<4 x i64> %ptrs, 
+                                     <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x float> %mask, i8 1)
+
+  ret <4 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <4 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <4 x i32> %offsets,
+                             <4 x i64> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  %v = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets, <4 x i64> %vecmask, i8 %scale8)
+
+  ret <4 x i64> %v
+}
+
+
+define <4 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <4 x i64> %offsets,
+                             <4 x i64> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  %v = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets, <4 x i64> %vecmask, i8 %scale8)
+
+  ret <4 x i64> %v
+}
+
+
+define <4 x i64> @__gather32_i64(<4 x i32> %ptrs, 
+                                 <4 x i64> %vecmask) nounwind readonly alwaysinline {
+
+  %v = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs, <4 x i64> %vecmask, i8 1)
+  ret <4 x i64> %v
+}
+
+
+define <4 x i64> @__gather64_i64(<4 x i64> %ptrs, 
+                                 <4 x i64> %vecmask) nounwind readonly alwaysinline {
+  %v = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x i64> %vecmask, i8 1)
+  ret <4 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <4 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <4 x i32> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets, <4 x double> %vecmask, i8 %scale8)
+  ret <4 x double> %v
+}
+
+define <4 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <4 x i64> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets, <4 x double> %vecmask, i8 %scale8)
+
+  ret <4 x double> %v
+}
+
+define <4 x double> @__gather32_double(<4 x i32> %ptrs, 
+                                       <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs, <4 x double> %vecmask, i8 1)
+
+  ret <4 x double> %v
+}
+
+define <4 x double> @__gather64_double(<4 x i64> %ptrs, 
+                                       <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x double> %vecmask, i8 1)
+
+  ret <4 x double> %v
+}
+
+')
diff --git a/ispc.cpp b/ispc.cpp
index 41adffe4..db4c161a 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -507,6 +507,25 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 #if !defined(LLVM_3_1)
         // LLVM 3.2+ only
         this->m_hasRand = true;
+#endif
+    }
+    else if (!strcasecmp(isa, "avx1.1-i64x4")) {
+        this->m_isa = Target::AVX11;
+        this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_vectorWidth = 4;
+        this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
+#if defined(LLVM_3_4)
+        ",+rdrnd"
+#else
+        ",+rdrand"
+#endif
+        ;           
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 64;
+        this->m_hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
+        this->m_hasRand = true;
 #endif
     }
     else if (!strcasecmp(isa, "avx2") ||
@@ -555,6 +574,29 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         // LLVM 3.2+ only
         this->m_hasRand = true;
         this->m_hasGather = true;
+#endif
+    }
+    else if (!strcasecmp(isa, "avx2-i64x4")) {
+        this->m_isa = Target::AVX2;
+        this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_vectorWidth = 4;
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
+#if defined(LLVM_3_4)
+        ",+rdrnd"
+#else
+        ",+rdrand"
+#endif
+#ifndef LLVM_3_1
+            ",+fma"
+#endif // !LLVM_3_1
+            ;
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 64;
+        this->m_hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
+        this->m_hasRand = true;
+        this->m_hasGather = true;
 #endif
     }
 #ifdef ISPC_ARM_ENABLED
@@ -715,8 +757,8 @@ Target::SupportedTargets() {
         "sse2-i32x4, sse2-i32x8, "
         "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
         "avx1-i32x8, avx1-i32x16, avx1-i64x4, "
-        "avx1.1-i32x8, avx1.1-i32x16, "
-        "avx2-i32x8, avx2-i32x16, "
+        "avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 "
+        "avx2-i32x8, avx2-i32x16, avx2-i64x4, "
         "generic-x1, generic-x4, generic-x8, generic-x16, "
         "generic-x32, generic-x64";
 }

From fb1a2a0a40f1ae3436fa2ed31c2daf8d90087e2b Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Tue, 15 Oct 2013 17:10:46 +0300
Subject: [PATCH 088/159] __masked_store_* uses vscatter now, and is
 thread-safe

---
 examples/intrinsics/knc-i1x16.h | 51 +++++++++++----------------------
 1 file changed, 17 insertions(+), 34 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 78d35ddc..d6bf6fd5 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -2009,43 +2009,37 @@ static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val,
             ptr[i] = val[i];
 }
 
-static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) 
+static FORCEINLINE void __masked_store_i32(void *p, const __vec16_i32 val, const __vec16_i1 mask) 
 {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
   _mm512_mask_store_epi32(p, mask, val.v);
 #else
-  __vec16_i32 tmp;
-  tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-  tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-  tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v);
-  _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-  _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_mask_i32extscatter_epi32(p, mask, __ispc_stride1, val, _MM_DOWNCONV_EPI32_NONE, _MM_SCALE_4, _MM_HINT_NONE);
 #endif
 }
 
-static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, __vec16_i1 mask) 
+static FORCEINLINE void __masked_store_float(void *p, const __vec16_f val, const __vec16_i1 mask) 
 {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
   _mm512_mask_store_ps(p, mask, val.v);
 #else
-  __vec16_f tmp;
-  tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-  tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-  tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v);
-  _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
-  _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_mask_i32extscatter_ps(p, mask, __ispc_stride1, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_4, _MM_HINT_NONE);
 #endif
 }
 
-static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
-                                          __vec16_i1 mask) {
-    int64_t *ptr = (int64_t *)p;
-    for (int i = 0; i < 16; ++i)
-        if ((mask.v & (1 << i)) != 0)
-            ptr[i] = val[i];
+static FORCEINLINE void __masked_store_i64(void *p, const __vec16_i64 val, const __vec16_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  _mm512_mask_store_epi64(p, mask, val.v1);
+  _mm512_mask_store_epi64((uint8_t*)p+64, tmp_m, val.v2);
+#else
+  _mm512_mask_i32loextscatter_epi64(          p,                      mask,  __ispc_stride1, val.v1, _MM_DOWNCONV_EPI64_NONE, _MM_SCALE_8, _MM_HINT_NONE);
+  _mm512_mask_i32loextscatter_epi64((int64_t*)p+8, _mm512_kswapb(mask,mask), __ispc_stride1, val.v2, _MM_DOWNCONV_EPI64_NONE, _MM_SCALE_8, _MM_HINT_NONE);
+#endif
 }
 
-static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, __vec16_i1 mask) 
+static FORCEINLINE void __masked_store_double(void *p, const __vec16_d val, const __vec16_i1 mask) 
 {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
   __vec16_i1 tmp_m = mask;
@@ -2053,19 +2047,8 @@ static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, __vec16_i1
   _mm512_mask_store_pd(p, mask, val.v1);
   _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2);
 #else
-  __vec16_d tmp;
-  __vec16_i1 tmp_m = mask;
-  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
-  tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-  tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-  tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-  tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-  tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1);
-  tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2);
-  _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-  _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-  _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-  _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_mask_i32loextscatter_pd(           p,                    mask,  __ispc_stride1, val.v1, _MM_DOWNCONV_PD_NONE, _MM_SCALE_8, _MM_HINT_NONE);
+  _mm512_mask_i32loextscatter_pd((double*)p+8, _mm512_kswapb(mask,mask), __ispc_stride1, val.v2, _MM_DOWNCONV_PD_NONE, _MM_SCALE_8, _MM_HINT_NONE);
 #endif
 }
 

From 1710b9171fa54f40165a0917ec764e70951b9841 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov@dds.nl>
Date: Fri, 18 Oct 2013 08:53:01 +0200
Subject: [PATCH 089/159] removed LLVM_3_0 legacy part and changed copyright to
 2013

---
 builtins/target-avx11-i64x4.ll | 10 ++--------
 builtins/target-avx2-i64x4.ll  | 22 ++++------------------
 2 files changed, 6 insertions(+), 26 deletions(-)

diff --git a/builtins/target-avx11-i64x4.ll b/builtins/target-avx11-i64x4.ll
index aae612bb..8fe75266 100644
--- a/builtins/target-avx11-i64x4.ll
+++ b/builtins/target-avx11-i64x4.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2012, Intel Corporation
+;;  Copyright (c) 2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -31,8 +31,7 @@
 
 include(`target-avx1-i64x4base.ll')
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
-       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+ifelse(LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
        `rdrand_definition()')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -74,10 +73,6 @@ gen_gather(double)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `
-;; nothing to define...
-', `
-
 define(`expand_4to8', `
   %$3 = shufflevector <4 x $1> %$2, <4 x $1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ')
@@ -123,4 +118,3 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
   %r = extractelement <8 x i16> %rv, i32 0
   ret i16 %r
 }
-')
diff --git a/builtins/target-avx2-i64x4.ll b/builtins/target-avx2-i64x4.ll
index cdd10386..d74f32dc 100644
--- a/builtins/target-avx2-i64x4.ll
+++ b/builtins/target-avx2-i64x4.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2012, Intel Corporation
+;;  Copyright (c) 2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -29,14 +29,12 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `',
-       LLVM_VERSION, `LLVM_3_1', `',
+ifelse(LLVM_VERSION, `LLVM_3_1', `',
        `define(`HAVE_GATHER', `1')')
 
 include(`target-avx1-i64x4base.ll')
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
-       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+ifelse(LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
        `rdrand_definition()')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -77,10 +75,6 @@ define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly a
 
 
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `
-;; nothing to define...
-', `
-
 define(`expand_4to8', `
   %$3 = shufflevector <4 x $1> %$2, <4 x $1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ')
@@ -126,7 +120,6 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
   %r = extractelement <8 x i16> %rv, i32 0
   ret i16 %r
 }
-')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
@@ -134,14 +127,7 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
 declare void @llvm.trap() noreturn nounwind
 
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)',
-LLVM_VERSION, `LLVM_3_1', `
+ifelse(LLVM_VERSION, `LLVM_3_1', `
 gen_gather_factored(i8)
 gen_gather_factored(i16)
 gen_gather_factored(i32)

From 2e724b095e030b2d548758b965529735f184ca43 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Mon, 7 Oct 2013 15:43:31 +0400
Subject: [PATCH 090/159] support of operators

---
 expr.cpp              | 135 ++++++++++++++++++++++++++++++------------
 expr.h                |   2 +
 lex.ll                |   8 +++
 parse.yy              |  14 ++---
 tests/operators.ispc  |  70 ++++++++++++++++++++++
 tests/operators1.ispc |  64 ++++++++++++++++++++
 tests/operators2.ispc |  51 ++++++++++++++++
 7 files changed, 299 insertions(+), 45 deletions(-)
 create mode 100644 tests/operators.ispc
 create mode 100644 tests/operators1.ispc
 create mode 100644 tests/operators2.ispc

diff --git a/expr.cpp b/expr.cpp
index 614cb5e5..c92503e0 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -1660,6 +1660,64 @@ BinaryExpr::BinaryExpr(Op o, Expr *a, Expr *b, SourcePos p)
     arg1 = b;
 }
 
+Expr *lCreateBinaryOperatorCall(const BinaryExpr::Op bop,
+                                Expr *a0, Expr *a1,
+                                const SourcePos &sp)
+{
+    if ((a0 == NULL) || (a1 == NULL)) {
+        return NULL;
+    }
+    Expr *arg0 = a0->TypeCheck();
+    Expr *arg1 = a1->TypeCheck();
+    if ((arg0 == NULL) || (arg1 == NULL)) {
+        return NULL;
+    }
+    const Type *type0 = arg0->GetType();
+    const Type *type1 = arg1->GetType();
+
+    // If either operand is a reference, dereference it before we move
+    // forward
+    if (CastType<ReferenceType>(type0) != NULL) {
+        arg0 = new RefDerefExpr(arg0, arg0->pos);
+        type0 = arg0->GetType();
+    }
+    if (CastType<ReferenceType>(type1) != NULL) {
+        arg1 = new RefDerefExpr(arg1, arg1->pos);
+        type1 = arg1->GetType();
+    }
+    if ((type0 == NULL) || (type1 == NULL)) {
+        return NULL;
+    }
+    if (CastType<StructType>(type0) != NULL ||
+        CastType<StructType>(type1) != NULL) {
+        std::string opName = std::string("operator") + lOpString(bop);
+        std::vector<Symbol *> funs;
+        m->symbolTable->LookupFunction(opName.c_str(), &funs);
+        if (funs.size() == 0) {
+            Error(sp, "operator %s(%s, %s) is not defined.",
+            opName.c_str(), (type0->GetString()).c_str(), (type1->GetString()).c_str());
+            return NULL;
+        }
+        Expr *func = new FunctionSymbolExpr(opName.c_str(), funs, sp);
+        ExprList *args = new ExprList(sp);
+        args->exprs.push_back(arg0);
+        args->exprs.push_back(arg1);
+        Expr *opCallExpr = new FunctionCallExpr(func, args, sp);
+        return opCallExpr;
+    }
+    return NULL;
+}
+
+
+Expr * MakeBinaryExpr(BinaryExpr::Op o, Expr *a, Expr *b, SourcePos p) {
+    Expr * op = lCreateBinaryOperatorCall(o, a, b, p);
+    if (op != NULL) {
+        return op;
+    }
+    op = new BinaryExpr(o, a, b, p);
+    return op;
+}
+
 
 /** Emit code for a && or || logical operator.  In particular, the code
     here handles "short-circuit" evaluation, where the second expression
@@ -2985,29 +3043,10 @@ AssignExpr::TypeCheck() {
     if (lvalueIsReference)
         lvalue = new RefDerefExpr(lvalue, lvalue->pos);
 
-    FunctionSymbolExpr *fse;
-    if ((fse = dynamic_cast<FunctionSymbolExpr *>(rvalue)) != NULL) {
-        // Special case to use the type of the LHS to resolve function
-        // overloads when we're assigning a function pointer where the
-        // function is overloaded.
-        const Type *lvalueType = lvalue->GetType();
-        const FunctionType *ftype;
-        if (CastType<PointerType>(lvalueType) == NULL ||
-            (ftype = CastType<FunctionType>(lvalueType->GetBaseType())) == NULL) {
-            Error(lvalue->pos, "Can't assign function pointer to type \"%s\".",
-                  lvalueType ? lvalueType->GetString().c_str() : "<unknown>");
-            return NULL;
-        }
-
-        std::vector<const Type *> paramTypes;
-        for (int i = 0; i < ftype->GetNumParameters(); ++i)
-            paramTypes.push_back(ftype->GetParameterType(i));
-
-        if (!fse->ResolveOverloads(rvalue->pos, paramTypes)) {
-            Error(pos, "Unable to find overloaded function for function "
-                  "pointer assignment.");
-            return NULL;
-        }
+    if (PossiblyResolveFunctionOverloads(rvalue, lvalue->GetType()) == false) {
+        Error(pos, "Unable to find overloaded function for function "
+                "pointer assignment.");
+        return NULL;
     }
 
     const Type *lhsType = lvalue->GetType();
@@ -3650,10 +3689,37 @@ FunctionCallExpr::GetLValue(FunctionEmitContext *ctx) const {
         return NULL;
     }
 }
- 
+
+
+bool FullResolveOverloads(Expr * func, ExprList * args,
+                        std::vector<const Type *> *argTypes,
+                        std::vector<bool> *argCouldBeNULL,
+                        std::vector<bool> *argIsConstant) {
+    for (unsigned int i = 0; i < args->exprs.size(); ++i) {
+        Expr *expr = args->exprs[i];
+        if (expr == NULL)
+            return false;
+        const Type *t = expr->GetType();
+        if (t == NULL)
+            return false;
+        argTypes->push_back(t);
+        argCouldBeNULL->push_back(lIsAllIntZeros(expr) || dynamic_cast<NullPointerExpr *>(expr));
+        argIsConstant->push_back(dynamic_cast<ConstExpr *>(expr) || dynamic_cast<NullPointerExpr *>(expr));
+    }
+    return true;
+}
+
 
 const Type *
 FunctionCallExpr::GetType() const {
+    std::vector<const Type *> argTypes;
+    std::vector<bool> argCouldBeNULL, argIsConstant;
+    if (FullResolveOverloads(func, args, &argTypes, &argCouldBeNULL, &argIsConstant) == true) {
+        FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
+        if (fse != NULL) {
+            fse->ResolveOverloads(args->pos, argTypes, &argCouldBeNULL, &argIsConstant);
+        }
+    }
     const FunctionType *ftype = lGetFunctionType(func);
     return ftype ? ftype->GetReturnType() : NULL;
 }
@@ -3689,20 +3755,9 @@ FunctionCallExpr::TypeCheck() {
 
     std::vector<const Type *> argTypes;
     std::vector<bool> argCouldBeNULL, argIsConstant;
-    for (unsigned int i = 0; i < args->exprs.size(); ++i) {
-        Expr *expr = args->exprs[i];
 
-        if (expr == NULL)
-            return NULL;
-        const Type *t = expr->GetType();
-        if (t == NULL)
-            return NULL;
-
-        argTypes.push_back(t);
-        argCouldBeNULL.push_back(lIsAllIntZeros(expr) ||
-                                 dynamic_cast<NullPointerExpr *>(expr));
-        argIsConstant.push_back(dynamic_cast<ConstExpr *>(expr) ||
-                                dynamic_cast<NullPointerExpr *>(expr));
+    if (FullResolveOverloads(func, args, &argTypes, &argCouldBeNULL, &argIsConstant) == false) {
+        return NULL;
     }
 
     FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
@@ -7010,7 +7065,8 @@ TypeCastExpr::GetLValue(FunctionEmitContext *ctx) const {
 
 const Type *
 TypeCastExpr::GetType() const {
-    AssertPos(pos, type->HasUnboundVariability() == false);
+    // We have to switch off this assert after supporting of operators.
+    //AssertPos(pos, type->HasUnboundVariability() == false);
     return type;
 }
 
@@ -8190,6 +8246,9 @@ FunctionSymbolExpr::ResolveOverloads(SourcePos argPos,
                                      const std::vector<bool> *argCouldBeNULL,
                                      const std::vector<bool> *argIsConstant) {
     const char *funName = candidateFunctions.front()->name.c_str();
+    if (triedToResolve == true) {
+        return true;
+    }
 
     triedToResolve = true;
 
diff --git a/expr.h b/expr.h
index 42fdff45..f8b96abd 100644
--- a/expr.h
+++ b/expr.h
@@ -730,6 +730,8 @@ bool CanConvertTypes(const Type *fromType, const Type *toType,
  */
 Expr *TypeConvertExpr(Expr *expr, const Type *toType, const char *errorMsgBase);
 
+Expr * MakeBinaryExpr(BinaryExpr::Op o, Expr *a, Expr *b, SourcePos p);
+
 /** Utility routine that emits code to initialize a symbol given an
     initializer expression.
 
diff --git a/lex.ll b/lex.ll
index 3655220f..87a80145 100644
--- a/lex.ll
+++ b/lex.ll
@@ -419,6 +419,14 @@ while { RT; return TOKEN_WHILE; }
 \"C\" { RT; return TOKEN_STRING_C_LITERAL; }
 \.\.\. { RT; return TOKEN_DOTDOTDOT; }
 
+"operator*"  { return TOKEN_IDENTIFIER; }
+"operator+"  { return TOKEN_IDENTIFIER; }
+"operator-"  { return TOKEN_IDENTIFIER; }
+"operator<<" { return TOKEN_IDENTIFIER; }
+"operator>>" { return TOKEN_IDENTIFIER; }
+"operator/" { return TOKEN_IDENTIFIER; }
+"operator%" { return TOKEN_IDENTIFIER; }
+
 L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERAL; }
 
 {IDENT} {
diff --git a/parse.yy b/parse.yy
index 933a3455..38c5ba77 100644
--- a/parse.yy
+++ b/parse.yy
@@ -468,27 +468,27 @@ cast_expression
 multiplicative_expression
     : cast_expression
     | multiplicative_expression '*' cast_expression
-      { $$ = new BinaryExpr(BinaryExpr::Mul, $1, $3, Union(@1, @3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Mul, $1, $3, Union(@1, @3)); }
     | multiplicative_expression '/' cast_expression
-      { $$ = new BinaryExpr(BinaryExpr::Div, $1, $3, Union(@1, @3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Div, $1, $3, Union(@1, @3)); }
     | multiplicative_expression '%' cast_expression
-      { $$ = new BinaryExpr(BinaryExpr::Mod, $1, $3, Union(@1, @3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Mod, $1, $3, Union(@1, @3)); }
     ;
 
 additive_expression
     : multiplicative_expression
     | additive_expression '+' multiplicative_expression
-      { $$ = new BinaryExpr(BinaryExpr::Add, $1, $3, Union(@1, @3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Add, $1, $3, Union(@1, @3)); }
     | additive_expression '-' multiplicative_expression
-      { $$ = new BinaryExpr(BinaryExpr::Sub, $1, $3, Union(@1, @3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Sub, $1, $3, Union(@1, @3)); }
     ;
 
 shift_expression
     : additive_expression
     | shift_expression TOKEN_LEFT_OP additive_expression
-      { $$ = new BinaryExpr(BinaryExpr::Shl, $1, $3, Union(@1,@3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Shl, $1, $3, Union(@1, @3)); }
     | shift_expression TOKEN_RIGHT_OP additive_expression
-      { $$ = new BinaryExpr(BinaryExpr::Shr, $1, $3, Union(@1,@3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Shr, $1, $3, Union(@1, @3)); }
     ;
 
 relational_expression
diff --git a/tests/operators.ispc b/tests/operators.ispc
new file mode 100644
index 00000000..95502bdd
--- /dev/null
+++ b/tests/operators.ispc
@@ -0,0 +1,70 @@
+
+export uniform int width() { return programCount; }
+
+struct S {
+    float a;
+};
+
+// References "struct&" were put in random order to test them.
+struct S operator*(struct S& rr, struct S rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator/(struct S& rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a - rr.a + 2;
+    return c;
+}
+
+struct S operator%(struct S rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator+(struct S rr, struct S rv) {
+    struct S c;
+    c.a = rr.a / rv.a + 3;
+    return c;
+}
+
+struct S operator-(struct S rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator>>(struct S& rr, struct S rv) {
+    struct S c;
+    c.a = rr.a / rv.a + 3;
+    return c;
+}
+
+struct S operator<<(struct S& rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S a, a1;
+struct S b, b1;
+struct S d1, d2, d3, d4, d5, d6, d7;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    a.a = aFOO[programIndex];
+    b.a = -aFOO[programIndex];
+    d1 = a * b;
+    d2 = a / b;
+    d3 = a % b;
+    d4 = a + b;
+    d5 = a - b;
+    d6 = a >> b;
+    d7 = a << b;
+    RET[programIndex] = d1.a + d2.a + d3.a + d4.a + d5.a + d6.a + d7.a;
+}
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 14;
+}
diff --git a/tests/operators1.ispc b/tests/operators1.ispc
new file mode 100644
index 00000000..f52c4c35
--- /dev/null
+++ b/tests/operators1.ispc
@@ -0,0 +1,64 @@
+
+export uniform int width() { return programCount; }
+
+struct S {
+    float a;
+};
+
+// References "struct&" were put in random order to test them.
+struct S operator*(struct S& rr, struct S rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator/(struct S& rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a - rr.a + 2;
+    return c;
+}
+
+struct S operator%(struct S rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator+(struct S rr, struct S rv) {
+    struct S c;
+    c.a = rr.a / rv.a + 3;
+    return c;
+}
+
+struct S operator-(struct S rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator>>(struct S& rr, struct S rv) {
+    struct S c;
+    c.a = rr.a / rv.a + 3;
+    return c;
+}
+
+struct S operator<<(struct S& rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S a;
+struct S b;
+struct S d;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    a.a = 5;
+    b.a = -5;
+    d = a * b + b / a - a << (b - b) - a;
+    RET[programIndex] = d.a;
+}
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 12;
+}
diff --git a/tests/operators2.ispc b/tests/operators2.ispc
new file mode 100644
index 00000000..b732b24a
--- /dev/null
+++ b/tests/operators2.ispc
@@ -0,0 +1,51 @@
+int off;
+
+export uniform int width() { return programCount; }
+
+struct S {
+    float a;
+};
+
+struct S operator+(struct S rr, struct S rv) {
+    struct S c;
+    c.a = rr.a / rv.a + 3;
+    if (off == 1)
+        c.a = 22;
+    return c;
+}
+
+struct S operator/(struct S rr, struct S rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 10;
+    if (off == 1)
+        c.a = 33;
+    return c;
+}
+
+struct S a;
+struct S b;
+struct S d;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int T = programIndex;
+    a.a = aFOO[programIndex];
+    b.a = -aFOO[programIndex];
+    if (programIndex == 3)
+        off = 1;
+    else
+        off = 0;
+    if (T % 2)
+        d = a + b;
+    else
+        d = a / b;
+
+    RET[programIndex] = d.a;
+}
+
+export void result(uniform float RET[4]) {
+    if (programIndex % 2)
+        RET[programIndex] = 2;
+    else
+        RET[programIndex] = 10;
+    RET[3] = 22;
+}

From 2117002c0129fe54a77e0138bfb715a246fd1121 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Thu, 17 Oct 2013 23:35:21 +0400
Subject: [PATCH 091/159] Adding testing support for avx1.1-i64x4 and
 avx2-i64x4 targets

---
 alloy.py     | 12 ++++++------
 run_tests.py |  5 +++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/alloy.py b/alloy.py
index 19497b35..cdfc9127 100755
--- a/alloy.py
+++ b/alloy.py
@@ -212,10 +212,10 @@ def check_targets():
                 answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"]
             if AVX11 == False and "rdrand" in f_lines[i]:
                 AVX11 = True;
-                answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
+                answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16", "avx1.1-i64x4"]
             if AVX2 == False and "avx2" in f_lines[i]:
                 AVX2 = True;
-                answer = answer + ["avx2-i32x8", "avx2-i32x16"]
+                answer = answer + ["avx2-i32x8", "avx2-i32x16", "avx2-i64x4"]
     if current_OS == "MacOS":
         f_lines = take_lines("sysctl machdep.cpu.features", "first")
         if "SSE2" in f_lines:
@@ -229,10 +229,10 @@ def check_targets():
             answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"]
         if "RDRAND" in f_lines:
             AVX11 = True;
-            answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
+            answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16", "avx1.1-i64x4"]
         if "AVX2.0" in f_lines:
             AVX2 = True;
-            answer = answer + ["avx2-i32x8", "avx2-i32x16"]
+            answer = answer + ["avx2-i32x8", "avx2-i32x16", "avx2-i64x4"]
 
     answer = answer + ["generic-4", "generic-16", "generic-8", "generic-1", "generic-32", "generic-64"]
     # now check what targets we have with the help of SDE
@@ -257,9 +257,9 @@ def check_targets():
         if AVX == False and "snb" in f_lines[i]:
             answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]]
         if AVX11 == False and "ivb" in f_lines[i]:
-            answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"]]
+            answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"], ["-ivb", "avx1.1-i64x4"]]
         if AVX2 == False and "hsw" in f_lines[i]:
-            answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"]]
+            answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"], ["-hsw", "avx2-i64x4"]]
     return [answer, answer_sde]
 
 def build_ispc(version_LLVM, make):
diff --git a/run_tests.py b/run_tests.py
index b5391e1f..40851a40 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -454,8 +454,9 @@ def verify():
     check = [["g++", "clang++", "cl"],["-O0", "-O2"],["x86","x86-64"],
              ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM head"],
              ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8",
-              "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8", "avx1.1-i32x16",
-              "avx2-i32x8", "avx2-i32x16", "generic-1", "generic-4", "generic-8",
+              "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8",
+              "avx1.1-i32x16", "avx1.1-i64x4", "avx2-i32x8", "avx2-i32x16", "avx2-i64x4",
+              "generic-1", "generic-4", "generic-8",
               "generic-16", "generic-32", "generic-64"]]
     for i in range (0,len(f_lines)):
         if f_lines[i][0] == "%":

From 1bd5b704c639595f9081e3d75da13a76c9621291 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 18 Oct 2013 01:15:35 +0400
Subject: [PATCH 092/159] Adding support for build on Windows for avx1.1-i64x4
 and avx2-i64x4

---
 ispc.vcxproj | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/ispc.vcxproj b/ispc.vcxproj
index 58fa5b08..b9a3b6c5 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -28,10 +28,14 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-x2-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx11-i64x4-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx11-i64x4-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx2-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx2-x2-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx2-i64x4-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx2-i64x4-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-c-32.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-c-64.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-dispatch.cpp" />
@@ -323,6 +327,24 @@
       <Message>Building gen-bitcode-avx11-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx11-i64x4.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx11-i64x4-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx11-i64x4.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx11-i64x4-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx2.ll">
       <FileType>Document</FileType>
@@ -359,6 +381,24 @@
       <Message>Building gen-bitcode-avx2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx2-i64x4.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx2-i64x4-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx2-i64x4.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx2-i64x4-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-1.ll">
       <FileType>Document</FileType>

From 6244902931ddc9c2e1a272954d5339cfbc328414 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 18 Oct 2013 01:16:25 +0400
Subject: [PATCH 093/159] Updating fail_db with new Windows fails

---
 fail_db.txt | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index f1aaaab2..9c43c7f0 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -1025,3 +1025,38 @@
 ./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.4         cl -O2 *

From c378429ffba43c51e60337e05cb45eecfafbb144 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Fri, 18 Oct 2013 19:45:39 +0400
Subject: [PATCH 094/159] time in alloy

---
 alloy.py     | 31 ++++++++++++++++++++++++-------
 run_tests.py |  7 ++++---
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/alloy.py b/alloy.py
index dbdc40aa..f18acf71 100755
--- a/alloy.py
+++ b/alloy.py
@@ -274,27 +274,38 @@ def build_ispc(version_LLVM, make):
 
 def execute_stability(stability, R, print_version):
     stability1 = copy.deepcopy(stability)
-    temp = run_tests.run_tests(stability1, [], print_version)
+    b_temp = run_tests.run_tests(stability1, [], print_version)
+    temp = b_temp[0]
+    time = b_temp[1]
     for j in range(0,4):
         R[j][0] = R[j][0] + temp[j]
         for i in range(0,len(temp[j])):
             R[j][1].append(temp[4])
     number_of_fails = temp[5]
     number_of_new_fails = len(temp[0]) + len(temp[1])
+    number_of_passes = len(temp[2]) + len(temp[3])
     if number_of_fails == 0:
         str_fails = ". No fails"
     else:
         str_fails = ". Fails: " + str(number_of_fails)
     if number_of_new_fails == 0:
-        str_new_fails = ", No new fails.\n"
+        str_new_fails = ", No new fails"
     else:
-        str_new_fails = ", New fails: " + str(number_of_new_fails) + ".\n"
-    print_debug(temp[4][1:-3] + str_fails + str_new_fails, False, stability_log)
+        str_new_fails = ", New fails: " + str(number_of_new_fails)
+    if number_of_passes == 0:
+        str_new_passes = "."
+    else:
+        str_new_passes = ", " + str(number_of_passes) + " new passes."
+    if stability.time:
+        str_time = " " + time + "\n"
+    else:
+        str_time = "\n"
+    print_debug(temp[4][1:-3] + str_fails + str_new_fails + str_new_passes + str_time, False, stability_log)
 
 def run_special_tests():
    i = 5 
 
-def validation_run(only, only_targets, reference_branch, number, notify, update, speed_number, make, perf_llvm):
+def validation_run(only, only_targets, reference_branch, number, notify, update, speed_number, make, perf_llvm, time):
     os.chdir(os.environ["ISPC_HOME"])
     os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"]
     if options.notify != "":
@@ -322,7 +333,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
         stability.compiler_exe = None
         stability.num_jobs = speed_number
         stability.verbose = False
-        stability.time = False
+        stability.time = time
         stability.non_interactive = True
         stability.update = update
         stability.include_file = None
@@ -572,13 +583,17 @@ def Main():
         if options.branch == "master":
             options.branch = "trunk"
     try:
+        start_time = time.time()
         if options.build_llvm:
             build_LLVM(options.version, options.revision, options.folder, options.tarball,
                     options.debug, options.selfbuild, options.extra, False, options.force, make)
         if options.validation_run:
             validation_run(options.only, options.only_targets, options.branch,
                     options.number_for_performance, options.notify, options.update, int(options.speed),
-                    make, options.perf_llvm)
+                    make, options.perf_llvm, options.time)
+        elapsed_time = time.time() - start_time
+        if options.time:
+            print_debug("Elapsed time: " + time.strftime('%Hh%Mm%Ssec.', time.gmtime(elapsed_time)) + "\n", False, "")
     finally:
         os.chdir(current_path)
         date_name = "alloy_results_" + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
@@ -668,6 +683,8 @@ run_group.add_option('--update-errors', dest='update',
 run_group.add_option('--only-targets', dest='only_targets',
     help='set list of targets to test. Possible values - all subnames of targets.',
     default="")
+run_group.add_option('--time', dest='time',
+    help='display time of testing', default=False, action='store_true')
 run_group.add_option('--only', dest='only',
     help='set types of tests. Possible values:\n' + 
         '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' +
diff --git a/run_tests.py b/run_tests.py
index bf4f5d8a..e53f6419 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -649,7 +649,8 @@ def run_tests(options1, args, print_version):
     if options.non_interactive == False:
         print_debug("\n", s, run_tests_log)
 
-    elapsed_time = time.time() - start_time
+    temp_time = (time.time() - start_time)
+    elapsed_time = time.strftime('%Hh%Mm%Ssec.', time.gmtime(temp_time))
 
     while not qret.empty():
         (c, r, skip) = qret.get()
@@ -684,9 +685,9 @@ def run_tests(options1, args, print_version):
         R = 0
 
     if options.time:
-        print_debug("Elapsed time: %d s\n" % elapsed_time, s, run_tests_log)
+        print_debug("Elapsed time: " + elapsed_time + "\n", s, run_tests_log)
 
-    return R
+    return [R, elapsed_time]
 
 
 from optparse import OptionParser

From d72590ede6c18cb06141f0f5c640eefa9ffd6701 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Mon, 21 Oct 2013 12:35:53 +0400
Subject: [PATCH 095/159] correction errors in generic targets after operators
 support

---
 cbackend.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/cbackend.cpp b/cbackend.cpp
index 7d4b4cfc..814a4016 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -558,8 +558,15 @@ char CWriter::ID = 0;
 static std::string CBEMangle(const std::string &S) {
   std::string Result;
 
-  for (unsigned i = 0, e = S.size(); i != e; ++i)
-    if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') {
+  for (unsigned i = 0, e = S.size(); i != e; ++i) {
+    if (i+1 != e && ((S[i] == '>' && S[i+1] == '>') ||
+                     (S[i] == '<' && S[i+1] == '<'))) {
+      Result += '_';
+      Result += 'A'+(S[i]&15);
+      Result += 'A'+((S[i]>>4)&15);
+      Result += '_';
+      i++;
+    } else if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') {
       Result += S[i];
     } else {
       Result += '_';
@@ -567,6 +574,7 @@ static std::string CBEMangle(const std::string &S) {
       Result += 'A'+((S[i]>>4)&15);
       Result += '_';
     }
+  }
   return Result;
 }
 

From 899f85ce9c75f7d545da1233b091dfbe8ff304bf Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Tue, 22 Oct 2013 18:06:54 -0400
Subject: [PATCH 096/159] Initial Support for new stdlib shift operator

---
 builtins.cpp                      |   6 ++
 builtins/target-generic-common.ll |   7 ++
 builtins/util.m4                  |  37 ++++++++
 opt.cpp                           | 137 ++++++++++++++++++++++++++++++
 stdlib.ispc                       |  30 +++++++
 5 files changed, 217 insertions(+)

diff --git a/builtins.cpp b/builtins.cpp
index 43f68833..fa2e7328 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -536,6 +536,12 @@ lSetInternalFunctions(llvm::Module *module) {
         "__set_system_isa",
         "__sext_uniform_bool",
         "__sext_varying_bool",
+        "__shift_double",
+        "__shift_float",
+        "__shift_i16",
+        "__shift_i32",
+        "__shift_i64",
+        "__shift_i8",
         "__shuffle2_double",
         "__shuffle2_float",
         "__shuffle2_i16",
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 2a5d1b32..92b7a18e 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -80,6 +80,13 @@ declare <WIDTH x i32> @__rotate_i32(<WIDTH x i32>, i32) nounwind readnone
 declare <WIDTH x double> @__rotate_double(<WIDTH x double>, i32) nounwind readnone
 declare <WIDTH x i64> @__rotate_i64(<WIDTH x i64>, i32) nounwind readnone
 
+declare <WIDTH x i8> @__shift_i8(<WIDTH x i8>, i32) nounwind readnone
+declare <WIDTH x i16> @__shift_i16(<WIDTH x i16>, i32) nounwind readnone
+declare <WIDTH x float> @__shift_float(<WIDTH x float>, i32) nounwind readnone
+declare <WIDTH x i32> @__shift_i32(<WIDTH x i32>, i32) nounwind readnone
+declare <WIDTH x double> @__shift_double(<WIDTH x double>, i32) nounwind readnone
+declare <WIDTH x i64> @__shift_i64(<WIDTH x i64>, i32) nounwind readnone
+
 declare <WIDTH x i8> @__shuffle_i8(<WIDTH x i8>, <WIDTH x i32>) nounwind readnone
 declare <WIDTH x i8> @__shuffle2_i8(<WIDTH x i8>, <WIDTH x i8>,
                                     <WIDTH x i32>) nounwind readnone
diff --git a/builtins/util.m4 b/builtins/util.m4
index 68fa818b..4cb46310 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -797,6 +797,43 @@ not_const:
   ret <WIDTH x $1> %result
 }
 
+define <WIDTH x $1> @__shift_$1(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
+  %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1)
+  %zeropaddedvec = shufflevector <WIDTH x $1> %0, <WIDTH x $1> zeroinitializer,
+                     <eval(2*WIDTH) x i32> < forloop(i, 0, eval(2*WIDTH-2), `i32 i, ')i32 eval(2*WIDTH-1) >
+  br i1 %isc, label %is_const, label %not_const
+
+is_const:
+  ; though verbose, this turms into tight code if %1 is a constant
+forloop(i, 0, eval(WIDTH-1), `  
+  %delta_`'i = add i32 %1, i
+  %delta_clamped_`'i = and i32 %delta_`'i, eval(2*WIDTH-1)
+  %v_`'i = extractelement <eval(2*WIDTH) x $1> %zeropaddedvec, i32 %delta_clamped_`'i')
+  %ret_0 = insertelement <WIDTH x $1> zeroinitializer, $1 %v_0, i32 0
+forloop(i, 1, eval(WIDTH-1), `  %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
+')
+  ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
+
+not_const:
+  ; store two instances of the vector into memory
+  %ptr = alloca <WIDTH x $1>, i32 3
+  %ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
+  store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr0
+  %ptr1 = getelementptr <WIDTH x $1> * %ptr, i32 1
+  store <WIDTH x $1> %0, <WIDTH x $1> * %ptr1
+  %ptr2 = getelementptr <WIDTH x $1> * %ptr, i32 2
+  store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr2
+
+  ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector
+  %offset = add i32 %1, 16
+  %ptr_as_elt_array = bitcast <WIDTH x $1> * %ptr to [eval(3*WIDTH) x $1] *
+  %load_ptr = getelementptr [eval(3*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset
+  %load_ptr_vec = bitcast $1 * %load_ptr to <WIDTH x $1> *
+  %result = load <WIDTH x $1> * %load_ptr_vec, align $2
+  ret <WIDTH x $1> %result
+}
+
+
 define <WIDTH x $1> @__shuffle_$1(<WIDTH x $1>, <WIDTH x i32>) nounwind readnone alwaysinline {
 forloop(i, 0, eval(WIDTH-1), `  
   %index_`'i = extractelement <WIDTH x i32> %1, i32 i')
diff --git a/opt.cpp b/opt.cpp
index 75eae20c..0146e7cf 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -72,6 +72,7 @@
 #include <llvm/Analysis/ConstantFolding.h>
 #include <llvm/Target/TargetLibraryInfo.h>
 #include <llvm/ADT/Triple.h>
+#include <llvm/ADT/SmallSet.h>
 #include <llvm/Transforms/Scalar.h>
 #include <llvm/Transforms/IPO.h>
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
@@ -124,6 +125,8 @@ static llvm::Pass *CreateMakeInternalFuncsStaticPass();
 
 static llvm::Pass *CreateDebugPass(char * output);
 
+static llvm::Pass *CreateReplaceExtractInsertChainsPass();
+
 #define DEBUG_START_PASS(NAME)                                 \
     if (g->debugPrint &&                                       \
         (getenv("FUNC") == NULL ||                             \
@@ -635,6 +638,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(CreateIntrinsicsOptPass());
         optPM.add(CreateInstructionSimplifyPass());
+        optPM.add(CreateReplaceExtractInsertChainsPass());
 
         optPM.add(llvm::createMemCpyOptPass());
         optPM.add(llvm::createSCCPPass());
@@ -4923,3 +4927,136 @@ static llvm::Pass *
 CreatePeepholePass() {
   return new PeepholePass;
 }
+
+///////////////////////////////////////////////////////////////////////////
+// ReplaceExtractInsertChainsPass
+
+/** 
+    We occassionally get chains of ExtractElementInsts followed by 
+    InsertElementInsts.  Unfortunately, all of these can't be replaced by 
+    ShuffleVectorInsts as we don't know that things are constant at the time.
+
+    This Pass will detect such chains, and replace them with ShuffleVectorInsts
+    if all the appropriate values are constant.
+ */
+
+class ReplaceExtractInsertChainsPass : public llvm::BasicBlockPass {
+public:
+    static char ID;
+    ReplaceExtractInsertChainsPass() : BasicBlockPass(ID) {
+    }
+
+    const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+};
+
+char ReplaceExtractInsertChainsPass::ID = 0;
+
+#include <iostream>
+
+/** Given an llvm::Value known to be an integer, return its value as
+    an int64_t.
+*/
+static int64_t
+lGetIntValue(llvm::Value *offset) {
+  llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
+  Assert(intOffset && (intOffset->getBitWidth() == 32 ||
+                       intOffset->getBitWidth() == 64));
+  return intOffset->getSExtValue();
+}
+
+bool
+ReplaceExtractInsertChainsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("ReplaceExtractInsertChainsPass");
+    bool modifiedAny = false;
+
+    // Initialize our mapping to the first spot in the zero vector
+    int vectorWidth = g->target->getVectorWidth();
+    int shuffleMap[vectorWidth];
+    for (int i = 0; i < vectorWidth; i++) {
+      shuffleMap[i] = vectorWidth;
+    }
+
+    // Hack-y.  16 is likely the upper limit for now.
+    llvm::SmallSet<llvm::Value *, 16> inserts;
+
+    // save the last Insert in the chain
+    llvm::Value * lastInsert = NULL;
+
+    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+      // Iterate through the instructions looking for InsertElementInsts
+      llvm::InsertElementInst *ieInst = llvm::dyn_cast<llvm::InsertElementInst>(&*i);
+      if (ieInst == NULL) {
+        // These aren't the instructions you're looking for.
+        continue;
+      }
+      
+      llvm::Value * base = ieInst->getOperand(0);
+      if ( (llvm::isa<llvm::UndefValue>(base))
+           || (llvm::isa<llvm::ConstantAggregateZero>(base))
+           || (base == lastInsert)) {
+        // if source for insert scalar is 0 or an EEInst, add insert
+        llvm::Value *scalar = ieInst->getOperand(1);
+        if (llvm::ExtractElementInst *eeInst = llvm::dyn_cast<llvm::ExtractElementInst>(scalar)) {
+          // We're only going to deal with Inserts into a Constant vector lane
+          if (llvm::isa<llvm::Constant>(eeInst->getOperand(1))) {
+            inserts.insert(ieInst);
+            lastInsert = ieInst;
+          }
+        }
+        else if (llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(scalar)) {
+          if (ci->isZero()) {
+            inserts.insert(ieInst);
+            lastInsert = ieInst;
+          }
+        }
+        else {
+          lastInsert = NULL;
+        }
+      }
+    }
+    
+    // Look for chains, not insert/shuffle sequences
+    if (inserts.size() > 1) {
+      // The vector from which we're extracting elements
+      llvm::Value * baseVec = NULL;
+      llvm::Value *ee = llvm::cast<llvm::InsertElementInst>((*inserts.begin()))->getOperand(1);
+      if (llvm::ExtractElementInst *eeInst = llvm::dyn_cast<llvm::ExtractElementInst>(ee)) {
+        baseVec = eeInst->getOperand(0);
+      }
+
+      bool sameBase = true;
+      for (llvm::SmallSet<llvm::Value *,16>::iterator i = inserts.begin(); i != inserts.end(); i++) {
+        llvm::InsertElementInst *ie = llvm::cast<llvm::InsertElementInst>(*i);
+        if (llvm::ExtractElementInst *ee = llvm::dyn_cast<llvm::ExtractElementInst>(ie->getOperand(1))) {
+          if (ee->getOperand(0) != baseVec) {
+            sameBase = false;
+            break;
+          }
+          int64_t from = lGetIntValue(ee->getIndexOperand());
+          int64_t to = lGetIntValue(ie->getOperand(2)); 
+          shuffleMap[to] = from;
+        }
+      }
+      if (sameBase) {
+        llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleMap);
+        llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shuffleIdxs->getType());
+        llvm::Value *shuffle = new llvm::ShuffleVectorInst(baseVec, zeroVec, shuffleIdxs, "shiftInZero", llvm::cast<llvm::Instruction>(lastInsert));
+        // For now, be lazy and let DCE clean up the Extracts/Inserts.
+        lastInsert->replaceAllUsesWith(shuffle);
+
+        modifiedAny = true;
+      }
+    }    
+    
+    DEBUG_END_PASS("ReplaceExtractInsertChainsPass");
+
+    return modifiedAny;
+}
+
+
+static llvm::Pass *
+CreateReplaceExtractInsertChainsPass() {
+    return new ReplaceExtractInsertChainsPass();
+}
diff --git a/stdlib.ispc b/stdlib.ispc
index 9b02d0ba..248f664a 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -170,6 +170,36 @@ static inline int64 rotate(int64 v, uniform int i) {
     return __rotate_i64(v, i);
 }
 
+__declspec(safe) 
+static inline float shift(float v, uniform int i) {
+    return __shift_float(v, i);
+}
+
+__declspec(safe) 
+static inline int8 shift(int8 v, uniform int i) {
+    return __shift_i8(v, i);
+}
+
+__declspec(safe) 
+static inline int16 shift(int16 v, uniform int i) {
+    return __shift_i16(v, i);
+}
+
+__declspec(safe) 
+static inline int32 shift(int32 v, uniform int i) {
+    return __shift_i32(v, i);
+}
+
+__declspec(safe) 
+static inline double shift(double v, uniform int i) {
+    return __shift_double(v, i);
+}
+
+__declspec(safe) 
+static inline int64 shift(int64 v, uniform int i) {
+    return __shift_i64(v, i);
+}
+
 __declspec(safe) 
 static inline float shuffle(float v, int i) {
     return __shuffle_float(v, i);

From f97a2d68c8e0ae0e10d11b3f08a415685a899f6f Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Tue, 22 Oct 2013 18:29:20 -0400
Subject: [PATCH 097/159] Bugfix for non-const shift amt and unit tests.

---
 builtins/util.m4   |  4 +---
 tests/shift-1.ispc | 14 ++++++++++++++
 tests/shift-2.ispc | 15 +++++++++++++++
 tests/shift-3.ispc | 14 ++++++++++++++
 4 files changed, 44 insertions(+), 3 deletions(-)
 create mode 100644 tests/shift-1.ispc
 create mode 100644 tests/shift-2.ispc
 create mode 100644 tests/shift-3.ispc

diff --git a/builtins/util.m4 b/builtins/util.m4
index 4cb46310..c1582e51 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -815,7 +815,6 @@ forloop(i, 1, eval(WIDTH-1), `  %ret_`'i = insertelement <WIDTH x $1> %ret_`'eva
   ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
 
 not_const:
-  ; store two instances of the vector into memory
   %ptr = alloca <WIDTH x $1>, i32 3
   %ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
   store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr0
@@ -824,8 +823,7 @@ not_const:
   %ptr2 = getelementptr <WIDTH x $1> * %ptr, i32 2
   store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr2
 
-  ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector
-  %offset = add i32 %1, 16
+  %offset = add i32 %1, WIDTH
   %ptr_as_elt_array = bitcast <WIDTH x $1> * %ptr to [eval(3*WIDTH) x $1] *
   %load_ptr = getelementptr [eval(3*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset
   %load_ptr_vec = bitcast $1 * %load_ptr to <WIDTH x $1> *
diff --git a/tests/shift-1.ispc b/tests/shift-1.ispc
new file mode 100644
index 00000000..2062e36b
--- /dev/null
+++ b/tests/shift-1.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int rot = shift(a, -1);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    varying int val = programIndex;
+    if (val < 0) val = 0;	 
+    RET[programIndex] = val;	 
+}   
diff --git a/tests/shift-2.ispc b/tests/shift-2.ispc
new file mode 100644
index 00000000..6cb88e8a
--- /dev/null
+++ b/tests/shift-2.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    uniform int delta = b - 6; // -1
+    int rot = shift(a, delta);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    varying int val = programIndex;
+    if (val < 0) val = 0;	 
+    RET[programIndex] = val;	 
+}
diff --git a/tests/shift-3.ispc b/tests/shift-3.ispc
new file mode 100644
index 00000000..827d076f
--- /dev/null
+++ b/tests/shift-3.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int rot = shift(a, 1);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    varying int val = 2 + programIndex;
+    if (val > programCount) val = 0;	 
+    RET[programIndex] = val;	 
+}   

From 4d289b16c283ace36aa193817bf1ac16a1fcc364 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Wed, 23 Oct 2013 14:25:43 -0400
Subject: [PATCH 098/159] Redesign after being hit with the KISS bat.

---
 builtins/util.m4 |  17 -----
 opt.cpp          | 168 +++++++++++++++++------------------------------
 stdlib.ispc      |  36 ++++++++--
 3 files changed, 92 insertions(+), 129 deletions(-)

diff --git a/builtins/util.m4 b/builtins/util.m4
index c1582e51..0e017322 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -798,23 +798,6 @@ not_const:
 }
 
 define <WIDTH x $1> @__shift_$1(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
-  %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1)
-  %zeropaddedvec = shufflevector <WIDTH x $1> %0, <WIDTH x $1> zeroinitializer,
-                     <eval(2*WIDTH) x i32> < forloop(i, 0, eval(2*WIDTH-2), `i32 i, ')i32 eval(2*WIDTH-1) >
-  br i1 %isc, label %is_const, label %not_const
-
-is_const:
-  ; though verbose, this turms into tight code if %1 is a constant
-forloop(i, 0, eval(WIDTH-1), `  
-  %delta_`'i = add i32 %1, i
-  %delta_clamped_`'i = and i32 %delta_`'i, eval(2*WIDTH-1)
-  %v_`'i = extractelement <eval(2*WIDTH) x $1> %zeropaddedvec, i32 %delta_clamped_`'i')
-  %ret_0 = insertelement <WIDTH x $1> zeroinitializer, $1 %v_0, i32 0
-forloop(i, 1, eval(WIDTH-1), `  %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
-')
-  ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
-
-not_const:
   %ptr = alloca <WIDTH x $1>, i32 3
   %ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
   store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr0
diff --git a/opt.cpp b/opt.cpp
index 0146e7cf..b1a22a1c 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -125,7 +125,7 @@ static llvm::Pass *CreateMakeInternalFuncsStaticPass();
 
 static llvm::Pass *CreateDebugPass(char * output);
 
-static llvm::Pass *CreateReplaceExtractInsertChainsPass();
+static llvm::Pass *CreateReplaceStdlibShiftPass();
 
 #define DEBUG_START_PASS(NAME)                                 \
     if (g->debugPrint &&                                       \
@@ -524,6 +524,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createPromoteMemoryToRegisterPass());
         optPM.add(llvm::createAggressiveDCEPass());
 
+
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
             optPM.add(llvm::createInstructionCombiningPass(), 210);
@@ -535,6 +536,9 @@ Optimize(llvm::Module *module, int optLevel) {
         }
         optPM.add(llvm::createDeadInstEliminationPass(), 220);
 
+        optPM.add(llvm::createIPConstantPropagationPass());
+        optPM.add(CreateReplaceStdlibShiftPass());
+
         // Max struct size threshold for scalar replacement is
         //    1) 4 fields (r,g,b,w)
         //    2) field size: vectorWidth * sizeof(float)
@@ -638,7 +642,6 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(CreateIntrinsicsOptPass());
         optPM.add(CreateInstructionSimplifyPass());
-        optPM.add(CreateReplaceExtractInsertChainsPass());
 
         optPM.add(llvm::createMemCpyOptPass());
         optPM.add(llvm::createSCCPPass());
@@ -4883,6 +4886,7 @@ lMatchAvgDownInt16(llvm::Value *inst) {
 }
 #endif // !LLVM_3_1 && !LLVM_3_2
 
+
 bool
 PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
     DEBUG_START_PASS("PeepholePass");
@@ -4928,31 +4932,6 @@ CreatePeepholePass() {
   return new PeepholePass;
 }
 
-///////////////////////////////////////////////////////////////////////////
-// ReplaceExtractInsertChainsPass
-
-/** 
-    We occassionally get chains of ExtractElementInsts followed by 
-    InsertElementInsts.  Unfortunately, all of these can't be replaced by 
-    ShuffleVectorInsts as we don't know that things are constant at the time.
-
-    This Pass will detect such chains, and replace them with ShuffleVectorInsts
-    if all the appropriate values are constant.
- */
-
-class ReplaceExtractInsertChainsPass : public llvm::BasicBlockPass {
-public:
-    static char ID;
-    ReplaceExtractInsertChainsPass() : BasicBlockPass(ID) {
-    }
-
-    const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
-    bool runOnBasicBlock(llvm::BasicBlock &BB);
-
-};
-
-char ReplaceExtractInsertChainsPass::ID = 0;
-
 #include <iostream>
 
 /** Given an llvm::Value known to be an integer, return its value as
@@ -4966,97 +4945,74 @@ lGetIntValue(llvm::Value *offset) {
   return intOffset->getSExtValue();
 }
 
+///////////////////////////////////////////////////////////////////////////
+// ReplaceStdlibShiftPass
+
+class ReplaceStdlibShiftPass : public llvm::BasicBlockPass {
+public:
+    static char ID;
+    ReplaceStdlibShiftPass() : BasicBlockPass(ID) {
+    }
+
+    const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+};
+
+char ReplaceStdlibShiftPass::ID = 0;
+
 bool
-ReplaceExtractInsertChainsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    DEBUG_START_PASS("ReplaceExtractInsertChainsPass");
+ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("ReplaceStdlibShiftPass");
     bool modifiedAny = false;
+    
+    llvm::Function *shifts[6];
+    shifts[0] = m->module->getFunction("__shift_i8");
+    shifts[1] = m->module->getFunction("__shift_i16");
+    shifts[2] = m->module->getFunction("__shift_i32");
+    shifts[3] = m->module->getFunction("__shift_i64");
+    shifts[4] = m->module->getFunction("__shift_float");
+    shifts[5] = m->module->getFunction("__shift_double");
 
-    // Initialize our mapping to the first spot in the zero vector
-    int vectorWidth = g->target->getVectorWidth();
-    int shuffleMap[vectorWidth];
-    for (int i = 0; i < vectorWidth; i++) {
-      shuffleMap[i] = vectorWidth;
-    }
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
+        llvm::Instruction *inst = &*iter;
 
-    // Hack-y.  16 is likely the upper limit for now.
-    llvm::SmallSet<llvm::Value *, 16> inserts;
-
-    // save the last Insert in the chain
-    llvm::Value * lastInsert = NULL;
-
-    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
-      // Iterate through the instructions looking for InsertElementInsts
-      llvm::InsertElementInst *ieInst = llvm::dyn_cast<llvm::InsertElementInst>(&*i);
-      if (ieInst == NULL) {
-        // These aren't the instructions you're looking for.
-        continue;
-      }
-      
-      llvm::Value * base = ieInst->getOperand(0);
-      if ( (llvm::isa<llvm::UndefValue>(base))
-           || (llvm::isa<llvm::ConstantAggregateZero>(base))
-           || (base == lastInsert)) {
-        // if source for insert scalar is 0 or an EEInst, add insert
-        llvm::Value *scalar = ieInst->getOperand(1);
-        if (llvm::ExtractElementInst *eeInst = llvm::dyn_cast<llvm::ExtractElementInst>(scalar)) {
-          // We're only going to deal with Inserts into a Constant vector lane
-          if (llvm::isa<llvm::Constant>(eeInst->getOperand(1))) {
-            inserts.insert(ieInst);
-            lastInsert = ieInst;
+        if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
+          llvm::Function *func = ci->getCalledFunction();
+          for (int i = 0; i < 6; i++) {
+            if (shifts[i] == func) {
+              // we matched a call
+              llvm::Value *shiftedVec = ci->getArgOperand(0);
+              llvm::Value *shiftAmt = ci->getArgOperand(1);
+              if (llvm::isa<llvm::Constant>(shiftAmt)) {
+                int vectorWidth = g->target->getVectorWidth();
+                int shuffleVals[vectorWidth];
+                int shiftInt = lGetIntValue(shiftAmt);
+                for (int i = 0; i < vectorWidth; i++) {
+                  int s = i + shiftInt;
+                  s = (s < 0) ? vectorWidth : s;
+                  s = (s >= vectorWidth) ? vectorWidth : s;
+                  shuffleVals[i] = s;
+                }
+                llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleVals);
+                llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shiftedVec->getType());
+                llvm::Value *shuffle = new llvm::ShuffleVectorInst(shiftedVec, zeroVec, 
+                                                                   shuffleIdxs, "vecShift", ci);
+                ci->replaceAllUsesWith(shuffle);
+                modifiedAny = true;
+              }
+            }
           }
         }
-        else if (llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(scalar)) {
-          if (ci->isZero()) {
-            inserts.insert(ieInst);
-            lastInsert = ieInst;
-          }
-        }
-        else {
-          lastInsert = NULL;
-        }
-      }
     }
     
-    // Look for chains, not insert/shuffle sequences
-    if (inserts.size() > 1) {
-      // The vector from which we're extracting elements
-      llvm::Value * baseVec = NULL;
-      llvm::Value *ee = llvm::cast<llvm::InsertElementInst>((*inserts.begin()))->getOperand(1);
-      if (llvm::ExtractElementInst *eeInst = llvm::dyn_cast<llvm::ExtractElementInst>(ee)) {
-        baseVec = eeInst->getOperand(0);
-      }
-
-      bool sameBase = true;
-      for (llvm::SmallSet<llvm::Value *,16>::iterator i = inserts.begin(); i != inserts.end(); i++) {
-        llvm::InsertElementInst *ie = llvm::cast<llvm::InsertElementInst>(*i);
-        if (llvm::ExtractElementInst *ee = llvm::dyn_cast<llvm::ExtractElementInst>(ie->getOperand(1))) {
-          if (ee->getOperand(0) != baseVec) {
-            sameBase = false;
-            break;
-          }
-          int64_t from = lGetIntValue(ee->getIndexOperand());
-          int64_t to = lGetIntValue(ie->getOperand(2)); 
-          shuffleMap[to] = from;
-        }
-      }
-      if (sameBase) {
-        llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleMap);
-        llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shuffleIdxs->getType());
-        llvm::Value *shuffle = new llvm::ShuffleVectorInst(baseVec, zeroVec, shuffleIdxs, "shiftInZero", llvm::cast<llvm::Instruction>(lastInsert));
-        // For now, be lazy and let DCE clean up the Extracts/Inserts.
-        lastInsert->replaceAllUsesWith(shuffle);
-
-        modifiedAny = true;
-      }
-    }    
-    
-    DEBUG_END_PASS("ReplaceExtractInsertChainsPass");
+    DEBUG_END_PASS("ReplaceStdlibShiftPass");
 
     return modifiedAny;
 }
 
 
 static llvm::Pass *
-CreateReplaceExtractInsertChainsPass() {
-    return new ReplaceExtractInsertChainsPass();
+CreateReplaceStdlibShiftPass() {
+    return new ReplaceStdlibShiftPass();
 }
diff --git a/stdlib.ispc b/stdlib.ispc
index 248f664a..6768594b 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -172,32 +172,56 @@ static inline int64 rotate(int64 v, uniform int i) {
 
 __declspec(safe) 
 static inline float shift(float v, uniform int i) {
-    return __shift_float(v, i);
+  varying float result;
+  unmasked {
+    result = __shift_float(v, i);
+  }
+  return result;
 }
 
 __declspec(safe) 
 static inline int8 shift(int8 v, uniform int i) {
-    return __shift_i8(v, i);
+  varying int8 result;
+  unmasked {
+    result = __shift_i8(v, i);
+  }
+  return result;
 }
 
 __declspec(safe) 
 static inline int16 shift(int16 v, uniform int i) {
-    return __shift_i16(v, i);
+  varying int16 result;
+  unmasked {
+    result = __shift_i16(v, i);
+  }
+  return result;
 }
 
 __declspec(safe) 
 static inline int32 shift(int32 v, uniform int i) {
-    return __shift_i32(v, i);
+  varying int32 result;
+  unmasked {
+    result = __shift_i32(v, i);
+  }
+  return result;
 }
 
 __declspec(safe) 
 static inline double shift(double v, uniform int i) {
-    return __shift_double(v, i);
+  varying double result;
+  unmasked {
+    result = __shift_double(v, i);
+  }
+  return result;
 }
 
 __declspec(safe) 
 static inline int64 shift(int64 v, uniform int i) {
-    return __shift_i64(v, i);
+  varying int64 result;
+  unmasked {
+    result = __shift_i64(v, i);
+  }
+  return result;
 }
 
 __declspec(safe) 

From c4ad8f6ed4d5f72e43f63b805f557d506f3a54a0 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Wed, 23 Oct 2013 15:51:59 -0400
Subject: [PATCH 099/159] Add docs/generic impls

---
 docs/ispc.rst              | 18 +++++++-
 examples/intrinsics/sse4.h | 84 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/docs/ispc.rst b/docs/ispc.rst
index eac9b24e..a99a3990 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -3719,6 +3719,22 @@ the size of the gang (it is masked to ensure valid offsets).
     double rotate(double value, uniform int offset)
 
 
+The ``shift()`` function allows each program instance to find the value of
+the given value that their neighbor ``offset`` steps away has.  This is similar
+to ``rotate()`` with the exception that values are not circularly shifted.  
+Instead, zeroes are shifted in where appropriate.
+
+
+::
+
+    int8 shift(int8 value, uniform int offset)
+    int16 shift(int16 value, uniform int offset)
+    int32 shift(int32 value, uniform int offset)
+    int64 shift(int64 value, uniform int offset)
+    float shift(float value, uniform int offset)
+    double shift(double value, uniform int offset)
+
+
 Finally, the ``shuffle()`` functions allow two variants of fully general
 shuffling of values among the program instances.  For the first version,
 each program instance's value of permutation gives the program instance
@@ -3751,7 +3767,7 @@ the last element of ``value1``, etc.)
     double shuffle(double value0, double value1, int permutation)
 
 Finally, there are primitive operations that extract and set values in the
-SIMD lanes.  You can implement all of the broadcast, rotate, and shuffle
+SIMD lanes.  You can implement all of the broadcast, rotate, shift, and shuffle
 operations described above in this section from these routines, though in
 general, not as efficiently.  These routines are useful for implementing
 other reductions and cross-lane communication that isn't included in the
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index ff00d920..d1178751 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -598,6 +598,20 @@ static FORCEINLINE __vec4_i8 __rotate_i8(__vec4_i8 v, int delta) {
                      __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_i8 __shift_i8(__vec4_i8 v, int delta) {
+  int8_t v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
+  return __vec4_i8(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_i8 __shuffle_i8(__vec4_i8 v, __vec4_i32 index) {
     return __vec4_i8(__extract_element(v, __extract_element(index, 0) & 0x3),
                      __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -870,6 +884,20 @@ static FORCEINLINE __vec4_i16 __rotate_i16(__vec4_i16 v, int delta) {
                       __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_i16 __shift_i16(__vec4_i16 v, int delta) {
+  int16_t v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
+  return __vec4_i16(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_i16 __shuffle_i16(__vec4_i16 v, __vec4_i32 index) {
     return __vec4_i16(__extract_element(v, __extract_element(index, 0) & 0x3),
                       __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1128,6 +1156,20 @@ static FORCEINLINE __vec4_i32 __rotate_i32(__vec4_i32 v, int delta) {
                       __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_i32 __shift_i32(__vec4_i32 v, int delta) {
+  int32_t v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
+  return __vec4_i32(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_i32 __shuffle_i32(__vec4_i32 v, __vec4_i32 index) {
     return __vec4_i32(__extract_element(v, __extract_element(index, 0) & 0x3),
                       __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1403,6 +1445,20 @@ static FORCEINLINE __vec4_i64 __rotate_i64(__vec4_i64 v, int delta) {
                       __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_i64 __shift_i64(__vec4_i64 v, int delta) {
+  int64_t v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
+  return __vec4_i64(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_i64 __shuffle_i64(__vec4_i64 v, __vec4_i32 index) {
     return __vec4_i64(__extract_element(v, __extract_element(index, 0) & 0x3),
                       __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1523,6 +1579,20 @@ static FORCEINLINE __vec4_f __rotate_float(__vec4_f v, int delta) {
                     __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_f __shift_float(__vec4_f v, int delta) {
+  float v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0.f;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0.f;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0.f;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0.f;
+  return __vec4_f(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_f __shuffle_float(__vec4_f v, __vec4_i32 index) {
     return __vec4_f(__extract_element(v, __extract_element(index, 0) & 0x3),
                     __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1676,6 +1746,20 @@ static FORCEINLINE __vec4_d __rotate_double(__vec4_d v, int delta) {
                     __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_d __shift_double(__vec4_d v, int delta) {
+  double v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
+  return __vec4_d(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_d __shuffle_double(__vec4_d v, __vec4_i32 index) {
     return __vec4_d(__extract_element(v, __extract_element(index, 0) & 0x3),
                     __extract_element(v, __extract_element(index, 1) & 0x3),

From d2b89e0e3741a85b49f51ea7a7bbdb04ce61eb4e Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Wed, 23 Oct 2013 18:01:01 -0400
Subject: [PATCH 100/159] Tweak generic target.

---
 examples/intrinsics/sse4.h | 127 ++++++++++++++++++++-----------------
 opt.cpp                    |   7 +-
 2 files changed, 73 insertions(+), 61 deletions(-)

diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index d1178751..67c46848 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -108,22 +108,21 @@ struct __vec4_i64 {
 };
 
 struct __vec4_i32 {
-    __vec4_i32() { }
+    FORCEINLINE __vec4_i32() : v(_mm_setzero_si128()) { }
     FORCEINLINE __vec4_i32(__m128i vv) : v(vv) {  }
-    FORCEINLINE __vec4_i32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+    FORCEINLINE __vec4_i32(int32_t a, int32_t b, int32_t c, int32_t d) {
         v = _mm_set_epi32(d, c, b, a);
     }
-    FORCEINLINE __vec4_i32(uint32_t *p) {
+    FORCEINLINE __vec4_i32(int32_t *p) {
         v = _mm_loadu_si128((__m128i *)p);
     }
-
+    FORCEINLINE __vec4_i32(const __vec4_i32 &other) : v(other.v) {}
+    FORCEINLINE __vec4_i32& operator =(const __vec4_i32 &o) { v=o.v; return *this; }
     FORCEINLINE operator __m128() const { return _mm_castsi128_ps(v); }
-
+    
     __m128i v;
 };
 
-static inline int32_t __extract_element(__vec4_i32 v, int index);
-
 struct __vec4_i16 {
     __vec4_i16() { }
     FORCEINLINE __vec4_i16(__m128i vv) : v(vv) {  }
@@ -215,6 +214,64 @@ INSERT_EXTRACT(__vec1_i64, int64_t)
 INSERT_EXTRACT(__vec1_f, float)
 INSERT_EXTRACT(__vec1_d, double)
 
+static FORCEINLINE bool __extract_element(const __vec4_i1 &v, int index) {
+    return ((int32_t *)&v)[index] ? true : false;
+}
+
+static FORCEINLINE void __insert_element(__vec4_i1 *v, int index, bool val) {
+    ((int32_t *)v)[index] = val ? -1 : 0;
+}
+
+static FORCEINLINE int8_t __extract_element(const __vec4_i8 &v, int index) {
+    return ((int8_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) {
+    ((int8_t *)v)[index] = val;
+}
+
+static FORCEINLINE int16_t __extract_element(const __vec4_i16 &v, int index) {
+    return ((int16_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val) {
+    ((int16_t *)v)[index] = val;
+}
+
+static FORCEINLINE int32_t __extract_element(const __vec4_i32 &v, int index) {
+    return ((int32_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i32 *v, int index, int32_t val) {
+    ((int32_t *)v)[index] = val;
+}
+
+static FORCEINLINE int64_t __extract_element(const __vec4_i64 &v, int index) {
+    return ((int64_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i64 *v, int index, int64_t val) {
+    ((int64_t *)v)[index] = val;
+}
+
+static FORCEINLINE float __extract_element(const __vec4_f &v, int index) {
+    return ((float *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_f *v, int index, float val) {
+    ((float *)v)[index] = val;
+}
+
+static FORCEINLINE double __extract_element(const __vec4_d &v, int index) {
+    return ((double *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_d *v, int index, double val) {
+    ((double *)v)[index] = val;
+}
+
+
+
 #define CAST_BITS_SCALAR(TO, FROM)                  \
 static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
     union {                                         \
@@ -313,13 +370,6 @@ static FORCEINLINE __vec4_i1 __select(__vec4_i1 mask, __vec4_i1 a, __vec4_i1 b)
     return _mm_blendv_ps(b.v, a.v, mask.v);
 }
 
-static FORCEINLINE bool __extract_element(__vec4_i1 v, int index) {
-    return ((int32_t *)&v)[index] ? true : false;
-}
-
-static FORCEINLINE void __insert_element(__vec4_i1 *v, int index, bool val) {
-    ((int32_t *)v)[index] = val ? -1 : 0;
-}
 
 template <int ALIGN> static FORCEINLINE __vec4_i1 __load(const __vec4_i1 *v) {
     // FIXME: handle align of 16...
@@ -564,13 +614,6 @@ static FORCEINLINE __vec4_i8 __select(__vec4_i1 mask, __vec4_i8 a, __vec4_i8 b)
                                                         _mm_extract_epi8(b.v, 3));
 }
 
-static FORCEINLINE int8_t __extract_element(__vec4_i8 v, int index) {
-    return ((int8_t *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) {
-    ((int8_t *)v)[index] = val;
-}
 
 template <class RetVecType> __vec4_i8 __smear_i8(int8_t v);
 template <> FORCEINLINE __vec4_i8 __smear_i8<__vec4_i8>(int8_t v) {
@@ -850,13 +893,6 @@ static FORCEINLINE __vec4_i16 __select(__vec4_i1 mask, __vec4_i16 a, __vec4_i16
                                                          _mm_extract_epi16(b.v, 3));
 }
 
-static FORCEINLINE int16_t __extract_element(__vec4_i16 v, int index) {
-    return ((int16_t *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val) {
-    ((int16_t *)v)[index] = val;
-}
 
 template <class RetVecType> __vec4_i16 __smear_i16(int16_t v);
 template <> FORCEINLINE __vec4_i16 __smear_i16<__vec4_i16>(int16_t v) {
@@ -1137,13 +1173,6 @@ template <> FORCEINLINE __vec4_i32 __undef_i32<__vec4_i32>() {
     return __vec4_i32();
 }
 
-static FORCEINLINE int32_t __extract_element(__vec4_i32 v, int index) {
-    return ((int32_t *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_i32 *v, int index, int32_t val) {
-    ((int32_t *)v)[index] = val;
-}
 
 static FORCEINLINE __vec4_i32 __broadcast_i32(__vec4_i32 v, int index) {
     return _mm_set1_epi32(__extract_element(v, index));
@@ -1156,9 +1185,10 @@ static FORCEINLINE __vec4_i32 __rotate_i32(__vec4_i32 v, int delta) {
                       __extract_element(v, (delta+3) & 0x3));
 }
 
-static FORCEINLINE __vec4_i32 __shift_i32(__vec4_i32 v, int delta) {
+#include <iostream>
+static FORCEINLINE __vec4_i32 __shift_i32(const __vec4_i32 &v, int delta) {
   int32_t v1, v2, v3, v4;
-  int d1, d2, d3, d4;
+  int32_t d1, d2, d3, d4;
   d1 = delta+0;
   d2 = delta+1;
   d3 = delta+2;
@@ -1425,13 +1455,6 @@ template <> FORCEINLINE __vec4_i64 __undef_i64<__vec4_i64>() {
     return __vec4_i64();
 }
 
-static FORCEINLINE int64_t __extract_element(__vec4_i64 v, int index) {
-    return ((int64_t *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_i64 *v, int index, int64_t val) {
-    ((int64_t *)v)[index] = val;
-}
 
 static FORCEINLINE __vec4_i64 __broadcast_i64(__vec4_i64 v, int index) {
     uint64_t val = __extract_element(v, index);
@@ -1560,13 +1583,6 @@ template <> FORCEINLINE __vec4_f __undef_float<__vec4_f>() {
     return __vec4_f();
 }
 
-static FORCEINLINE float __extract_element(__vec4_f v, int index) {
-    return ((float *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_f *v, int index, float val) {
-    ((float *)v)[index] = val;
-}
 
 static FORCEINLINE __vec4_f __broadcast_float(__vec4_f v, int index) {
     return _mm_set1_ps(__extract_element(v, index));
@@ -1726,13 +1742,6 @@ template <> FORCEINLINE __vec4_d __undef_double<__vec4_d>() {
     return __vec4_d();
 }
 
-static FORCEINLINE double __extract_element(__vec4_d v, int index) {
-    return ((double *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_d *v, int index, double val) {
-    ((double *)v)[index] = val;
-}
 
 static FORCEINLINE __vec4_d __broadcast_double(__vec4_d v, int index) {
     return __vec4_d(_mm_set1_pd(__extract_element(v, index)),
@@ -1973,7 +1982,7 @@ static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, __vec4_i16 val) {
                     (float)((int16_t)_mm_extract_epi16(val.v, 3)));
 }
 
-static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, __vec4_i32 val) {
+static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, const __vec4_i32 &val) {
     return _mm_cvtepi32_ps(val.v);
 }
 
diff --git a/opt.cpp b/opt.cpp
index b1a22a1c..b018d35d 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -536,8 +536,11 @@ Optimize(llvm::Module *module, int optLevel) {
         }
         optPM.add(llvm::createDeadInstEliminationPass(), 220);
 
-        optPM.add(llvm::createIPConstantPropagationPass());
-        optPM.add(CreateReplaceStdlibShiftPass());
+        if (g->target->getISA() != Target::GENERIC) {
+          // Just use the builtins for generic targets.
+          optPM.add(llvm::createIPConstantPropagationPass());
+          optPM.add(CreateReplaceStdlibShiftPass());
+        }
 
         // Max struct size threshold for scalar replacement is
         //    1) 4 fields (r,g,b,w)

From 814ee67519771dc1f8b4002affc2426eb0e7e427 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Thu, 24 Oct 2013 11:51:33 +0400
Subject: [PATCH 101/159] patch and regression test for problem with vzeroupper

---
 llvm_patches/3_3_0001-Fix-PR16807.patch       |  0
 .../3_3_r193261_bug17631_win_vzeroupper.patch | 69 +++++++++++++++++++
 tests/chkstk.ispc                             | 49 +++++++++++++
 3 files changed, 118 insertions(+)
 mode change 100755 => 100644 llvm_patches/3_3_0001-Fix-PR16807.patch
 create mode 100644 llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch
 create mode 100644 tests/chkstk.ispc

diff --git a/llvm_patches/3_3_0001-Fix-PR16807.patch b/llvm_patches/3_3_0001-Fix-PR16807.patch
old mode 100755
new mode 100644
diff --git a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch b/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch
new file mode 100644
index 00000000..b6abb1d3
--- /dev/null
+++ b/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch
@@ -0,0 +1,69 @@
+From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001
+From: Michael Liao <michael.hliao@gmail.com>
+Date: Mon, 21 Oct 2013 17:47:58 -0700
+Subject: [PATCH] Fix PR17631
+
+- Skip instructions added in prolog. For specific targets, prolog may
+  insert helper function calls (e.g. _chkstk will be called when
+  there're more than 4K bytes allocated on stack). However, these
+  helpers don't use/def YMM/XMM registers.
+---
+ lib/Target/X86/X86VZeroUpper.cpp | 11 ++++++++++-
+ test/CodeGen/X86/pr17631.ll      | 22 ++++++++++++++++++++++
+ 2 files changed, 32 insertions(+), 1 deletion(-)
+ create mode 100644 test/CodeGen/X86/pr17631.ll
+
+diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
+index 477f75a..0d37a7d 100644
+--- lib/Target/X86/X86VZeroUpper.cpp
++++ lib/Target/X86/X86VZeroUpper.cpp
+@@ -231,8 +231,17 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
+   bool BBHasCall = false;
+ 
+   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+-    MachineInstr *MI = I;
+     DebugLoc dl = I->getDebugLoc();
++    MachineInstr *MI = I;
++
++    // Don't need to check instructions added in prolog.
++    // In prolog, special function calls may be added for specific targets
++    // (e.g. on Windows, a prolog helper '_chkstk' is called when the local
++    // variables exceed 4K bytes on stack.) These helpers won't use/def YMM/XMM
++    // registers.
++    if (MI->getFlag(MachineInstr::FrameSetup))
++      continue;
++
+     bool isControlFlow = MI->isCall() || MI->isReturn();
+ 
+     // Shortcut: don't need to check regular instructions in dirty state.
+diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
+new file mode 100644
+index 0000000..a572ff2
+--- /dev/null
++++ test/CodeGen/X86/pr17631.ll
+@@ -0,0 +1,22 @@
++; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
++ 
++%struct_type = type { [64 x <8 x float>], <8 x float> }
++ 
++; Function Attrs: nounwind readnone
++declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
++ 
++; Function Attrs: nounwind
++define i32 @equal(<8 x i32> %A) {
++allocas:
++  %first_alloc  = alloca [64 x <8 x i32>]
++  %second_alloc = alloca %struct_type
++ 
++  %A1 = bitcast <8 x i32> %A to <8 x float>
++  %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
++  ret i32 %A2
++}
++
++; CHECK: equal
++; CHECK-NOT: vzeroupper
++; CHECK: _chkstk
++; CHECK: ret
+-- 
+1.8.1.2
+
diff --git a/tests/chkstk.ispc b/tests/chkstk.ispc
new file mode 100644
index 00000000..bd0a8299
--- /dev/null
+++ b/tests/chkstk.ispc
@@ -0,0 +1,49 @@
+//test for 17631 bug in LLVM.
+
+export uniform int width() { return programCount; }
+ 
+struct s_temp
+{
+    float temp[64];
+};
+ 
+int CompressBlockBC7(int A, uniform float b)
+{
+    // This declaration caused problem because LLVM inserted
+    // _chkstk after declaration and vzeroupper before it's call.
+    // A will be in ymm at avx, so we lose a half of it.
+    s_temp _state;
+    // These two loops are here to prevent elimination of declaration
+    for (int i=0; i<64; i++) {
+        float ii = i;
+        _state.temp[i] = b + sin(ii);
+    }
+    float r = 0;
+    for (int j=0; j<64; j+=9) {
+        r += _state.temp[j] + j;
+    }
+
+    // Here upper bits of A in ymm can be zeros. This will crash the test.
+    int B;
+    if (A!=0) {
+        B = 20;
+    }
+    else {
+        B = 30;
+    }
+    if(A == 1) {
+        B = r;
+    }
+    return B;
+}
+ 
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int A = programIndex;
+    RET[programIndex] = CompressBlockBC7(A, b);
+}
+ 
+export void result(uniform float RET[]) {
+    RET[programIndex] = 20;
+    RET[0] = 30;
+    RET[1] = 292;
+}

From 9b5ee1b31bea40c3d94097fd352a7c50a91b9487 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 23 Oct 2013 18:42:49 +0400
Subject: [PATCH 102/159] fail_db update on Linux

---
 fail_db.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fail_db.txt b/fail_db.txt
index 9c43c7f0..367cdf18 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -906,13 +906,8 @@
 ./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *

From 58aea1b61c27b1305318b55ca899895a8da699a8 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 25 Oct 2013 21:42:57 +0400
Subject: [PATCH 103/159] Fail_db update with Linux passed with LLVM 3.4

---
 fail_db.txt | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/fail_db.txt b/fail_db.txt
index 367cdf18..b3163869 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -914,45 +914,11 @@
 ./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/atomics-13.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3 clang++3.3 -O2 *

From 621679245a4567a385fdeeed9af944ed45d691c0 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Fri, 25 Oct 2013 12:49:06 +0400
Subject: [PATCH 104/159] fixing problem 644

---
 expr.cpp | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/expr.cpp b/expr.cpp
index c92503e0..222c89a1 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -7065,9 +7065,22 @@ TypeCastExpr::GetLValue(FunctionEmitContext *ctx) const {
 
 const Type *
 TypeCastExpr::GetType() const {
-    // We have to switch off this assert after supporting of operators.
-    //AssertPos(pos, type->HasUnboundVariability() == false);
-    return type;
+    // Here we try to resolve situation where (base_type) can be treated as
+    // (uniform base_type) of (varying base_type). This is a part of function
+    // TypeCastExpr::TypeCheck. After implementation of operators we
+    // have to have this functionality here.
+    const Type *toType = type, *fromType = expr->GetType();
+    if (toType == NULL || fromType == NULL)
+        return NULL;
+    if (toType->HasUnboundVariability()) {
+        if (fromType->IsUniformType()) {
+            toType = type->ResolveUnboundVariability(Variability::Uniform);
+        } else {
+            toType = type->ResolveUnboundVariability(Variability::Varying);
+        }
+    }
+    AssertPos(pos, toType->HasUnboundVariability() == false);
+    return toType;
 }
 
 

From a508bd4290a5dc8073602bda88a7953ab6ef456b Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Sat, 26 Oct 2013 14:50:45 +0400
Subject: [PATCH 105/159] MacOS fails update

---
 fail_db.txt | 35 -----------------------------------
 1 file changed, 35 deletions(-)

diff --git a/fail_db.txt b/fail_db.txt
index b3163869..bfa14dad 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -944,46 +944,11 @@
 ./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/double-3.ispc runfail     x86     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
 .\tests\exclusive-scan-add-10.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *

From 103ef25f12bfd736a1ca84f71358059991354c6d Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Sun, 27 Oct 2013 23:01:20 +0400
Subject: [PATCH 106/159] Docs fix in memory management section

---
 docs/ispc.rst | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/ispc.rst b/docs/ispc.rst
index eac9b24e..84063694 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -2344,8 +2344,11 @@ based on C++'s ``new`` and ``delete`` operators:
 In the above code, each program instance allocates its own ``count`` sized
 array of ``uniform int`` values, uses that memory, and then deallocates
 that memory.  Uses of ``new`` and ``delete`` in ``ispc`` programs are
-serviced by corresponding calls the system C library's ``malloc()`` and
-``free()`` functions.
+implemented as calls to C library's aligned memory allocation routines,
+which are platform dependent (``posix_memalign()`` and ``free()`` on Linux
+and Mac and ``_aligned_malloc()`` and ``_aligned_free()`` on Windows). So it's
+advised to pair ISPC's ``new`` and ``delete`` with each other, but not with
+C/C++ memory management functions.
 
 Note that the rules for ``uniform`` and ``varying`` for ``new`` are
 analogous to the corresponding rules for pointers (as described in

From 43829028940316cf9869492d7d0f712aec878338 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 28 Oct 2013 12:31:24 +0400
Subject: [PATCH 107/159] Fail_db update on Windows: 3.3 update and adding 3.4

---
 fail_db.txt | 120 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 92 insertions(+), 28 deletions(-)

diff --git a/fail_db.txt b/fail_db.txt
index bfa14dad..2e08a6ae 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -644,11 +644,6 @@
 .\tests\reduce-add-uint64.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
 .\tests\reduce-max-uint.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
 .\tests\atomics-13.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-10.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-11.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-13.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-5.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-6.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
 .\tests\exclusive-scan-add-10.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\exclusive-scan-add-9.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\funcptr-null-4.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
@@ -667,11 +662,6 @@
 .\tests\reduce-max-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\reduce-min-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\atomics-13.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-10.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-11.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-13.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-5.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-6.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\exclusive-scan-add-10.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\max-uint-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\max-uint.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
@@ -715,8 +705,6 @@
 .\tests\uint64-max.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\uint64-min-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\uint64-min.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-down-int8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-up-int8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-10.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-11.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-12.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
@@ -765,35 +753,21 @@
 .\tests\uint64-max.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\uint64-min-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\uint64-min.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-down-int8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-up-int8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-10.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-11.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-12.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-9.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\atomics-13.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-10.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-11.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-13.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-5.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-6.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
 .\tests\funcptr-null-4.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\funcptr-null-5.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\funcptr-null-6.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\atomics-13.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-10.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-11.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-13.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-5.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-6.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-10.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-11.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-12.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-8.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-9.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-down-int8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-up-int8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-10.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-11.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-12.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
@@ -804,8 +778,6 @@
 .\tests\switch-12.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-8.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-9.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-10.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-11.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-12.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
@@ -986,3 +958,95 @@
 .\tests\reduce-min-uint64.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.4         cl -O2 *
 .\tests\reduce-min-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.4         cl -O2 *
 .\tests\reduce-min-uint64.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-4.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-5.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-6.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.4         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\min-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\packed-store.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\test-141.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-4.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-5.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-6.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\test-141.ispc runfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *

From 63a3214cc6c7fe7fc051254b7882f12f2231f314 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 28 Oct 2013 12:45:39 +0400
Subject: [PATCH 108/159] Removing fails with g++4.4/g++4.7, as we are using
 clang by default now

---
 fail_db.txt | 598 +---------------------------------------------------
 1 file changed, 5 insertions(+), 593 deletions(-)

diff --git a/fail_db.txt b/fail_db.txt
index 2e08a6ae..da77cac3 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -1,600 +1,12 @@
 % List of known fails.
 % The list is unordered and contains information about commonly used platforms / configurations.
 % Our goas is to maintain this list for Linux, MacOS and Windows with reasonably new compilers.
-% Note, that it's important which C++ compiler was used. For example, gcc 4.4 is know to produce
-% considerably more fails with generic targets, than gcc 4.7 or later.
-% Using old compilers (gcc 4.4 is considered to be relatively old) may cause LLVM bugs.
-% To avoid them you can use LLVM selfbuild.
+% Note, that it's important which C++ compiler was used. The currently supported C++ compilers are
+% clang 3.3 on Linux and MacOS and cl (VS2010) on Windows.
+% Please also note that it's very important to have correctly built LLVM. There are a number of
+% LLVM bugs in released versions, that we have to workaround by applying patches (see llvm_patches
+% folder). The recommended way to build LLVM on Unix is to use "alloy.py".
 % 
-./tests/masked-scatter-vector.ispc runfail  x86-64     sse2-i32x4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-down-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-down-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-up-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-up-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/broadcast-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/count-leading-trailing-zeros-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/count-leading-trailing-zeros-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/exclusive-scan-add-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/exclusive-scan-add-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/exclusive-scan-add-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/exclusive-scan-and-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/exclusive-scan-or-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-uniform-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-uniform-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-uniform-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-varying-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-varying-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-varying-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-varying-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/idiv.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/int64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/int64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/int64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/int64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/new-delete-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/phi-opts-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/phi-opts-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/popcnt-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/popcnt-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/popcnt-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-add-int16-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-add-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/rotate-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/rotate-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/rotate-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/rotate.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/short-vec-14.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/soa-27.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/soa-28.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-128.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-57.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/uint64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/uint64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/uint64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/uint64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-gather-ifs.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-gather-multi-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-gather-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-mixed-unif-vary-indexing-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-mixed-unif-vary-indexing-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-mixed-unif-vary-indexing.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-multidim-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-scatter-unif-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-scatter-vary.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-struct-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/atomics-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-down-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-down-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-up-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-up-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/broadcast-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/cfor-array-gather-ifs.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/cfor-array-gather-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/cfor-array-multidim-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/cfor-array-struct-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/cfor-struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/cfor-unif-struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/count-leading-trailing-zeros-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/count-leading-trailing-zeros-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/exclusive-scan-add-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/exclusive-scan-add-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/exclusive-scan-add-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/exclusive-scan-and-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/exclusive-scan-or-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-uniform-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-uniform-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-uniform-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-varying-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-varying-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-varying-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-varying-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/gather-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/gather-to-vload-neg-offset.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/global-array-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/idiv.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/int64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/int64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/int64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/int64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/masked-scatter-struct.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/masked-scatter-vector.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/nested-structs-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/new-delete-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/pass-varying-lvalue-to-ref.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/phi-opts-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/phi-opts-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/popcnt-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/popcnt-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/popcnt-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-add-int16-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-add-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/rotate-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/rotate-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/rotate-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/rotate.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/scatter-int16-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/scatter-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/scatter-mask-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/scatter-mask-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/short-vec-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/short-vec-14.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/soa-28.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-128.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-57.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/uint64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/uint64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/uint64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/uint64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/unif-struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/varying-struct-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/varying-struct-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/varying-struct-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/write-same-loc.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/masked-scatter-struct.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/memset-varying.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/rotate.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shift1.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shift1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/memset-varying.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shift1.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shift1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-13.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-11.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-13.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-10.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-11.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/broadcast-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/half-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-13.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-swap.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/memset-varying.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/rotate-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-10.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-11.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-6.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-7.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-8.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/test-129.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/test-130.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/broadcast-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/half-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-13.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-swap.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/memset-varying.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/rotate-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-10.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-11.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-6.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-7.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-8.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/test-129.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/test-130.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
 .\tests\exclusive-scan-add-9.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
 .\tests\reduce-equal-10.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
 .\tests\reduce-max-uint64.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *

From 1e80b3b0d7d5a1e66cd6a28d3ad379b0624a009d Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 12:20:32 -0400
Subject: [PATCH 109/159] Add shift support for generic-16 target.

---
 examples/intrinsics/generic-16.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index d81101f7..fa794276 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -311,6 +311,17 @@ static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
     return ret;                                       \
 }                                                     \
 
+#define SHIFT(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __shift_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i) {                    \
+      int modIndex = i+index;                         \
+      STYPE val = ((modIndex >= 0) && (modIndex < 16)) ? v.v[modIndex] : 0; \
+      ret.v[i] = val;                                 \
+    }                                                 \
+    return ret;                                       \
+}                                                     \
+
 #define SHUFFLES(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) {   \
     VTYPE ret;                                        \
@@ -492,6 +503,7 @@ SETZERO(__vec16_i8, i8)
 UNDEF(__vec16_i8, i8)
 BROADCAST(__vec16_i8, i8, int8_t)
 ROTATE(__vec16_i8, i8, int8_t)
+SHIFT(__vec16_i8, i8, int8_t)
 SHUFFLES(__vec16_i8, i8, int8_t)
 LOAD_STORE(__vec16_i8, int8_t)
 
@@ -537,6 +549,7 @@ SETZERO(__vec16_i16, i16)
 UNDEF(__vec16_i16, i16)
 BROADCAST(__vec16_i16, i16, int16_t)
 ROTATE(__vec16_i16, i16, int16_t)
+SHIFT(__vec16_i16, i16, int16_t)
 SHUFFLES(__vec16_i16, i16, int16_t)
 LOAD_STORE(__vec16_i16, int16_t)
 
@@ -582,6 +595,7 @@ SETZERO(__vec16_i32, i32)
 UNDEF(__vec16_i32, i32)
 BROADCAST(__vec16_i32, i32, int32_t)
 ROTATE(__vec16_i32, i32, int32_t)
+SHIFT(__vec16_i32, i32, int32_t)
 SHUFFLES(__vec16_i32, i32, int32_t)
 LOAD_STORE(__vec16_i32, int32_t)
 
@@ -627,6 +641,7 @@ SETZERO(__vec16_i64, i64)
 UNDEF(__vec16_i64, i64)
 BROADCAST(__vec16_i64, i64, int64_t)
 ROTATE(__vec16_i64, i64, int64_t)
+SHIFT(__vec16_i64, i64, int64_t)
 SHUFFLES(__vec16_i64, i64, int64_t)
 LOAD_STORE(__vec16_i64, int64_t)
 
@@ -672,6 +687,7 @@ SETZERO(__vec16_f, float)
 UNDEF(__vec16_f, float)
 BROADCAST(__vec16_f, float, float)
 ROTATE(__vec16_f, float, float)
+SHIFT(__vec16_f, float, float)
 SHUFFLES(__vec16_f, float, float)
 LOAD_STORE(__vec16_f, float)
 
@@ -832,6 +848,7 @@ SETZERO(__vec16_d, double)
 UNDEF(__vec16_d, double)
 BROADCAST(__vec16_d, double, double)
 ROTATE(__vec16_d, double, double)
+SHIFT(__vec16_d, double, double)
 SHUFFLES(__vec16_d, double, double)
 LOAD_STORE(__vec16_d, double)
 

From 641d882ea6ed42ab19c5afaeb0ca9fcc97d616ed Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 12:43:42 -0400
Subject: [PATCH 110/159] Add shift support for knc targets.  This is not
 optimized.

---
 examples/intrinsics/knc-i1x16.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 78d35ddc..0ede6006 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -451,6 +451,17 @@ static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
     return ret;                                       \
 }                                                     \
 
+#define SHIFT(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __shift_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i) {                    \
+      int modIndex = i+index;                         \
+      STYPE val = ((modIndex >= 0) && (modIndex < 16)) ? v.v[modIndex] : 0; \
+      ret.v[i] = val;                                 \
+    }                                                 \
+    return ret;                                       \
+}                                                     \
+
 /* knc::macro::used */
 #define SHUFFLES(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) {   \
@@ -566,6 +577,7 @@ SETZERO(__vec16_i8, i8)
 UNDEF(__vec16_i8, i8)
 BROADCAST(__vec16_i8, i8, int8_t)
 ROTATE(__vec16_i8, i8, int8_t)
+SHIFT(__vec16_i8, i8, int8_t)
 SHUFFLES(__vec16_i8, i8, int8_t)
 LOAD_STORE(__vec16_i8, int8_t)
 
@@ -612,6 +624,7 @@ SETZERO(__vec16_i16, i16)
 UNDEF(__vec16_i16, i16)
 BROADCAST(__vec16_i16, i16, int16_t)
 ROTATE(__vec16_i16, i16, int16_t)
+SHIFT(__vec16_i16, i16, int16_t)
 SHUFFLES(__vec16_i16, i16, int16_t)
 LOAD_STORE(__vec16_i16, int16_t)
 
@@ -688,6 +701,8 @@ static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index)
   return _mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v);
 }
 
+SHIFT(__vec16_i32, i32, int32_t)
+
 static FORCEINLINE __vec16_i32 __shuffle_i32 (__vec16_i32 v, __vec16_i32 index) 
 { 
   return _mm512_mask_permutevar_epi32(v, 0xFFFF, __and(index, __smear_i32<__vec16_i32>(0xF)), v); 
@@ -942,6 +957,8 @@ static FORCEINLINE __vec16_i64 __rotate_i64(const __vec16_i64 _v, const int inde
   const __vec16_i32 ret_lo = __rotate_i32(v_lo, index);
   return CASTI2L(ret_hi, ret_lo);
 }
+SHIFT(__vec16_i64, i64, int64_t)
+
 static FORCEINLINE __vec16_i64 __shuffle_double(__vec16_i64 _v, const __vec16_i32 index) 
 {
   CASTL2I(_v, v_hi, v_lo);
@@ -1063,6 +1080,7 @@ static FORCEINLINE __vec16_f __rotate_float(__vec16_f _v, int index)
   const __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0xF));
   return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v));
 }
+SHIFT(__vec16_f, float, float)
 static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) 
 {
   return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
@@ -1333,6 +1351,7 @@ static FORCEINLINE __vec16_d __rotate_double(const __vec16_d _v, const int index
   const __vec16_f ret_lo = __rotate_float(v_lo, index);
   return CASTF2D(ret_hi, ret_lo);
 }
+SHIFT(__vec16_d, double, double)
 static FORCEINLINE __vec16_d __shuffle_double(__vec16_d _v, const __vec16_i32 index) 
 {
   CASTD2F(_v, v_hi, v_lo);

From 02681d531eb4871db2b732079651aed6360d85c6 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 12:56:43 -0400
Subject: [PATCH 111/159] Minor tweak for interface.

---
 examples/intrinsics/knc-i1x16.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 0ede6006..376e66bc 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -456,8 +456,8 @@ static FORCEINLINE VTYPE __shift_##NAME(VTYPE v, int index) {   \
     VTYPE ret;                                        \
     for (int i = 0; i < 16; ++i) {                    \
       int modIndex = i+index;                         \
-      STYPE val = ((modIndex >= 0) && (modIndex < 16)) ? v.v[modIndex] : 0; \
-      ret.v[i] = val;                                 \
+      STYPE val = ((modIndex >= 0) && (modIndex < 16)) ? v[modIndex] : 0; \
+      ret[i] = val;                                 \
     }                                                 \
     return ret;                                       \
 }                                                     \

From a166eb7ea12ed9a6ea4a13a7354d2906fb5de6e3 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 28 Oct 2013 22:19:09 +0400
Subject: [PATCH 112/159] Check AVX OS support in host cpu check code

---
 ispc.cpp | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/ispc.cpp b/ispc.cpp
index db4c161a..419c64ab 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -102,6 +102,22 @@ static void __cpuidex(int info[4], int level, int count) {
 }
 #endif // !ISPC_IS_WINDOWS && !__ARM__
 
+#if !defined(__arm__)
+static bool __os_has_avx_support() {
+#if defined(ISPC_IS_WINDOWS)
+    // Check if the OS will save the YMM registers
+    unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+    return (xcrFeatureMask & 6) == 6;
+#else // defined(ISPC_IS_WINDOWS)
+    // Check xgetbv; this uses a .byte sequence instead of the instruction
+    // directly because older assemblers do not include support for xgetbv and
+    // there is no easy way to conditionally compile based on the assembler used.
+    int rEAX, rEDX;
+    __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+    return (rEAX & 6) == 6;
+#endif // !defined(ISPC_IS_WINDOWS)
+}
+#endif // !__arm__
 
 static const char *
 lGetSystemISA() {
@@ -111,7 +127,8 @@ lGetSystemISA() {
     int info[4];
     __cpuid(info, 1);
 
-    if ((info[2] & (1 << 28)) != 0) {  // AVX
+    if ((info[2] & (1 << 28)) != 0 &&
+         __os_has_avx_support()) {  // AVX
         // AVX1 for sure....
         // Ivy Bridge?
         if ((info[2] & (1 << 29)) != 0 &&  // F16C

From 1f0f852fda1209aabf8f773c0ec6d8266f236296 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 28 Oct 2013 22:54:14 +0400
Subject: [PATCH 113/159] Standalone checker for detecting the best ISA
 supported on the host

---
 check_isa.cpp | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 check_isa.cpp

diff --git a/check_isa.cpp b/check_isa.cpp
new file mode 100644
index 00000000..3f8b487d
--- /dev/null
+++ b/check_isa.cpp
@@ -0,0 +1,129 @@
+/*
+  Copyright (c) 2013, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// This file is a standalone program, which detects the best supported ISA.  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+
+
+#include <stdio.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#include <intrin.h>
+#endif
+
+#if !defined (__arm__)
+#if !defined(ISPC_IS_WINDOWS)
+static void __cpuid(int info[4], int infoType) {
+    __asm__ __volatile__ ("cpuid"
+                          : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
+                          : "0" (infoType));
+}
+
+/* Save %ebx in case it's the PIC register */
+static void __cpuidex(int info[4], int level, int count) {
+  __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
+                        "cpuid\n\t"
+                        "xchg{l}\t{%%}ebx, %1\n\t"
+                        : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
+                        : "0" (level), "2" (count));
+}
+#endif // !ISPC_IS_WINDOWS
+
+static bool __os_has_avx_support() {
+#if defined(ISPC_IS_WINDOWS)
+    // Check if the OS will save the YMM registers
+    unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+    return (xcrFeatureMask & 6) == 6;
+#else // defined(ISPC_IS_WINDOWS)
+    // Check xgetbv; this uses a .byte sequence instead of the instruction
+    // directly because older assemblers do not include support for xgetbv and
+    // there is no easy way to conditionally compile based on the assembler used.
+    int rEAX, rEDX;
+    __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+    return (rEAX & 6) == 6;
+#endif // !defined(ISPC_IS_WINDOWS)
+}
+#endif // !__arm__
+
+
+static const char *
+lGetSystemISA() {
+#ifdef __arm__
+    return "ARM NEON";
+#else
+    int info[4];
+    __cpuid(info, 1);
+
+    if ((info[2] & (1 << 28)) != 0 &&
+         __os_has_avx_support()) {  // AVX
+        // AVX1 for sure....
+        // Ivy Bridge?
+        if ((info[2] & (1 << 29)) != 0 &&  // F16C
+            (info[2] & (1 << 30)) != 0) {  // RDRAND
+            // So far, so good.  AVX2?
+            // Call cpuid with eax=7, ecx=0
+            int info2[4];
+            __cpuidex(info2, 7, 0);
+            if ((info2[1] & (1 << 5)) != 0) {
+                return "AVX2 (codename Haswell)";
+            }
+            else {
+                return "AVX1.1 (codename Ivy Bridge)";
+            }
+        }
+        // Regular AVX
+        return "AVX (codename Sandy Bridge)";
+    }
+    else if ((info[2] & (1 << 19)) != 0) {
+        return "SSE4";
+    }
+    else if ((info[3] & (1 << 26)) != 0) {
+        return "SSE2";
+    }
+    else {
+        return "Error";
+    }
+#endif
+}
+
+int main () {
+    const char* isa = lGetSystemISA();
+    printf("ISA: %s\n", isa);
+
+    return 0;
+}

From 9ba7b96825ec29e18372df72dadcc988ea5c0a52 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 16:14:31 -0400
Subject: [PATCH 114/159] Make the new optimization play nicely with the
 other.s

---
 opt.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index b018d35d..59f00538 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -536,12 +536,6 @@ Optimize(llvm::Module *module, int optLevel) {
         }
         optPM.add(llvm::createDeadInstEliminationPass(), 220);
 
-        if (g->target->getISA() != Target::GENERIC) {
-          // Just use the builtins for generic targets.
-          optPM.add(llvm::createIPConstantPropagationPass());
-          optPM.add(CreateReplaceStdlibShiftPass());
-        }
-
         // Max struct size threshold for scalar replacement is
         //    1) 4 fields (r,g,b,w)
         //    2) field size: vectorWidth * sizeof(float)
@@ -556,7 +550,12 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createGlobalOptimizerPass());
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createIPConstantPropagationPass());
-        optPM.add(llvm::createDeadArgEliminationPass());
+        if (g->target->getISA() != Target::GENERIC) {
+          // Just use the builtins for generic targets.
+
+          optPM.add(CreateReplaceStdlibShiftPass(),229);
+        }
+        optPM.add(llvm::createDeadArgEliminationPass(),230);
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createPruneEHPass());

From 1b8e745ffe68d4d9107d9b910f1f2c5892362b4c Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 16:36:59 -0400
Subject: [PATCH 115/159] remove condition.  Don't use gcc 4.7 for tests.

---
 opt.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index 59f00538..e585d6c1 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -550,11 +550,11 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createGlobalOptimizerPass());
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createIPConstantPropagationPass());
-        if (g->target->getISA() != Target::GENERIC) {
+        //        if (g->target->getISA() != Target::GENERIC) {
           // Just use the builtins for generic targets.
 
           optPM.add(CreateReplaceStdlibShiftPass(),229);
-        }
+          //        }
         optPM.add(llvm::createDeadArgEliminationPass(),230);
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());

From 09a6e371541d6578a59c00c2aeea66443bb1c0b7 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 16:37:33 -0400
Subject: [PATCH 116/159] Source cleanup.

---
 opt.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index e585d6c1..77fb9f21 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -550,11 +550,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createGlobalOptimizerPass());
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createIPConstantPropagationPass());
-        //        if (g->target->getISA() != Target::GENERIC) {
-          // Just use the builtins for generic targets.
-
-          optPM.add(CreateReplaceStdlibShiftPass(),229);
-          //        }
+        optPM.add(CreateReplaceStdlibShiftPass(),229);
         optPM.add(llvm::createDeadArgEliminationPass(),230);
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());

From 8ee317816607b6abef892fbc6a04ccf886518b99 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 16:51:02 -0400
Subject: [PATCH 117/159] Add Performance Warning

---
 opt.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/opt.cpp b/opt.cpp
index 77fb9f21..bb788a8e 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -4998,6 +4998,8 @@ ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                                                                    shuffleIdxs, "vecShift", ci);
                 ci->replaceAllUsesWith(shuffle);
                 modifiedAny = true;
+              } else {
+                PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount."); 
               }
             }
           }

From e682b19edacb2713453d229961f9e33ed6b82bd1 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 17:13:07 -0400
Subject: [PATCH 118/159] Remove zero initialization for __vec4_i32

---
 examples/intrinsics/sse4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 67c46848..919716be 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -108,7 +108,7 @@ struct __vec4_i64 {
 };
 
 struct __vec4_i32 {
-    FORCEINLINE __vec4_i32() : v(_mm_setzero_si128()) { }
+    FORCEINLINE __vec4_i32() { }
     FORCEINLINE __vec4_i32(__m128i vv) : v(vv) {  }
     FORCEINLINE __vec4_i32(int32_t a, int32_t b, int32_t c, int32_t d) {
         v = _mm_set_epi32(d, c, b, a);

From 362ee06b9f14c4ebb8672ba5d11dc3d5f7c6fac5 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 29 Oct 2013 01:35:26 +0400
Subject: [PATCH 119/159] Typo fix

---
 check_isa.cpp | 2 +-
 ispc.cpp      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/check_isa.cpp b/check_isa.cpp
index 3f8b487d..a4d10606 100644
--- a/check_isa.cpp
+++ b/check_isa.cpp
@@ -69,7 +69,7 @@ static bool __os_has_avx_support() {
     // Check if the OS will save the YMM registers
     unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
     return (xcrFeatureMask & 6) == 6;
-#else // defined(ISPC_IS_WINDOWS)
+#else // !defined(ISPC_IS_WINDOWS)
     // Check xgetbv; this uses a .byte sequence instead of the instruction
     // directly because older assemblers do not include support for xgetbv and
     // there is no easy way to conditionally compile based on the assembler used.
diff --git a/ispc.cpp b/ispc.cpp
index 419c64ab..859865a5 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -108,7 +108,7 @@ static bool __os_has_avx_support() {
     // Check if the OS will save the YMM registers
     unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
     return (xcrFeatureMask & 6) == 6;
-#else // defined(ISPC_IS_WINDOWS)
+#else // !defined(ISPC_IS_WINDOWS)
     // Check xgetbv; this uses a .byte sequence instead of the instruction
     // directly because older assemblers do not include support for xgetbv and
     // there is no easy way to conditionally compile based on the assembler used.

From 85eb4cf0d683868190fe68ad693ab5de422aace8 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Tue, 29 Oct 2013 14:02:32 -0400
Subject: [PATCH 120/159] Fix logic that looks for shift builtins.

---
 opt.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index bb788a8e..0cb14475 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -4930,8 +4930,6 @@ CreatePeepholePass() {
   return new PeepholePass;
 }
 
-#include <iostream>
-
 /** Given an llvm::Value known to be an integer, return its value as
     an int64_t.
 */
@@ -4978,7 +4976,7 @@ ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
           llvm::Function *func = ci->getCalledFunction();
           for (int i = 0; i < 6; i++) {
-            if (shifts[i] == func) {
+            if (shifts[i] && (shifts[i] == func)) {
               // we matched a call
               llvm::Value *shiftedVec = ci->getArgOperand(0);
               llvm::Value *shiftAmt = ci->getArgOperand(1);

From 9ce6fbe1fa7f7f056421a0a1b7ca1fd3b102b19e Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Wed, 30 Oct 2013 17:07:26 -0400
Subject: [PATCH 121/159] Support using pointer arithmetic as lvalue

---
 expr.cpp | 16 ++++++++++++++--
 expr.h   |  1 +
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/expr.cpp b/expr.cpp
index 222c89a1..1cbebad5 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -2798,6 +2798,17 @@ BinaryExpr::TypeCheck() {
     }
 }
 
+const Type *
+BinaryExpr::GetLValueType() const {
+  const Type *t = GetType();
+  if (CastType<PointerType>(t) != NULL) {
+    // Are we doing something like (basePtr + offset)[...] = ...
+    return t;
+  }
+  else {
+    return NULL;
+  }
+}
 
 int
 BinaryExpr::EstimateCost() const {
@@ -4266,8 +4277,9 @@ IndexExpr::GetValue(FunctionEmitContext *ctx) const {
     }
     else {
         Symbol *baseSym = GetBaseSymbol();
-        if (dynamic_cast<FunctionCallExpr *>(baseExpr) == NULL) {
-            // Only check for non-function calls
+        if (dynamic_cast<FunctionCallExpr *>(baseExpr) == NULL && 
+            dynamic_cast<BinaryExpr *>(baseExpr) == NULL) {
+          // Don't check if we're doing a function call or pointer arith
             AssertPos(pos, baseSym != NULL);
         }
         mask = lMaskForSymbol(baseSym, ctx);
diff --git a/expr.h b/expr.h
index f8b96abd..45780414 100644
--- a/expr.h
+++ b/expr.h
@@ -155,6 +155,7 @@ public:
 
     llvm::Value *GetValue(FunctionEmitContext *ctx) const;
     const Type *GetType() const;
+    const Type *GetLValueType() const;
     void Print() const;
 
     Expr *Optimize();

From ec170828641a36fb83c04fe39f2ffda2fd227645 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Wed, 30 Oct 2013 17:21:10 -0400
Subject: [PATCH 122/159] Add unittest.

---
 tests/ptr-arith-indexing.ispc | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 tests/ptr-arith-indexing.ispc

diff --git a/tests/ptr-arith-indexing.ispc b/tests/ptr-arith-indexing.ispc
new file mode 100644
index 00000000..9f62a2c9
--- /dev/null
+++ b/tests/ptr-arith-indexing.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+int foo(uniform float * uniform base, uniform int uOfs, varying int vOfs) {
+    return (base+uOfs)[vOfs];
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * uniform ptr = &aFOO[0];
+    int val = foo(ptr, programCount, programIndex);
+    RET[programIndex] = val;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programCount+programIndex;
+}

From 0f7050d3aaedef5d328ce8dcc71417aac4573cbb Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Thu, 31 Oct 2013 19:51:13 -0400
Subject: [PATCH 123/159] More stds compliant.  VS doesn't like non constant
 length local arrays.

---
 opt.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/opt.cpp b/opt.cpp
index 0cb14475..ce84744a 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -4982,7 +4982,7 @@ ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) {
               llvm::Value *shiftAmt = ci->getArgOperand(1);
               if (llvm::isa<llvm::Constant>(shiftAmt)) {
                 int vectorWidth = g->target->getVectorWidth();
-                int shuffleVals[vectorWidth];
+                int * shuffleVals = new int[vectorWidth];
                 int shiftInt = lGetIntValue(shiftAmt);
                 for (int i = 0; i < vectorWidth; i++) {
                   int s = i + shiftInt;
@@ -4996,6 +4996,7 @@ ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                                                                    shuffleIdxs, "vecShift", ci);
                 ci->replaceAllUsesWith(shuffle);
                 modifiedAny = true;
+                delete [] shuffleVals;
               } else {
                 PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount."); 
               }

From a910bfb539ae30502d4ee6722f4646a54295a5d4 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Thu, 31 Oct 2013 13:25:15 +0400
Subject: [PATCH 124/159] Windows support

---
 .gitignore                                    |   1 +
 alloy.py                                      | 308 ++++++++++--------
 examples/aobench/aobench.vcxproj              |   9 +-
 .../aobench_instrumented.vcxproj              |  11 +-
 examples/deferred/deferred_shading.vcxproj    |   9 +-
 examples/examples.sln                         |  10 +
 examples/mandelbrot/mandelbrot.vcxproj        |   9 +-
 .../mandelbrot_tasks/mandelbrot_tasks.vcxproj |   9 +-
 examples/noise/noise.vcxproj                  |   9 +-
 examples/options/options.vcxproj              |   9 +-
 examples/perfbench/perfbench.vcxproj          |   9 +-
 examples/rt/rt.vcxproj                        |   9 +-
 examples/simple/simple.vcxproj                |   9 +-
 examples/sort/sort.cpp                        |   6 +-
 examples/sort/sort.vcxproj                    | 177 ++++++++++
 examples/stencil/stencil.vcxproj              |   9 +-
 examples/volume_rendering/volume.vcxproj      |   9 +-
 perf.py                                       |  34 +-
 18 files changed, 437 insertions(+), 209 deletions(-)
 create mode 100644 examples/sort/sort.vcxproj

diff --git a/.gitignore b/.gitignore
index 429199bb..7cdc4a4e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,5 +18,6 @@ examples/*/objs/*
 examples/*/ref
 examples/*/test
 *.swp
+check_isa.exe
 
 
diff --git a/alloy.py b/alloy.py
index 0aaf3d8d..51aec82b 100755
--- a/alloy.py
+++ b/alloy.py
@@ -65,7 +65,12 @@ def try_do_LLVM(text, command, from_validation):
     if from_validation == True:
         text = text + "\n"
     print_debug("Trying to " + text, from_validation, alloy_build)
-    if os.system(command + " >> " + alloy_build + " 2>> " + alloy_build) != 0:
+    postfix = ""
+    if current_OS == "Windows":
+        postfix = " 1>> " + alloy_build + " 2>&1"
+    else:
+        postfix = " >> " + alloy_build + " 2>> " + alloy_build
+    if os.system(command + postfix) != 0:
         print_debug("ERROR.\n", from_validation, alloy_build)
         error("can't " + text, 1)
     print_debug("DONE.\n", from_validation, alloy_build)
@@ -144,10 +149,13 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
                     "tar -xvzf " + tar[1] + " --strip-components 1", from_validation)
         os.chdir("../../")
     # paching llvm
-    patches = glob.glob(os.environ["ISPC_HOME"] + "/llvm_patches/*.*")
+    patches = glob.glob(os.environ["ISPC_HOME"] + os.sep + "llvm_patches" + os.sep + "*.*")
     for patch in patches:
         if version_LLVM in os.path.basename(patch):
-            try_do_LLVM("patch LLVM with patch" + patch + " ", "patch -p0 < " + patch, from_validation)
+            if current_OS != "Windows":
+                try_do_LLVM("patch LLVM with patch " + patch + " ", "patch -p0 < " + patch, from_validation)
+            else:
+                try_do_LLVM("patch LLVM with patch " + patch + " ", "C:\\gnuwin32\\bin\\patch.exe -p0 < " + patch, from_validation)
     os.chdir("../")
     # configuring llvm, build first part of selfbuild
     os.makedirs(LLVM_BUILD)
@@ -173,77 +181,65 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
         print_debug("Now we have compiler for selfbuild: " + selfbuild_compiler + "\n", from_validation, alloy_build)
     os.chdir(LLVM_BUILD)
     if debug == False:
-        try_do_LLVM("configure release version ",
+        if current_OS != "Windows":
+            try_do_LLVM("configure release version ",
                     "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" +
                     LLVM_BIN + " --enable-optimized" + selfbuild_compiler,
                     from_validation)
+        else:
+            try_do_LLVM("configure release version ",
+                    'cmake -G "Visual Studio 10" -DCMAKE_INSTALL_PREFIX="..\\'+ LLVM_BIN +
+                    '" -DLLVM_LIT_TOOLS_DIR="C:\\gnuwin32\\bin" ..\\' + LLVM_SRC,
+                    from_validation)
     else:
         try_do_LLVM("configure debug version ",
                     "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" + LLVM_BIN +
                     " --enable-debug-runtime --enable-debug-symbols --enable-keep-symbols" + selfbuild_compiler,
                     from_validation)
     # building llvm
-    try_do_LLVM("build LLVM ", make, from_validation)
-    try_do_LLVM("install LLVM ", "make install", from_validation)
+    if current_OS != "Windows":
+        try_do_LLVM("build LLVM ", make, from_validation)
+        try_do_LLVM("install LLVM ", "make install", from_validation)
+    else:
+        try_do_LLVM("build LLVM and than install LLVM ", "msbuild INSTALL.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild", from_validation)
     os.chdir(current_path) 
 
 def check_targets():
     answer = []
     answer_sde = []
-    SSE2 = False;
-    SSE4 = False;
-    AVX = False;
-    AVX11 = False;
-    AVX2 = False;
-    if current_OS == "Linux":
-        cpu = open("/proc/cpuinfo")
-        f_lines = cpu.readlines()
-        cpu.close()
-        # check what native targets do we have
-        for i in range(0,len(f_lines)):
-            if SSE2 == False and "sse2" in f_lines[i]:
-                SSE2 = True;
-                answer = answer + ["sse2-i32x4", "sse2-i32x8"]
-            if SSE4 == False and "sse4_1" in f_lines[i]:
-                SSE4 = True;
-                answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
-            if AVX == False and "avx" in f_lines[i]:
-                AVX = True;
-                answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"]
-            if AVX11 == False and "rdrand" in f_lines[i]:
-                AVX11 = True;
-                answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16", "avx1.1-i64x4"]
-            if AVX2 == False and "avx2" in f_lines[i]:
-                AVX2 = True;
-                answer = answer + ["avx2-i32x8", "avx2-i32x16", "avx2-i64x4"]
-    if current_OS == "MacOS":
-        f_lines = take_lines("sysctl machdep.cpu.features", "first")
-        if "SSE2" in f_lines:
-            SSE2 = True;
-            answer = answer + ["sse2-i32x4", "sse2-i32x8"]
-        if "SSE4.1" in f_lines:
-            SSE4 = True;
-            answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
-        if "AVX1.0" in f_lines:
-            AVX = True;
-            answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"]
-        if "RDRAND" in f_lines:
-            AVX11 = True;
-            answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16", "avx1.1-i64x4"]
-        if "AVX2.0" in f_lines:
-            AVX2 = True;
-            answer = answer + ["avx2-i32x8", "avx2-i32x16", "avx2-i64x4"]
-
-    answer = answer + ["generic-4", "generic-16", "generic-8", "generic-1", "generic-32", "generic-64"]
+    # check what native targets do we have
+    if current_OS != "Windows":
+        try_do_LLVM("build check_ISA", "clang check_isa.cpp -o check_isa.exe", True)
+    else:
+        try_do_LLVM("build check_ISA", "cl check_isa.cpp", True)
+    SSE2  = ["sse2-i32x4",  "sse2-i32x8"]
+    SSE4  = ["sse4-i32x4",  "sse4-i32x8",   "sse4-i16x8", "sse4-i8x16"]
+    AVX   = ["avx1-i32x8",  "avx1-i32x16",  "avx1-i64x4"]
+    AVX11 = ["avx1.1-i32x8","avx1.1-i32x16","avx1.1-i64x4"]
+    AVX2  = ["avx2-i32x8",  "avx2-i32x16",  "avx2-i64x4"]
+    targets = [["AVX2", AVX2, False], ["AVX1.1", AVX11, False], ["AVX", AVX, False], ["SSE4", SSE4, False], ["SSE2", SSE2, False]]
+    f_lines = take_lines("check_isa.exe", "first")
+    for i in range(0,5):
+        if targets[i][0] in f_lines:
+            for j in range(i,5):
+                answer = targets[j][1] + answer
+                targets[j][2] = True
+            break
+    if current_OS != "Windows":
+        answer = answer + ["generic-4", "generic-16", "generic-8", "generic-1", "generic-32", "generic-64"]
     # now check what targets we have with the help of SDE
     sde_exists = ""
     PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+    if current_OS == "Windows":
+        sde_n = "sde.exe"
+    else:
+        sde_n = "sde"
     for counter in PATH_dir:
-        if os.path.exists(counter + os.sep + "sde") and sde_exists == "":
-            sde_exists = counter + os.sep + "sde"
+        if os.path.exists(counter + os.sep + sde_n) and sde_exists == "":
+            sde_exists = counter + os.sep + sde_n
     if os.environ.get("SDE_HOME") != None:
-        if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"):
-            sde_exists = os.environ.get("SDE_HOME") + os.sep + "sde"
+        if os.path.exists(os.environ.get("SDE_HOME") + os.sep + sde_n):
+            sde_exists = os.environ.get("SDE_HOME") + os.sep + sde_n
     if sde_exists == "":
         error("you haven't got sde neither in SDE_HOME nor in your PATH.\n" + 
             "To test all platforms please set SDE_HOME to path containing SDE.\n" +
@@ -252,24 +248,38 @@ def check_targets():
     # here we have SDE
     f_lines = take_lines(sde_exists + " -help", "all")
     for i in range(0,len(f_lines)):
-        if SSE4 == False and "wsm" in f_lines[i]:
+        if targets[3][2] == False and "wsm" in f_lines[i]:
             answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]]
-        if AVX == False and "snb" in f_lines[i]:
+        if targets[2][2] == False and "snb" in f_lines[i]:
             answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]]
-        if AVX11 == False and "ivb" in f_lines[i]:
+        if targets[1][2] == False and "ivb" in f_lines[i]:
             answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"], ["-ivb", "avx1.1-i64x4"]]
-        if AVX2 == False and "hsw" in f_lines[i]:
+        if targets[0][2] == False and "hsw" in f_lines[i]:
             answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"], ["-hsw", "avx2-i64x4"]]
     return [answer, answer_sde]
 
 def build_ispc(version_LLVM, make):
     current_path = os.getcwd()
     os.chdir(os.environ["ISPC_HOME"])
-    p_temp = os.getenv("PATH")
-    os.environ["PATH"] = os.environ["LLVM_HOME"] + "/bin-" + version_LLVM + "/bin:" + os.environ["PATH"]
-    try_do_LLVM("clean ISPC for building", "make clean", True)
-    try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", make, True)
-    os.environ["PATH"] = p_temp
+    if current_OS != "Windows":
+        p_temp = os.getenv("PATH")
+        os.environ["PATH"] = os.environ["LLVM_HOME"] + "/bin-" + version_LLVM + "/bin:" + os.environ["PATH"]
+        try_do_LLVM("clean ISPC for building", "make clean", True)
+        try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", make, True)
+        os.environ["PATH"] = p_temp
+    else:
+        p_temp = os.getenv("LLVM_INSTALL_DIR")
+        v_temp = os.getenv("LLVM_VERSION")
+        os.environ["LLVM_INSTALL_DIR"] = os.environ["LLVM_HOME"] + "\\bin-" + version_LLVM
+        if version_LLVM == "3.3":
+            temp = "3_3"
+        if version_LLVM == "trunk":
+            temp = "3_4"
+        os.environ["LLVM_VERSION"] = "LLVM_" + temp
+        try_do_LLVM("clean ISPC for building", "msbuild ispc.vcxproj /t:clean", True)
+        try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", "msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild", True)
+        os.environ["LLVM_INSTALL_DIR"] = p_temp
+        os.environ["LLVM_VERSION"] = v_temp
     os.chdir(current_path)
 
 def execute_stability(stability, R, print_version):
@@ -305,9 +315,13 @@ def execute_stability(stability, R, print_version):
 def run_special_tests():
    i = 5 
 
+class options_for_drivers:
+    pass
+
 def validation_run(only, only_targets, reference_branch, number, notify, update, speed_number, make, perf_llvm, time):
     os.chdir(os.environ["ISPC_HOME"])
-    os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"]
+    if current_OS != "Windows":
+        os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"]
     if options.notify != "":
         common.remove_if_exists(os.environ["ISPC_HOME"] + os.sep + "notify_log.log")
         smtp_server = os.environ["SMTP_ISPC"]
@@ -319,8 +333,6 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
     print_debug("Folder: " + os.environ["ISPC_HOME"] + "\n", False, "")
     date = datetime.datetime.now()
     print_debug("Date: " + date.strftime('%H:%M %d/%m/%Y') + "\n", False, "")
-    class options_for_drivers:
-        pass
 # *** *** ***
 # Stability validation run
 # *** *** ***
@@ -475,12 +487,14 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
         performance = options_for_drivers()
 # performance constant options
         performance.number = number
-        performance.config = "./perf.ini"
-        performance.path = "./"
+        performance.config = "." + os.sep + "perf.ini"
+        performance.path = "." + os.sep
         performance.silent = True
         performance.output = ""
         performance.compiler = ""
         performance.ref = "ispc_ref"
+        if current_OS == "Windows":
+            performance.ref = "ispc_ref.exe"
         performance.in_file = "." + os.sep + f_date + os.sep + "performance.log"
 # prepare LLVM 3.3 as newest LLVM
         need_LLVM = check_LLVM(["3.3"])
@@ -502,7 +516,11 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
             sys.stdout.write(".\n")
             build_ispc("3.3", make)
             sys.stdout.write(".\n")
-            os.rename("ispc", "ispc_ref")
+            if current_OS != "Windows":
+                os.rename("ispc", "ispc_ref")
+            else:
+                common.remove_if_exists("Release\\ispc_ref.exe")
+                os.rename("Release\\ispc.exe", "Release\\ispc_ref.exe")
             try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True)
             if stashing:
                 try_do_LLVM("return current branch ", "git stash pop", True)
@@ -541,7 +559,6 @@ def Main():
     global current_OS
     if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True:
         current_OS = "Windows"
-        error("Windows isn't supported now", 1)
     else:
         if (platform.system() == 'Darwin'):
             current_OS = "MacOS"
@@ -566,7 +583,9 @@ def Main():
         for iterator in test_only:
             if not (" " + iterator + " " in test_only_r):
                 error("unknow option for only: " + iterator, 1)
-
+    if current_OS == "Windows":
+        if options.debug == True or options.selfbuild == True or options.tarball != "":
+            error("Debug, selfbuild and tarball options are unsupported on windows", 1)
     global f_date
     f_date = "logs"
     common.remove_if_exists(f_date)
@@ -626,79 +645,80 @@ import common
 error = common.error
 take_lines = common.take_lines
 print_debug = common.print_debug
-# parsing options
-class MyParser(OptionParser):
-    def format_epilog(self, formatter):
-        return self.epilog
-examples =  ("Examples:\n" +
-"Load and build LLVM from trunk\n\talloy.py -b\n" +
-"Load and build LLVM 3.3. Rewrite LLVM folders\n\talloy.py -b --version=3.3 --force\n" +
-"Untar files llvm.tgz clang.tgz, build LLVM from them in folder bin-from_tar\n\talloy.py -b --tarball='llvm.tgz clang.tgz' --folder=from_tar\n" +
-"Load LLVM from trunk, revision r172870. Build it. Do selfbuild\n\talloy.py -b --revision=r172870 --selfbuild\n" +
-"Validation run with LLVM 3.3, trunk; x86, x86-64; -O2;\nall supported targets; performance\n\talloy.py -r\n" + 
-"Validation run with all avx targets and sse4-i8x16 without performance\n\talloy.py -r --only=stability --only-targets='avx sse4-i8x16'\n" +
-"Validation run with avx2-i32x8, all sse4 and sse2 targets\nand all targets with i32x16\n\talloy.py -r --only-targets='avx2-i32x8 sse4 i32x16 sse2'\n" +
-"Stability validation run with LLVM 3.2, 3.3; -O0; x86,\nupdate fail_db.txt with passes and fails\n\talloy.py -r --only='3.2 -O0 stability 3.3 x86' --update-errors=FP\n" +
-"Try to build compiler with all LLVM\n\talloy.py -r --only=build\n" +
-"Performance validation run with 10 runs of each test and comparing to branch 'old'\n\talloy.py -r --only=performance --compare-with=old --number=10\n" +
-"Validation run. Update fail_db.txt with new fails, send results to my@my.com\n\talloy.py -r --update-errors=F --notify='my@my.com'\n")
-parser = MyParser(usage="Usage: alloy.py -r/-b [options]", epilog=examples)
-parser.add_option('-b', '--build-llvm', dest='build_llvm',
-    help='ask to build LLVM', default=False, action="store_true")
-parser.add_option('-r', '--run', dest='validation_run',
-    help='ask for validation run', default=False, action="store_true")
-parser.add_option('-j', dest='speed',
-    help='set -j for make', default="8")
-# options for activity "build LLVM"
-llvm_group = OptionGroup(parser, "Options for building LLVM",
+if __name__ == '__main__':
+    # parsing options
+    class MyParser(OptionParser):
+        def format_epilog(self, formatter):
+            return self.epilog
+    examples =  ("Examples:\n" +
+    "Load and build LLVM from trunk\n\talloy.py -b\n" +
+    "Load and build LLVM 3.3. Rewrite LLVM folders\n\talloy.py -b --version=3.3 --force\n" +
+    "Untar files llvm.tgz clang.tgz, build LLVM from them in folder bin-from_tar\n\talloy.py -b --tarball='llvm.tgz clang.tgz' --folder=from_tar\n" +
+    "Load LLVM from trunk, revision r172870. Build it. Do selfbuild\n\talloy.py -b --revision=r172870 --selfbuild\n" +
+    "Validation run with LLVM 3.3, trunk; x86, x86-64; -O2;\nall supported targets; performance\n\talloy.py -r\n" + 
+    "Validation run with all avx targets and sse4-i8x16 without performance\n\talloy.py -r --only=stability --only-targets='avx sse4-i8x16'\n" +
+    "Validation run with avx2-i32x8, all sse4 and sse2 targets\nand all targets with i32x16\n\talloy.py -r --only-targets='avx2-i32x8 sse4 i32x16 sse2'\n" +
+    "Stability validation run with LLVM 3.2, 3.3; -O0; x86,\nupdate fail_db.txt with passes and fails\n\talloy.py -r --only='3.2 -O0 stability 3.3 x86' --update-errors=FP\n" +
+    "Try to build compiler with all LLVM\n\talloy.py -r --only=build\n" +
+    "Performance validation run with 10 runs of each test and comparing to branch 'old'\n\talloy.py -r --only=performance --compare-with=old --number=10\n" +
+    "Validation run. Update fail_db.txt with new fails, send results to my@my.com\n\talloy.py -r --update-errors=F --notify='my@my.com'\n")
+    parser = MyParser(usage="Usage: alloy.py -r/-b [options]", epilog=examples)
+    parser.add_option('-b', '--build-llvm', dest='build_llvm',
+        help='ask to build LLVM', default=False, action="store_true")
+    parser.add_option('-r', '--run', dest='validation_run',
+        help='ask for validation run', default=False, action="store_true")
+    parser.add_option('-j', dest='speed',
+        help='set -j for make', default="8")
+    # options for activity "build LLVM"
+    llvm_group = OptionGroup(parser, "Options for building LLVM",
                     "These options must be used with -b option.")
-llvm_group.add_option('--version', dest='version',
-    help='version of llvm to build: 3.1 3.2 3.3 trunk. Default: trunk', default="trunk")
-llvm_group.add_option('--revision', dest='revision',
-    help='revision of llvm to build in format r172870', default="")
-llvm_group.add_option('--debug', dest='debug',
-    help='debug build of LLVM?', default=False, action="store_true")
-llvm_group.add_option('--folder', dest='folder',
-    help='folder to build LLVM in', default="")
-llvm_group.add_option('--tarball', dest='tarball',
-    help='"llvm_tarball clang_tarball"', default="")
-llvm_group.add_option('--selfbuild', dest='selfbuild',
-    help='make selfbuild of LLVM and clang', default=False, action="store_true")
-llvm_group.add_option('--force', dest='force',
-    help='rebuild LLVM', default=False, action='store_true')
-llvm_group.add_option('--extra', dest='extra',
-    help='load extra clang tools', default=False, action='store_true')
-parser.add_option_group(llvm_group)
-# options for activity "validation run"
-run_group = OptionGroup(parser, "Options for validation run",
+    llvm_group.add_option('--version', dest='version',
+        help='version of llvm to build: 3.1 3.2 3.3 trunk. Default: trunk', default="trunk")
+    llvm_group.add_option('--revision', dest='revision',
+        help='revision of llvm to build in format r172870', default="")
+    llvm_group.add_option('--debug', dest='debug',
+        help='debug build of LLVM?', default=False, action="store_true")
+    llvm_group.add_option('--folder', dest='folder',
+        help='folder to build LLVM in', default="")
+    llvm_group.add_option('--tarball', dest='tarball',
+        help='"llvm_tarball clang_tarball"', default="")
+    llvm_group.add_option('--selfbuild', dest='selfbuild',
+        help='make selfbuild of LLVM and clang', default=False, action="store_true")
+    llvm_group.add_option('--force', dest='force',
+        help='rebuild LLVM', default=False, action='store_true')
+    llvm_group.add_option('--extra', dest='extra',
+        help='load extra clang tools', default=False, action='store_true')
+    parser.add_option_group(llvm_group)
+    # options for activity "validation run"
+    run_group = OptionGroup(parser, "Options for validation run",
                     "These options must be used with -r option.")
-run_group.add_option('--compare-with', dest='branch',
-    help='set performance reference point. Dafault: master', default="master")
-run_group.add_option('--number', dest='number_for_performance',
-    help='number of performance runs for each test. Default: 5', default=5)
-run_group.add_option('--notify', dest='notify',
-    help='email to sent results to', default="")
-run_group.add_option('--update-errors', dest='update',
-    help='rewrite fail_db.txt file according to received results (F or FP)', default="")
-run_group.add_option('--only-targets', dest='only_targets',
-    help='set list of targets to test. Possible values - all subnames of targets.',
-    default="")
-run_group.add_option('--time', dest='time',
-    help='display time of testing', default=False, action='store_true')
-run_group.add_option('--only', dest='only',
-    help='set types of tests. Possible values:\n' + 
-        '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' +
-        'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).',
+    run_group.add_option('--compare-with', dest='branch',
+        help='set performance reference point. Dafault: master', default="master")
+    run_group.add_option('--number', dest='number_for_performance',
+        help='number of performance runs for each test. Default: 5', default=5)
+    run_group.add_option('--notify', dest='notify',
+        help='email to sent results to', default="")
+    run_group.add_option('--update-errors', dest='update',
+        help='rewrite fail_db.txt file according to received results (F or FP)', default="")
+    run_group.add_option('--only-targets', dest='only_targets',
+        help='set list of targets to test. Possible values - all subnames of targets.',
         default="")
-run_group.add_option('--perf_LLVM', dest='perf_llvm',
-    help='compare LLVM 3.3 with "--compare-with", default trunk', default=False, action='store_true')
-parser.add_option_group(run_group)
-# options for activity "setup PATHS"
-setup_group = OptionGroup(parser, "Options for setup",
+    run_group.add_option('--time', dest='time',
+        help='display time of testing', default=False, action='store_true')
+    run_group.add_option('--only', dest='only',
+        help='set types of tests. Possible values:\n' + 
+            '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' +
+            'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).',
+            default="")
+    run_group.add_option('--perf_LLVM', dest='perf_llvm',
+        help='compare LLVM 3.3 with "--compare-with", default trunk', default=False, action='store_true')
+    parser.add_option_group(run_group)
+    # options for activity "setup PATHS"
+    setup_group = OptionGroup(parser, "Options for setup",
                     "These options must be use with -r or -b to setup environment variables")
-setup_group.add_option('--llvm_home', dest='llvm_home',help='path to LLVM',default="")
-setup_group.add_option('--ispc_home', dest='ispc_home',help='path to ISPC',default="")
-setup_group.add_option('--sde_home', dest='sde_home',help='path to SDE',default="")
-parser.add_option_group(setup_group)
-(options, args) = parser.parse_args()
-Main()
+    setup_group.add_option('--llvm_home', dest='llvm_home',help='path to LLVM',default="")
+    setup_group.add_option('--ispc_home', dest='ispc_home',help='path to ISPC',default="")
+    setup_group.add_option('--sde_home', dest='sde_home',help='path to SDE',default="")
+    parser.add_option_group(setup_group)
+    (options, args) = parser.parse_args()
+    Main()
diff --git a/examples/aobench/aobench.vcxproj b/examples/aobench/aobench.vcxproj
index 48e26e40..a5b354ce 100644
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -26,15 +26,15 @@
   <ItemGroup>
     <CustomBuild Include="ao.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
@@ -44,6 +44,7 @@
     <ProjectGuid>{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>aobench</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
diff --git a/examples/aobench_instrumented/aobench_instrumented.vcxproj b/examples/aobench_instrumented/aobench_instrumented.vcxproj
index d54332b6..5247762c 100644
--- a/examples/aobench_instrumented/aobench_instrumented.vcxproj
+++ b/examples/aobench_instrumented/aobench_instrumented.vcxproj
@@ -26,15 +26,15 @@
   <ItemGroup>
     <CustomBuild Include="ao.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
@@ -44,6 +44,7 @@
     <ProjectGuid>{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>aobench_instrumented</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -171,4 +172,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/examples/deferred/deferred_shading.vcxproj b/examples/deferred/deferred_shading.vcxproj
index 9a2a64bf..94e38540 100755
--- a/examples/deferred/deferred_shading.vcxproj
+++ b/examples/deferred/deferred_shading.vcxproj
@@ -22,6 +22,7 @@
     <ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>mandelbrot</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -158,15 +159,15 @@
   <ItemGroup>
     <CustomBuild Include="kernels.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
diff --git a/examples/examples.sln b/examples/examples.sln
index e9992f76..2285f6a6 100755
--- a/examples/examples.sln
+++ b/examples/examples.sln
@@ -25,6 +25,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferre
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "perfbench", "perfbench\perfbench.vcxproj", "{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sort", "sort\sort.vcxproj", "{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -129,6 +131,14 @@ Global
 		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.Build.0 = Release|Win32
 		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.ActiveCfg = Release|x64
 		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.Build.0 = Release|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Debug|Win32.ActiveCfg = Debug|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Debug|Win32.Build.0 = Debug|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Debug|x64.ActiveCfg = Debug|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Debug|x64.Build.0 = Debug|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Release|Win32.ActiveCfg = Release|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Release|Win32.Build.0 = Release|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Release|x64.ActiveCfg = Release|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/examples/mandelbrot/mandelbrot.vcxproj b/examples/mandelbrot/mandelbrot.vcxproj
index 7b78d1dd..1b6f1281 100644
--- a/examples/mandelbrot/mandelbrot.vcxproj
+++ b/examples/mandelbrot/mandelbrot.vcxproj
@@ -22,6 +22,7 @@
     <ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>mandelbrot</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -155,15 +156,15 @@
   <ItemGroup>
     <CustomBuild Include="mandelbrot.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
index 3a8fca79..fbebdc32 100644
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -22,6 +22,7 @@
     <ProjectGuid>{E80DA7D4-AB22-4648-A068-327307156BE6}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>mandelbrot_tasks</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -160,15 +161,15 @@
   <ItemGroup>
     <CustomBuild Include="mandelbrot_tasks.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
diff --git a/examples/noise/noise.vcxproj b/examples/noise/noise.vcxproj
index 4e983759..01456625 100644
--- a/examples/noise/noise.vcxproj
+++ b/examples/noise/noise.vcxproj
@@ -22,6 +22,7 @@
     <ProjectGuid>{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>noise</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -155,15 +156,15 @@
   <ItemGroup>
     <CustomBuild Include="noise.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
diff --git a/examples/options/options.vcxproj b/examples/options/options.vcxproj
index b029b598..77fb9353 100644
--- a/examples/options/options.vcxproj
+++ b/examples/options/options.vcxproj
@@ -22,6 +22,7 @@
     <ProjectGuid>{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>options</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -160,15 +161,15 @@
   <ItemGroup>
     <CustomBuild Include="options.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
diff --git a/examples/perfbench/perfbench.vcxproj b/examples/perfbench/perfbench.vcxproj
index 31974ac7..d94b753c 100644
--- a/examples/perfbench/perfbench.vcxproj
+++ b/examples/perfbench/perfbench.vcxproj
@@ -22,6 +22,7 @@
     <ProjectGuid>{d923bb7e-a7c8-4850-8fcf-0eb9ce35b4e8}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>perfbench</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -155,15 +156,15 @@
   <ItemGroup>
     <CustomBuild Include="perfbench.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
diff --git a/examples/rt/rt.vcxproj b/examples/rt/rt.vcxproj
index 4cfefb81..19d40192 100644
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -22,6 +22,7 @@
     <ProjectGuid>{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>rt</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -152,18 +153,18 @@
     <CustomBuild Include="rt.ispc">
       <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
diff --git a/examples/simple/simple.vcxproj b/examples/simple/simple.vcxproj
index 65af97bb..34908223 100644
--- a/examples/simple/simple.vcxproj
+++ b/examples/simple/simple.vcxproj
@@ -25,18 +25,18 @@
     <CustomBuild Include="simple.ispc">
       <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
 </Command>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
 </Command>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
@@ -46,6 +46,7 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
     <ProjectGuid>{947C5311-8B78-4D05-BEE4-BCF342D4B367}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>simple</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
diff --git a/examples/sort/sort.cpp b/examples/sort/sort.cpp
index f5e4264a..20221d90 100644
--- a/examples/sort/sort.cpp
+++ b/examples/sort/sort.cpp
@@ -78,7 +78,7 @@ int main (int argc, char *argv[])
 
   for (i = 0; i < m; i ++)
   {
-    for (j = 0; j < n; j ++) code [j] = random() % l;
+    for (j = 0; j < n; j ++) code [j] = rand() % l;
 
     reset_and_start_timer();
 
@@ -96,7 +96,7 @@ int main (int argc, char *argv[])
 
   for (i = 0; i < m; i ++)
   {
-    for (j = 0; j < n; j ++) code [j] = random() % l;
+    for (j = 0; j < n; j ++) code [j] = rand() % l;
 
     reset_and_start_timer();
 
@@ -114,7 +114,7 @@ int main (int argc, char *argv[])
 
   for (i = 0; i < m; i ++)
   {
-    for (j = 0; j < n; j ++) code [j] = random() % l;
+    for (j = 0; j < n; j ++) code [j] = rand() % l;
 
     reset_and_start_timer();
 
diff --git a/examples/sort/sort.vcxproj b/examples/sort/sort.vcxproj
new file mode 100644
index 00000000..b37eab1c
--- /dev/null
+++ b/examples/sort/sort.vcxproj
@@ -0,0 +1,177 @@
+ï»¿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>sort</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="sort.cpp" />
+    <ClCompile Include="sort_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="sort.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/examples/stencil/stencil.vcxproj b/examples/stencil/stencil.vcxproj
index ce5d7979..a96a187d 100644
--- a/examples/stencil/stencil.vcxproj
+++ b/examples/stencil/stencil.vcxproj
@@ -22,6 +22,7 @@
     <ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>rt</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -152,18 +153,18 @@
     <CustomBuild Include="stencil.ispc">
       <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
diff --git a/examples/volume_rendering/volume.vcxproj b/examples/volume_rendering/volume.vcxproj
index 908cf734..d3594b98 100644
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -22,6 +22,7 @@
     <ProjectGuid>{dee5733a-e93e-449d-9114-9bffcaeb4df9}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>volume</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -156,15 +157,15 @@
   <ItemGroup>
     <CustomBuild Include="volume.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
diff --git a/perf.py b/perf.py
index 2b5c6edd..7e8b3cff 100755
--- a/perf.py
+++ b/perf.py
@@ -42,6 +42,9 @@ def print_file(line):
 def build_test(commands):
     os.system(commands[4])
     test = os.system(commands[1])
+    if is_windows:
+        common.remove_if_exists(".\\X64\\Release1")
+        os.rename(".\\X64\\Release", ".\\X64\\Release1")
     if options.ref:
         ref = os.system(commands[3])
     return (options.ref and ref) or test
@@ -156,16 +159,16 @@ def cpu_check():
             R = c_line.split(' ')
             cpu_percent = float(R[1]) * 3
     else:
-	os.system("wmic cpu get loadpercentage /value > cpu_temp")
-	c = open("cpu_temp", 'r')
+        os.system("wmic cpu get loadpercentage /value > cpu_temp")
+        c = open("cpu_temp", 'r')
         c_lines = c.readlines()
-	c.close()
-	os.remove("cpu_temp")
-	t = "0"
-	for i in c_lines[2]:
+        c.close()
+        os.remove("cpu_temp")
+        t = "0"
+        for i in c_lines[2]:
             if i.isdigit():
                 t = t + i
-	cpu_percent = int(t)
+        cpu_percent = int(t)
     return cpu_percent
 
 #returns geomean of list
@@ -345,9 +348,14 @@ def perf(options1, args):
     if options.ref != "":
         options.ref = True
     if os.environ.get("ISPC_HOME") != None:
-        if os.path.exists(os.environ["ISPC_HOME"] + os.sep + ispc_test):
-            ispc_test_exists = True
-            ispc_test = os.environ["ISPC_HOME"] + os.sep + ispc_test
+        if is_windows == False:
+            if os.path.exists(os.environ["ISPC_HOME"] + os.sep + ispc_test):
+                ispc_test_exists = True
+                ispc_test = os.environ["ISPC_HOME"] + os.sep + ispc_test
+        else:
+            if os.path.exists(os.environ["ISPC_HOME"] + "\\Release\\" + ispc_test):
+                ispc_test_exists = True
+                ispc_test = os.environ["ISPC_HOME"] + "\\Release\\" + ispc_test
     for counter in PATH_dir:
         if ispc_test_exists == False:
             if os.path.exists(counter + os.sep + ispc_test):
@@ -419,9 +427,9 @@ def perf(options1, args):
             re_command = "make clean >> "+build_log
         else:
             ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref"
-            ex_command = "x64\\Release\\test.exe " + command + " >> " + perf_temp + "_test"
-            bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /t:rebuild >> " + build_log
-            bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /t:rebuild >> " + build_log
+            ex_command = "x64\\Release1\\test.exe " + command + " >> " + perf_temp + "_test"
+            bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /p:ISPC_compiler=ispc_ref /t:rebuild >> " + build_log
+            bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /p:ISPC_compiler=ispc /t:rebuild >> " + build_log
             re_command = "msbuild /t:clean >> " + build_log
         commands = [ex_command, bu_command, ex_command_ref, bu_command_ref, re_command]
         # parsing config parameters

From 015af03bdcdba801e6e7873c33ae35d9b881e730 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Tue, 5 Nov 2013 15:41:14 +0100
Subject: [PATCH 125/159] changed back to #define ISPC_FORCE_ALIGNED_MEMORY
 aligned_ld/st #else unaligned ld/st #endif. However load<64>/store<64> will
 still be unaliged w/o this define because of fails related to the issue #632

---
 examples/intrinsics/knc-i1x16.h | 65 +++++++++++++++++++++++++--------
 1 file changed, 49 insertions(+), 16 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index d6bf6fd5..ef14d26e 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -451,6 +451,17 @@ static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
     return ret;                                       \
 }                                                     \
 
+#define SHIFT(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __shift_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i) {                    \
+      int modIndex = i+index;                         \
+      STYPE val = ((modIndex >= 0) && (modIndex < 16)) ? v[modIndex] : 0; \
+      ret[i] = val;                                 \
+    }                                                 \
+    return ret;                                       \
+}                                                     \
+
 /* knc::macro::used */
 #define SHUFFLES(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) {   \
@@ -566,6 +577,7 @@ SETZERO(__vec16_i8, i8)
 UNDEF(__vec16_i8, i8)
 BROADCAST(__vec16_i8, i8, int8_t)
 ROTATE(__vec16_i8, i8, int8_t)
+SHIFT(__vec16_i8, i8, int8_t)
 SHUFFLES(__vec16_i8, i8, int8_t)
 LOAD_STORE(__vec16_i8, int8_t)
 
@@ -612,6 +624,7 @@ SETZERO(__vec16_i16, i16)
 UNDEF(__vec16_i16, i16)
 BROADCAST(__vec16_i16, i16, int16_t)
 ROTATE(__vec16_i16, i16, int16_t)
+SHIFT(__vec16_i16, i16, int16_t)
 SHUFFLES(__vec16_i16, i16, int16_t)
 LOAD_STORE(__vec16_i16, int16_t)
 
@@ -688,6 +701,8 @@ static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index)
   return _mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v);
 }
 
+SHIFT(__vec16_i32, i32, int32_t)
+
 static FORCEINLINE __vec16_i32 __shuffle_i32 (__vec16_i32 v, __vec16_i32 index) 
 { 
   return _mm512_mask_permutevar_epi32(v, 0xFFFF, __and(index, __smear_i32<__vec16_i32>(0xF)), v); 
@@ -704,8 +719,9 @@ static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __
 
 template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
-  return __load<64>(p);
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // return __load<64>(p);
+  return _mm512_load_epi32(p);
 #else
   __vec16_i32 v;
   v = _mm512_extloadunpacklo_epi32(v,           p,    _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
@@ -716,8 +732,9 @@ template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p)
 
 template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
-  __store<64>(p,v);
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // __store<64>(p,v);
+  _mm512_store_epi32(p, v);
 #else
   _mm512_extpackstorelo_epi32(          p,    v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
   _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
@@ -942,6 +959,8 @@ static FORCEINLINE __vec16_i64 __rotate_i64(const __vec16_i64 _v, const int inde
   const __vec16_i32 ret_lo = __rotate_i32(v_lo, index);
   return CASTI2L(ret_hi, ret_lo);
 }
+SHIFT(__vec16_i64, i64, int64_t)
+
 static FORCEINLINE __vec16_i64 __shuffle_double(__vec16_i64 _v, const __vec16_i32 index) 
 {
   CASTL2I(_v, v_hi, v_lo);
@@ -962,8 +981,11 @@ static FORCEINLINE __vec16_i64 __shuffle2_double(__vec16_i64 _v0, __vec16_i64 _v
 
 template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
-  return __load<128>(p);
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // return __load<128>(p);
+  __m512i v2 = _mm512_load_epi32(p);
+  __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
+  return __vec16_i64(v2,v1);
 #else
   __vec16_i32 v1;
   __vec16_i32 v2;
@@ -978,8 +1000,12 @@ template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p)
 
 template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
-  return __store<128>(p,v);
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // __store<128>(p,v);
+  __m512i v1 = v.v2;
+  __m512i v2 = v.v1;
+  _mm512_store_epi64(p, v2);
+  _mm512_store_epi64(((uint8_t*)p)+64, v1);
 #else
   __m512i v1 = v.v2;
   __m512i v2 = v.v1;
@@ -1063,6 +1089,7 @@ static FORCEINLINE __vec16_f __rotate_float(__vec16_f _v, int index)
   const __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0xF));
   return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v));
 }
+SHIFT(__vec16_f, float, float)
 static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) 
 {
   return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
@@ -1081,8 +1108,9 @@ static FORCEINLINE __vec16_f __shuffle2_float(__vec16_f _v0, __vec16_f _v1, __ve
 
 template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
-  return __load<64>(p);
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // return __load<64>(p);
+  return _mm512_load_ps(p);
 #else
   __vec16_f v;
   v = _mm512_extloadunpacklo_ps(v,           p,    _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
@@ -1093,8 +1121,9 @@ template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p)
 
 template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
-  __store<64>(p,v);
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // __store<64>(p,v);
+  _mm512_store_ps(p, v);
 #else
   _mm512_extpackstorelo_ps(          p,    v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
   _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
@@ -1333,6 +1362,7 @@ static FORCEINLINE __vec16_d __rotate_double(const __vec16_d _v, const int index
   const __vec16_f ret_lo = __rotate_float(v_lo, index);
   return CASTF2D(ret_hi, ret_lo);
 }
+SHIFT(__vec16_d, double, double)
 static FORCEINLINE __vec16_d __shuffle_double(__vec16_d _v, const __vec16_i32 index) 
 {
   CASTD2F(_v, v_hi, v_lo);
@@ -1353,8 +1383,9 @@ static FORCEINLINE __vec16_d __shuffle2_double(__vec16_d _v0, __vec16_d _v1, con
 
 template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) \
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
-  return __load<128>(p);
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // return __load<128>(p);
+  return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
 #else
   __vec16_d ret;
   ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
@@ -1367,8 +1398,10 @@ template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) \
  
 template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
-  return __store<128>(p,v);
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // return __store<128>(p,v);
+  _mm512_store_pd(p, v.v1);
+  _mm512_store_pd(((uint8_t*)p)+64, v.v2);
 #else
   _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
   _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);

From 017e7890f7748d4f5fb1794bef532ec7c51cf70c Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Sun, 10 Nov 2013 02:58:48 +0400
Subject: [PATCH 126/159] Examples makefiles to support setting single target
 via ISPC_IA_TARGETS

---
 examples/aobench/Makefile          |  2 +-
 examples/common.mk                 | 24 +++++++++++++++++++++---
 examples/deferred/Makefile         |  2 +-
 examples/gmres/Makefile            |  2 +-
 examples/mandelbrot/Makefile       |  2 +-
 examples/mandelbrot_tasks/Makefile |  2 +-
 examples/noise/Makefile            |  2 +-
 examples/options/Makefile          |  2 +-
 examples/perfbench/Makefile        |  2 +-
 examples/rt/Makefile               |  2 +-
 examples/sort/Makefile             |  2 +-
 examples/stencil/Makefile          |  2 +-
 examples/volume_rendering/Makefile |  2 +-
 13 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/examples/aobench/Makefile b/examples/aobench/Makefile
index 7aba4f01..c8122c07 100644
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=ao
 CPP_SRC=ao.cpp ao_serial.cpp
 ISPC_SRC=ao.ispc
-ISPC_IA_TARGETS=sse2,sse4,avx
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/common.mk b/examples/common.mk
index db7b8eee..0f375f29 100644
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -16,8 +16,26 @@ ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
 ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
 
 ifeq ($(ARCH),x86)
-  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
-	$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
+  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o)
+  COMMA=,
+  ifneq (,$(findstring $(COMMA),$(ISPC_IA_TARGETS)))
+    #$(info multi-target detected: $(ISPC_IA_TARGETS))
+    ifneq (,$(findstring sse2-,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse2.o)
+    endif
+    ifneq (,$(findstring sse4-,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse4.o)
+    endif
+    ifneq (,$(findstring avx1-,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx.o)
+    endif
+    ifneq (,$(findstring avx1.1-,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx11.o)
+    endif
+    ifneq (,$(findstring avx2-,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx2.o)
+    endif
+  endif
   ISPC_TARGETS=$(ISPC_IA_TARGETS)
   ARCH_BIT:=$(shell getconf LONG_BIT)
   ifeq ($(ARCH_BIT),32)
@@ -68,7 +86,7 @@ objs/%.o: ../%.cpp dirs
 
 objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h
 
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o objs/%_ispc_avx11.o objs/%_ispc_avx2.o: %.ispc
 	$(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
 
 objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
diff --git a/examples/deferred/Makefile b/examples/deferred/Makefile
index 09fa56f0..be8ce7c4 100644
--- a/examples/deferred/Makefile
+++ b/examples/deferred/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=deferred_shading
 CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp
 ISPC_SRC=kernels.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
 ISPC_ARM_TARGETS=neon
 ISPC_FLAGS=--opt=fast-math
 
diff --git a/examples/gmres/Makefile b/examples/gmres/Makefile
index 5b57cbf8..07765069 100644
--- a/examples/gmres/Makefile
+++ b/examples/gmres/Makefile
@@ -3,7 +3,7 @@ EXAMPLE=gmres
 CPP_SRC=algorithm.cpp main.cpp matrix.cpp
 CC_SRC=mmio.c
 ISPC_SRC=matrix.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/mandelbrot/Makefile b/examples/mandelbrot/Makefile
index 7e83e618..d225037d 100644
--- a/examples/mandelbrot/Makefile
+++ b/examples/mandelbrot/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=mandelbrot
 CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
 ISPC_SRC=mandelbrot.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile
index 1a565ffd..51866b32 100644
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=mandelbrot_tasks
 CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
 ISPC_SRC=mandelbrot_tasks.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/noise/Makefile b/examples/noise/Makefile
index 58d1cf3b..6dd5ae42 100644
--- a/examples/noise/Makefile
+++ b/examples/noise/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=noise
 CPP_SRC=noise.cpp noise_serial.cpp
 ISPC_SRC=noise.ispc
-ISPC_IA_TARGETS=sse2,sse4,avx-x2
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x16,avx2-i32x16
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/options/Makefile b/examples/options/Makefile
index 11d3d790..2da7e01a 100644
--- a/examples/options/Makefile
+++ b/examples/options/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=options
 CPP_SRC=options.cpp options_serial.cpp
 ISPC_SRC=options.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/perfbench/Makefile b/examples/perfbench/Makefile
index 02507c84..cc2e681f 100644
--- a/examples/perfbench/Makefile
+++ b/examples/perfbench/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=perbench
 CPP_SRC=perfbench.cpp perfbench_serial.cpp
 ISPC_SRC=perfbench.ispc
-ISPC_IA_TARGETS=sse2,sse4,avx
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/rt/Makefile b/examples/rt/Makefile
index 647086cb..e52b02e4 100644
--- a/examples/rt/Makefile
+++ b/examples/rt/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=rt
 CPP_SRC=rt.cpp rt_serial.cpp
 ISPC_SRC=rt.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/sort/Makefile b/examples/sort/Makefile
index cf6bffa4..6ae43fae 100644
--- a/examples/sort/Makefile
+++ b/examples/sort/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=sort
 CPP_SRC=sort.cpp sort_serial.cpp
 ISPC_SRC=sort.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8
 ISPC_ARM_TARGETS=neon
 #ISPC_FLAGS=-DDEBUG
 
diff --git a/examples/stencil/Makefile b/examples/stencil/Makefile
index 097cd597..1b9c2717 100644
--- a/examples/stencil/Makefile
+++ b/examples/stencil/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=stencil
 CPP_SRC=stencil.cpp stencil_serial.cpp
 ISPC_SRC=stencil.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/volume_rendering/Makefile b/examples/volume_rendering/Makefile
index 7bb86e10..1bc81e4e 100644
--- a/examples/volume_rendering/Makefile
+++ b/examples/volume_rendering/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=volume
 CPP_SRC=volume.cpp volume_serial.cpp
 ISPC_SRC=volume.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk

From fbab9874f638a1c11e5bb1a4b2343a8846b154e3 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Sun, 10 Nov 2013 23:47:19 +0400
Subject: [PATCH 127/159] perf.py - target switch was added

---
 perf.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/perf.py b/perf.py
index 7e8b3cff..d7482fab 100755
--- a/perf.py
+++ b/perf.py
@@ -391,6 +391,9 @@ def perf(options1, args):
     # end of preparations
  
     print_debug("Okey go go go!\n\n", s, perf_log)
+    # report command line
+    print_debug("Command line: %s\n" % " ".join(map(str, sys.argv)), s, perf_log)
+    # report used ispc
     print_debug("Testing ispc: " + ispc_test + "\n", s, perf_log)
  
     #print compilers versions   
@@ -419,11 +422,15 @@ def perf(options1, args):
         # read parameters of test
         command = lines[i+2]
         command = command[:-1]
+        # handle conditional target argument
+        target_str = ""
+        if options.target != "":
+            target_str = " ISPC_IA_TARGETS="+options.target 
         if is_windows == False:
             ex_command_ref = "./ref " + command + " >> " + perf_temp + "_ref"
             ex_command = "./test " + command + " >> " + perf_temp + "_test"
-            bu_command_ref = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=ref ISPC="+ispc_ref+" >> "+build_log+" 2>> "+build_log
-            bu_command = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=test ISPC="+ispc_test+" >> "+build_log+" 2>> "+build_log
+            bu_command_ref = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=ref ISPC="+ispc_ref+target_str+" >> "+build_log+" 2>> "+build_log
+            bu_command = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=test ISPC="+ispc_test+target_str+" >> "+build_log+" 2>> "+build_log
             re_command = "make clean >> "+build_log
         else:
             ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref"
@@ -503,5 +510,7 @@ if __name__ == "__main__":
         help='set reference compiler for compare', default="")
     parser.add_option('-f', '--file', dest='in_file',
         help='file to save perf output', default="")
+    parser.add_option('-t', '--target', dest='target',
+        help='set ispc target for building benchmarks (both test and ref)', default="")
     (options, args) = parser.parse_args()
     perf(options, args)

From ffc9a33933987b71d245adb208d180805b64cb9e Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Sun, 10 Nov 2013 23:48:49 +0400
Subject: [PATCH 128/159] avx1-i32x4 implementation as sse4-i32x4 with avx
 target-feature flag

---
 builtins.cpp | 21 ++++++++++++++++-----
 ispc.cpp     | 42 +++++++++++++++++++++++++++++++++++-------
 ispc.h       |  8 +++++++-
 3 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index 730e315c..2c9703c6 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -942,11 +942,22 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     case Target::AVX: {
         switch (g->target->getVectorWidth()) {
         case 4:
-            if (runtime32) {
-                EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit);
-            }
-            else {
-                EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit);
+            if (g->target->getDataTypeWidth() == 32) {
+                if (runtime32) {
+                    EXPORT_MODULE(builtins_bitcode_sse4_32bit);
+                }
+                else {
+                    EXPORT_MODULE(builtins_bitcode_sse4_64bit);
+                }
+            } else if (g->target->getDataTypeWidth() == 64) {
+                if (runtime32) {
+                    EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit);
+                }
+                else {
+                    EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit);
+                }
+            } else {
+                FATAL("logic error in DefineStdlib");
             }
             break;
         case 8:
diff --git a/ispc.cpp b/ispc.cpp
index 859865a5..cb70b879 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -169,7 +169,7 @@ static const char *supportedCPUs[] = {
     , "core-avx-i", "core-avx2"
 #endif // LLVM 3.2+
 #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3)
-    , "slm" 
+    , "slm"
 #endif // LLVM 3.4+
 };
 
@@ -191,6 +191,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     m_tf_attributes(NULL),
 #endif
     m_nativeVectorWidth(-1),
+    m_dataTypeWidth(-1),
     m_vectorWidth(-1),
     m_generatePIC(pic),
     m_maskingIsFree(false),
@@ -308,6 +309,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         !strcasecmp(isa, "sse2-i32x4")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 4;
         this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
 #if defined(LLVM_3_4)
@@ -323,6 +325,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "sse2-i32x8")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
 #if defined(LLVM_3_4)
@@ -338,11 +341,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "sse4-i32x4")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 4;
         // TODO: why not sse42 and popcnt?
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
 #if defined(LLVM_3_4)
-        ",+sse4.1,-sse4.2"        
+        ",+sse4.1,-sse4.2"
 #else
         ",+sse41,-sse42"
 #endif
@@ -355,10 +359,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "sse4-i32x8")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
 #if defined(LLVM_3_4)
-        ",+sse4.1,-sse4.2"        
+        ",+sse4.1,-sse4.2"
 #else
         ",+sse41,-sse42"
 #endif
@@ -369,10 +374,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "sse4-i8x16")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 16;
+        this->m_dataTypeWidth = 8;
         this->m_vectorWidth = 16;
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
 #if defined(LLVM_3_4)
-        ",+sse4.1,-sse4.2"        
+        ",+sse4.1,-sse4.2"
 #else
         ",+sse41,-sse42"
 #endif
@@ -383,10 +389,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "sse4-i16x8")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 16;
         this->m_vectorWidth = 8;
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
 #if defined(LLVM_3_4)
-        ",+sse4.1,-sse4.2"        
+        ",+sse4.1,-sse4.2"
 #else
         ",+sse41,-sse42"
 #endif
@@ -457,11 +464,21 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
+    else if (!strcasecmp(isa, "avx1-i32x4")) {
+        this->m_isa = Target::AVX;
+        this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 32;
+        this->m_vectorWidth = 4;
+        this->m_attributes = "+avx,+popcnt,+cmov";
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 32;
+    }
     else if (!strcasecmp(isa, "avx") ||
              !strcasecmp(isa, "avx1") ||
              !strcasecmp(isa, "avx1-i32x8")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+avx,+popcnt,+cmov";
         this->m_maskingIsFree = false;
@@ -471,6 +488,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1-i64x4")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_dataTypeWidth = 64;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov";
         this->m_maskingIsFree = false;
@@ -481,6 +499,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1-i32x16")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 16;
         this->m_attributes = "+avx,+popcnt,+cmov";
         this->m_maskingIsFree = false;
@@ -490,6 +509,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1.1-i32x8")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
 #if defined(LLVM_3_4)
@@ -510,6 +530,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1.1-i32x16")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 16;
         this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
 #if defined(LLVM_3_4)
@@ -517,7 +538,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 #else
         ",+rdrand"
 #endif
-        ;           
+        ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
@@ -529,6 +550,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "avx1.1-i64x4")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_dataTypeWidth = 64;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
 #if defined(LLVM_3_4)
@@ -536,7 +558,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 #else
         ",+rdrand"
 #endif
-        ;           
+        ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 64;
         this->m_hasHalf = true;
@@ -549,6 +571,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx2-i32x8")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
 #if defined(LLVM_3_4)
@@ -573,6 +596,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx2-i32x16")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 16;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 16;
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
 #if defined(LLVM_3_4)
@@ -596,6 +620,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "avx2-i64x4")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_dataTypeWidth = 64;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
 #if defined(LLVM_3_4)
@@ -620,6 +645,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "neon-i8x16")) {
         this->m_isa = Target::NEON8;
         this->m_nativeVectorWidth = 16;
+        this->m_dataTypeWidth = 8;
         this->m_vectorWidth = 16;
         this->m_attributes = "+neon,+fp16";
         this->m_hasHalf = true; // ??
@@ -629,6 +655,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "neon-i16x8")) {
         this->m_isa = Target::NEON16;
         this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 16;
         this->m_vectorWidth = 8;
         this->m_attributes = "+neon,+fp16";
         this->m_hasHalf = true; // ??
@@ -639,6 +666,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "neon-i32x4")) {
         this->m_isa = Target::NEON32;
         this->m_nativeVectorWidth = 4;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 4;
         this->m_attributes = "+neon,+fp16";
         this->m_hasHalf = true; // ??
diff --git a/ispc.h b/ispc.h
index 82cb9050..c74ff347 100644
--- a/ispc.h
+++ b/ispc.h
@@ -253,6 +253,8 @@ public:
 
     int getNativeVectorWidth() const {return m_nativeVectorWidth;}
 
+    int getDataTypeWidth() const {return m_dataTypeWidth;}
+
     int getVectorWidth() const {return m_vectorWidth;}
 
     bool getGeneratePIC() const {return m_generatePIC;}
@@ -319,10 +321,14 @@ private:
 #endif
 
     /** Native vector width of the vector instruction set.  Note that this
-        value is directly derived from the ISA Being used (e.g. it's 4 for
+        value is directly derived from the ISA being used (e.g. it's 4 for
         SSE, 8 for AVX, etc.) */
     int m_nativeVectorWidth;
 
+    /** Data type with in bits. Typically it's 32, but could be 8, 16 or 64.
+        For generic it's -1, which means undefined. */
+    int m_dataTypeWidth;
+
     /** Actual vector width currently being compiled to.  This may be an
         integer multiple of the native vector width, for example if we're
         "doubling up" and compiling 8-wide on a 4-wide SSE system. */

From af5895514020e14a84676c55c5755d87d53129f6 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 12 Nov 2013 10:00:42 +0400
Subject: [PATCH 129/159] target-[sse4|avx]_common.ll are twin brothers, which
 diffes only cosmetically. This commit makes them diffable. No real changes,
 except adding alwaysinline to sse version iof
 __max_uniform_int32/__max_uniform_uint32

---
 builtins/target-avx-common.ll  | 102 +++++++++++++++++----------------
 builtins/target-sse4-common.ll |  53 +++++++++--------
 2 files changed, 80 insertions(+), 75 deletions(-)

diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index 1d317713..41692823 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -37,24 +37,6 @@ define_prefetches()
 define_shuffles()
 aossoa()
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rcp
-
-declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
-
-define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
-;    uniform float iv = extract(__rcp_u(v), 0);
-;    return iv * (2. - v * iv);
-  %vecval = insertelement <4 x float> undef, float %0, i32 0
-  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
-  %scall = extractelement <4 x float> %call, i32 0
-
-  ; do one N-R iteration
-  %v_iv = fmul float %0, %scall
-  %two_minus = fsub float 2., %v_iv  
-  %iv_mul = fmul float %scall, %two_minus
-  ret float %iv_mul
-}
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
@@ -77,7 +59,8 @@ define float @__round_uniform_float(float) nounwind readonly alwaysinline {
   ;  r3 = a3
   ;
   ;  It doesn't matter what we pass as a, since we only need the r0 value
-  ;  here.  So we pass the same register for both.
+  ;  here.  So we pass the same register for both.  Further, only the 0th
+  ;  element of the b parameter matters
   %xi = insertelement <4 x float> undef, float %0, i32 0
   %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
   %rs = extractelement <4 x float> %xr, i32 0
@@ -117,7 +100,7 @@ define double @__round_uniform_double(double) nounwind readonly alwaysinline {
 define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
   ; see above for round_ss instrinsic discussion...
   %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  ; roundsd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
   %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
   %rs = extractelement <2 x double> %xr, i32 0
   ret double %rs
@@ -126,12 +109,31 @@ define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
 define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
   ; see above for round_ss instrinsic discussion...
   %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  ; roundsd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
   %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
   %rs = extractelement <2 x double> %xr, i32 0
   ret double %rs
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+  ; do the rcpss call
+  ;    uniform float iv = extract(__rcp_u(v), 0);
+  ;    return iv * (2. - v * iv);
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration to improve precision, as above
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rsqrt
@@ -144,6 +146,7 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
   %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
   %is = extractelement <4 x float> %vis, i32 0
 
+  ; Newton-Raphson iteration to improve precision
   ;  return 0.5 * is * (3. - (v * is) * is);
   %v_is = fmul float %0, %is
   %v_is_is = fmul float %v_is, %is
@@ -164,9 +167,18 @@ define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
   ret float %ret
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
+  ret double %ret
+}
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; fastmath
+;; fast math mode
 
 declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
 declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
@@ -200,6 +212,22 @@ define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
   ret float %ret
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}
+
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -235,7 +263,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops
+;; horizontal ops / reductions
 
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 
@@ -251,32 +279,6 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
   ret i64 %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
-define double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
-  ret double %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
-  ret double %ret
-}
-
-define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
-  ret double %ret
-}
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int8/int16 builtins
diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll
index 4b8751b5..50dd0582 100644
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -29,6 +29,9 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; SSE4 target implementation.
+
 ctlztz()
 define_prefetches()
 define_shuffles()
@@ -67,7 +70,7 @@ define float @__round_uniform_float(float) nounwind readonly alwaysinline {
 define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
   ; see above for round_ss instrinsic discussion...
   %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
   %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
   %rs = extractelement <4 x float> %xr, i32 0
   ret float %rs
@@ -97,7 +100,7 @@ define double @__round_uniform_double(double) nounwind readonly alwaysinline {
 define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
   ; see above for round_ss instrinsic discussion...
   %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  ; roundsd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
   %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
   %rs = extractelement <2 x double> %xr, i32 0
   ret double %rs
@@ -106,7 +109,7 @@ define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
 define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
   ; see above for round_ss instrinsic discussion...
   %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  ; roundsd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
   %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
   %rs = extractelement <2 x double> %xr, i32 0
   ret double %rs
@@ -119,6 +122,8 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
 
 define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
   ; do the rcpss call
+  ;    uniform float iv = extract(__rcp_u(v), 0);
+  ;    return iv * (2. - v * iv);
   %vecval = insertelement <4 x float> undef, float %0, i32 0
   %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
   %scall = extractelement <4 x float> %call, i32 0
@@ -130,9 +135,8 @@ define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
   ret float %iv_mul
 }
 
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; rsqrt
+;; rsqrt
 
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 
@@ -154,7 +158,7 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; sqrt
+;; sqrt
 
 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 
@@ -163,6 +167,16 @@ define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
   ret float %ret
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
+  ret double %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; fast math mode
 
@@ -198,36 +212,25 @@ define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
   ret float %ret
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
-define double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
-  ret double %ret
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision min/max
 
 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
 
-define double @__min_uniform_double(double, double) nounwind readnone {
+define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
   sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
   ret double %ret
 }
 
-
-define double @__max_uniform_double(double, double) nounwind readnone {
+define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
   sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
   ret double %ret
 }
 
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int32 min/max
+;; int min/max
 
 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
@@ -242,8 +245,9 @@ define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
   ret i32 %ret
 }
 
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; unsigned int min/max
+;; unsigned int min/max
 
 declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
@@ -258,9 +262,8 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
   ret i32 %ret
 }
 
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops / reductions
+;; horizontal ops / reductions
 
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 

From d2c7b356cc852af50780f513c6a09c946cc257ff Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 12 Nov 2013 14:56:52 +0400
Subject: [PATCH 130/159] Ordering functions in target-[avx|sse2].ll to be in
 the same order. No real changes, except adding a few alwaysinline in SSE4
 target

---
 builtins/target-avx-common.ll |   4 +
 builtins/target-avx.ll        | 221 +++++++++++++++++-----------------
 builtins/target-sse4.ll       | 206 +++++++++++++++++--------------
 3 files changed, 231 insertions(+), 200 deletions(-)

diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index 41692823..1c467476 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -31,6 +31,10 @@
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; AVX target implementation.
+;;
+;; Please note that this file uses SSE intrinsics, but LLVM generates AVX
+;; instructions, so it doesn't makes sense to change this implemenation.
+
 
 ctlztz()
 define_prefetches()
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 196e5ea4..e98a3843 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -49,11 +49,10 @@ include(`target-avx-common.ll')
 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
 
 define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; do one N-R iteration to improve precision
   ;  float iv = __rcp_v(v);
   ;  return iv * (2. - v * iv);
-
   %call = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %0)
-  ; do one N-R iteration
   %v_iv = fmul <8 x float> %0, %call
   %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
                                  float 2., float 2., float 2., float 2.>, %v_iv  
@@ -61,6 +60,46 @@ define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinl
   ret <8 x float> %iv_mul
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
+
+define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                 float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
+  ret <8 x double> %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 
@@ -94,58 +133,15 @@ define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwa
 }
 
 define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
   round4to8double(%0, 9)
 }
 
-
 define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
   round4to8double(%0, 10)
 }
 
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rsqrt
-
-declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
-
-define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
-  ;  float is = __rsqrt_v(v);
-  %is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
-  ;  return 0.5 * is * (3. - (v * is) * is);
-  %v_is = fmul <8 x float> %v, %is
-  %v_is_is = fmul <8 x float> %v_is, %is
-  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
-                                 float 3., float 3., float 3., float 3.>, %v_is_is
-  %is_mul = fmul <8 x float> %is, %three_sub
-  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
-                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
-  ret <8 x float> %half_scale
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; sqrt
-
-declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
-
-define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  %call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
-  ret <8 x float> %call
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; svml
-
-include(`svml.m4')
-;; single precision
-svml_declare(float,f8,8)
-svml_define(float,f8,8,f)
-
-;; double precision
-svml_declare(double,4,4)
-svml_define_x(double,4,4,d,8)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
@@ -166,7 +162,37 @@ define <8 x float> @__min_varying_float(<8 x float>,
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
+  ret <8 x double> %ret
+}
+
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
+  ret <8 x double> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+include(`svml.m4')
+;; single precision
+svml_declare(float,f8,8)
+svml_define(float,f8,8,f)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define_x(double,4,4,d,8)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; mask handling
 
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
 
@@ -198,6 +224,9 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal ops / reductions
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal float ops
 
@@ -216,12 +245,36 @@ define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
   reduce8(float, @__min_varying_float, @__min_uniform_float)
 }
 
-
 define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
   reduce8(float, @__max_varying_float, @__max_uniform_float)
 }
 
-reduce_equal(8)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
+  %v0 = shufflevector <8 x double> %0, <8 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1 = shufflevector <8 x double> %0, <8 x double> undef,
+                      <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
+  ret double %sum
+}
+
+define double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int8 ops
@@ -262,6 +315,7 @@ define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
 
+;; helper functions
 define <8 x i32> @__add_varying_int32(<8 x i32>,
                                       <8 x i32>) nounwind readnone alwaysinline {
   %s = add <8 x i32> %0, %1
@@ -273,16 +327,15 @@ define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
   ret i32 %s
 }
 
+;; reduction functions
 define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
 }
 
-
 define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
 }
 
-
 define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
 }
@@ -295,38 +348,11 @@ define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; horizontal double ops
-
-declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
-
-define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
-  %v0 = shufflevector <8 x double> %0, <8 x double> undef,
-                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v1 = shufflevector <8 x double> %0, <8 x double> undef,
-                      <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
-  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
-  %final0 = extractelement <4 x double> %sum1, i32 0
-  %final1 = extractelement <4 x double> %sum1, i32 2
-  %sum = fadd double %final0, %final1
-
-  ret double %sum
-}
-
-define double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
-  reduce8(double, @__min_varying_double, @__min_uniform_double)
-}
-
-
-define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
-  reduce8(double, @__max_varying_double, @__max_uniform_double)
-}
-
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int64 ops
 
+;; helper functions
 define <8 x i64> @__add_varying_int64(<8 x i64>,
                                       <8 x i64>) nounwind readnone alwaysinline {
   %s = add <8 x i64> %0, %1
@@ -338,6 +364,7 @@ define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
   ret i64 %s
 }
 
+;; reduction functions
 define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
   reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
 }
@@ -362,6 +389,7 @@ define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
   reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
 
+reduce_equal(8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
@@ -446,6 +474,10 @@ define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>,
   ret void
 }
 
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store blend
 
 masked_store_blend_8_16_by_8()
 
@@ -517,8 +549,6 @@ define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
   ret void
 }
 
-masked_store_float_double()
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; scatter
 
@@ -529,30 +559,3 @@ gen_scatter(float)
 gen_scatter(i64)
 gen_scatter(double)
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
-
-define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
-  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
-  ret <8 x double> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
-declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
-
-define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
-  binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
-  ret <8 x double> %ret
-}
-
-define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
-  binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
-  ret <8 x double> %ret
-}
-
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 88be6c59..16177b47 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -58,10 +58,10 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
 
 define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
   ; do one N-R iteration to improve precision
   ;  float iv = __rcp_v(v);
   ;  return iv * (2. - v * iv);
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
   %v_iv = fmul <4 x float> %0, %call
   %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
   %iv_mul = fmul <4 x float> %call, %two_minus
@@ -87,7 +87,7 @@ define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwa
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; sqrt
+;; sqrt
 
 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
 
@@ -154,16 +154,34 @@ define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alway
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+define <4 x float> @__max_varying_float(<4 x float>,
+                                        <4 x float>) nounwind readonly alwaysinline {
   %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
   ret <4 x float> %call
 }
 
-define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+define <4 x float> @__min_varying_float(<4 x float>,
+                                        <4 x float>) nounwind readonly alwaysinline {
   %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
   ret <4 x float> %call
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int32 min/max
 
@@ -191,23 +209,7 @@ define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly a
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
-
-define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
+;; svml stuff
 
 include(`svml.m4')
 ;; single precision
@@ -219,7 +221,7 @@ svml_declare(double,2,2)
 svml_define_x(double,2,2,d,4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops / reductions
+;; mask handling
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 
@@ -251,6 +253,55 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal ops / reductions
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
+  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
+  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
+  %scalar = extractelement <4 x float> %v2, i32 0
+  ret float %scalar
+}
+
+define float @__reduce_min_float(<4 x float>) nounwind readnone alwaysinline {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<4 x float>) nounwind readnone alwaysinline {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+define double @__reduce_add_double(<4 x double>) nounwind readnone alwaysinline {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = fadd <2 x double> %v0, %v1
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
 declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
 
 define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
@@ -266,6 +317,9 @@ define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
   ret i16 %r16
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
 define internal <4 x i16> @__add_varying_i16(<4 x i16>,
                                   <4 x i16>) nounwind readnone alwaysinline {
   %r = add <4 x i16> %0, %1
@@ -281,24 +335,11 @@ define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
   reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
 }
 
-declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
 
-define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
-  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
-  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
-  %scalar = extractelement <4 x float> %v2, i32 0
-  ret float %scalar
-}
-
-define float @__reduce_min_float(<4 x float>) nounwind readnone {
-  reduce4(float, @__min_varying_float, @__min_uniform_float)
-}
-
-define float @__reduce_max_float(<4 x float>) nounwind readnone {
-  reduce4(float, @__max_varying_float, @__max_uniform_float)
-}
-
-define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
+;; reduction functions
+define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone alwaysinline {
   %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
                       <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   %m1 = add <4 x i32> %v1, %v
@@ -308,44 +349,27 @@ define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
   ret i32 %sum
 }
 
-define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
   reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
 }
 
-define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
   reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
 }
 
-define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
   reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
 
-define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
   reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
 
-define double @__reduce_add_double(<4 x double>) nounwind readnone {
-  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
-                      <2 x i32> <i32 0, i32 1>
-  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
-                      <2 x i32> <i32 2, i32 3>
-  %sum = fadd <2 x double> %v0, %v1
-  %e0 = extractelement <2 x double> %sum, i32 0
-  %e1 = extractelement <2 x double> %sum, i32 1
-  %m = fadd double %e0, %e1
-  ret double %m
-}
-
-define double @__reduce_min_double(<4 x double>) nounwind readnone {
-  reduce4(double, @__min_varying_double, @__min_uniform_double)
-}
-
-define double @__reduce_max_double(<4 x double>) nounwind readnone {
-  reduce4(double, @__max_varying_double, @__max_uniform_double)
-}
-
-define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
+;; reduction functions
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
   %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
                       <2 x i32> <i32 0, i32 1>
   %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
@@ -357,27 +381,50 @@ define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
   ret i64 %m
 }
 
-define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
   reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
 }
 
-define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
   reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
 }
 
-define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
   reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
 
-define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
   reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
 
 reduce_equal(4)
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store blend
+
+masked_store_blend_8_16_by_4()
+
 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                              <4 x float>) nounwind readnone
 
@@ -444,29 +491,6 @@ define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
   ret void
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; masked store
-
-masked_store_blend_8_16_by_4()
-
-gen_masked_store(i8)
-gen_masked_store(i16)
-gen_masked_store(i32)
-gen_masked_store(i64)
-
-masked_store_float_double()
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unaligned loads/loads+broadcasts
-
-
-masked_load(i8,  1)
-masked_load(i16, 2)
-masked_load(i32, 4)
-masked_load(float, 4)
-masked_load(i64, 8)
-masked_load(double, 8)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
 

From 65ea6fd48af95fb62c56fea9cb2414bd2eaaa8a0 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 13 Nov 2013 13:15:01 +0400
Subject: [PATCH 131/159] Reasoning to use sse4 bitcode file

---
 builtins.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/builtins.cpp b/builtins.cpp
index 2c9703c6..2afd92d9 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -943,6 +943,15 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         switch (g->target->getVectorWidth()) {
         case 4:
             if (g->target->getDataTypeWidth() == 32) {
+                // Note here that for avx1-i32x4 we are using bitcode file for
+                // sse4-i32x4. This is intentional and good enough.
+                // AVX target implies appropriate target-feature attrbute,
+                // which forces LLVM to generate AVX code, even for SSE4
+                // intrinsics. Except that the only "missing" feature in sse4
+                // target is implemenation of __masked_[store|load]_[i32|i64]
+                // using maskmov instruction. But it's not very popular
+                // intrinsics, so we assume the implementation to be good
+                // enough at the moment.
                 if (runtime32) {
                     EXPORT_MODULE(builtins_bitcode_sse4_32bit);
                 }

From 8f768633ad1f48aeefe948abc3e50fd7c2cd097e Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 13 Nov 2013 15:07:21 +0400
Subject: [PATCH 132/159] Make perf.py changes work as part of alloy.py

---
 alloy.py | 1 +
 perf.py  | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/alloy.py b/alloy.py
index 51aec82b..478895b7 100755
--- a/alloy.py
+++ b/alloy.py
@@ -495,6 +495,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
         performance.ref = "ispc_ref"
         if current_OS == "Windows":
             performance.ref = "ispc_ref.exe"
+        performance.perf_target = ""
         performance.in_file = "." + os.sep + f_date + os.sep + "performance.log"
 # prepare LLVM 3.3 as newest LLVM
         need_LLVM = check_LLVM(["3.3"])
diff --git a/perf.py b/perf.py
index d7482fab..9875fbdb 100755
--- a/perf.py
+++ b/perf.py
@@ -392,7 +392,8 @@ def perf(options1, args):
  
     print_debug("Okey go go go!\n\n", s, perf_log)
     # report command line
-    print_debug("Command line: %s\n" % " ".join(map(str, sys.argv)), s, perf_log)
+    if __name__ == "__main__":
+        print_debug("Command line: %s\n" % " ".join(map(str, sys.argv)), s, perf_log)
     # report used ispc
     print_debug("Testing ispc: " + ispc_test + "\n", s, perf_log)
  
@@ -424,8 +425,8 @@ def perf(options1, args):
         command = command[:-1]
         # handle conditional target argument
         target_str = ""
-        if options.target != "":
-            target_str = " ISPC_IA_TARGETS="+options.target 
+        if options.perf_target != "":
+            target_str = " ISPC_IA_TARGETS="+options.perf_target 
         if is_windows == False:
             ex_command_ref = "./ref " + command + " >> " + perf_temp + "_ref"
             ex_command = "./test " + command + " >> " + perf_temp + "_test"
@@ -510,7 +511,7 @@ if __name__ == "__main__":
         help='set reference compiler for compare', default="")
     parser.add_option('-f', '--file', dest='in_file',
         help='file to save perf output', default="")
-    parser.add_option('-t', '--target', dest='target',
+    parser.add_option('-t', '--target', dest='perf_target',
         help='set ispc target for building benchmarks (both test and ref)', default="")
     (options, args) = parser.parse_args()
     perf(options, args)

From b8a39a1b26745029540fce48146e7f75a3a5e798 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 13 Nov 2013 16:34:10 +0400
Subject: [PATCH 133/159] minor improvements in examples/common.mk

---
 examples/common.mk | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/common.mk b/examples/common.mk
index 0f375f29..6541fa05 100644
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -20,19 +20,19 @@ ifeq ($(ARCH),x86)
   COMMA=,
   ifneq (,$(findstring $(COMMA),$(ISPC_IA_TARGETS)))
     #$(info multi-target detected: $(ISPC_IA_TARGETS))
-    ifneq (,$(findstring sse2-,$(ISPC_IA_TARGETS)))
+    ifneq (,$(findstring sse2,$(ISPC_IA_TARGETS)))
       ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse2.o)
     endif
-    ifneq (,$(findstring sse4-,$(ISPC_IA_TARGETS)))
+    ifneq (,$(findstring sse4,$(ISPC_IA_TARGETS)))
       ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse4.o)
     endif
     ifneq (,$(findstring avx1-,$(ISPC_IA_TARGETS)))
       ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx.o)
     endif
-    ifneq (,$(findstring avx1.1-,$(ISPC_IA_TARGETS)))
+    ifneq (,$(findstring avx1.1,$(ISPC_IA_TARGETS)))
       ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx11.o)
     endif
-    ifneq (,$(findstring avx2-,$(ISPC_IA_TARGETS)))
+    ifneq (,$(findstring avx2,$(ISPC_IA_TARGETS)))
       ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx2.o)
     endif
   endif

From e100040f2809ecd1917b337528217e1548373b7f Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 13 Nov 2013 22:35:37 +0400
Subject: [PATCH 134/159] Fix bug with fail when
 --target=avx1.1-i32x8,avx2-i32x8 - avx11 is not a valid target anymore, need
 more complete string

---
 ispc.cpp   | 42 ++++++++++++++++++++++++++++++++++++++++++
 ispc.h     |  9 ++++++++-
 module.cpp |  2 +-
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/ispc.cpp b/ispc.cpp
index cb70b879..87c7793e 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -838,6 +838,9 @@ Target::GetTripleString() const {
     return triple.str();
 }
 
+// This function returns string representation of ISA for the purpose of
+// mangling. And may return any unique string, preferably short, like
+// sse4, avx and etc.
 const char *
 Target::ISAToString(ISA isa) {
     switch (isa) {
@@ -873,6 +876,45 @@ Target::GetISAString() const {
 }
 
 
+// This function returns string representation of default target corresponding
+// to ISA. I.e. for SSE4 it's sse4-i32x4, for AVX11 it's avx1.1-i32x8. This
+// string may be used to initialize Target.
+const char *
+Target::ISAToTargetString(ISA isa) {
+    switch (isa) {
+#ifdef ISPC_ARM_ENABLED
+    case Target::NEON8:
+        return "neon-8";
+    case Target::NEON16:
+        return "neon-16";
+    case Target::NEON32:
+        return "neon-32";
+#endif
+    case Target::SSE2:
+        return "sse2-i32x4";
+    case Target::SSE4:
+        return "sse4-i32x4";
+    case Target::AVX:
+        return "avx1-i32x8";
+    case Target::AVX11:
+        return "avx1.1-i32x8";
+    case Target::AVX2:
+        return "avx2-i32x8";
+    case Target::GENERIC:
+        return "generic-4";
+    default:
+        FATAL("Unhandled target in ISAToTargetString()");
+    }
+    return "";
+}
+
+
+const char *
+Target::GetISATargetString() const {
+    return ISAToString(m_isa);
+}
+
+
 static bool
 lGenericTypeLayoutIndeterminate(llvm::Type *type) {
     if (type->isPrimitiveType() || type->isIntegerTy())
diff --git a/ispc.h b/ispc.h
index c74ff347..2207cdde 100644
--- a/ispc.h
+++ b/ispc.h
@@ -214,9 +214,16 @@ public:
     /** Convert ISA enum to string */
     static const char *ISAToString(Target::ISA isa);
 
-    /** Returns a string like "avx" encoding the target. */
+    /** Returns a string like "avx" encoding the target. Good for mangling. */
     const char *GetISAString() const;
 
+    /** Convert ISA enum to string */
+    static const char *ISAToTargetString(Target::ISA isa);
+
+    /** Returns a string like "avx1.1-i32x8" encoding the target.
+        This may be used for Target initialization. */
+    const char *GetISATargetString() const;
+
     /** Returns the size of the given type */
     llvm::Value *SizeOf(llvm::Type *type,
                         llvm::BasicBlock *insertAtEnd);
diff --git a/module.cpp b/module.cpp
index 41861a2d..1afc5a0b 100644
--- a/module.cpp
+++ b/module.cpp
@@ -2443,7 +2443,7 @@ Module::CompileAndOutput(const char *srcFile,
         int i = 0;
         const char *firstISA;
         while (i < Target::NUM_ISAS && firstTargetMachine == NULL) {
-            firstISA = Target::ISAToString((Target::ISA) i);
+            firstISA = Target::ISAToTargetString((Target::ISA) i);
             firstTargetMachine = targetMachines[i++];
         }
         Assert(firstTargetMachine != NULL);

From 801f78f8a8c079526863313d420c19060612b33f Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 13 Nov 2013 22:48:14 +0400
Subject: [PATCH 135/159] Rebuild *.ispc when necessary

---
 examples/common.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/common.mk b/examples/common.mk
index 6541fa05..04a566bb 100644
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -84,9 +84,9 @@ objs/%.o: %.c dirs $(ISPC_HEADER)
 objs/%.o: ../%.cpp dirs
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 
-objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h
+objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h dirs
 
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o objs/%_ispc_avx11.o objs/%_ispc_avx2.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o objs/%_ispc_avx11.o objs/%_ispc_avx2.o: %.ispc dirs
 	$(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
 
 objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)

From 42e181112af77301b263f38eaa0cec160fcfe8ce Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Thu, 14 Nov 2013 16:21:30 +0400
Subject: [PATCH 136/159] Add avx1-i32x4 to the list of supported targets

---
 ispc.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ispc.cpp b/ispc.cpp
index 87c7793e..de01fdfb 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -801,6 +801,7 @@ Target::SupportedTargets() {
 #endif
         "sse2-i32x4, sse2-i32x8, "
         "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
+        "avx1-i32x4, "
         "avx1-i32x8, avx1-i32x16, avx1-i64x4, "
         "avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 "
         "avx2-i32x8, avx2-i32x16, avx2-i64x4, "

From 131ff503339b1bac713109bd0acced8d526aef72 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 15 Nov 2013 22:09:13 +0400
Subject: [PATCH 137/159] Adding avx1-i32x4 to alloy.py testing

---
 alloy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/alloy.py b/alloy.py
index 478895b7..21e428de 100755
--- a/alloy.py
+++ b/alloy.py
@@ -214,7 +214,7 @@ def check_targets():
         try_do_LLVM("build check_ISA", "cl check_isa.cpp", True)
     SSE2  = ["sse2-i32x4",  "sse2-i32x8"]
     SSE4  = ["sse4-i32x4",  "sse4-i32x8",   "sse4-i16x8", "sse4-i8x16"]
-    AVX   = ["avx1-i32x8",  "avx1-i32x16",  "avx1-i64x4"]
+    AVX   = ["avx1-i32x4",  "avx1-i32x8",  "avx1-i32x16",  "avx1-i64x4"]
     AVX11 = ["avx1.1-i32x8","avx1.1-i32x16","avx1.1-i64x4"]
     AVX2  = ["avx2-i32x8",  "avx2-i32x16",  "avx2-i64x4"]
     targets = [["AVX2", AVX2, False], ["AVX1.1", AVX11, False], ["AVX", AVX, False], ["SSE4", SSE4, False], ["SSE2", SSE2, False]]
@@ -251,7 +251,7 @@ def check_targets():
         if targets[3][2] == False and "wsm" in f_lines[i]:
             answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]]
         if targets[2][2] == False and "snb" in f_lines[i]:
-            answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]]
+            answer_sde = answer_sde + [["-snb", "avx1-i32x4"], ["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]]
         if targets[1][2] == False and "ivb" in f_lines[i]:
             answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"], ["-ivb", "avx1.1-i64x4"]]
         if targets[0][2] == False and "hsw" in f_lines[i]:

From 953e467a85b43d682326df149e3bf629d180f42e Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 18 Nov 2013 03:39:09 +0400
Subject: [PATCH 138/159] fail_db.txt update on Linux

---
 fail_db.txt | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 160 insertions(+), 6 deletions(-)

diff --git a/fail_db.txt b/fail_db.txt
index da77cac3..43f3bdad 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -292,12 +292,6 @@
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O2 *
@@ -462,3 +456,163 @@
 .\tests\switch-8.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
 .\tests\switch-9.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
 .\tests\reduce-equal-10.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+./tests/half-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-11.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-12.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-13.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-14.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-3.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-4.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-and-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-and-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-or-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-12.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-13.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-3.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-4.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-5.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-6.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/half-1.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/ptr-19.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-11.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-12.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-13.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-14.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-3.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-4.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-and-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-and-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-or-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-12.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-13.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-3.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-4.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-5.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/avg-down-int16.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/avg-down-uint16.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/avg-up-int16.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/avg-up-uint16.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/broadcast-2.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/half-2.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/half.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/idiv.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/load-int16-1.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-add-int16-1.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-add-int16.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/rotate-6.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/shuffle-4.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/shuffle2-11.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/shuffle2-7.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/shuffle2-9.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/avg-down-int16.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/avg-down-uint16.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/avg-up-int16.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/avg-up-uint16.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/broadcast-2.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/half-2.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/half.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/idiv.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/load-int16-1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-add-int16-1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-add-int16.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/rotate-6.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/shuffle-4.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/shuffle2-11.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/shuffle2-7.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/shuffle2-9.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/half-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-11.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-12.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-13.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-14.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-3.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-4.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-and-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-and-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-or-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-12.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-13.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-3.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-4.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-5.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-6.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/half-1.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/ptr-19.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-11.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-12.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-13.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-14.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-3.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-4.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-and-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-and-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-or-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-12.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-13.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-3.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-4.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-5.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4 clang++3.3 -O0 *

From 4579d339ea5898e8ec7dc807cec0d7ab1fa11093 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Mon, 18 Nov 2013 13:44:59 +0400
Subject: [PATCH 139/159] patch for LLVM 3.3 and test correction at avx2

---
 .../3_3_PR17764_reverse_operands_avx2.patch   | 51 +++++++++++++++++++
 tests/test-141.ispc                           |  5 +-
 2 files changed, 54 insertions(+), 2 deletions(-)
 create mode 100644 llvm_patches/3_3_PR17764_reverse_operands_avx2.patch

diff --git a/llvm_patches/3_3_PR17764_reverse_operands_avx2.patch b/llvm_patches/3_3_PR17764_reverse_operands_avx2.patch
new file mode 100644
index 00000000..2719633a
--- /dev/null
+++ b/llvm_patches/3_3_PR17764_reverse_operands_avx2.patch
@@ -0,0 +1,51 @@
+From 13c33dd2931ae9d9c5c9f142677f025281fbefca Mon Sep 17 00:00:00 2001
+From: Michael Liao <michael.hliao@gmail.com>
+Date: Fri, 1 Nov 2013 11:08:08 -0700
+Subject: [PATCH] Fix PR17764
+
+- %ret = select %mask, %v1, %v2 is equivalent to
+
+    %ret = %mask ? %v1 : %v2
+
+  but VPBLENDVB %mask, %v1, %v2, %ret (operands are in Intel assembly
+  order) is equivalent to
+
+    %ret = %mask ? %v2 : %v1
+---
+ lib/Target/X86/X86InstrSSE.td |  2 +-
+ test/CodeGen/X86/pr17764.ll   | 10 ++++++++++
+ 2 files changed, 11 insertions(+), 1 deletion(-)
+ create mode 100644 test/CodeGen/X86/pr17764.ll
+
+diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
+index 7cae485..bac88f9 100644
+--- lib/Target/X86/X86InstrSSE.td
++++ lib/Target/X86/X86InstrSSE.td
+@@ -6965,7 +6965,7 @@ let Predicates = [HasAVX] in {
+ let Predicates = [HasAVX2] in {
+   def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
+                             (v32i8 VR256:$src2))),
+-            (VPBLENDVBYrr VR256:$src1, VR256:$src2, VR256:$mask)>;
++            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+   def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2),
+                                (imm:$mask))),
+             (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+diff --git a/test/CodeGen/X86/pr17764.ll b/test/CodeGen/X86/pr17764.ll
+new file mode 100644
+index 0000000..7a3fd6d
+--- /dev/null
++++ test/CodeGen/X86/pr17764.ll
+@@ -0,0 +1,10 @@
++; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core-avx2 | FileCheck %s
++
++define <16 x i16> @foo(<16 x i1> %mask, <16 x i16> %x, <16 x i16> %y) {
++  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %y
++  ret <16 x i16> %ret
++}
++
++; CHECK: foo
++; CHECK: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
++; CHECK: ret
+-- 
+1.8.1.2
+
diff --git a/tests/test-141.ispc b/tests/test-141.ispc
index a533b605..b69be1fa 100644
--- a/tests/test-141.ispc
+++ b/tests/test-141.ispc
@@ -3,8 +3,9 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = aFOO[programIndex]; 
-    RET[programIndex] = (exp(-log(1/a)) - a) < 1e-7 ? 1 : 0;
+    float a = aFOO[programIndex];
+    // calculation error 1e-6 is the same as in icc
+    RET[programIndex] = (exp(-log(1/a)) - a) < 1e-6 ? 1 : 0;
 }
 
 export void result(uniform float RET[4]) {

From 97298eb1121195fba432f0c552fc8cc8b5e92718 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Tue, 19 Nov 2013 17:37:52 +0400
Subject: [PATCH 140/159] multiple targets in perf.py

---
 perf.py | 143 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 94 insertions(+), 49 deletions(-)

diff --git a/perf.py b/perf.py
index 9875fbdb..bb1f5b91 100755
--- a/perf.py
+++ b/perf.py
@@ -177,7 +177,10 @@ def geomean(par):
     l = len(par)
     for i in range(l):
         temp = temp * par[i]
-    temp = temp ** (1.0/l)
+    if l != 0:
+        temp = temp ** (1.0/l)
+    else:
+        temp = 0
     return round(temp, 2)
 
 #takes an answer struct and print it.
@@ -189,18 +192,30 @@ def geomean(par):
 #test[4] - list of absolute results with tasks
 #test[5] - list of absolute time without ISPC (serial)
 #test[1..4] may be empty
-def print_answer(answer):
+def print_answer(answer, target_number):
     filelist = []
     print_debug("--------------------------------------------------------------------------\n", s, perf_log)
     print_debug("test name:\t    ISPC speedup: ISPC + tasks speedup: | " + 
         "    ISPC time:    ISPC + tasks time:  serial:\n", s, perf_log)
-    filelist.append("test name,ISPC speedup,diff," +
-        "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n")
+    if target_number > 1:
+        if options.output == "":
+            options.output = "targets.csv"
+        filelist.append("test name,ISPC speedup" + "," * target_number + "ISPC + tasks speedup\n")
+        filelist.append("," + options.perf_target + "," + options.perf_target + "\n")
+    else:
+        filelist.append("test name,ISPC speedup,diff," +
+            "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n")
     max_t = [0,0,0,0,0]
     diff_t = [0,0,0,0,0]
-    geomean_t = [0,0,0,0,0]
-    list_of_max = [[],[],[],[],[]]
+    geomean_t = []
+    list_of_max = []
+    for i1 in range(target_number):
+        geomean_t.append([0,0,0,0,0])
+        list_of_max.append([[],[],[],[],[]])
     list_of_compare = [[],[],[],[],[],[]]
+    target_k = 0
+    temp_str_1 = ""
+    temp_str_2 = ""
     for i in range(len(answer)):
         list_of_compare[0].append(answer[i][0])
         for t in range(1,6):
@@ -215,7 +230,7 @@ def print_answer(answer):
                     mm = min(answer[i][t])
                 list_of_compare[t].append(mm)
                 max_t[t-1] = '%.2f' % mm
-                list_of_max[t-1].append(mm)
+                list_of_max[i % target_number][t-1].append(mm)
                 diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t]))
         print_debug("%s:\n" % answer[i][0], s, perf_log)
         print_debug("\t\tmax:\t%5s\t\t%10s\t|min:%10s\t%10s\t%10s\n" %
@@ -227,17 +242,37 @@ def print_answer(answer):
                 max_t[t] = ""
             if diff_t[t] == "n/a":
                 diff_t[t] = ""
-        filelist.append(answer[i][0] + "," +
+        if target_number > 1:
+            if target_k == 0:
+                temp_str_1 = answer[i][0] + ","
+                temp_str_2 = ""
+            temp_str_1 += max_t[0] + ","
+            temp_str_2 += max_t[1] + ","
+            target_k = target_k + 1
+            if target_k == target_number:
+                filelist.append(temp_str_1 + temp_str_2[:-1] + "\n")
+                target_k = 0
+        else:
+            filelist.append(answer[i][0] + "," +
                         max_t[0] + "," + diff_t[0] + "," +  max_t[1] + "," + diff_t[1] + "," +
                         max_t[2] + "," + diff_t[2] + "," +  max_t[3] + "," + diff_t[3] + "," +
                         max_t[4] + "," + diff_t[4] + "\n")
     for i in range(0,5):
-        geomean_t[i] = geomean(list_of_max[i])
+        for i1 in range(target_number):
+            geomean_t[i1][i] = geomean(list_of_max[i1][i])
     print_debug("---------------------------------------------------------------------------------\n", s, perf_log)
     print_debug("Geomean:\t\t%5s\t\t%10s\t|%14s\t%10s\t%10s\n" %
-        (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4]), s, perf_log)
-    filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1])
-        + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n")
+        (geomean_t[0][0], geomean_t[0][1], geomean_t[0][2], geomean_t[0][3], geomean_t[0][4]), s, perf_log)
+    if target_number > 1:
+        temp_str_1 = "Geomean,"
+        temp_str_2 = ""
+        for i in range(target_number):
+            temp_str_1 += str(geomean_t[i][0]) + ","
+            temp_str_2 += str(geomean_t[i][1]) + ","
+        filelist.append(temp_str_1 + temp_str_2[:-1] + "\n")
+    else:
+        filelist.append("Geomean," + str(geomean_t[0][0]) + ",," + str(geomean_t[0][1])
+            + ",," + str(geomean_t[0][2]) + ",," + str(geomean_t[0][3]) + ",," + str(geomean_t[0][4]) + "\n")
     print_file(filelist)
     return list_of_compare
 
@@ -409,8 +444,6 @@ def perf(options1, args):
     while i < length-2:
         # we read name of test
         print_debug("%s" % lines[i], s, perf_log)
-        test = [lines[i][:-1],[],[],[],[],[]]
-        test_ref = [lines[i][:-1],[],[],[],[],[]]
         # read location of test
         folder = lines[i+1]
         folder = folder[:-1]
@@ -424,41 +457,51 @@ def perf(options1, args):
         command = lines[i+2]
         command = command[:-1]
         # handle conditional target argument
-        target_str = ""
+        target_str_temp = ""
+        perf_targets = [""]
+        target_number = 1
         if options.perf_target != "":
-            target_str = " ISPC_IA_TARGETS="+options.perf_target 
-        if is_windows == False:
-            ex_command_ref = "./ref " + command + " >> " + perf_temp + "_ref"
-            ex_command = "./test " + command + " >> " + perf_temp + "_test"
-            bu_command_ref = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=ref ISPC="+ispc_ref+target_str+" >> "+build_log+" 2>> "+build_log
-            bu_command = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=test ISPC="+ispc_test+target_str+" >> "+build_log+" 2>> "+build_log
-            re_command = "make clean >> "+build_log
-        else:
-            ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref"
-            ex_command = "x64\\Release1\\test.exe " + command + " >> " + perf_temp + "_test"
-            bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /p:ISPC_compiler=ispc_ref /t:rebuild >> " + build_log
-            bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /p:ISPC_compiler=ispc /t:rebuild >> " + build_log
-            re_command = "msbuild /t:clean >> " + build_log
-        commands = [ex_command, bu_command, ex_command_ref, bu_command_ref, re_command]
-        # parsing config parameters
-        next_line = lines[i+3]
-        if next_line[0] == "!": # we should take only one part of test output
-            R = next_line.split(' ')
-            c1 = int(R[1]) #c1 is a number of string which we want to use in test output
-            c2 = int(R[2]) #c2 is total number of strings in test output
-            i = i+1
-        else:
-            c1 = 1
-            c2 = 1
-        next_line = lines[i+3]
-        if next_line[0] == "^":  #we should concatenate result of this test with previous one
-            run_test(commands, c1, c2, answer[len(answer)-1], answer_ref[len(answer)-1], False)
-            i = i+1
-        else: #we run this test and append it's result to answer structure
-            run_test(commands, c1, c2, test, test_ref, True)
-            answer.append(test)
-            answer_ref.append(test_ref)
-
+            perf_targets = options.perf_target.split(',')
+            target_str_temp = " ISPC_IA_TARGETS="
+            target_number = len(perf_targets)
+        temp = 0
+        for target_i in range(target_number):
+            test = [lines[i][:-1],[],[],[],[],[]]
+            test_ref = [lines[i][:-1],[],[],[],[],[]]
+            target_str = target_str_temp + perf_targets[target_i] 
+            if is_windows == False:
+                ex_command_ref = "./ref " + command + " >> " + perf_temp + "_ref"
+                ex_command = "./test " + command + " >> " + perf_temp + "_test"
+                bu_command_ref = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=ref ISPC="+ispc_ref+target_str+" >> "+build_log+" 2>> "+build_log
+                bu_command = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=test ISPC="+ispc_test+target_str+" >> "+build_log+" 2>> "+build_log
+                re_command = "make clean >> "+build_log
+            else:
+                ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref"
+                ex_command = "x64\\Release1\\test.exe " + command + " >> " + perf_temp + "_test"
+                bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /p:ISPC_compiler=ispc_ref /t:rebuild >> " + build_log
+                bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /p:ISPC_compiler=ispc /t:rebuild >> " + build_log
+                re_command = "msbuild /t:clean >> " + build_log
+            commands = [ex_command, bu_command, ex_command_ref, bu_command_ref, re_command]
+            # parsing config parameters
+            next_line = lines[i+3]
+            if next_line[0] == "!": # we should take only one part of test output
+                R = next_line.split(' ')
+                c1 = int(R[1]) #c1 is a number of string which we want to use in test output
+                c2 = int(R[2]) #c2 is total number of strings in test output
+                temp = 1
+            else:
+                c1 = 1
+                c2 = 1
+            next_line = lines[i+3]
+            if next_line[0] == "^":
+                temp = 1
+            if next_line[0] == "^" and target_number == 1:  #we should concatenate result of this test with previous one
+                run_test(commands, c1, c2, answer[len(answer)-1], answer_ref[len(answer)-1], False)
+            else: #we run this test and append it's result to answer structure
+                run_test(commands, c1, c2, test, test_ref, True)
+                answer.append(test)
+                answer_ref.append(test_ref)
+        i = i + temp
         # preparing next loop iteration
         os.chdir(pwd1)
         i+=4
@@ -468,8 +511,10 @@ def perf(options1, args):
     common.remove_if_exists(perf_temp+"_ref")
 
     #print collected answer
+    if target_number > 1:
+        s = True
     print_debug("\n\nTEST COMPILER:\n", s, perf_log)
-    A = print_answer(answer)
+    A = print_answer(answer, target_number)
     if options.ref != "":
         print_debug("\n\nREFERENCE COMPILER:\n", s, perf_log)
         B = print_answer(answer_ref)

From 5722d17924cae2f5c972f211312f5bb7787794bc Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 19 Nov 2013 21:17:54 +0400
Subject: [PATCH 141/159] fail_db.txt update on Linux with new passes

---
 fail_db.txt | 44 --------------------------------------------
 1 file changed, 44 deletions(-)

diff --git a/fail_db.txt b/fail_db.txt
index 43f3bdad..32917815 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -287,18 +287,12 @@
 ./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/atomics-13.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3 clang++3.3 -O2 *
 ./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3 clang++3.3 -O2 *
 ./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
@@ -489,7 +483,6 @@
 ./tests/half-1.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
 ./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
 ./tests/ptr-19.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
 ./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
 ./tests/atomics-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
 ./tests/atomics-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
@@ -518,40 +511,6 @@
 ./tests/reduce-equal-5.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
 ./tests/reduce-equal-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
 ./tests/reduce-equal-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/avg-down-int16.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/avg-down-uint16.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/avg-up-int16.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/avg-up-uint16.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/broadcast-2.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/half-2.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/half.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/idiv.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/load-int16-1.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/reduce-add-int16-1.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/reduce-add-int16.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/rotate-6.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/shuffle-4.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/shuffle2-11.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/shuffle2-7.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/shuffle2-9.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/avg-down-int16.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/avg-down-uint16.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/avg-up-int16.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/avg-up-uint16.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/broadcast-2.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/half-2.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/half.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/idiv.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/load-int16-1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/reduce-add-int16-1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/reduce-add-int16.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/rotate-6.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/shuffle-4.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/shuffle2-11.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/shuffle2-7.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/shuffle2-9.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O0 *
 ./tests/half-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
 ./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
 ./tests/atomics-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
@@ -585,7 +544,6 @@
 ./tests/half-1.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
 ./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
 ./tests/ptr-19.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
-./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
 ./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
 ./tests/atomics-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
 ./tests/atomics-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
@@ -614,5 +572,3 @@
 ./tests/reduce-equal-5.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
 ./tests/reduce-equal-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
 ./tests/reduce-equal-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4 clang++3.3 -O0 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4 clang++3.3 -O0 *

From 40da411fa5f55e40ad0c92afce5c8c725f68fb41 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 20 Nov 2013 17:22:50 +0400
Subject: [PATCH 142/159] Fix task system dignostic to report real reason of
 the symaphore allocation fail

---
 examples/tasksys.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp
index c9c2fa7b..e0149952 100644
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -696,7 +696,7 @@ InitTaskSystem() {
                     sprintf(name, "ispc_task.%d", (int)getpid());
                     workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
                     if (!workerSemaphore) {
-                        fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
+                        fprintf(stderr, "Error creating semaphore (%s): %s\n", name, strerror(errno));
                         exit(1);
                     }
 

From 5531586c356962bf2c53fc0dae932db52ad356c0 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 20 Nov 2013 19:19:15 +0400
Subject: [PATCH 143/159] Fix for existing semaphore problem

---
 examples/tasksys.cpp | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp
index e0149952..b97c4bba 100644
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -693,9 +693,19 @@ InitTaskSystem() {
                     }
 
                     char name[32];
-                    sprintf(name, "ispc_task.%d", (int)getpid());
-                    workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
-                    if (!workerSemaphore) {
+                    bool success = false;
+                    srand(time(NULL));
+                    for (int i = 0; i < 10; i++) {
+                        sprintf(name, "ispc_task.%d.%d", (int)getpid(), (int)rand());
+                        workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
+                        if (workerSemaphore != SEM_FAILED) {
+                            success = true;
+                            break;
+                        }
+                        fprintf(stderr, "Failed to create %s\n", name);
+                    }
+
+                    if (!success) {
                         fprintf(stderr, "Error creating semaphore (%s): %s\n", name, strerror(errno));
                         exit(1);
                     }

From 924858509d5e1b29e18a8f429e05ac30da4af910 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Thu, 21 Nov 2013 19:05:35 +0400
Subject: [PATCH 144/159] checking targets in perf.py

---
 perf.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/perf.py b/perf.py
index bb1f5b91..1b9de452 100755
--- a/perf.py
+++ b/perf.py
@@ -332,6 +332,15 @@ def perf(options1, args):
     if is_windows:
         pwd1 = "..\\..\\"
 
+    if options.perf_target != "":
+        test_only_r = " sse2-i32x4 sse2-i32x8 sse4-i32x4 sse4-i32x8 sse4-i16x8 \
+                        sse4-i8x16 avx1-i32x8 avx1-i32x16 avx1-i64x4 avx1.1-i32x8 \
+                        avx1.1-i32x16 avx1.1-i64x4 avx2-i32x8 avx2-i32x16 avx2-i64x4 "
+        test_only = options.perf_target.split(" ")
+        for iterator in test_only:
+            if not (" " + iterator + " " in test_only_r):
+                error("unknow option for target: " + iterator, 1)
+
     # check if cpu usage is low now
     cpu_percent = cpu_check()
     if cpu_percent > 20:

From 3fd9d5a025714448f09bc897c21ef36178b465df Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Thu, 21 Nov 2013 19:09:43 +0400
Subject: [PATCH 145/159] support of LLVM 3.5

---
 Makefile     |  4 ++++
 cbackend.cpp |  2 +-
 ctx.cpp      |  2 +-
 ispc.cpp     | 24 ++++++++++++------------
 ispc.h       |  4 ++--
 main.cpp     |  6 ++++--
 opt.cpp      |  4 ++--
 type.cpp     |  2 +-
 8 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/Makefile b/Makefile
index 9d39baa4..f6d7af38 100644
--- a/Makefile
+++ b/Makefile
@@ -83,6 +83,10 @@ ifeq ($(LLVM_VERSION),LLVM_3_4)
     ISPC_LIBS += -lcurses
 endif
 
+ifeq ($(LLVM_VERSION),LLVM_3_5)
+    ISPC_LIBS += -lcurses
+endif
+
 ifeq ($(ARCH_OS),Linux)
 	ISPC_LIBS += -ldl
 endif
diff --git a/cbackend.cpp b/cbackend.cpp
index 5a1ef705..8535653f 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -2196,7 +2196,7 @@ bool CWriter::doInitialization(llvm::Module &M) {
 #endif
   TAsm = new CBEMCAsmInfo();
   MRI  = new llvm::MCRegisterInfo();
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
   TCtx = new llvm::MCContext(TAsm, MRI, NULL);
 #else
   TCtx = new llvm::MCContext(*TAsm, *MRI, NULL);
diff --git a/ctx.cpp b/ctx.cpp
index c50d22f9..c1a7e61a 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -348,7 +348,7 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
             AssertPos(currentPos, diSubprogramType.Verify());
         }
 
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         Assert(diSubprogramType.isCompositeType());
         llvm::DICompositeType diSubprogramType_n =
             static_cast<llvm::DICompositeType>(diSubprogramType);
diff --git a/ispc.cpp b/ispc.cpp
index de01fdfb..36d31580 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -312,7 +312,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 4;
         this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",-sse4.1,-sse4.2"
 #else
         ",-sse41,-sse42"
@@ -328,7 +328,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",-sse4.1,-sse4.2"
 #else
         ",-sse41,-sse42"
@@ -345,7 +345,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_vectorWidth = 4;
         // TODO: why not sse42 and popcnt?
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",+sse4.1,-sse4.2"
 #else
         ",+sse41,-sse42"
@@ -362,7 +362,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",+sse4.1,-sse4.2"
 #else
         ",+sse41,-sse42"
@@ -377,7 +377,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_dataTypeWidth = 8;
         this->m_vectorWidth = 16;
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",+sse4.1,-sse4.2"
 #else
         ",+sse41,-sse42"
@@ -392,7 +392,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_dataTypeWidth = 16;
         this->m_vectorWidth = 8;
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",+sse4.1,-sse4.2"
 #else
         ",+sse41,-sse42"
@@ -512,7 +512,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",+rdrnd"
 #else
         ",+rdrand"
@@ -533,7 +533,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 16;
         this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",+rdrnd"
 #else
         ",+rdrand"
@@ -553,7 +553,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_dataTypeWidth = 64;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",+rdrnd"
 #else
         ",+rdrand"
@@ -574,7 +574,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",+rdrnd"
 #else
         ",+rdrand"
@@ -599,7 +599,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 16;
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",+rdrnd"
 #else
         ",+rdrand"
@@ -623,7 +623,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_dataTypeWidth = 64;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",+rdrnd"
 #else
         ",+rdrand"
diff --git a/ispc.h b/ispc.h
index 2207cdde..b319d656 100644
--- a/ispc.h
+++ b/ispc.h
@@ -40,8 +40,8 @@
 
 #define ISPC_VERSION "1.5.1dev"
 
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
-#error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported"
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)
+#error "Only LLVM 3.1, 3.2, 3.3, 3.4 and the 3.5 development branch are supported"
 #endif
 
 #if defined(_WIN32) || defined(_WIN64)
diff --git a/main.cpp b/main.cpp
index 21a47de8..99497af5 100644
--- a/main.cpp
+++ b/main.cpp
@@ -70,6 +70,8 @@ lPrintVersion() {
            "3.3"
 #elif defined(LLVM_3_4)
            "3.4"
+#elif defined(LLVM_3_5)
+           "3.5"
 #else
 #error "Unhandled LLVM version"
 #endif
@@ -164,7 +166,7 @@ devUsage(int ret) {
     printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
     printf("    [--yydebug]\t\t\t\tPrint debugging information during parsing\n");
     printf("    [--debug-phase=<value>]\t\tSet optimization phases to dump. --debug-phase=first,210:220,300,305,310:last\n");
-#ifdef LLVM_3_4
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
     printf("    [--debug-ir=<value>]\t\tSet optimization phase to generate debugIR after it\n");
 #endif
     printf("    [--off-phase=<value>]\t\tSwitch off optimization phases. --off-phase=first,210:220,300,305,310:last\n");
@@ -547,7 +549,7 @@ int main(int Argc, char *Argv[]) {
                             "away or introduce the new ones.\n");
             g->debug_stages = ParsingPhases(argv[i] + strlen("--debug-phase="));
         }
-#ifdef LLVM_3_4
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         else if (strncmp(argv[i], "--debug-ir=", 11) == 0) {
             g->debugIR = ParsingPhaseName(argv[i] + strlen("--debug-ir="));
         }
diff --git a/opt.cpp b/opt.cpp
index ce84744a..8df0f4fe 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -63,7 +63,7 @@
   #include <llvm/IR/BasicBlock.h>
   #include <llvm/IR/Constants.h>
 #endif
-#if defined (LLVM_3_4)
+#if defined (LLVM_3_4) || defined(LLVM_3_5)
   #include <llvm/Transforms/Instrumentation.h>
 #endif
 #include <llvm/PassManager.h>
@@ -441,7 +441,7 @@ DebugPassManager::add(llvm::Pass * P, int stage = -1) {
                 number, P->getPassName());
             PM.add(CreateDebugPass(buf));
         }
-#ifdef LLVM_3_4
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         if (g->debugIR == number) {
             // adding generating of LLVM IR debug after optimization
             char buf[100];
diff --git a/type.cpp b/type.cpp
index 5fa1845b..08013af0 100644
--- a/type.cpp
+++ b/type.cpp
@@ -2879,7 +2879,7 @@ FunctionType::GetDIType(llvm::DIDescriptor scope) const {
     for (int i = 0; i < GetNumParameters(); ++i) {
         const Type *t = GetParameterType(i);
         if (t == NULL)
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
             return llvm::DICompositeType();
 #else
             return llvm::DIType();

From 18f90e63391101d414b1ecca7a5ee7430e621e62 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Fri, 22 Nov 2013 17:06:19 +0400
Subject: [PATCH 146/159] fix of perf.py

---
 perf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf.py b/perf.py
index 1b9de452..2d83475f 100755
--- a/perf.py
+++ b/perf.py
@@ -336,7 +336,7 @@ def perf(options1, args):
         test_only_r = " sse2-i32x4 sse2-i32x8 sse4-i32x4 sse4-i32x8 sse4-i16x8 \
                         sse4-i8x16 avx1-i32x8 avx1-i32x16 avx1-i64x4 avx1.1-i32x8 \
                         avx1.1-i32x16 avx1.1-i64x4 avx2-i32x8 avx2-i32x16 avx2-i64x4 "
-        test_only = options.perf_target.split(" ")
+        test_only = options.perf_target.split(",")
         for iterator in test_only:
             if not (" " + iterator + " " in test_only_r):
                 error("unknow option for target: " + iterator, 1)

From 8b972f2ed6b3c3f4a775f72bcc512f337fb60084 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Tue, 26 Nov 2013 17:08:06 +0400
Subject: [PATCH 147/159] Changing error to warning: mismatch in size/layout of
 global variable

---
 module.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module.cpp b/module.cpp
index 1afc5a0b..6006ea34 100644
--- a/module.cpp
+++ b/module.cpp
@@ -2104,7 +2104,7 @@ lAddExtractedGlobals(llvm::Module *module,
                 // example, this happens with varying globals if we compile
                 // to different vector widths.
                 if (gv2->getType() != gv->getType())
-                    Error(rgi.pos, "Mismatch in size/layout of global "
+                    Warning(rgi.pos, "Mismatch in size/layout of global "
                           "variable \"%s\" with different targets. "
                           "Globals must not include \"varying\" types or arrays "
                           "with size based on programCount when compiling to "

From 935800d7f69de46c1cda1a0055d85f01a0709ada Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Mon, 25 Nov 2013 13:31:26 +0400
Subject: [PATCH 148/159] making common.props

---
 examples/aobench/aobench.vcxproj              | 173 ++--------------
 examples/common.props                         | 149 ++++++++++++++
 examples/deferred/deferred_shading.vcxproj    | 163 +--------------
 examples/mandelbrot/mandelbrot.vcxproj        | 163 +--------------
 .../mandelbrot_tasks/mandelbrot_tasks.vcxproj | 161 +--------------
 examples/noise/noise.vcxproj                  | 193 +++---------------
 examples/options/options.vcxproj              | 159 +--------------
 examples/rt/rt.vcxproj                        | 163 +--------------
 examples/sort/sort.vcxproj                    | 157 +-------------
 examples/stencil/stencil.vcxproj              | 147 +------------
 examples/volume_rendering/volume.vcxproj      | 157 +-------------
 11 files changed, 231 insertions(+), 1554 deletions(-)
 create mode 100644 examples/common.props

diff --git a/examples/aobench/aobench.vcxproj b/examples/aobench/aobench.vcxproj
index a5b354ce..270af3b1 100644
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -1,28 +1,12 @@
 ï»¿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="ao.cpp" />
-    <ClCompile Include="ao_serial.cpp" />
-    <ClCompile Include="../tasksys.cpp" />
-  </ItemGroup>
+  <Import Project="..\common.props" />
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>aobench</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
+  </PropertyGroup>
   <ItemGroup>
     <CustomBuild Include="ao.ispc">
       <FileType>Document</FileType>
@@ -40,142 +24,9 @@
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
     </CustomBuild>
   </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}</ProjectGuid>
-    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>aobench</RootNamespace>
-    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>ao</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ExecutablePath);$(ProjectDir)..\..</ExecutablePath>
-    <TargetName>ao</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>ao</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>ao</TargetName>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
+  <ItemGroup>
+    <ClCompile Include="ao.cpp" />
+    <ClCompile Include="ao_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
 </Project>
diff --git a/examples/common.props b/examples/common.props
new file mode 100644
index 00000000..6c1e9596
--- /dev/null
+++ b/examples/common.props
@@ -0,0 +1,149 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/examples/deferred/deferred_shading.vcxproj b/examples/deferred/deferred_shading.vcxproj
index 94e38540..dc90cec5 100755
--- a/examples/deferred/deferred_shading.vcxproj
+++ b/examples/deferred/deferred_shading.vcxproj
@@ -1,161 +1,12 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
-    <RootNamespace>mandelbrot</RootNamespace>
+    <RootNamespace>deferred</RootNamespace>
     <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="common.cpp" />
-    <ClCompile Include="dynamic_c.cpp" />
-    <ClCompile Include="dynamic_cilk.cpp" />
-    <ClCompile Include="main.cpp" />
-    <ClCompile Include="../tasksys.cpp" />
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="kernels.ispc">
       <FileType>Document</FileType>
@@ -173,7 +24,11 @@
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
     </CustomBuild>
   </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
+  <ItemGroup>
+    <ClCompile Include="common.cpp" />
+    <ClCompile Include="dynamic_c.cpp" />
+    <ClCompile Include="dynamic_cilk.cpp" />
+    <ClCompile Include="main.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
 </Project>
diff --git a/examples/mandelbrot/mandelbrot.vcxproj b/examples/mandelbrot/mandelbrot.vcxproj
index 1b6f1281..6db650a1 100644
--- a/examples/mandelbrot/mandelbrot.vcxproj
+++ b/examples/mandelbrot/mandelbrot.vcxproj
@@ -1,176 +1,31 @@
 ï»¿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>mandelbrot</RootNamespace>
     <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="mandelbrot.cpp" />
-    <ClCompile Include="mandelbrot_serial.cpp" />
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="mandelbrot.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
     </CustomBuild>
   </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
+  <ItemGroup>
+    <ClCompile Include="mandelbrot.cpp" />
+    <ClCompile Include="mandelbrot_serial.cpp" />
+  </ItemGroup>
 </Project>
diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
index fbebdc32..41c9c75d 100644
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -1,163 +1,12 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{E80DA7D4-AB22-4648-A068-327307156BE6}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>mandelbrot_tasks</RootNamespace>
     <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot_tasks</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot_tasks</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot_tasks</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot_tasks</TargetName>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="mandelbrot_tasks.cpp" />
-    <ClCompile Include="mandelbrot_tasks_serial.cpp" />
-    <ClCompile Include="../tasksys.cpp" />
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="mandelbrot_tasks.ispc">
       <FileType>Document</FileType>
@@ -175,7 +24,9 @@
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
     </CustomBuild>
   </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
+    <ItemGroup>
+    <ClCompile Include="mandelbrot_tasks.cpp" />
+    <ClCompile Include="mandelbrot_tasks_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
 </Project>
diff --git a/examples/noise/noise.vcxproj b/examples/noise/noise.vcxproj
index 01456625..dbb309dd 100644
--- a/examples/noise/noise.vcxproj
+++ b/examples/noise/noise.vcxproj
@@ -1,176 +1,31 @@
 <?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}</ProjectGuid>
-    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>noise</RootNamespace>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="..\common.props" />
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>noise</RootNamespace>
     <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="noise.cpp" />
-    <ClCompile Include="noise_serial.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="noise.ispc">
-      <FileType>Document</FileType>
+  </PropertyGroup>
+  <ItemGroup>
+    <CustomBuild Include="noise.ispc">
+      <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
-</Command>
+</Command>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
-</Command>
+</Command>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+    <ItemGroup>
+    <ClCompile Include="noise.cpp" />
+    <ClCompile Include="noise_serial.cpp" />
+  </ItemGroup>
 </Project>
diff --git a/examples/options/options.vcxproj b/examples/options/options.vcxproj
index 77fb9353..e61f2d80 100644
--- a/examples/options/options.vcxproj
+++ b/examples/options/options.vcxproj
@@ -1,163 +1,12 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>options</RootNamespace>
     <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="options.cpp" />
-    <ClCompile Include="options_serial.cpp" />
-    <ClCompile Include="../tasksys.cpp" />
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="options.ispc">
       <FileType>Document</FileType>
@@ -176,9 +25,9 @@
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="options.cpp" />
+    <ClCompile Include="options_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
     <ClInclude Include="options_defs.h" />
   </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
 </Project>
diff --git a/examples/rt/rt.vcxproj b/examples/rt/rt.vcxproj
index 19d40192..3fbc6874 100644
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -1,171 +1,21 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>rt</RootNamespace>
     <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
   <ItemGroup>
     <CustomBuild Include="rt.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
     </CustomBuild>
@@ -175,7 +25,4 @@ $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(Target
     <ClCompile Include="rt_serial.cpp" />
     <ClCompile Include="../tasksys.cpp" />
   </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
 </Project>
diff --git a/examples/sort/sort.vcxproj b/examples/sort/sort.vcxproj
index b37eab1c..7bf90aa6 100644
--- a/examples/sort/sort.vcxproj
+++ b/examples/sort/sort.vcxproj
@@ -1,159 +1,12 @@
 ï»¿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>sort</RootNamespace>
     <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="sort.cpp" />
-    <ClCompile Include="sort_serial.cpp" />
-    <ClCompile Include="../tasksys.cpp" />
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="sort.ispc">
       <FileType>Document</FileType>
@@ -171,7 +24,9 @@
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
     </CustomBuild>
   </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
+  <ItemGroup>
+    <ClCompile Include="sort.cpp" />
+    <ClCompile Include="sort_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
 </Project>
diff --git a/examples/stencil/stencil.vcxproj b/examples/stencil/stencil.vcxproj
index a96a187d..2814d5a1 100644
--- a/examples/stencil/stencil.vcxproj
+++ b/examples/stencil/stencil.vcxproj
@@ -1,154 +1,12 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>rt</RootNamespace>
     <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
   <ItemGroup>
     <CustomBuild Include="stencil.ispc">
       <FileType>Document</FileType>
@@ -175,7 +33,4 @@ $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(Target
     <ClCompile Include="stencil_serial.cpp" />
     <ClCompile Include="../tasksys.cpp" />
   </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
 </Project>
diff --git a/examples/volume_rendering/volume.vcxproj b/examples/volume_rendering/volume.vcxproj
index d3594b98..78f5ba86 100644
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -1,159 +1,12 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{dee5733a-e93e-449d-9114-9bffcaeb4df9}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>volume</RootNamespace>
     <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="volume.cpp" />
-    <ClCompile Include="volume_serial.cpp" />
-    <ClCompile Include="../tasksys.cpp" />
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="volume.ispc">
       <FileType>Document</FileType>
@@ -171,7 +24,9 @@
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
     </CustomBuild>
   </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
+  <ItemGroup>
+    <ClCompile Include="volume.cpp" />
+    <ClCompile Include="volume_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
 </Project>

From f3ff1fcbebd9f8c8f6ba573d5a1f140c744d3e41 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Mon, 25 Nov 2013 23:37:42 +0400
Subject: [PATCH 149/159] supporting targets in perf windows

---
 examples/aobench/aobench.vcxproj              | 22 +++-------------
 examples/common.props                         | 23 ++++++++++++++++
 examples/deferred/deferred_shading.vcxproj    | 22 +++-------------
 examples/mandelbrot/mandelbrot.vcxproj        | 22 +++-------------
 .../mandelbrot_tasks/mandelbrot_tasks.vcxproj | 22 +++-------------
 examples/noise/noise.vcxproj                  | 22 +++-------------
 examples/options/options.vcxproj              | 22 +++-------------
 examples/rt/rt.vcxproj                        | 18 +++----------
 examples/sort/sort.vcxproj                    | 22 +++-------------
 examples/stencil/stencil.vcxproj              | 26 +++----------------
 examples/volume_rendering/volume.vcxproj      | 22 +++-------------
 perf.py                                       | 14 ++++++----
 run_tests.py                                  |  2 +-
 13 files changed, 63 insertions(+), 196 deletions(-)

diff --git a/examples/aobench/aobench.vcxproj b/examples/aobench/aobench.vcxproj
index 270af3b1..c46ee41a 100644
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -1,29 +1,13 @@
 ï»¿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>aobench</RootNamespace>
-    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
+    <ISPC_file>ao</ISPC_file>
+    <default_targets>sse2,sse4,avx1-i32x8</default_targets>
   </PropertyGroup>
-  <ItemGroup>
-    <CustomBuild Include="ao.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="ao.cpp" />
     <ClCompile Include="ao_serial.cpp" />
diff --git a/examples/common.props b/examples/common.props
index 6c1e9596..7bf37005 100644
--- a/examples/common.props
+++ b/examples/common.props
@@ -143,6 +143,29 @@
       <OptimizeReferences>true</OptimizeReferences>
     </Link>
   </ItemDefinitionGroup>
+  <PropertyGroup Label="User">
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
+    <Target_str Condition=" '$(Target_str)' == '' ">$(default_targets)</Target_str>
+    <Target_out>$(TargetDir)$(ISPC_file).obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('sse2')))">$(Target_out);$(TargetDir)$(ISPC_file)_sse2.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('sse4')))">$(Target_out);$(TargetDir)$(ISPC_file)_sse4.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx1-')))">$(Target_out);$(TargetDir)$(ISPC_file)_avx.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx1.1')))">$(Target_out);$(TargetDir)$(ISPC_file)_avx11.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx2')))">$(Target_out);$(TargetDir)$(ISPC_file)_avx2.obj</Target_out>
+  </PropertyGroup>
+  <ItemGroup>
+    <CustomBuild Include='$(ISPC_file).ispc'>
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
diff --git a/examples/deferred/deferred_shading.vcxproj b/examples/deferred/deferred_shading.vcxproj
index dc90cec5..cd361b26 100755
--- a/examples/deferred/deferred_shading.vcxproj
+++ b/examples/deferred/deferred_shading.vcxproj
@@ -1,29 +1,13 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>deferred</RootNamespace>
-    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
+    <ISPC_file>kernels</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-x2</default_targets>
   </PropertyGroup>
-  <ItemGroup>
-    <CustomBuild Include="kernels.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="common.cpp" />
     <ClCompile Include="dynamic_c.cpp" />
diff --git a/examples/mandelbrot/mandelbrot.vcxproj b/examples/mandelbrot/mandelbrot.vcxproj
index 6db650a1..e7703ad0 100644
--- a/examples/mandelbrot/mandelbrot.vcxproj
+++ b/examples/mandelbrot/mandelbrot.vcxproj
@@ -1,29 +1,13 @@
 ï»¿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>mandelbrot</RootNamespace>
-    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
+    <ISPC_file>mandelbrot</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-x2</default_targets>
   </PropertyGroup>
-  <ItemGroup>
-    <CustomBuild Include="mandelbrot.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="mandelbrot.cpp" />
     <ClCompile Include="mandelbrot_serial.cpp" />
diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
index 41c9c75d..f8b8cfcb 100644
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -1,30 +1,14 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{E80DA7D4-AB22-4648-A068-327307156BE6}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>mandelbrot_tasks</RootNamespace>
-    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
+    <ISPC_file>mandelbrot_tasks</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-x2</default_targets>
   </PropertyGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
-    <CustomBuild Include="mandelbrot_tasks.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
-    <ItemGroup>
     <ClCompile Include="mandelbrot_tasks.cpp" />
     <ClCompile Include="mandelbrot_tasks_serial.cpp" />
     <ClCompile Include="../tasksys.cpp" />
diff --git a/examples/noise/noise.vcxproj b/examples/noise/noise.vcxproj
index dbb309dd..7adc57f3 100644
--- a/examples/noise/noise.vcxproj
+++ b/examples/noise/noise.vcxproj
@@ -1,30 +1,14 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>noise</RootNamespace>
-    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
+    <ISPC_file>noise</ISPC_file>
+    <default_targets>sse2,sse4,avx1-x2</default_targets>
   </PropertyGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
-    <CustomBuild Include="noise.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
-    <ItemGroup>
     <ClCompile Include="noise.cpp" />
     <ClCompile Include="noise_serial.cpp" />
   </ItemGroup>
diff --git a/examples/options/options.vcxproj b/examples/options/options.vcxproj
index e61f2d80..af336aa1 100644
--- a/examples/options/options.vcxproj
+++ b/examples/options/options.vcxproj
@@ -1,29 +1,13 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>options</RootNamespace>
-    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
+    <ISPC_file>options</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-x2</default_targets>
   </PropertyGroup>
-  <ItemGroup>
-    <CustomBuild Include="options.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="options.cpp" />
     <ClCompile Include="options_serial.cpp" />
diff --git a/examples/rt/rt.vcxproj b/examples/rt/rt.vcxproj
index 3fbc6874..ea34de56 100644
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -1,25 +1,13 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>rt</RootNamespace>
-    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
+    <ISPC_file>rt</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-i32x8</default_targets>
   </PropertyGroup>
-  <ItemGroup>
-    <CustomBuild Include="rt.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="rt.cpp" />
     <ClCompile Include="rt_serial.cpp" />
diff --git a/examples/sort/sort.vcxproj b/examples/sort/sort.vcxproj
index 7bf90aa6..43f2b439 100644
--- a/examples/sort/sort.vcxproj
+++ b/examples/sort/sort.vcxproj
@@ -1,29 +1,13 @@
 ï»¿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>sort</RootNamespace>
-    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
+    <ISPC_file>sort</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-x2</default_targets>
   </PropertyGroup>
-  <ItemGroup>
-    <CustomBuild Include="sort.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="sort.cpp" />
     <ClCompile Include="sort_serial.cpp" />
diff --git a/examples/stencil/stencil.vcxproj b/examples/stencil/stencil.vcxproj
index 2814d5a1..b5f5bb22 100644
--- a/examples/stencil/stencil.vcxproj
+++ b/examples/stencil/stencil.vcxproj
@@ -1,33 +1,13 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>rt</RootNamespace>
-    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
+    <ISPC_file>stencil</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-i32x8</default_targets>
   </PropertyGroup>
-  <ItemGroup>
-    <CustomBuild Include="stencil.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="stencil.cpp" />
     <ClCompile Include="stencil_serial.cpp" />
diff --git a/examples/volume_rendering/volume.vcxproj b/examples/volume_rendering/volume.vcxproj
index 78f5ba86..cc738a7e 100644
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -1,29 +1,13 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <Import Project="..\common.props" />
   <PropertyGroup Label="Globals">
     <ProjectGuid>{dee5733a-e93e-449d-9114-9bffcaeb4df9}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>volume</RootNamespace>
-    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
+    <ISPC_file>volume</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-i32x8</default_targets>
   </PropertyGroup>
-  <ItemGroup>
-    <CustomBuild Include="volume.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="volume.cpp" />
     <ClCompile Include="volume_serial.cpp" />
diff --git a/perf.py b/perf.py
index 2d83475f..d1134990 100755
--- a/perf.py
+++ b/perf.py
@@ -44,7 +44,8 @@ def build_test(commands):
     test = os.system(commands[1])
     if is_windows:
         common.remove_if_exists(".\\X64\\Release1")
-        os.rename(".\\X64\\Release", ".\\X64\\Release1")
+        if (test == 0):
+            os.rename(".\\X64\\Release", ".\\X64\\Release1")
     if options.ref:
         ref = os.system(commands[3])
     return (options.ref and ref) or test
@@ -334,7 +335,7 @@ def perf(options1, args):
 
     if options.perf_target != "":
         test_only_r = " sse2-i32x4 sse2-i32x8 sse4-i32x4 sse4-i32x8 sse4-i16x8 \
-                        sse4-i8x16 avx1-i32x8 avx1-i32x16 avx1-i64x4 avx1.1-i32x8 \
+                        sse4-i8x16 avx1-i32x4 avx1-i32x8 avx1-i32x16 avx1-i64x4 avx1.1-i32x8 \
                         avx1.1-i32x16 avx1.1-i64x4 avx2-i32x8 avx2-i32x16 avx2-i64x4 "
         test_only = options.perf_target.split(",")
         for iterator in test_only:
@@ -467,17 +468,20 @@ def perf(options1, args):
         command = command[:-1]
         # handle conditional target argument
         target_str_temp = ""
+	target_out_temp = ""
         perf_targets = [""]
         target_number = 1
         if options.perf_target != "":
             perf_targets = options.perf_target.split(',')
             target_str_temp = " ISPC_IA_TARGETS="
+	    target_out_temp = " /p:Target_str="
             target_number = len(perf_targets)
         temp = 0
         for target_i in range(target_number):
             test = [lines[i][:-1],[],[],[],[],[]]
             test_ref = [lines[i][:-1],[],[],[],[],[]]
-            target_str = target_str_temp + perf_targets[target_i] 
+            target_str = target_str_temp + perf_targets[target_i]
+	    Target_out = target_out_temp + perf_targets[target_i]
             if is_windows == False:
                 ex_command_ref = "./ref " + command + " >> " + perf_temp + "_ref"
                 ex_command = "./test " + command + " >> " + perf_temp + "_test"
@@ -487,8 +491,8 @@ def perf(options1, args):
             else:
                 ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref"
                 ex_command = "x64\\Release1\\test.exe " + command + " >> " + perf_temp + "_test"
-                bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /p:ISPC_compiler=ispc_ref /t:rebuild >> " + build_log
-                bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /p:ISPC_compiler=ispc /t:rebuild >> " + build_log
+		bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /p:ISPC_compiler=ispc_ref " + Target_out + " /t:rebuild >> " + build_log
+                bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /p:ISPC_compiler=ispc " + Target_out + " /t:rebuild >> " + build_log
                 re_command = "msbuild /t:clean >> " + build_log
             commands = [ex_command, bu_command, ex_command_ref, bu_command_ref, re_command]
             # parsing config parameters
diff --git a/run_tests.py b/run_tests.py
index 506d37a5..3f03cc9b 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -454,7 +454,7 @@ def verify():
     check = [["g++", "clang++", "cl"],["-O0", "-O2"],["x86","x86-64"],
              ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM head"],
              ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8",
-              "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8",
+              "sse4-i8x16", "avx1-i32x4" "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8",
               "avx1.1-i32x16", "avx1.1-i64x4", "avx2-i32x8", "avx2-i32x16", "avx2-i64x4",
               "generic-1", "generic-4", "generic-8",
               "generic-16", "generic-32", "generic-64"]]

From 218d2892e8eb8abbc8c1cb480336ecece89e0911 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 27 Nov 2013 03:24:17 +0400
Subject: [PATCH 150/159] fail_db.txt update with LLVM 3.5 (trunk) results on
 Linux

---
 fail_db.txt | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index 32917815..6351fa8f 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -572,3 +572,68 @@
 ./tests/reduce-equal-5.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
 ./tests/reduce-equal-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
 ./tests/reduce-equal-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O2 *
+./tests/half-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-11.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-12.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-13.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-14.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-3.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-4.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-and-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-and-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-or-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-12.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-13.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-3.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-4.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-5.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-6.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O2 *
+./tests/half-1.ispc runfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/ptr-19.ispc runfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-11.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-12.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-13.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-14.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-3.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-4.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-and-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-and-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-or-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-12.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-13.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-3.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-4.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-5.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *

From 672d43a6cfa135eeeb0b9640cf691526e7eac72d Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 27 Nov 2013 23:22:50 +0400
Subject: [PATCH 151/159] Adding patch for sse4-i16x8 and sse4-i8x16 targets

---
 .../3_3_r195476_r195779_i16_sext.patch        | 57 +++++++++++++++++++
 .../3_4_r195476_r195779_i16_sext.patch        | 57 +++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 llvm_patches/3_3_r195476_r195779_i16_sext.patch
 create mode 100644 llvm_patches/3_4_r195476_r195779_i16_sext.patch

diff --git a/llvm_patches/3_3_r195476_r195779_i16_sext.patch b/llvm_patches/3_3_r195476_r195779_i16_sext.patch
new file mode 100644
index 00000000..a49325c9
--- /dev/null
+++ b/llvm_patches/3_3_r195476_r195779_i16_sext.patch
@@ -0,0 +1,57 @@
+Two stability patches affecting sse4-i16x8 and sse4-i8x16 targets. See PR18014 and PR18054 for more details.
+
+Index: lib/Target/X86/X86ISelLowering.cpp
+===================================================================
+--- lib/Target/X86/X86ISelLowering.cpp	(revision 195862)
++++ lib/Target/X86/X86ISelLowering.cpp	(working copy)
+@@ -12099,19 +12099,27 @@
+       // fall through
+     case MVT::v4i32:
+     case MVT::v8i16: {
+-      // (sext (vzext x)) -> (vsext x)
+       SDValue Op0 = Op.getOperand(0);
+       SDValue Op00 = Op0.getOperand(0);
+       SDValue Tmp1;
+       // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
+       if (Op0.getOpcode() == ISD::BITCAST &&
+-          Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
++          Op00.getOpcode() == ISD::VECTOR_SHUFFLE) {
++        // (sext (vzext x)) -> (vsext x)
+         Tmp1 = LowerVectorIntExtend(Op00, DAG);
+-      if (Tmp1.getNode()) {
+-        SDValue Tmp1Op0 = Tmp1.getOperand(0);
+-        assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
+-               "This optimization is invalid without a VZEXT.");
+-        return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
++        if (Tmp1.getNode()) {
++          EVT ExtraEltVT = ExtraVT.getVectorElementType();
++          // This folding is only valid when the in-reg type is a vector of i8,
++          // i16, or i32.
++          if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 ||
++              ExtraEltVT == MVT::i32) {
++            SDValue Tmp1Op0 = Tmp1.getOperand(0);
++            assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
++                   "This optimization is invalid without a VZEXT.");
++            return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
++          }
++          Op0 = Tmp1;
++        }
+       }
+ 
+       // If the above didn't work, then just use Shift-Left + Shift-Right.
+@@ -15826,6 +15834,15 @@
+     if (BitWidth == 1)
+       return SDValue();
+ 
++    // Check all uses of that condition operand to check whether it will be
++    // consumed by non-BLEND instructions, which may depend on all bits are set
++    // properly.
++    for (SDNode::use_iterator I = Cond->use_begin(),
++                              E = Cond->use_end(); I != E; ++I)
++      if (I->getOpcode() != ISD::VSELECT)
++        // TODO: Add other opcodes eventually lowered into BLEND.
++        return SDValue();
++
+     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
+     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
+ 
diff --git a/llvm_patches/3_4_r195476_r195779_i16_sext.patch b/llvm_patches/3_4_r195476_r195779_i16_sext.patch
new file mode 100644
index 00000000..4e2c0f6b
--- /dev/null
+++ b/llvm_patches/3_4_r195476_r195779_i16_sext.patch
@@ -0,0 +1,57 @@
+Two stability patches affecting sse4-i16x8 and sse4-i8x16 targets. See PR18014 and PR18054 for more details.
+
+Index: lib/Target/X86/X86ISelLowering.cpp
+===================================================================
+--- lib/Target/X86/X86ISelLowering.cpp	(revision 195863)
++++ lib/Target/X86/X86ISelLowering.cpp	(working copy)
+@@ -13120,19 +13120,27 @@
+       // fall through
+     case MVT::v4i32:
+     case MVT::v8i16: {
+-      // (sext (vzext x)) -> (vsext x)
+       SDValue Op0 = Op.getOperand(0);
+       SDValue Op00 = Op0.getOperand(0);
+       SDValue Tmp1;
+       // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
+       if (Op0.getOpcode() == ISD::BITCAST &&
+-          Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
++          Op00.getOpcode() == ISD::VECTOR_SHUFFLE) {
++        // (sext (vzext x)) -> (vsext x)
+         Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG);
+-      if (Tmp1.getNode()) {
+-        SDValue Tmp1Op0 = Tmp1.getOperand(0);
+-        assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
+-               "This optimization is invalid without a VZEXT.");
+-        return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
++        if (Tmp1.getNode()) {
++          EVT ExtraEltVT = ExtraVT.getVectorElementType();
++          // This folding is only valid when the in-reg type is a vector of i8,
++          // i16, or i32.
++          if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 ||
++              ExtraEltVT == MVT::i32) {
++            SDValue Tmp1Op0 = Tmp1.getOperand(0);
++            assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
++                   "This optimization is invalid without a VZEXT.");
++            return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
++          }
++          Op0 = Tmp1;
++        }
+       }
+ 
+       // If the above didn't work, then just use Shift-Left + Shift-Right.
+@@ -17007,6 +17015,15 @@
+     if (BitWidth == 1)
+       return SDValue();
+ 
++    // Check all uses of that condition operand to check whether it will be
++    // consumed by non-BLEND instructions, which may depend on all bits are set
++    // properly.
++    for (SDNode::use_iterator I = Cond->use_begin(),
++                              E = Cond->use_end(); I != E; ++I)
++      if (I->getOpcode() != ISD::VSELECT)
++        // TODO: Add other opcodes eventually lowered into BLEND.
++        return SDValue();
++
+     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
+     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
+ 

From eaa483d6e4e08addc311c7b533f1b22184782186 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Thu, 28 Nov 2013 13:51:20 +0400
Subject: [PATCH 152/159] fail_db update (Linux)

---
 fail_db.txt | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fail_db.txt b/fail_db.txt
index 6351fa8f..ff119d5a 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -277,13 +277,7 @@
 .\tests\reduce-min-uint64.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.4         cl -O2 *
 ./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O2 *

From be813ea0a239928bc0bd174870c939d10de2c2d3 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Thu, 14 Nov 2013 15:32:47 +0400
Subject: [PATCH 153/159] Select optimization for LLVM 3.3

---
 opt.cpp | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 195 insertions(+), 7 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index 8df0f4fe..3e320b4b 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -127,6 +127,8 @@ static llvm::Pass *CreateDebugPass(char * output);
 
 static llvm::Pass *CreateReplaceStdlibShiftPass();
 
+static llvm::Pass *CreateFixBooleanSelectPass();
+
 #define DEBUG_START_PASS(NAME)                                 \
     if (g->debugPrint &&                                       \
         (getenv("FUNC") == NULL ||                             \
@@ -659,6 +661,9 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(CreateMakeInternalFuncsStaticPass());
         optPM.add(llvm::createGlobalDCEPass());
         optPM.add(llvm::createConstantMergePass());
+
+        // Should be the last
+        optPM.add(CreateFixBooleanSelectPass(), 400);
     }
 
     // Finish up by making sure we didn't mess anything up in the IR along
@@ -670,6 +675,7 @@ Optimize(llvm::Module *module, int optLevel) {
         printf("\n*****\nFINAL OUTPUT\n*****\n");
         module->dump();
     }
+
 }
 
 
@@ -1022,12 +1028,12 @@ InstructionSimplifyPass::simplifyBoolVec(llvm::Value *value) {
     if (trunc != NULL) {
         // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector)
         llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(value);
-        if (sext && 
+        if (sext &&
             sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
             return sext->getOperand(0);
 
         llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(value);
-        if (zext && 
+        if (zext &&
             zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
             return zext->getOperand(0);
     }
@@ -1853,7 +1859,7 @@ lIs32BitSafeHelper(llvm::Value *v) {
     // handle Adds, SExts, Constant Vectors
     if (llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v)) {
         if (bop->getOpcode() == llvm::Instruction::Add) {
-            return lIs32BitSafeHelper(bop->getOperand(0)) 
+            return lIs32BitSafeHelper(bop->getOperand(0))
                 && lIs32BitSafeHelper(bop->getOperand(1));
         }
         return false;
@@ -4961,7 +4967,7 @@ bool
 ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) {
     DEBUG_START_PASS("ReplaceStdlibShiftPass");
     bool modifiedAny = false;
-    
+
     llvm::Function *shifts[6];
     shifts[0] = m->module->getFunction("__shift_i8");
     shifts[1] = m->module->getFunction("__shift_i16");
@@ -4992,19 +4998,19 @@ ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                 }
                 llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleVals);
                 llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shiftedVec->getType());
-                llvm::Value *shuffle = new llvm::ShuffleVectorInst(shiftedVec, zeroVec, 
+                llvm::Value *shuffle = new llvm::ShuffleVectorInst(shiftedVec, zeroVec,
                                                                    shuffleIdxs, "vecShift", ci);
                 ci->replaceAllUsesWith(shuffle);
                 modifiedAny = true;
                 delete [] shuffleVals;
               } else {
-                PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount."); 
+                PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount.");
               }
             }
           }
         }
     }
-    
+
     DEBUG_END_PASS("ReplaceStdlibShiftPass");
 
     return modifiedAny;
@@ -5015,3 +5021,185 @@ static llvm::Pass *
 CreateReplaceStdlibShiftPass() {
     return new ReplaceStdlibShiftPass();
 }
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+// FixBooleanSelect
+//
+// The problem is that in LLVM 3.3, optimizer doesn't like
+// the following instruction sequence:
+//    %cmp = fcmp olt <8 x float> %a, %b
+//    %sext_cmp = sext <8 x i1> %cmp to <8 x i32>
+//    %new_mask = and <8 x i32> %sext_cmp, %mask
+// and optimizes it to the following:
+//    %cmp = fcmp olt <8 x float> %a, %b
+//    %cond = select <8 x i1> %cmp, <8 x i32> %mask, <8 x i32> zeroinitializer
+//
+// It wouldn't be a problem if codegen produced good code for it. But it
+// doesn't, especially for vectors larger than native vectors.
+//
+// This optimization reverts this pattern and should be the last one before
+// code gen.
+//
+// Note that this problem was introduced in LLVM 3.3. But in LLVM 3.4 it was
+// fixed. See commit r194542.
+//
+// After LLVM 3.3 this optimization should probably stay for experimental
+// purposes and code should be compared with and without this optimization from
+// time to time to make sure that LLVM does right thing.
+///////////////////////////////////////////////////////////////////////////////
+
+class FixBooleanSelectPass : public llvm::FunctionPass {
+public:
+    static char ID;
+    FixBooleanSelectPass() :FunctionPass(ID) {}
+
+    const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
+    bool runOnFunction(llvm::Function &F);
+
+private:
+    llvm::Instruction* fixSelect(llvm::SelectInst* sel, llvm::SExtInst* sext);
+};
+
+char FixBooleanSelectPass::ID = 0;
+
+llvm::Instruction* FixBooleanSelectPass::fixSelect(llvm::SelectInst* sel, llvm::SExtInst* sext) {
+    // Select instruction result type and its integer equivalent
+    llvm::VectorType *orig_type = llvm::dyn_cast<llvm::VectorType>(sel->getType());
+    llvm::VectorType *int_type = llvm::VectorType::getInteger(orig_type);
+
+    // Result value and optional pointer to instruction to delete
+    llvm::Instruction *result = 0, *optional_to_delete = 0;
+
+    // It can be vector of integers or vector of floating point values.
+    if (orig_type->getElementType()->isIntegerTy()) {
+        // Generate sext+and, remove select.
+        result = llvm::BinaryOperator::CreateAnd(sext, sel->getTrueValue(), "and_mask", sel);
+    } else {
+        llvm::BitCastInst* bc = llvm::dyn_cast<llvm::BitCastInst>(sel->getTrueValue());
+
+        if (bc && bc->hasOneUse() && bc->getSrcTy()->isIntOrIntVectorTy() && bc->getSrcTy()->isVectorTy() &&
+                llvm::isa<llvm::Instruction>(bc->getOperand(0)) &&
+                llvm::dyn_cast<llvm::Instruction>(bc->getOperand(0))->getParent() == sel->getParent()) {
+            // Bitcast is casting form integer type, it's operand is instruction, which is located in the same basic block (otherwise it's unsafe to use it).
+            // bitcast+select => sext+and+bicast
+            // Create and
+            llvm::BinaryOperator* and_inst = llvm::BinaryOperator::CreateAnd(sext, bc->getOperand(0), "and_mask", sel);
+            // Bitcast back to original type
+            result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
+            // Original bitcast will be removed
+            optional_to_delete = bc;
+        } else {
+            // General case: select => bitcast+sext+and+bitcast
+            // Bitcast
+            llvm::BitCastInst* bc_in = new llvm::BitCastInst(sel->getTrueValue(), int_type, "bitcast_mask_in", sel);
+            // And
+            llvm::BinaryOperator* and_inst = llvm::BinaryOperator::CreateAnd(sext, bc_in, "and_mask", sel);
+            // Bitcast back to original type
+            result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
+        }
+    }
+
+    // Done, finalize.
+    sel->replaceAllUsesWith(result);
+    sel->eraseFromParent();
+    if (optional_to_delete) {
+        optional_to_delete->eraseFromParent();
+    }
+
+    return result;
+}
+
+bool
+FixBooleanSelectPass::runOnFunction(llvm::Function &F) {
+    bool modifiedAny = false;
+
+    // LLVM 3.3 only
+#if defined(LLVM_3_3)
+
+    for (llvm::Function::iterator I = F.begin(), E = F.end();
+         I != E; ++I) {
+        llvm::BasicBlock* bb = &*I;
+        for (llvm::BasicBlock::iterator iter = bb->begin(), e = bb->end(); iter != e; ++iter) {
+            llvm::Instruction *inst = &*iter;
+
+            llvm::CmpInst *cmp = llvm::dyn_cast<llvm::CmpInst>(inst);
+
+            if (cmp && 
+                cmp->getType()->isVectorTy() &&
+                cmp->getType()->getVectorElementType()->isIntegerTy(1)) {
+
+                // Search for select instruction uses.
+                int selects = 0;
+                llvm::VectorType* sext_type = 0;
+                for (llvm::Instruction::use_iterator it=cmp->use_begin(); it!=cmp->use_end(); ++it ) {
+                    llvm::SelectInst* sel = llvm::dyn_cast<llvm::SelectInst>(*it);
+                    if (sel &&
+                        sel->getType()->isVectorTy() &&
+                        sel->getType()->getScalarSizeInBits() > 1) {
+                        selects++;
+                        // We pick the first one, but typical case when all select types are the same.
+                        sext_type = llvm::dyn_cast<llvm::VectorType>(sel->getType());
+                        break;
+                    }
+                }
+                if (selects == 0) {
+                    continue;
+                }
+                // Get an integer equivalent, if it's not yet an integer.
+                sext_type = llvm::VectorType::getInteger(sext_type);
+
+                // Do transformation
+                llvm::BasicBlock::iterator iter_copy=iter;
+                llvm::Instruction* next_inst = &*(++iter_copy);
+                // Create or reuse sext
+                llvm::SExtInst* sext = llvm::dyn_cast<llvm::SExtInst>(next_inst);
+                if (sext &&
+                    sext->getOperand(0) == cmp &&
+                    sext->getDestTy() == sext_type) {
+                    // This sext can be reused
+                } else {
+                    if (next_inst) {
+                        sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", next_inst);
+                    } else {
+                        sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", bb);
+                    }
+                }
+
+                // Walk and fix selects
+                std::vector<llvm::SelectInst*> sel_uses;
+                for (llvm::Instruction::use_iterator it=cmp->use_begin(); it!=cmp->use_end(); ++it) {
+                    llvm::SelectInst* sel = llvm::dyn_cast<llvm::SelectInst>(*it);
+                    if (sel &&
+                        sel->getType()->getScalarSizeInBits() == sext_type->getScalarSizeInBits()) {
+
+                        // Check that second operand is zero.
+                        llvm::Constant* false_cond = llvm::dyn_cast<llvm::Constant>(sel->getFalseValue());
+                        if (false_cond &&
+                            false_cond->isZeroValue()) {
+                            sel_uses.push_back(sel);
+                            modifiedAny = true;
+                        }
+                    }
+                }
+
+                for (int i=0; i<sel_uses.size(); i++) {
+                    fixSelect(sel_uses[i], sext);
+                }
+            }
+        }
+    }
+
+#endif // LLVM 3.3
+
+    return modifiedAny;
+}
+
+
+static llvm::Pass *
+CreateFixBooleanSelectPass() {
+    return new FixBooleanSelectPass();
+}
+
+

From d6dfbcd7432b9f06eb00772e0754a6cebdd920ec Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Thu, 28 Nov 2013 21:44:12 +0400
Subject: [PATCH 154/159] Run alloy -j<num cores> by default

---
 alloy.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/alloy.py b/alloy.py
index 21e428de..525f90d0 100755
--- a/alloy.py
+++ b/alloy.py
@@ -635,6 +635,7 @@ import platform
 import smtplib
 import datetime
 import copy
+import multiprocessing
 from email.MIMEMultipart import MIMEMultipart
 from email.MIMEBase import MIMEBase
 from email.mime.text import MIMEText
@@ -663,13 +664,14 @@ if __name__ == '__main__':
     "Try to build compiler with all LLVM\n\talloy.py -r --only=build\n" +
     "Performance validation run with 10 runs of each test and comparing to branch 'old'\n\talloy.py -r --only=performance --compare-with=old --number=10\n" +
     "Validation run. Update fail_db.txt with new fails, send results to my@my.com\n\talloy.py -r --update-errors=F --notify='my@my.com'\n")
+    num_threads="%s" % multiprocessing.cpu_count()
     parser = MyParser(usage="Usage: alloy.py -r/-b [options]", epilog=examples)
     parser.add_option('-b', '--build-llvm', dest='build_llvm',
         help='ask to build LLVM', default=False, action="store_true")
     parser.add_option('-r', '--run', dest='validation_run',
         help='ask for validation run', default=False, action="store_true")
     parser.add_option('-j', dest='speed',
-        help='set -j for make', default="8")
+        help='set -j for make', default=num_threads)
     # options for activity "build LLVM"
     llvm_group = OptionGroup(parser, "Options for building LLVM",
                     "These options must be used with -b option.")

From b94b89ba68e6da9c3c7b8a2eccbb2475d3a04765 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Fri, 29 Nov 2013 14:24:21 +0400
Subject: [PATCH 155/159] support of LLVM trunk

---
 cbackend.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cbackend.cpp b/cbackend.cpp
index 8535653f..40f87074 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -241,7 +241,11 @@ namespace {
   class CBEMCAsmInfo : public llvm::MCAsmInfo {
   public:
     CBEMCAsmInfo() {
+#if defined(LLVM_3_5)
+      GlobalPrefix = '\0';
+#else
       GlobalPrefix = "";
+#endif
       PrivateGlobalPrefix = "";
     }
   };

From 3bc4788acb31ae1387389bbb77c9b60f0dd4f73a Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Sun, 1 Dec 2013 03:45:00 +0400
Subject: [PATCH 156/159] Fix errors with VS2013

---
 ispc.vcxproj | 336 +++++++++++++--------------------------------------
 1 file changed, 84 insertions(+), 252 deletions(-)

diff --git a/ispc.vcxproj b/ispc.vcxproj
index b9a3b6c5..218bfd5c 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -57,17 +57,17 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-x2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-32bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-32bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-mask1.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-mask8.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-mask16.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-mask32.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-mask64.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask1.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask8.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask16.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask32.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask64.cpp" />
     <ClCompile Include="ispc.cpp" />
     <ClCompile Include="$(Configuration)\lex.cc">
       <DisableSpecificWarnings>4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
@@ -132,383 +132,215 @@
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse4.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-32bit.cpp; $(Configuration)/gen-bitcode-sse4-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-32bit.cpp</Message>
+      <Message>Building gen-bitcode-sse4-32bit.cpp and gen-bitcode-sse4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins\target-sse4.ll">
+    <CustomBuild Include="builtins\target-sse4-8.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-64bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-8-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-8-32bit.cpp; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-64bit.cpp</Message>
+      <Message>Building gen-bitcode-sse4-8-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-16.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-16-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-16-32bit.cpp; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-16-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-sse4-8.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-8-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-sse4-8.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-8-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-sse4-16.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-16-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-sse4-16.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-16-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse4-x2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-x2-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-x2-32bit.cpp; $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-sse4-x2.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-x2-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse2-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse2-32bit.cpp; $(Configuration)/gen-bitcode-sse2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-sse2.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse2-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse2-x2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse2-x2-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse2-x2-32bit.cpp; $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse2-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-sse2-x2.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse2-x2-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx1.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1-x2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-x2-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx1-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx1-x2.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-x2-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1-i64x4.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx1-i64x4-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx1-i64x4.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-i64x4-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx11.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx11-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx11-32bit.cpp; $(Configuration)/gen-bitcode-avx11-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx11-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx11.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx11-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx11-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx11-x2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx11-x2-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx11-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx11-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx11-x2.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx11-x2-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx11-i64x4.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx11-i64x4-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx11-i64x4.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx11-i64x4-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx2-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx2-32bit.cpp; $(Configuration)/gen-bitcode-avx2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx2.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx2-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx2-x2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx2-x2-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx2-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx2-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx2-x2.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx2-x2-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx2-i64x4.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx2-i64x4-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx2-i64x4.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx2-i64x4-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-1.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-1-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-1-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-1-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-1-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-generic-1-32bit.cpp; $(Configuration)/gen-bitcode-generic-1-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-1-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-generic-1.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-1-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-1-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-1-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-4.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-4-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-4-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-4-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-generic-4-32bit.cpp; $(Configuration)/gen-bitcode-generic-4-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-4-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-generic-4.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-4-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-4-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-4-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-8.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-8-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-8-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-8-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-8-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-generic-8-32bit.cpp; $(Configuration)/gen-bitcode-generic-8-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-8-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-generic-8.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-8-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-8-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-8-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-16.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-16-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-16-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-16-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-16-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-generic-16-32bit.cpp; $(Configuration)/gen-bitcode-generic-16-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-16-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-generic-16.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-16-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-16-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-16-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-32.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-32-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-32-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-32-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-32-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-generic-32-32bit.cpp; $(Configuration)/gen-bitcode-generic-32-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-32-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-generic-32.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-32-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-32-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-32-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-64.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-64-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-64-32bit.cpp</Outputs>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-64-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-64-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-generic-64-32bit.cpp; $(Configuration)/gen-bitcode-generic-64-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-64-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins\target-generic-64.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-64-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-64-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-64-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-      <CustomBuild Include="lex.ll">
+    <CustomBuild Include="lex.ll">
       <FileType>Document</FileType>
       <Command>flex -t lex.ll &gt; $(Configuration)\lex.cc</Command>
       <Outputs>$(Configuration)\lex.cc</Outputs>
@@ -597,4 +429,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
+</Project>

From e172d7f1a95076c6ea9e70cac9d20c2edc2848d1 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Sun, 1 Dec 2013 16:18:06 +0400
Subject: [PATCH 157/159] Update build messages (Windows)

---
 ispc.vcxproj | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/ispc.vcxproj b/ispc.vcxproj
index 218bfd5c..8aee2988 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -146,7 +146,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-8-32bit.cpp; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-8-32bit.cpp</Message>
+      <Message>Building gen-bitcode-sse4-8-32bit.cpp and gen-bitcode-sse4-8-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -156,7 +156,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-16-32bit.cpp; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-16-32bit.cpp</Message>
+      <Message>Building gen-bitcode-sse4-16-32bit.cpp and gen-bitcode-sse4-16-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -166,7 +166,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-x2-32bit.cpp; $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-x2-32bit.cpp</Message>
+      <Message>Building gen-bitcode-sse4-x2-32bit.cpp and gen-bitcode-sse4-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -176,7 +176,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-32bit.cpp; $(Configuration)/gen-bitcode-sse2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse2-32bit.cpp</Message>
+      <Message>Building gen-bitcode-sse2-32bit.cpp and gen-bitcode-sse2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -186,7 +186,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-x2-32bit.cpp; $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse2-x2-32bit.cpp</Message>
+      <Message>Building gen-bitcode-sse2-x2-32bit.cpp and gen-bitcode-sse2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -196,7 +196,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
+      <Message>Building gen-bitcode-avx1-32bit.cpp and gen-bitcode-avx1-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -206,7 +206,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-x2-32bit.cpp</Message>
+      <Message>Building gen-bitcode-avx1-x2-32bit.cpp and gen-bitcode-avx1-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -216,7 +216,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-i64x4-32bit.cpp</Message>
+      <Message>Building gen-bitcode-avx1-i64x4-32bit.cpp and gen-bitcode-avx1-i64x4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -226,7 +226,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-32bit.cpp; $(Configuration)/gen-bitcode-avx11-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx11-32bit.cpp</Message>
+      <Message>Building gen-bitcode-avx11-32bit.cpp and gen-bitcode-avx11-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -236,7 +236,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx11-x2-32bit.cpp</Message>
+      <Message>Building gen-bitcode-avx11-x2-32bit.cpp and gen-bitcode-avx11-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -246,7 +246,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx11-i64x4-32bit.cpp</Message>
+      <Message>Building gen-bitcode-avx11-i64x4-32bit.cpp and gen-bitcode-avx11-i64x4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -256,7 +256,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-32bit.cpp; $(Configuration)/gen-bitcode-avx2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx2-32bit.cpp</Message>
+      <Message>Building gen-bitcode-avx2-32bit.cpp and gen-bitcode-avx2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -266,7 +266,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx2-x2-32bit.cpp</Message>
+      <Message>Building gen-bitcode-avx2-x2-32bit.cpp and gen-bitcode-avx2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -276,7 +276,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx2-i64x4-32bit.cpp</Message>
+      <Message>Building gen-bitcode-avx2-i64x4-32bit.cpp and gen-bitcode-avx2-i64x4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -286,7 +286,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-1-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-1-32bit.cpp; $(Configuration)/gen-bitcode-generic-1-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-1-32bit.cpp</Message>
+      <Message>Building gen-bitcode-generic-1-32bit.cpp and gen-bitcode-generic-1-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -296,7 +296,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-4-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-4-32bit.cpp; $(Configuration)/gen-bitcode-generic-4-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-4-32bit.cpp</Message>
+      <Message>Building gen-bitcode-generic-4-32bit.cpp and gen-bitcode-generic-4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -306,7 +306,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-8-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-8-32bit.cpp; $(Configuration)/gen-bitcode-generic-8-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-8-32bit.cpp</Message>
+      <Message>Building gen-bitcode-generic-8-32bit.cpp and gen-bitcode-generic-8-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -316,7 +316,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-16-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-16-32bit.cpp; $(Configuration)/gen-bitcode-generic-16-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-16-32bit.cpp</Message>
+      <Message>Building gen-bitcode-generic-16-32bit.cpp and gen-bitcode-generic-16-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -326,7 +326,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-32-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-32-32bit.cpp; $(Configuration)/gen-bitcode-generic-32-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-32-32bit.cpp</Message>
+      <Message>Building gen-bitcode-generic-32-32bit.cpp and gen-bitcode-generic-32-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -336,7 +336,7 @@
                m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-64-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-64-32bit.cpp; $(Configuration)/gen-bitcode-generic-64-64bit.cpp</Outputs>
       <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-64-32bit.cpp</Message>
+      <Message>Building gen-bitcode-generic-64-32bit.cpp and gen-bitcode-generic-64-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>

From 31ee2951ce84cb1dc758ea4906df22cc9e9b6b01 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 3 Dec 2013 19:40:30 +0400
Subject: [PATCH 158/159] Adding LLVM 3.4 definition to alloy.py

---
 alloy.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/alloy.py b/alloy.py
index 525f90d0..657e67bf 100755
--- a/alloy.py
+++ b/alloy.py
@@ -88,6 +88,9 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
     FOLDER_NAME=version_LLVM
     if  version_LLVM == "trunk":
         SVN_PATH="trunk"
+    if  version_LLVM == "3.4":
+        SVN_PATH="tags/RELEASE_34/rc1"
+        version_LLVM = "3_4"
     if  version_LLVM == "3.3":
         SVN_PATH="tags/RELEASE_33/final"
         version_LLVM = "3_3"
@@ -273,8 +276,10 @@ def build_ispc(version_LLVM, make):
         os.environ["LLVM_INSTALL_DIR"] = os.environ["LLVM_HOME"] + "\\bin-" + version_LLVM
         if version_LLVM == "3.3":
             temp = "3_3"
-        if version_LLVM == "trunk":
+        if version_LLVM == "3.4":
             temp = "3_4"
+        if version_LLVM == "trunk":
+            temp = "3_5"
         os.environ["LLVM_VERSION"] = "LLVM_" + temp
         try_do_LLVM("clean ISPC for building", "msbuild ispc.vcxproj /t:clean", True)
         try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", "msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild", True)
@@ -376,7 +381,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
             archs.append("x86-64")
         if "native" in only:
             sde_targets_t = []
-        for i in ["3.1", "3.2", "3.3", "trunk"]:
+        for i in ["3.1", "3.2", "3.3", "3.4", "trunk"]:
             if i in only:
                 LLVM.append(i)
         if "current" in only:
@@ -676,7 +681,7 @@ if __name__ == '__main__':
     llvm_group = OptionGroup(parser, "Options for building LLVM",
                     "These options must be used with -b option.")
     llvm_group.add_option('--version', dest='version',
-        help='version of llvm to build: 3.1 3.2 3.3 trunk. Default: trunk', default="trunk")
+        help='version of llvm to build: 3.1 3.2 3.3 3.4 trunk. Default: trunk', default="trunk")
     llvm_group.add_option('--revision', dest='revision',
         help='revision of llvm to build in format r172870', default="")
     llvm_group.add_option('--debug', dest='debug',
@@ -711,7 +716,7 @@ if __name__ == '__main__':
     run_group.add_option('--only', dest='only',
         help='set types of tests. Possible values:\n' + 
             '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' +
-            'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).',
+            'build (only build with different LLVM), 3.1, 3.2, 3.3, 3.4, trunk, native (do not use SDE), current (do not rebuild ISPC).',
             default="")
     run_group.add_option('--perf_LLVM', dest='perf_llvm',
         help='compare LLVM 3.3 with "--compare-with", default trunk', default=False, action='store_true')

From f61f1a20207eeefbde91359d279ea57a91c9e7f7 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 3 Dec 2013 19:52:11 +0400
Subject: [PATCH 159/159] Fixing run_tests.py to understand LLVM 3.4

---
 run_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_tests.py b/run_tests.py
index 3f03cc9b..e6429861 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -452,7 +452,7 @@ def verify():
     f_lines = f.readlines()
     f.close()
     check = [["g++", "clang++", "cl"],["-O0", "-O2"],["x86","x86-64"],
-             ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM head"],
+             ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM 3.4","LLVM trunk"],
              ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8",
               "sse4-i8x16", "avx1-i32x4" "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8",
               "avx1.1-i32x16", "avx1.1-i64x4", "avx2-i32x8", "avx2-i32x16", "avx2-i64x4",