diff --git a/alloy.py b/alloy.py
index 0055842c..57d8df1e 100755
--- a/alloy.py
+++ b/alloy.py
@@ -89,7 +89,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
     if  version_LLVM == "trunk":
         SVN_PATH="trunk"
     if  version_LLVM == "3.4":
-        SVN_PATH="tags/RELEASE_34/rc2"
+        SVN_PATH="tags/RELEASE_34/final"
         version_LLVM = "3_4"
     if  version_LLVM == "3.3":
         SVN_PATH="tags/RELEASE_33/final"
@@ -129,8 +129,23 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
         try_do_LLVM("load clang from http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " ",
                     "svn co " + revision + " http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " clang",
                     from_validation)
+        os.chdir("..")
+        if current_OS == "MacOS" and int(current_OS_version.split(".")[0]) >= 13:
+            # Starting with MacOS 10.9 Maverics, the system doesn't contain headers for standard C++ library and
+            # the default library is libc++, bit libstdc++. The headers are part of XCode now. But we are checking out
+            # headers as part of LLVM source tree, so they will be installed in clang location and clang will be able
+            # to find them. Though they may not match to the library installed in the system, but seems that this should
+            # not happen.
+            # Note, that we can also build a libc++ library, but it must be on system default location or should be passed
+            # to the linker explicitly (either through command line or environment variables). So we are not doing it
+            # currently to make the build process easier.
+            os.chdir("projects")
+            try_do_LLVM("load libcxx http://llvm.org/svn/llvm-project/libcxx/" + SVN_PATH + " ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/libcxx/" + SVN_PATH + " libcxx",
+                    from_validation)
+            os.chdir("..")
         if extra == True:
-            os.chdir("./clang/tools")
+            os.chdir("tools/clang/tools")
             try_do_LLVM("load extra clang extra tools ",
                     "svn co " + revision + " http://llvm.org/svn/llvm-project/clang-tools-extra/" + SVN_PATH + " extra",
                     from_validation)
@@ -138,7 +153,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
             try_do_LLVM("load extra clang compiler-rt ",
                     "svn co " + revision + " http://llvm.org/svn/llvm-project/compiler-rt/" + SVN_PATH + " compiler-rt",
                     from_validation)
-        os.chdir("../")
+            os.chdir("..")
     else:
         tar = tarball.split(" ")
         os.makedirs(LLVM_SRC) 
@@ -563,6 +578,8 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
 
 def Main():
     global current_OS
+    global current_OS_version
+    current_OS_version = platform.release()
     if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True:
         current_OS = "Windows"
     else:
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 43609b33..c43a12a7 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -3,13 +3,13 @@
 define(`MASK',`i32')
 define(`WIDTH',`1')
 include(`util.m4')
+rdrand_decls()
 ; Define some basics for a 1-wide target
 stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 aossoa()
-rdrand_decls()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
@@ -653,10 +653,121 @@ define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-include(`svml.m4')
-svml_stubs(float,f,WIDTH)
-svml_stubs(double,d,WIDTH)
+declare  <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline 
+declare  void @__svml_sincosd(<1 x float>, <1 x double> *, <1 x double> *) nounwind readnone alwaysinline
+declare  <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_expd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_logd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_powd(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
 
+define  <1 x float> @__svml_sinf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.sin.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float,@llvm.sin.f32)
+   
+}
+
+define  <1 x float> @__svml_asinf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_asinf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.asin.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float,@llvm.asin.f32)
+   
+}
+
+define  <1 x float> @__svml_cosf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.cos.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float, @llvm.cos.f32)
+
+}
+
+define  void @__svml_sincosf(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
+;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
+;  store <1 x float> %s, <1 x float> * %1
+;  ret void
+   %sin = call <1 x float> @__svml_sinf(<1 x float> %0)
+   %cos = call <1 x float> @__svml_cosf(<1 x float> %0)
+   store <1 x float> %sin, <1 x float> * %1
+   store <1 x float> %cos, <1 x float> * %2
+   ret void
+}
+
+define  <1 x float> @__svml_tanf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm_tan_f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  ;unasry1to1(float, @llvm.tan.f32)
+  ; UNSUPPORTED!
+  ret <1 x float > %0
+}
+
+define  <1 x float> @__svml_atanf(<1 x float>) nounwind readnone alwaysinline {
+;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
+;  ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm_atan_f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  ;unsary1to1(float,@llvm.atan.f32)
+  ;UNSUPPORTED!
+  ret <1 x float > %0
+
+}
+
+define  <1 x float> @__svml_atan2f(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
+  ;ret <1 x float> %ret
+  ;%y = extractelement <1 x float> %0, i32 0
+  ;%x = extractelement <1 x float> %1, i32 0
+  ;%q = fdiv float %y, %x
+  ;%a = call float @llvm.atan.f32 (float %q)
+  ;%rv = insertelement <1 x float> undef, float %a, i32 0
+  ;ret <1 x float> %rv
+  ; UNSUPPORTED!
+  ret <1 x float > %0
+}
+
+define  <1 x float> @__svml_expf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  unary1to1(float, @llvm.exp.f32)
+}
+
+define  <1 x float> @__svml_logf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  unary1to1(float, @llvm.log.f32)
+}
+
+define  <1 x float> @__svml_powf(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
+  ;ret <1 x float> %ret
+  %r = extractelement <1 x float> %0, i32 0
+  %e  = extractelement <1 x float> %1, i32 0
+  %s = call float @llvm.pow.f32(float %r,float %e)
+  %rv = insertelement <1 x float> undef, float %s, i32 0
+  ret <1 x float> %rv
+
+}
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
@@ -881,14 +992,3 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
 
 define_avgs()
 
-;;;;;;; nvptx64
-
-declare i32 @__tid_x()  nounwind readnone alwaysinline
-declare i32 @__warpsize()  nounwind readnone alwaysinline
-declare i32 @__ctaid_x()  nounwind readnone alwaysinline
-declare i32 @__ctaid_y()  nounwind readnone alwaysinline
-declare i32 @__ctaid_z()  nounwind readnone alwaysinline
-declare i32 @__nctaid_x()  nounwind readnone alwaysinline
-declare i32 @__nctaid_y()  nounwind readnone alwaysinline
-declare i32 @__nctaid_z()  nounwind readnone alwaysinline
-
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index e57b9036..0a51f44f 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -371,6 +371,8 @@ declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
                                   <WIDTH x i1>) nounwind
 declare i32 @__packed_store_active(i32 * nocapture, <WIDTH x i32> %vals,
                                    <WIDTH x i1>) nounwind
+declare i32 @__packed_store_active2(i32 * nocapture, <WIDTH x i32> %vals,
+                                   <WIDTH x i1>) nounwind
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/contrib/ispc.vim b/contrib/ispc.vim
index f3cb413b..11808658 100644
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -18,6 +18,7 @@ syn keyword	ispcConditional	cif
 syn keyword	ispcRepeat	cdo cfor cwhile
 syn keyword	ispcBuiltin	programCount programIndex	
 syn keyword	ispcType	export uniform varying int8 int16 int32 int64
+syn keyword	ispcOperator	operator
 
 "double precision floating point number, with dot, optional exponent
 syn match	cFloat		display contained "\d\+\.\d*d[-+]\=\d*\>"
@@ -33,6 +34,7 @@ HiLink ispcConditional	Conditional
 HiLink ispcRepeat	Repeat
 HiLink ispcBuiltin	Statement
 HiLink ispcType		Type
+HiLink ispcOperator	Operator
 delcommand HiLink
 
 let b:current_syntax = "ispc"
diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt
index a8575ea0..b7d0bb17 100644
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,47 @@
+=== v1.6.0 === (19 December 2013)
+
+A major new version of ISPC with major improvements in performance and
+stability. Linux and MacOS binaries are based on patched version of LLVM 3.3,
+while Windows version is based on LLVM 3.4rc3. LLVM 3.4 significantly improves
+stability on Win32 platform, so we've decided not to wait for official LLVM 3.4
+release.
+
+The list of the most significant changes is:
+
+* New avx1-i32x4 target was added. It may play well for you, if you are focused
+  on integer computations or FP unit in your hardware is 128 bit wide.
+
+* Support for calculations in double precision was extended with two new
+  targets avx1.1-i64x4 and avx2-i64x4.
+
+* Language support for overloaded operators was added.
+
+* New library shift() function was added, which is similar to rotate(), but is
+  non-circular.
+
+* The language was extended to accept 3 dimensional tasking - a syntactic sugar,
+  which may facilitate programming of some tasks.
+
+* Regression, which broke --opt=force-aligned-memory is fixed.
+
+If you are not using pre-built binaries, you may notice the following changes:
+
+* VS2012/VS2013 are supported.
+
+* alloy.py (with -b switch) can build LLVM for you on any platform now
+  (except MacOS 10.9, but we know about the problem and working on it).
+  This is a preferred way to build LLVM for ISPC, as all required patches for
+  better performance and stability will automatically apply.
+
+* LLVM 3.5 (current trunk) is supported.
+
+There are also multiple fixes for better performance and stability, most
+notable are:
+
+* Fixed performance problem for x2 targets.
+
+* Fixed a problem with incorrect vzeroupper insertion on AVX target on Win32.
+
 === v1.5.0 === (27 September 2013)
 
 A major new version of ISPC with several new targets and important bug fixes.
diff --git a/docs/ispc.rst b/docs/ispc.rst
index 7e76f433..9464dcde 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -48,6 +48,8 @@ Contents:
   + `Updating ISPC Programs For Changes In ISPC 1.1`_
   + `Updating ISPC Programs For Changes In ISPC 1.2`_
   + `Updating ISPC Programs For Changes In ISPC 1.3`_
+  + `Updating ISPC Programs For Changes In ISPC 1.5.0`_
+  + `Updating ISPC Programs For Changes In ISPC 1.6.0`_
 
 * `Getting Started with ISPC`_
 
@@ -97,6 +99,9 @@ Contents:
     * `Short Vector Types`_
     * `Array Types`_
     * `Struct Types`_
+
+      + `Operators Overloading`_
+
     * `Structure of Array Types`_
 
   + `Declarations and Initializers`_
@@ -279,6 +284,15 @@ Double precision floating point constants are floating point number with
 31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is
 treated as single precision constant.
 
+Updating ISPC Programs For Changes In ISPC 1.6.0
+------------------------------------------------
+
+This release adds support for `Operators Overloading`_, so a word ``operator``
+becomes a keyword and it potentially creates a conflict with existing user 
+function. Also a new library function packed_store_active2() was introduced,
+which also may create a conflict with existing user functions.
+
+
 Getting Started with ISPC
 =========================
 
@@ -1325,6 +1339,7 @@ in C:
 * Function overloading by parameter type
 * Hexadecimal floating-point constants
 * Dynamic memory allocation with ``new`` and ``delete``.
+* Limited support for overloaded operators (`Operators Overloading`_).
 
 ``ispc`` also adds a number of new features that aren't in C89, C99, or
 C++:
@@ -2122,7 +2137,35 @@ above code, the value of ``f[index]`` needs to be able to store a different
 value of ``Foo::a`` for each program instance.  However, a ``varying Foo``
 still has only a single ``a`` member, since ``a`` was declared with
 ``uniform`` variability in the declaration of ``Foo``.  Therefore, the
-indexing operation in the last line results in an error.  
+indexing operation in the last line results in an error.
+
+
+Operators Overloading
+---------------------
+
+ISPC has limited support for overloaded operators for ``struct`` types. Only
+binary operators are supported currently, namely they are: ``*, /, %, +, -, >>
+and <<``. Operators overloading support is similar to the one in C++ language.
+To overload an operator for ``struct S``, you need to declare and implement a
+function using keyword ``operator``, which accepts two parameters of type
+``struct S`` or ``struct S&`` and returns either of these types. For example:
+
+::
+
+    struct S { float re, im;};
+    struct S operator*(struct S a, struct S b) {
+        struct S result;
+        result.re = a.re * b.re - a.im * b.im;
+        result.im = a.re * b.im + a.im * b.re;
+        return result;
+    }
+
+    void foo(struct S a, struct S b) {
+        struct S mul = a*b;
+        print("a.re:   %\na.im:   %\n", a.re, a.im);
+        print("b.re:   %\nb.im:   %\n", b.re, b.im);
+        print("mul.re: %\nmul.im: %\n", mul.re, mul.im);
+    }
 
 
 Structure of Array Types
@@ -4050,6 +4093,14 @@ They return the total number of values stored.
                                     unsigned int val)
 
 
+There are also ``packed_store_active2()`` functions with exactly the same
+signatures and the same semantic except that they may write one extra
+element to the output array (but still returning the same value as
+``packed_store_active()``). These functions suggest different branch free 
+implementation on most of supported targets, which usually (but not always)
+performs better than ``packed_store_active()``. It's advised to test function
+performance on user's scenarios on particular target hardware before using it.
+
 As an example of how these functions can be used, the following code shows
 the use of ``packed_store_active()``.
 
diff --git a/docs/news.rst b/docs/news.rst
index 7d78a662..6a805e48 100644
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -2,6 +2,16 @@
 ispc News
 =========
 
+ispc 1.6.0 is Released
+----------------------
+
+A major update of ``ispc`` has been released. The main focus is on improved 
+performance and stability. Several new targets were added. There are also 
+a number of language and library extensions. Released binaries are based on
+patched LLVM 3.3 on Linux and MacOS and LLVM 3.4rc3 on Windows. Please refer
+to Release Notes for complete set of changes.
+
+
 ispc 1.5.0 is Released
 ----------------------
 
diff --git a/doxygen.cfg b/doxygen.cfg
index a0ad3176..9a8f88e5 100644
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
 
-PROJECT_NUMBER         = 1.5.1dev
+PROJECT_NUMBER         = 1.6.1dev
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
diff --git a/examples/aobench/aobench.vcxproj b/examples/aobench/aobench.vcxproj
index c46ee41a..298be2cb 100644
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -1,5 +1,23 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/common.props b/examples/common.props
index 7bf37005..3769330b 100644
--- a/examples/common.props
+++ b/examples/common.props
@@ -146,24 +146,24 @@
   <PropertyGroup Label="User">
     <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
     <Target_str Condition=" '$(Target_str)' == '' ">$(default_targets)</Target_str>
-    <Target_out>$(TargetDir)$(ISPC_file).obj</Target_out>
-    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('sse2')))">$(Target_out);$(TargetDir)$(ISPC_file)_sse2.obj</Target_out>
-    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('sse4')))">$(Target_out);$(TargetDir)$(ISPC_file)_sse4.obj</Target_out>
-    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx1-')))">$(Target_out);$(TargetDir)$(ISPC_file)_avx.obj</Target_out>
-    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx1.1')))">$(Target_out);$(TargetDir)$(ISPC_file)_avx11.obj</Target_out>
-    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx2')))">$(Target_out);$(TargetDir)$(ISPC_file)_avx2.obj</Target_out>
+    <Target_out>$(ISPC_file).obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('sse2')))">$(Target_out);$(ISPC_file)_sse2.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('sse4')))">$(Target_out);$(ISPC_file)_sse4.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx1-')))">$(Target_out);$(ISPC_file)_avx.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx1.1')))">$(Target_out);$(ISPC_file)_avx11.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx2')))">$(Target_out);$(ISPC_file)_avx2.obj</Target_out>
   </PropertyGroup>
   <ItemGroup>
     <CustomBuild Include='$(ISPC_file).ispc'>
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str)</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str)</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str)</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str)</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Target_out)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Target_out)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Target_out)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Target_out)</Outputs>
     </CustomBuild>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/examples/deferred/deferred_shading.vcxproj b/examples/deferred/deferred_shading.vcxproj
index cd361b26..974e870b 100755
--- a/examples/deferred/deferred_shading.vcxproj
+++ b/examples/deferred/deferred_shading.vcxproj
@@ -1,5 +1,23 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>	
   <PropertyGroup Label="Globals">
     <ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index fa794276..0aa8a3f6 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1472,31 +1472,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec16_i32 val,
     return count;
 }
 
+
+static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    int count = 0;
+    int32_t *ptr_ = ptr;
+    for (int i = 0; i < 16; ++i) {
+        *ptr = val.v[i];
+        ptr += mask.v & 1;
+        mask.v = mask.v >> 1;
+    }
+    return ptr - ptr_;
+}
+
+
 static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
                                                 __vec16_i32 *val,
                                                 __vec16_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 16; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            val->v[i] = *ptr++;
-            ++count;
-        }
-    }
-    return count;
+    return __packed_load_active((int32_t *)ptr, val, mask);
 }
 
 
 static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
                                                  __vec16_i32 val,
                                                  __vec16_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 16; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            *ptr++ = val.v[i];
-            ++count;
-        }
-    }
-    return count;
+    return __packed_store_active((int32_t *)ptr, val, mask);
+}
+
+
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr,
+                                                 __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    return __packed_store_active2((int32_t *)ptr, val, mask);
 }
 
 
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index 531ed215..924b049d 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -1523,31 +1523,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec32_i32 val,
     return count;
 }
 
+
+static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec32_i32 val,
+                                                 __vec32_i1 mask) {
+    int count = 0;
+    int32_t *ptr_ = ptr;
+    for (int i = 0; i < 32; ++i) {
+        *ptr = val.v[i];
+        ptr += mask.v & 1;
+        mask.v = mask.v >> 1;
+    }
+    return ptr - ptr_;
+}
+
+
 static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
                                                 __vec32_i32 *val,
                                                 __vec32_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 32; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            val->v[i] = *ptr++;
-            ++count;
-        }
-    }
-    return count;
+    return __packed_load_active((int32_t *)ptr, val, mask);
 }
 
 
 static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
                                                  __vec32_i32 val,
                                                  __vec32_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 32; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            *ptr++ = val.v[i];
-            ++count;
-        }
-    }
-    return count;
+    return __packed_store_active((int32_t *)ptr, val, mask);
+}
+
+
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr,
+                                                 __vec32_i32 val,
+                                                 __vec32_i1 mask) {
+    return __packed_store_active2((int32_t *)ptr, val, mask);
 }
 
 
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index bbeb007a..b1451c96 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -1656,31 +1656,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec64_i32 val,
     return count;
 }
 
+
+static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec64_i32 val,
+                                                 __vec64_i1 mask) {
+    int count = 0;
+    int32_t *ptr_ = ptr;
+    for (int i = 0; i < 64; ++i) {
+        *ptr = val.v[i];
+        ptr += mask.v & 1;
+        mask.v = mask.v >> 1;
+    }
+    return ptr - ptr_;
+}
+
+
 static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
                                                 __vec64_i32 *val,
                                                 __vec64_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 64; ++i) {
-        if ((mask.v & (1ull << i)) != 0) {
-            val->v[i] = *ptr++;
-            ++count;
-        }
-    }
-    return count;
+    return __packed_load_active((int32_t *) ptr, val, mask);
 }
 
 
 static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
                                                  __vec64_i32 val,
                                                  __vec64_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 64; ++i) {
-        if ((mask.v & (1ull << i)) != 0) {
-            *ptr++ = val.v[i];
-            ++count;
-        }
-    }
-    return count;
+    return __packed_store_active((int32_t *) ptr, val, mask);
+}
+
+
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr,
+                                                 __vec64_i32 val,
+                                                 __vec64_i1 mask) {
+    return __packed_store_active2((int32_t *) ptr, val, mask);
 }
 
 
diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index ef14d26e..141c47bb 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -2451,20 +2451,24 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, _
   return _mm_countbits_32(uint32_t(mask));
 }
 
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) 
+{
+  return __packed_store_active(p, val, mask);
+}
+
 static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val, __vec16_i1 mask)
 {
-  __vec16_i32 v = __load<64>(val);
-  v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-  v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-  __store<64>(val, v);
-  return _mm_countbits_32(uint32_t(mask));
+  return __packed_load_active((uint32_t *)p, val, mask);
 }
 
 static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val, __vec16_i1 mask) 
 {
-  _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-  return _mm_countbits_32(uint32_t(mask));
+  return __packed_store_active((uint32_t *)p, val, mask);
+}
+
+static FORCEINLINE int32_t __packed_store_active2(int32_t *p, __vec16_i32 val, __vec16_i1 mask) 
+{
+  return __packed_store_active(p, val, mask);
 }
 
 ///////////////////////////////////////////////////////////////////////////
diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h
index d7696117..32f39c4a 100644
--- a/examples/intrinsics/knc-i1x8.h
+++ b/examples/intrinsics/knc-i1x8.h
@@ -2496,20 +2496,23 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val,
     _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
     return _mm_countbits_32(uint32_t(0xFF & mask));
 }
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val,
+                                                 __vec4_i1 mask) {
+    return __packed_store_active(ptr, val, mask);
+}
 static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val,
                                                 __vec8_i1 mask) {
-    __vec8_i32 v = __load<64>(val);
-    v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    __store<64>(val, v);
-    return _mm_countbits_32(uint32_t(0xFF & mask));
+    return __packed_load_active((uint32_t *)p, val, mask);
 }
 static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val,
                                                  __vec8_i1 mask) {
-    _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    return _mm_countbits_32(uint32_t(0xFF & mask));
+    return __packed_store_active((uint32_t *)p, val, mask);
 }
+static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val,
+                                                 __vec4_i1 mask) {
+    return __packed_store_active(ptr, val, mask);
+}
+
 #endif
 
 ///////////////////////////////////////////////////////////////////////////
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index 8baef8cb..0077ad88 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -1260,6 +1260,13 @@ static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i3
     return __vec16_i64(val.v, _mm512_setzero_epi32());
 }
 
+static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val)
+{
+  __vec16_i32 ret = _mm512_setzero_epi32();
+  __vec16_i32 one = _mm512_set1_epi32(-1);
+  return _mm512_mask_mov_epi32(ret, val, one);
+}
+
 static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
 {
     __vec16_i32 ret = _mm512_setzero_epi32();
@@ -1878,6 +1885,11 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val,
     return _mm_countbits_32(uint32_t(mask));
 }
 
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val, __vec16_i1 mask)
+{
+  return __packed_store_active(p, val, mask);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // prefetch
 ///////////////////////////////////////////////////////////////////////////
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 919716be..5dd424d9 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -3798,6 +3798,25 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec4_i32 val,
     return count;
 }
 
+static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val,
+                                                 __vec4_i1 mask) {
+    int count = 0;
+
+    ptr[count] = _mm_extract_epi32(val.v, 0);
+    count -= _mm_extract_ps(mask.v, 0);
+
+    ptr[count] = _mm_extract_epi32(val.v, 1);
+    count -= _mm_extract_ps(mask.v, 1);
+
+    ptr[count] = _mm_extract_epi32(val.v, 2);
+    count -= _mm_extract_ps(mask.v, 2);
+
+    ptr[count] = _mm_extract_epi32(val.v, 3);
+    count -= _mm_extract_ps(mask.v, 3);
+
+    return count;
+}
+
 static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec4_i32 *val,
                                                 __vec4_i1 mask) {
     return __packed_load_active((int32_t *)ptr, val, mask);
@@ -3808,6 +3827,11 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec4_i32 val,
     return __packed_store_active((int32_t *)ptr, val, mask);
 }
 
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val,
+                                                 __vec4_i1 mask) {
+    return __packed_store_active2((int32_t *)ptr, val, mask);
+}
+
 
 ///////////////////////////////////////////////////////////////////////////
 // aos/soa
diff --git a/examples/mandelbrot/mandelbrot.vcxproj b/examples/mandelbrot/mandelbrot.vcxproj
index e7703ad0..7a5f6e03 100644
--- a/examples/mandelbrot/mandelbrot.vcxproj
+++ b/examples/mandelbrot/mandelbrot.vcxproj
@@ -1,5 +1,23 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
index f8b8cfcb..113fc4e8 100644
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -1,5 +1,23 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>	
   <PropertyGroup Label="Globals">
     <ProjectGuid>{E80DA7D4-AB22-4648-A068-327307156BE6}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/noise/noise.vcxproj b/examples/noise/noise.vcxproj
index 7adc57f3..ff3953ae 100644
--- a/examples/noise/noise.vcxproj
+++ b/examples/noise/noise.vcxproj
@@ -1,5 +1,23 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>	
   <PropertyGroup Label="Globals">
     <ProjectGuid>{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/options/options.vcxproj b/examples/options/options.vcxproj
index af336aa1..d48ac8bc 100644
--- a/examples/options/options.vcxproj
+++ b/examples/options/options.vcxproj
@@ -1,5 +1,23 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>	
   <PropertyGroup Label="Globals">
     <ProjectGuid>{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/rt/rt.vcxproj b/examples/rt/rt.vcxproj
index ea34de56..00b6dd3a 100644
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -1,5 +1,23 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>	
   <PropertyGroup Label="Globals">
     <ProjectGuid>{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/sort/sort.vcxproj b/examples/sort/sort.vcxproj
index 43f2b439..b0bdc63d 100644
--- a/examples/sort/sort.vcxproj
+++ b/examples/sort/sort.vcxproj
@@ -1,5 +1,23 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>	
   <PropertyGroup Label="Globals">
     <ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/stencil/stencil.vcxproj b/examples/stencil/stencil.vcxproj
index b5f5bb22..fd8564aa 100644
--- a/examples/stencil/stencil.vcxproj
+++ b/examples/stencil/stencil.vcxproj
@@ -1,5 +1,23 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp
index f8b2dfec..77269f9f 100644
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -170,17 +170,44 @@
 
 // Signature of ispc-generated 'task' functions
 typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,
-                             int taskIndex, int taskCount);
+                             int taskIndex, int taskCount,
+                             int taskIndex0, int taskIndex1, int taskIndex2,
+                             int taskCount0, int taskCount1, int taskCount2);
 
 // Small structure used to hold the data for each task
+#ifdef _MSC_VER
+__declspec(align(16))
+#endif
 struct TaskInfo {
     TaskFuncType func;
     void *data;
-    int taskIndex, taskCount;
+    int taskIndex;
+    int taskCount3d[3];
 #if defined(ISPC_IS_WINDOWS)
     event taskEvent;
 #endif
-};
+    int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; }
+    int taskIndex0() const 
+    {
+      return taskIndex % taskCount3d[0];
+    }
+    int taskIndex1() const 
+    {
+      return ( taskIndex / taskCount3d[0] ) % taskCount3d[1];
+    }
+    int taskIndex2() const 
+    {
+      return taskIndex / ( taskCount3d[0]*taskCount3d[1] );
+    }
+    int taskCount0() const { return taskCount3d[0]; }
+    int taskCount1() const { return taskCount3d[1]; }
+    int taskCount2() const { return taskCount3d[2]; }
+    TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); }
+}
+#ifndef _MSC_VER
+__attribute__((aligned(32)));
+#endif
+;
 
 // ispc expects these functions to have C linkage / not be mangled
 extern "C" { 
@@ -518,7 +545,9 @@ lRunTask(void *ti) {
 
     // Actually run the task
     taskInfo->func(taskInfo->data, threadIndex, threadCount, 
-                   taskInfo->taskIndex, taskInfo->taskCount);
+                   taskInfo->taskIndex, taskInfo->taskCount(),
+            taskInfo->taskIndex0(), taskInfo->taskIndex1(), taskInfo->taskIndex2(),
+            taskInfo->taskCount0(), taskInfo->taskCount1(), taskInfo->taskCount2());
 }
 
 
@@ -559,7 +588,9 @@ lRunTask(LPVOID param) {
     // will cause bugs in code that uses those.
     int threadIndex = 0;
     int threadCount = 1;
-    ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+    ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
+            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
+            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
 
     // Signal the event that this task is done
     ti->taskEvent.set();
@@ -660,7 +691,9 @@ lTaskEntry(void *arg) {
         DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg));
         TaskInfo *myTask = tg->GetTaskInfo(taskNumber);
         myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex,
-                     myTask->taskCount);
+                     myTask->taskCount(),
+            myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(),
+            myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2());
 
         //
         // Decrement the "number of unfinished tasks" counter in the task
@@ -871,7 +904,9 @@ TaskGroup::Sync() {
         // Do work for _myTask_
         //
         // FIXME: bogus values for thread index/thread count here as well..
-        myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount);
+        myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount(),
+            myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(),
+            myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2());
 
         //
         // Decrement the number of unfinished tasks counter
@@ -901,7 +936,9 @@ TaskGroup::Launch(int baseIndex, int count) {
 
         // Actually run the task. 
         // Cilk does not expose the task -> thread mapping so we pretend it's 1:1
-        ti->func(ti->data, ti->taskIndex, ti->taskCount, ti->taskIndex, ti->taskCount);
+        ti->func(ti->data, ti->taskIndex, ti->taskCount(),
+            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
+            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
     }
 }
 
@@ -930,7 +967,9 @@ TaskGroup::Launch(int baseIndex, int count) {
         // Actually run the task. 
         int threadIndex = omp_get_thread_num();
         int threadCount = omp_get_num_threads();
-        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
+            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
+            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
     }
 }
 
@@ -961,7 +1000,9 @@ TaskGroup::Launch(int baseIndex, int count) {
         int threadIndex = ti->taskIndex;
         int threadCount = ti->taskCount;
 
-        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
+            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
+            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
     });
 }
 
@@ -988,7 +1029,9 @@ TaskGroup::Launch(int baseIndex, int count) {
             // TBB does not expose the task -> thread mapping so we pretend it's 1:1
             int threadIndex = ti->taskIndex;
             int threadCount = ti->taskCount;
-            ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+            ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
+            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
+            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
         });
     }
 }
@@ -1041,7 +1084,8 @@ FreeTaskGroup(TaskGroup *tg) {
 ///////////////////////////////////////////////////////////////////////////
 
 void
-ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
+ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2) {
+    const int count = count0*count1*count2;
     TaskGroup *taskGroup;
     if (*taskGroupPtr == NULL) {
         InitTaskSystem();
@@ -1057,7 +1101,9 @@ ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
         ti->func = (TaskFuncType)func;
         ti->data = data;
         ti->taskIndex = i;
-        ti->taskCount = count;
+        ti->taskCount3d[0] = count0;
+        ti->taskCount3d[1] = count1;
+        ti->taskCount3d[2] = count2;
     }
     taskGroup->Launch(baseIndex, count);
 }
diff --git a/examples/volume_rendering/volume.vcxproj b/examples/volume_rendering/volume.vcxproj
index cc738a7e..a1fea5f1 100644
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -1,5 +1,23 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>	
   <PropertyGroup Label="Globals">
     <ProjectGuid>{dee5733a-e93e-449d-9114-9bffcaeb4df9}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/ispc.h b/ispc.h
index 652390d2..d649b6cd 100644
--- a/ispc.h
+++ b/ispc.h
@@ -38,7 +38,7 @@
 #ifndef ISPC_H
 #define ISPC_H
 
-#define ISPC_VERSION "1.5.1dev"
+#define ISPC_VERSION "1.6.1dev"
 
 #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)
 #error "Only LLVM 3.1, 3.2, 3.3, 3.4 and the 3.5 development branch are supported"
diff --git a/opt.cpp b/opt.cpp
index c75d4225..ff7ee410 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -5153,6 +5153,11 @@ FixBooleanSelectPass::runOnFunction(llvm::Function &F) {
     // LLVM 3.3 only
 #if defined(LLVM_3_3)
 
+    // Don't optimize generic targets.
+    if (g->target->getISA() == Target::GENERIC) {
+        return false;
+    }
+
     for (llvm::Function::iterator I = F.begin(), E = F.end();
          I != E; ++I) {
         llvm::BasicBlock* bb = &*I;