diff --git a/alloy.py b/alloy.py
index 657e67bf..57d8df1e 100755
--- a/alloy.py
+++ b/alloy.py
@@ -89,7 +89,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
     if  version_LLVM == "trunk":
         SVN_PATH="trunk"
     if  version_LLVM == "3.4":
-        SVN_PATH="tags/RELEASE_34/rc1"
+        SVN_PATH="tags/RELEASE_34/final"
         version_LLVM = "3_4"
     if  version_LLVM == "3.3":
         SVN_PATH="tags/RELEASE_33/final"
@@ -129,8 +129,23 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
         try_do_LLVM("load clang from http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " ",
                     "svn co " + revision + " http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " clang",
                     from_validation)
+        os.chdir("..")
+        if current_OS == "MacOS" and int(current_OS_version.split(".")[0]) >= 13:
+            # Starting with MacOS 10.9 Maverics, the system doesn't contain headers for standard C++ library and
+            # the default library is libc++, bit libstdc++. The headers are part of XCode now. But we are checking out
+            # headers as part of LLVM source tree, so they will be installed in clang location and clang will be able
+            # to find them. Though they may not match to the library installed in the system, but seems that this should
+            # not happen.
+            # Note, that we can also build a libc++ library, but it must be on system default location or should be passed
+            # to the linker explicitly (either through command line or environment variables). So we are not doing it
+            # currently to make the build process easier.
+            os.chdir("projects")
+            try_do_LLVM("load libcxx http://llvm.org/svn/llvm-project/libcxx/" + SVN_PATH + " ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/libcxx/" + SVN_PATH + " libcxx",
+                    from_validation)
+            os.chdir("..")
         if extra == True:
-            os.chdir("./clang/tools")
+            os.chdir("tools/clang/tools")
             try_do_LLVM("load extra clang extra tools ",
                     "svn co " + revision + " http://llvm.org/svn/llvm-project/clang-tools-extra/" + SVN_PATH + " extra",
                     from_validation)
@@ -138,7 +153,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
             try_do_LLVM("load extra clang compiler-rt ",
                     "svn co " + revision + " http://llvm.org/svn/llvm-project/compiler-rt/" + SVN_PATH + " compiler-rt",
                     from_validation)
-        os.chdir("../")
+            os.chdir("..")
     else:
         tar = tarball.split(" ")
         os.makedirs(LLVM_SRC) 
@@ -563,6 +578,8 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
 
 def Main():
     global current_OS
+    global current_OS_version
+    current_OS_version = platform.release()
     if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True:
         current_OS = "Windows"
     else:
@@ -584,7 +601,7 @@ def Main():
         if os.environ.get("SMTP_ISPC") == None:
             error("you have no SMTP_ISPC in your environment for option notify", 1)
     if options.only != "":
-        test_only_r = " 3.1 3.2 3.3 trunk current build stability performance x86 x86-64 -O0 -O2 native "
+        test_only_r = " 3.1 3.2 3.3 3.4 trunk current build stability performance x86 x86-64 -O0 -O2 native "
         test_only = options.only.split(" ")
         for iterator in test_only:
             if not (" " + iterator + " " in test_only_r):
diff --git a/ast.cpp b/ast.cpp
index 83ee207d..60b20a80 100644
--- a/ast.cpp
+++ b/ast.cpp
@@ -223,7 +223,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
         else if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
             fce->func = (Expr *)WalkAST(fce->func, preFunc, postFunc, data);
             fce->args = (ExprList *)WalkAST(fce->args, preFunc, postFunc, data);
-            fce->launchCountExpr = (Expr *)WalkAST(fce->launchCountExpr, preFunc,
+            for (int k = 0; k < 3; k++)
+              fce->launchCountExpr[0] = (Expr *)WalkAST(fce->launchCountExpr[0], preFunc,
                                                    postFunc, data);
         }
         else if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL) {
diff --git a/builtins.cpp b/builtins.cpp
index 2afd92d9..6be41f13 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -488,6 +488,7 @@ lSetInternalFunctions(llvm::Module *module) {
         "__num_cores",
         "__packed_load_active",
         "__packed_store_active",
+        "__packed_store_active2",
         "__popcnt_int32",
         "__popcnt_int64",
         "__prefetch_read_uniform_1",
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 910565dd..c43a12a7 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -3,6 +3,7 @@
 define(`MASK',`i32')
 define(`WIDTH',`1')
 include(`util.m4')
+rdrand_decls()
 ; Define some basics for a 1-wide target
 stdlib_core()
 packed_load_and_store()
@@ -655,7 +656,7 @@ define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
 declare  <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline 
 declare  <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline 
 declare  <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline 
-declare  void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline 
+declare  void @__svml_sincosd(<1 x float>, <1 x double> *, <1 x double> *) nounwind readnone alwaysinline
 declare  <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline 
 declare  <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline 
 declare  <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 92b7a18e..2b2b21c9 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -371,6 +371,8 @@ declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
                                   <WIDTH x i1>) nounwind
 declare i32 @__packed_store_active(i32 * nocapture, <WIDTH x i32> %vals,
                                    <WIDTH x i1>) nounwind
+declare i32 @__packed_store_active2(i32 * nocapture, <WIDTH x i32> %vals,
+                                   <WIDTH x i1>) nounwind
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/util.m4 b/builtins/util.m4
index e1c9bf97..f9ae7cd1 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1831,7 +1831,7 @@ define(`stdlib_core', `
 declare i32 @__fast_masked_vload()
 
 declare i8* @ISPCAlloc(i8**, i64, i32) nounwind
-declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind
+declare void @ISPCLaunch(i8**, i8*, i8*, i32, i32, i32) nounwind
 declare void @ISPCSync(i8*) nounwind
 declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind
 
@@ -3815,6 +3815,51 @@ loopend:
 done:
   ret i32 %nextoffset
 }
+
+define MASK @__packed_store_active2(i32 * %startptr, <WIDTH x i32> %vals,
+                                   <WIDTH x MASK> %full_mask) nounwind alwaysinline {
+entry:
+  %mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
+  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
+  br i1 %mask_known, label %known_mask, label %unknown_mask
+
+known_mask:
+  %allon = icmp eq i64 %mask, ALL_ON_MASK
+  br i1 %allon, label %all_on, label %unknown_mask
+ 
+all_on:
+  %vecptr = bitcast i32 *%startptr to <WIDTH x i32> *
+  store <WIDTH x i32> %vals, <WIDTH x i32> * %vecptr, align 4
+  ret MASK WIDTH
+ 
+unknown_mask:
+  br label %loop
+ 
+loop:
+  %offset = phi MASK [ 0, %unknown_mask ], [ %ch_offset, %loop ]
+  %i = phi i32 [ 0, %unknown_mask ], [ %ch_i, %loop ]
+  %storeval = extractelement <WIDTH x i32> %vals, i32 %i
+
+;; Offset has value in range from 0 to WIDTH-1. So it does not matter if we
+;; zero or sign extending it, while zero extend is free. Also do nothing for
+;; i64 MASK, as we need i64 value.
+ifelse(MASK, `i64',
+` %storeptr = getelementptr i32 *%startptr, MASK %offset',
+` %offset1 = zext MASK %offset to i64
+  %storeptr = getelementptr i32 *%startptr, i64 %offset1')
+  store i32 %storeval, i32 *%storeptr
+
+  %mull_mask = extractelement <WIDTH x MASK> %full_mask, i32 %i
+  %ch_offset = sub MASK %offset, %mull_mask
+ 
+  ; are we done yet?
+  %ch_i = add i32 %i, 1
+  %test = icmp ne i32 %ch_i, WIDTH
+  br i1 %test, label %loop, label %done
+ 
+done:
+  ret MASK %ch_offset
+}
 ')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/cbackend.cpp b/cbackend.cpp
index 40f87074..3db2d504 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -660,7 +660,7 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
 llvm::raw_ostream &
 CWriter::printSimpleType(llvm::raw_ostream &Out, llvm::Type *Ty, bool isSigned,
                          const std::string &NameSoFar) {
-  assert((Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) &&
+  assert((Ty->isFloatingPointTy() || Ty->isX86_MMXTy() || Ty->isIntegerTy() || Ty->isVectorTy() || Ty->isVoidTy()) &&
          "Invalid type for printSimpleType");
   switch (Ty->getTypeID()) {
   case llvm::Type::VoidTyID:   return Out << "void " << NameSoFar;
@@ -756,7 +756,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
 #endif
                                       ) {
 
-  if (Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) {
+  if (Ty->isFloatingPointTy() || Ty->isX86_MMXTy() || Ty->isIntegerTy() || Ty->isVectorTy() || Ty->isVoidTy()) {
     printSimpleType(Out, Ty, isSigned, NameSoFar);
     return Out;
   }
@@ -2737,7 +2737,7 @@ void CWriter::printModuleTypes() {
 void CWriter::printContainedStructs(llvm::Type *Ty,
                                     llvm::SmallPtrSet<llvm::Type *, 16> &Printed) {
   // Don't walk through pointers.
-  if (Ty->isPointerTy() || Ty->isPrimitiveType() || Ty->isIntegerTy())
+  if (!(Ty->isStructTy() || Ty->isArrayTy()))
     return;
 
   // Print all contained types first.
diff --git a/common.py b/common.py
old mode 100644
new mode 100755
index be3e9526..2a788722
--- a/common.py
+++ b/common.py
@@ -121,4 +121,7 @@ def check_tools(m):
                         if int(t11[j])<input_tools[t][0][j]:
                             error(input_tools[t][2], m)
                             ret = 0
+                            break
+                        if int(t11[j])>input_tools[t][0][j]:
+                            break
     return ret
diff --git a/contrib/ispc.vim b/contrib/ispc.vim
index f3cb413b..11808658 100644
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -18,6 +18,7 @@ syn keyword	ispcConditional	cif
 syn keyword	ispcRepeat	cdo cfor cwhile
 syn keyword	ispcBuiltin	programCount programIndex	
 syn keyword	ispcType	export uniform varying int8 int16 int32 int64
+syn keyword	ispcOperator	operator
 
 "double precision floating point number, with dot, optional exponent
 syn match	cFloat		display contained "\d\+\.\d*d[-+]\=\d*\>"
@@ -33,6 +34,7 @@ HiLink ispcConditional	Conditional
 HiLink ispcRepeat	Repeat
 HiLink ispcBuiltin	Statement
 HiLink ispcType		Type
+HiLink ispcOperator	Operator
 delcommand HiLink
 
 let b:current_syntax = "ispc"
diff --git a/ctx.cpp b/ctx.cpp
index c1a7e61a..6ff26c6a 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -316,7 +316,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
             llvm::BasicBlock *offBB =
                    llvm::BasicBlock::Create(*g->ctx, "entry",
                                             (llvm::Function *)offFunc, 0);
-            new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB);
+            llvm::StoreInst *inst =
+                new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB);
+            if (g->opt.forceAlignedMemory) {
+                inst->setAlignment(g->target->getNativeVectorAlignment());
+            }
             llvm::ReturnInst::Create(*g->ctx, offBB);
         }
 
@@ -2437,7 +2441,13 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) {
     if (name == NULL)
         name = LLVMGetName(ptr, "_load");
 
-    llvm::Instruction *inst = new llvm::LoadInst(ptr, name, bblock);
+    llvm::LoadInst *inst = new llvm::LoadInst(ptr, name, bblock);
+
+    if (g->opt.forceAlignedMemory &&
+        llvm::dyn_cast<llvm::VectorType>(pt->getElementType())) {
+        inst->setAlignment(g->target->getNativeVectorAlignment());
+    }
+
     AddDebugPos(inst);
     return inst;
 }
@@ -2719,7 +2729,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType,
         inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock);
 
     // If no alignment was specified but we have an array of a uniform
-    // type, then align it to 4 * the native vector width; it's not
+    // type, then align it to the native vector alignment; it's not
     // unlikely that this array will be loaded into varying variables with
     // what will be aligned accesses if the uniform -> varying load is done
     // in regular chunks.
@@ -2727,7 +2737,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType,
         llvm::dyn_cast<llvm::ArrayType>(llvmType);
     if (align == 0 && arrayType != NULL &&
         !llvm::isa<llvm::VectorType>(arrayType->getElementType()))
-        align = 4 * g->target->getNativeVectorWidth();
+        align = g->target->getNativeVectorAlignment();
 
     if (align != 0)
         inst->setAlignment(align);
@@ -2986,7 +2996,17 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) {
         return;
     }
 
-    llvm::Instruction *inst = new llvm::StoreInst(value, ptr, bblock);
+    llvm::PointerType *pt =
+        llvm::dyn_cast<llvm::PointerType>(ptr->getType());
+    AssertPos(currentPos, pt != NULL);
+
+    llvm::StoreInst *inst = new llvm::StoreInst(value, ptr, bblock);
+
+    if (g->opt.forceAlignedMemory &&
+        llvm::dyn_cast<llvm::VectorType>(pt->getElementType())) {
+        inst->setAlignment(g->target->getNativeVectorAlignment());
+    }
+
     AddDebugPos(inst);
 }
 
@@ -3502,7 +3522,7 @@ FunctionEmitContext::ReturnInst() {
 llvm::Value *
 FunctionEmitContext::LaunchInst(llvm::Value *callee,
                                 std::vector<llvm::Value *> &argVals,
-                                llvm::Value *launchCount) {
+                                llvm::Value *launchCount[3]){
     if (callee == NULL) {
         AssertPos(currentPos, m->errorCount > 0);
         return NULL;
@@ -3563,7 +3583,9 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
     args.push_back(launchGroupHandlePtr);
     args.push_back(fptr);
     args.push_back(voidmem);
-    args.push_back(launchCount);
+    args.push_back(launchCount[0]);
+    args.push_back(launchCount[1]);
+    args.push_back(launchCount[2]);
     return CallInst(flaunch, NULL, args, "");
 }
 
diff --git a/ctx.h b/ctx.h
index 58f9aae3..4dd30053 100644
--- a/ctx.h
+++ b/ctx.h
@@ -542,7 +542,7 @@ public:
         he given argument values. */
     llvm::Value *LaunchInst(llvm::Value *callee,
                             std::vector<llvm::Value *> &argVals,
-                            llvm::Value *launchCount);
+                            llvm::Value *launchCount[3]);
 
     void SyncInst();
 
diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt
index a8575ea0..b7d0bb17 100644
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,47 @@
+=== v1.6.0 === (19 December 2013)
+
+A major new version of ISPC with major improvements in performance and
+stability. Linux and MacOS binaries are based on patched version of LLVM 3.3,
+while Windows version is based on LLVM 3.4rc3. LLVM 3.4 significantly improves
+stability on Win32 platform, so we've decided not to wait for official LLVM 3.4
+release.
+
+The list of the most significant changes is:
+
+* New avx1-i32x4 target was added. It may play well for you, if you are focused
+  on integer computations or FP unit in your hardware is 128 bit wide.
+
+* Support for calculations in double precision was extended with two new
+  targets avx1.1-i64x4 and avx2-i64x4.
+
+* Language support for overloaded operators was added.
+
+* New library shift() function was added, which is similar to rotate(), but is
+  non-circular.
+
+* The language was extended to accept 3 dimensional tasking - a syntactic sugar,
+  which may facilitate programming of some tasks.
+
+* Regression, which broke --opt=force-aligned-memory is fixed.
+
+If you are not using pre-built binaries, you may notice the following changes:
+
+* VS2012/VS2013 are supported.
+
+* alloy.py (with -b switch) can build LLVM for you on any platform now
+  (except MacOS 10.9, but we know about the problem and working on it).
+  This is a preferred way to build LLVM for ISPC, as all required patches for
+  better performance and stability will automatically apply.
+
+* LLVM 3.5 (current trunk) is supported.
+
+There are also multiple fixes for better performance and stability, most
+notable are:
+
+* Fixed performance problem for x2 targets.
+
+* Fixed a problem with incorrect vzeroupper insertion on AVX target on Win32.
+
 === v1.5.0 === (27 September 2013)
 
 A major new version of ISPC with several new targets and important bug fixes.
diff --git a/docs/ispc.rst b/docs/ispc.rst
index 93b6ac9b..9464dcde 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -48,6 +48,8 @@ Contents:
   + `Updating ISPC Programs For Changes In ISPC 1.1`_
   + `Updating ISPC Programs For Changes In ISPC 1.2`_
   + `Updating ISPC Programs For Changes In ISPC 1.3`_
+  + `Updating ISPC Programs For Changes In ISPC 1.5.0`_
+  + `Updating ISPC Programs For Changes In ISPC 1.6.0`_
 
 * `Getting Started with ISPC`_
 
@@ -97,6 +99,9 @@ Contents:
     * `Short Vector Types`_
     * `Array Types`_
     * `Struct Types`_
+
+      + `Operators Overloading`_
+
     * `Structure of Array Types`_
 
   + `Declarations and Initializers`_
@@ -279,6 +284,15 @@ Double precision floating point constants are floating point number with
 31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is
 treated as single precision constant.
 
+Updating ISPC Programs For Changes In ISPC 1.6.0
+------------------------------------------------
+
+This release adds support for `Operators Overloading`_, so a word ``operator``
+becomes a keyword and it potentially creates a conflict with existing user 
+function. Also a new library function packed_store_active2() was introduced,
+which also may create a conflict with existing user functions.
+
+
 Getting Started with ISPC
 =========================
 
@@ -1325,6 +1339,7 @@ in C:
 * Function overloading by parameter type
 * Hexadecimal floating-point constants
 * Dynamic memory allocation with ``new`` and ``delete``.
+* Limited support for overloaded operators (`Operators Overloading`_).
 
 ``ispc`` also adds a number of new features that aren't in C89, C99, or
 C++:
@@ -2122,7 +2137,35 @@ above code, the value of ``f[index]`` needs to be able to store a different
 value of ``Foo::a`` for each program instance.  However, a ``varying Foo``
 still has only a single ``a`` member, since ``a`` was declared with
 ``uniform`` variability in the declaration of ``Foo``.  Therefore, the
-indexing operation in the last line results in an error.  
+indexing operation in the last line results in an error.
+
+
+Operators Overloading
+---------------------
+
+ISPC has limited support for overloaded operators for ``struct`` types. Only
+binary operators are supported currently, namely they are: ``*, /, %, +, -, >>
+and <<``. Operators overloading support is similar to the one in C++ language.
+To overload an operator for ``struct S``, you need to declare and implement a
+function using keyword ``operator``, which accepts two parameters of type
+``struct S`` or ``struct S&`` and returns either of these types. For example:
+
+::
+
+    struct S { float re, im;};
+    struct S operator*(struct S a, struct S b) {
+        struct S result;
+        result.re = a.re * b.re - a.im * b.im;
+        result.im = a.re * b.im + a.im * b.re;
+        return result;
+    }
+
+    void foo(struct S a, struct S b) {
+        struct S mul = a*b;
+        print("a.re:   %\na.im:   %\n", a.re, a.im);
+        print("b.re:   %\nb.im:   %\n", b.re, b.im);
+        print("mul.re: %\nmul.im: %\n", mul.re, mul.im);
+    }
 
 
 Structure of Array Types
@@ -3015,8 +3058,7 @@ Intel® Cilk(tm), Intel® Thread Building Blocks or another task system), and
 for tasks to use ``ispc`` for SPMD parallelism across the vector lanes as
 appropriate.  Alternatively, ``ispc`` also has support for launching tasks
 from ``ispc`` code.  The approach is similar to Intel® Cilk's task launch
-feature.  (See the ``examples/mandelbrot_tasks`` example to see it used in
-a small example.)
+feature.  (Check the ``examples/mandelbrot_tasks`` example to see how it is used.)
 
 Any function that is launched as a task must be declared with the
 ``task`` qualifier:
@@ -3111,6 +3153,38 @@ executing the current task.  The ``threadIndex`` can be used for accessing
 data that is private to the current thread and thus doesn't require
 synchronization to access under parallel execution.
 
+The tasking system also supports multi-dimensional partitioning (currently up
+to three dimensions). To launch a 3D grid of tasks, for example with ``N0``,
+``N1``  and ``N2`` tasks in x-, y- and z-dimension respectively
+
+::
+   
+  float data[N2][N1][N0]
+  task void foo_task()
+  {
+     data[taskIndex2][taskIndex1][threadIndex0] = taskIndex;
+  }
+
+we use the following ``launch`` expressions:
+
+::
+
+  launch [N2][N1][N0] foo_task()
+
+or
+
+::
+
+  launch [N0,N1,N2] foo_task()
+
+Value of ``taskIndex`` is equal to ``taskIndex0 + taskCount0*(taskIndex1 +
+taskCount1*taskIndex2)`` and it ranges from ``0`` to  ``taskCount-1``, where 
+``taskCount = taskCount0*taskCount1*taskCount2``. If ``N1`` or/and ``N2`` are
+not specified in the ``launch`` expression, a value of ``1`` is assumed.
+Finally, for an one-dimensional grid of tasks,  ``taskIndex`` is equivalent to
+``taskIndex0`` and ``taskCount`` is equivalent to ``taskCount0``.
+
+
 Task Parallelism: Runtime Requirements
 --------------------------------------
 
@@ -3141,7 +3215,7 @@ manage tasks in ``ispc``:
 ::
 
     void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
-    void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
+    void ISPCLaunch(void **handlePtr, void *f, void *data, int count0, int count1, int count2);
     void ISPCSync(void *handle);
 
 All three of these functions take an opaque handle (or a pointer to an
@@ -3178,16 +3252,20 @@ tasks.  Each ``launch`` statement in ``ispc`` code causes a call to
 after the handle pointer to the function are relatively straightforward;
 the ``void *f`` parameter holds a pointer to a function to call to run the
 work for this task, ``data`` holds a pointer to data to pass to this
-function, and ``count`` is the number of instances of this function to
-enqueue for asynchronous execution.  (In other words, ``count`` corresponds
-to the value ``n`` in a multiple-task launch statement like ``launch[n]``.)
+function, and ``count0``, ``count1`` and ``count2`` are the number of instances
+of this function to enqueue for asynchronous execution.  (In other words,
+``count0``, ``count1`` and ``count2`` correspond to the value ``n0``, ``n1``
+and ``n2`` in a multiple-task launch statement like ``launch[n2][n1][n0]`` or
+``launch [n0,n1,n2]`` respectively.)
 
 The signature of the provided function pointer ``f`` is
 
 ::
 
     void (*TaskFuncPtr)(void *data, int threadIndex, int threadCount,
-                        int taskIndex, int taskCount)
+                        int taskIndex, int taskCount,
+                        int taskIndex0, int taskIndex1, int taskIndex2,
+                        int taskCount0, int taskCount1, int taskCount2);
 
 When this function pointer is called by one of the hardware threads managed
 by the task system, the ``data`` pointer passed to ``ISPCLaunch()`` should
@@ -3197,11 +3275,14 @@ number of hardware threads that have been spawned to run tasks and
 uniquely identifying the hardware thread that is running the task.  (These
 values can be used to index into thread-local storage.)
 
-The value of ``taskCount`` should be the number of tasks launched in the
-``launch`` statement that caused the call to ``ISPCLaunch()`` and each of
-the calls to this function should be given a unique value of ``taskIndex``
-between zero and ``taskCount``, to distinguish which of the instances
-of the set of launched tasks is running.
+The value of ``taskCount`` should be the total number of tasks launched in the
+``launch`` statement (it must be equal to ``taskCount0*taskCount1*taskCount2``)
+that caused the call to ``ISPCLaunch()`` and each of the calls to this function
+should be given a unique value of ``taskIndex``, ``taskIndex0``, ``taskIndex1``
+and ``taskIndex2`` between zero and ``taskCount``, ``taskCount0``,
+``taskCount1`` and ``taskCount2`` respectively,  with ``taskIndex = taskIndex0 
++ taskCount0*(taskIndex1 + taskCount1*taskIndex2)``, to distinguish which of
+the instances of the set of launched tasks is running.
 
 
 
@@ -4012,6 +4093,14 @@ They return the total number of values stored.
                                     unsigned int val)
 
 
+There are also ``packed_store_active2()`` functions with exactly the same
+signatures and the same semantic except that they may write one extra
+element to the output array (but still returning the same value as
+``packed_store_active()``). These functions suggest different branch free 
+implementation on most of supported targets, which usually (but not always)
+performs better than ``packed_store_active()``. It's advised to test function
+performance on user's scenarios on particular target hardware before using it.
+
 As an example of how these functions can be used, the following code shows
 the use of ``packed_store_active()``.
 
diff --git a/docs/news.rst b/docs/news.rst
index 7d78a662..6a805e48 100644
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -2,6 +2,16 @@
 ispc News
 =========
 
+ispc 1.6.0 is Released
+----------------------
+
+A major update of ``ispc`` has been released. The main focus is on improved 
+performance and stability. Several new targets were added. There are also 
+a number of language and library extensions. Released binaries are based on
+patched LLVM 3.3 on Linux and MacOS and LLVM 3.4rc3 on Windows. Please refer
+to Release Notes for complete set of changes.
+
+
 ispc 1.5.0 is Released
 ----------------------
 
diff --git a/doxygen.cfg b/doxygen.cfg
index a0ad3176..9a8f88e5 100644
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
 
-PROJECT_NUMBER         = 1.5.1dev
+PROJECT_NUMBER         = 1.6.1dev
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
diff --git a/examples/aobench/ao.cpp b/examples/aobench/ao.cpp
index 2286316d..b4e2833d 100644
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -60,7 +60,7 @@ using namespace ispc;
 
 extern void ao_serial(int w, int h, int nsubsamples, float image[]);
 
-static unsigned int test_iterations;
+static unsigned int test_iterations[] = {3, 7, 1};
 static unsigned int width, height;
 static unsigned char *img;
 static float *fimg;
@@ -106,16 +106,20 @@ savePPM(const char *fname, int w, int h)
 
 int main(int argc, char **argv)
 {
-    if (argc != 4) {
+    if (argc < 3) {
         printf ("%s\n", argv[0]);
-        printf ("Usage: ao [num test iterations] [width] [height]\n");
+        printf ("Usage: ao [width] [height] [ispc iterations] [tasks iterations] [serial iterations]\n");
         getchar();
         exit(-1);
     }
     else {
-        test_iterations = atoi(argv[1]);
-        width = atoi (argv[2]);
-        height = atoi (argv[3]);
+        if (argc == 6) {
+            for (int i = 0; i < 3; i++) {
+                test_iterations[i] = atoi(argv[3 + i]);
+            }
+        }
+        width = atoi (argv[1]);
+        height = atoi (argv[2]);
     }
 
     // Allocate space for output images
@@ -127,13 +131,14 @@ int main(int argc, char **argv)
     // time for any of them.
     //
     double minTimeISPC = 1e30;
-    for (unsigned int i = 0; i < test_iterations; i++) {
+    for (unsigned int i = 0; i < test_iterations[0]; i++) {
         memset((void *)fimg, 0, sizeof(float) * width * height * 3);
         assert(NSUBSAMPLES == 2);
 
         reset_and_start_timer();
         ao_ispc(width, height, NSUBSAMPLES, fimg);
         double t = get_elapsed_mcycles();
+        printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", t);
         minTimeISPC = std::min(minTimeISPC, t);
     }
 
@@ -147,13 +152,14 @@ int main(int argc, char **argv)
     // minimum time for any of them.
     //
     double minTimeISPCTasks = 1e30;
-    for (unsigned int i = 0; i < test_iterations; i++) {
+    for (unsigned int i = 0; i < test_iterations[1]; i++) {
         memset((void *)fimg, 0, sizeof(float) * width * height * 3);
         assert(NSUBSAMPLES == 2);
 
         reset_and_start_timer();
         ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
         double t = get_elapsed_mcycles();
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", t);
         minTimeISPCTasks = std::min(minTimeISPCTasks, t);
     }
 
@@ -167,11 +173,12 @@ int main(int argc, char **argv)
     // minimum time.
     //
     double minTimeSerial = 1e30;
-    for (unsigned int i = 0; i < test_iterations; i++) {
+    for (unsigned int i = 0; i < test_iterations[2]; i++) {
         memset((void *)fimg, 0, sizeof(float) * width * height * 3);
         reset_and_start_timer();
         ao_serial(width, height, NSUBSAMPLES, fimg);
         double t = get_elapsed_mcycles();
+        printf("@time of serial run:\t\t\t\t[%.3f] million cycles\n", t);
         minTimeSerial = std::min(minTimeSerial, t);
     }
 
diff --git a/examples/aobench/aobench.vcxproj b/examples/aobench/aobench.vcxproj
index c46ee41a..298be2cb 100644
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -1,5 +1,23 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/common.props b/examples/common.props
index 7bf37005..3769330b 100644
--- a/examples/common.props
+++ b/examples/common.props
@@ -146,24 +146,24 @@
   <PropertyGroup Label="User">
     <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
     <Target_str Condition=" '$(Target_str)' == '' ">$(default_targets)</Target_str>
-    <Target_out>$(TargetDir)$(ISPC_file).obj</Target_out>
-    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('sse2')))">$(Target_out);$(TargetDir)$(ISPC_file)_sse2.obj</Target_out>
-    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('sse4')))">$(Target_out);$(TargetDir)$(ISPC_file)_sse4.obj</Target_out>
-    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx1-')))">$(Target_out);$(TargetDir)$(ISPC_file)_avx.obj</Target_out>
-    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx1.1')))">$(Target_out);$(TargetDir)$(ISPC_file)_avx11.obj</Target_out>
-    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx2')))">$(Target_out);$(TargetDir)$(ISPC_file)_avx2.obj</Target_out>
+    <Target_out>$(ISPC_file).obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('sse2')))">$(Target_out);$(ISPC_file)_sse2.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('sse4')))">$(Target_out);$(ISPC_file)_sse4.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx1-')))">$(Target_out);$(ISPC_file)_avx.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx1.1')))">$(Target_out);$(ISPC_file)_avx11.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx2')))">$(Target_out);$(ISPC_file)_avx2.obj</Target_out>
   </PropertyGroup>
   <ItemGroup>
     <CustomBuild Include='$(ISPC_file).ispc'>
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str)</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str)</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str)</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str)</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Target_out)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Target_out)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Target_out)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Target_out)</Outputs>
     </CustomBuild>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/examples/deferred/deferred_shading.vcxproj b/examples/deferred/deferred_shading.vcxproj
index cd361b26..974e870b 100755
--- a/examples/deferred/deferred_shading.vcxproj
+++ b/examples/deferred/deferred_shading.vcxproj
@@ -1,5 +1,23 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>	
   <PropertyGroup Label="Globals">
     <ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/deferred/main.cpp b/examples/deferred/main.cpp
index 4f2be879..d7f62f50 100644
--- a/examples/deferred/main.cpp
+++ b/examples/deferred/main.cpp
@@ -62,10 +62,16 @@
 ///////////////////////////////////////////////////////////////////////////
 
 int main(int argc, char** argv) {
-    if (argc != 2) {
-        printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
+    if (argc < 2) {
+        printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)> [tasks iterations] [serial iterations]\n");
         return 1;
     }
+    static unsigned int test_iterations[] = {5, 3, 500}; //last value is for nframes, it is scale.
+    if (argc == 5) {
+        for (int i = 0; i < 3; i++) {
+            test_iterations[i] = atoi(argv[2 + i]);
+        }
+    }
 
     InputData *input = CreateInputDataFromFile(argv[1]);
     if (!input) {
@@ -81,9 +87,9 @@ int main(int argc, char** argv) {
     InitDynamicCilk(input);
 #endif // __cilk
 
-    int nframes = 5;
+    int nframes = test_iterations[2];
     double ispcCycles = 1e30;
-    for (int i = 0; i < 5; ++i) {
+    for (int i = 0; i < test_iterations[0]; ++i) {
         framebuffer.clear();
         reset_and_start_timer();
         for (int j = 0; j < nframes; ++j)
@@ -91,6 +97,7 @@ int main(int argc, char** argv) {
                                VISUALIZE_LIGHT_COUNT,
                                framebuffer.r, framebuffer.g, framebuffer.b);
         double mcycles = get_elapsed_mcycles() / nframes;
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", mcycles);
         ispcCycles = std::min(ispcCycles, mcycles);
     }
     printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
@@ -98,14 +105,16 @@ int main(int argc, char** argv) {
            input->header.framebufferWidth, input->header.framebufferHeight);
     WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
 
+    nframes = 3;
 #ifdef __cilk
     double dynamicCilkCycles = 1e30;
-    for (int i = 0; i < 5; ++i) {
+    for (int i = 0; i < test_iterations[1]; ++i) {
         framebuffer.clear();
         reset_and_start_timer();
         for (int j = 0; j < nframes; ++j)
             DispatchDynamicCilk(input, &framebuffer);
         double mcycles = get_elapsed_mcycles() / nframes;
+        printf("@time of serial run:\t\t\t[%.3f] million cycles\n", mcycles);
         dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
     }
     printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", 
@@ -114,12 +123,13 @@ int main(int argc, char** argv) {
 #endif // __cilk
 
     double serialCycles = 1e30;
-    for (int i = 0; i < 5; ++i) {
+    for (int i = 0; i < test_iterations[1]; ++i) {
         framebuffer.clear();
         reset_and_start_timer();
         for (int j = 0; j < nframes; ++j)
             DispatchDynamicC(input, &framebuffer);
         double mcycles = get_elapsed_mcycles() / nframes;
+        printf("@time of serial run:\t\t\t[%.3f] million cycles\n", mcycles);
         serialCycles = std::min(serialCycles, mcycles);
     }
     printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", 
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index fa794276..0aa8a3f6 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1472,31 +1472,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec16_i32 val,
     return count;
 }
 
+
+static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    int count = 0;
+    int32_t *ptr_ = ptr;
+    for (int i = 0; i < 16; ++i) {
+        *ptr = val.v[i];
+        ptr += mask.v & 1;
+        mask.v = mask.v >> 1;
+    }
+    return ptr - ptr_;
+}
+
+
 static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
                                                 __vec16_i32 *val,
                                                 __vec16_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 16; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            val->v[i] = *ptr++;
-            ++count;
-        }
-    }
-    return count;
+    return __packed_load_active((int32_t *)ptr, val, mask);
 }
 
 
 static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
                                                  __vec16_i32 val,
                                                  __vec16_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 16; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            *ptr++ = val.v[i];
-            ++count;
-        }
-    }
-    return count;
+    return __packed_store_active((int32_t *)ptr, val, mask);
+}
+
+
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr,
+                                                 __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    return __packed_store_active2((int32_t *)ptr, val, mask);
 }
 
 
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index 531ed215..924b049d 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -1523,31 +1523,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec32_i32 val,
     return count;
 }
 
+
+static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec32_i32 val,
+                                                 __vec32_i1 mask) {
+    int count = 0;
+    int32_t *ptr_ = ptr;
+    for (int i = 0; i < 32; ++i) {
+        *ptr = val.v[i];
+        ptr += mask.v & 1;
+        mask.v = mask.v >> 1;
+    }
+    return ptr - ptr_;
+}
+
+
 static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
                                                 __vec32_i32 *val,
                                                 __vec32_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 32; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            val->v[i] = *ptr++;
-            ++count;
-        }
-    }
-    return count;
+    return __packed_load_active((int32_t *)ptr, val, mask);
 }
 
 
 static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
                                                  __vec32_i32 val,
                                                  __vec32_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 32; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            *ptr++ = val.v[i];
-            ++count;
-        }
-    }
-    return count;
+    return __packed_store_active((int32_t *)ptr, val, mask);
+}
+
+
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr,
+                                                 __vec32_i32 val,
+                                                 __vec32_i1 mask) {
+    return __packed_store_active2((int32_t *)ptr, val, mask);
 }
 
 
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index bbeb007a..b1451c96 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -1656,31 +1656,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec64_i32 val,
     return count;
 }
 
+
+static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec64_i32 val,
+                                                 __vec64_i1 mask) {
+    int count = 0;
+    int32_t *ptr_ = ptr;
+    for (int i = 0; i < 64; ++i) {
+        *ptr = val.v[i];
+        ptr += mask.v & 1;
+        mask.v = mask.v >> 1;
+    }
+    return ptr - ptr_;
+}
+
+
 static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
                                                 __vec64_i32 *val,
                                                 __vec64_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 64; ++i) {
-        if ((mask.v & (1ull << i)) != 0) {
-            val->v[i] = *ptr++;
-            ++count;
-        }
-    }
-    return count;
+    return __packed_load_active((int32_t *) ptr, val, mask);
 }
 
 
 static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
                                                  __vec64_i32 val,
                                                  __vec64_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 64; ++i) {
-        if ((mask.v & (1ull << i)) != 0) {
-            *ptr++ = val.v[i];
-            ++count;
-        }
-    }
-    return count;
+    return __packed_store_active((int32_t *) ptr, val, mask);
+}
+
+
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr,
+                                                 __vec64_i32 val,
+                                                 __vec64_i1 mask) {
+    return __packed_store_active2((int32_t *) ptr, val, mask);
 }
 
 
diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index ef14d26e..141c47bb 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -2451,20 +2451,24 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, _
   return _mm_countbits_32(uint32_t(mask));
 }
 
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) 
+{
+  return __packed_store_active(p, val, mask);
+}
+
 static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val, __vec16_i1 mask)
 {
-  __vec16_i32 v = __load<64>(val);
-  v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-  v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-  __store<64>(val, v);
-  return _mm_countbits_32(uint32_t(mask));
+  return __packed_load_active((uint32_t *)p, val, mask);
 }
 
 static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val, __vec16_i1 mask) 
 {
-  _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-  return _mm_countbits_32(uint32_t(mask));
+  return __packed_store_active((uint32_t *)p, val, mask);
+}
+
+static FORCEINLINE int32_t __packed_store_active2(int32_t *p, __vec16_i32 val, __vec16_i1 mask) 
+{
+  return __packed_store_active(p, val, mask);
 }
 
 ///////////////////////////////////////////////////////////////////////////
diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h
index d7696117..32f39c4a 100644
--- a/examples/intrinsics/knc-i1x8.h
+++ b/examples/intrinsics/knc-i1x8.h
@@ -2496,20 +2496,23 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val,
     _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
     return _mm_countbits_32(uint32_t(0xFF & mask));
 }
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val,
+                                                 __vec4_i1 mask) {
+    return __packed_store_active(ptr, val, mask);
+}
 static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val,
                                                 __vec8_i1 mask) {
-    __vec8_i32 v = __load<64>(val);
-    v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    __store<64>(val, v);
-    return _mm_countbits_32(uint32_t(0xFF & mask));
+    return __packed_load_active((uint32_t *)p, val, mask);
 }
 static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val,
                                                  __vec8_i1 mask) {
-    _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    return _mm_countbits_32(uint32_t(0xFF & mask));
+    return __packed_store_active((uint32_t *)p, val, mask);
 }
+static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val,
+                                                 __vec4_i1 mask) {
+    return __packed_store_active(ptr, val, mask);
+}
+
 #endif
 
 ///////////////////////////////////////////////////////////////////////////
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index 8baef8cb..0077ad88 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -1260,6 +1260,13 @@ static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i3
     return __vec16_i64(val.v, _mm512_setzero_epi32());
 }
 
+static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val)
+{
+  __vec16_i32 ret = _mm512_setzero_epi32();
+  __vec16_i32 one = _mm512_set1_epi32(-1);
+  return _mm512_mask_mov_epi32(ret, val, one);
+}
+
 static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
 {
     __vec16_i32 ret = _mm512_setzero_epi32();
@@ -1878,6 +1885,11 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val,
     return _mm_countbits_32(uint32_t(mask));
 }
 
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val, __vec16_i1 mask)
+{
+  return __packed_store_active(p, val, mask);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // prefetch
 ///////////////////////////////////////////////////////////////////////////
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 919716be..5dd424d9 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -3798,6 +3798,25 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec4_i32 val,
     return count;
 }
 
+static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val,
+                                                 __vec4_i1 mask) {
+    int count = 0;
+
+    ptr[count] = _mm_extract_epi32(val.v, 0);
+    count -= _mm_extract_ps(mask.v, 0);
+
+    ptr[count] = _mm_extract_epi32(val.v, 1);
+    count -= _mm_extract_ps(mask.v, 1);
+
+    ptr[count] = _mm_extract_epi32(val.v, 2);
+    count -= _mm_extract_ps(mask.v, 2);
+
+    ptr[count] = _mm_extract_epi32(val.v, 3);
+    count -= _mm_extract_ps(mask.v, 3);
+
+    return count;
+}
+
 static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec4_i32 *val,
                                                 __vec4_i1 mask) {
     return __packed_load_active((int32_t *)ptr, val, mask);
@@ -3808,6 +3827,11 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec4_i32 val,
     return __packed_store_active((int32_t *)ptr, val, mask);
 }
 
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val,
+                                                 __vec4_i1 mask) {
+    return __packed_store_active2((int32_t *)ptr, val, mask);
+}
+
 
 ///////////////////////////////////////////////////////////////////////////
 // aos/soa
diff --git a/examples/mandelbrot/mandelbrot.cpp b/examples/mandelbrot/mandelbrot.cpp
index d2bebb96..fafc00d0 100644
--- a/examples/mandelbrot/mandelbrot.cpp
+++ b/examples/mandelbrot/mandelbrot.cpp
@@ -42,6 +42,7 @@
 #include <algorithm>
 #include "../timing.h"
 #include "mandelbrot_ispc.h"
+#include <string.h>
 using namespace ispc;
 
 extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
@@ -67,7 +68,8 @@ writePPM(int *buf, int width, int height, const char *fn) {
 }
 
 
-int main() {
+int main(int argc, char *argv[]) {
+    static unsigned int test_iterations[] = {3, 3};
     unsigned int width = 768;
     unsigned int height = 512;
     float x0 = -2;
@@ -75,6 +77,19 @@ int main() {
     float y0 = -1;
     float y1 = 1;
 
+    if (argc > 1) {
+        if (strncmp(argv[1], "--scale=", 8) == 0) {
+            float scale = atof(argv[1] + 8);
+            width *= scale;
+            height *= scale;
+        }
+    }
+    if ((argc == 3) || (argc == 4)) {
+        for (int i = 0; i < 2; i++) {
+            test_iterations[i] = atoi(argv[argc - 2 + i]);
+        }
+    }
+
     int maxIterations = 256;
     int *buf = new int[width*height];
 
@@ -83,10 +98,11 @@ int main() {
     // time of three runs.
     //
     double minISPC = 1e30;
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < test_iterations[0]; ++i) {
         reset_and_start_timer();
         mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
         double dt = get_elapsed_mcycles();
+        printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt);
         minISPC = std::min(minISPC, dt);
     }
 
@@ -102,10 +118,11 @@ int main() {
     // minimum time.
     //
     double minSerial = 1e30;
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < test_iterations[1]; ++i) {
         reset_and_start_timer();
         mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
         double dt = get_elapsed_mcycles();
+        printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt);
         minSerial = std::min(minSerial, dt);
     }
 
diff --git a/examples/mandelbrot/mandelbrot.vcxproj b/examples/mandelbrot/mandelbrot.vcxproj
index e7703ad0..7a5f6e03 100644
--- a/examples/mandelbrot/mandelbrot.vcxproj
+++ b/examples/mandelbrot/mandelbrot.vcxproj
@@ -1,5 +1,23 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
index 698daf0f..682987ae 100644
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
@@ -38,7 +38,8 @@
 #pragma warning (disable: 4305)
 #endif
 
-#include <stdio.h>
+#include <cstdio>
+#include <cstdlib>
 #include <algorithm>
 #include <string.h>
 #include "../timing.h"
@@ -69,11 +70,12 @@ writePPM(int *buf, int width, int height, const char *fn) {
 
 
 static void usage() {
-    fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
+    fprintf(stderr, "usage: mandelbrot [--scale=<factor>] [tasks iterations] [serial iterations]\n");
     exit(1);
 }
 
 int main(int argc, char *argv[]) {
+    static unsigned int test_iterations[] = {7, 1};
     unsigned int width = 1536;
     unsigned int height = 1024;
     float x0 = -2;
@@ -81,9 +83,7 @@ int main(int argc, char *argv[]) {
     float y0 = -1;
     float y1 = 1;
 
-    if (argc == 1)
-        ;
-    else if (argc == 2) {
+    if (argc > 1) {
         if (strncmp(argv[1], "--scale=", 8) == 0) {
             float scale = atof(argv[1] + 8);
             if (scale == 0.f)
@@ -94,11 +94,13 @@ int main(int argc, char *argv[]) {
             width = (width + 0xf) & ~0xf;
             height = (height + 0xf) & ~0xf;
         }
-        else 
-            usage();
     }
-    else
-        usage();
+    if ((argc == 3) || (argc == 4)) {
+        for (int i = 0; i < 2; i++) {
+            test_iterations[i] = atoi(argv[argc - 2 + i]);
+        }
+    }
+
 
     int maxIterations = 512;
     int *buf = new int[width*height];
@@ -108,13 +110,14 @@ int main(int argc, char *argv[]) {
     // time of three runs.
     //
     double minISPC = 1e30;
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < test_iterations[0]; ++i) {
         // Clear out the buffer
         for (unsigned int i = 0; i < width * height; ++i)
             buf[i] = 0;
         reset_and_start_timer();
         mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
         double dt = get_elapsed_mcycles();
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", dt);
         minISPC = std::min(minISPC, dt);
     }
 
@@ -127,13 +130,14 @@ int main(int argc, char *argv[]) {
     // minimum time.
     //
     double minSerial = 1e30;
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < test_iterations[1]; ++i) {
         // Clear out the buffer
         for (unsigned int i = 0; i < width * height; ++i)
             buf[i] = 0;
         reset_and_start_timer();
         mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
         double dt = get_elapsed_mcycles();
+        printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt);
         minSerial = std::min(minSerial, dt);
     }
 
diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.ispc b/examples/mandelbrot_tasks/mandelbrot_tasks.ispc
index 84d4ccd4..f9b0be4c 100644
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.ispc
@@ -57,21 +57,26 @@ task void
 mandelbrot_scanline(uniform float x0, uniform float dx, 
                     uniform float y0, uniform float dy,
                     uniform int width, uniform int height, 
-                    uniform int span,
+                    uniform int xspan, uniform int yspan,
                     uniform int maxIterations, uniform int output[]) {
-    uniform int ystart = taskIndex * span;
-    uniform int yend = min((taskIndex+1) * span, (unsigned int)height);
+    const uniform int xstart = taskIndex0 * xspan;
+    const uniform int xend   = min(xstart  + xspan, width);
 
-    foreach (yi = ystart ... yend, xi = 0 ... width) {
+    const uniform int ystart = taskIndex1 * yspan;
+    const uniform int yend   = min(ystart  + yspan, height);
+    
+
+    foreach (yi = ystart ... yend, xi = xstart ... xend) {
         float x = x0 + xi * dx;
         float y = y0 + yi * dy;
 
         int index = yi * width + xi;
         output[index] = mandel(x, y, maxIterations);
     }
+    
 }
-                               
 
+#if 1
 export void
 mandelbrot_ispc(uniform float x0, uniform float y0, 
                 uniform float x1, uniform float y1,
@@ -79,8 +84,16 @@ mandelbrot_ispc(uniform float x0, uniform float y0,
                 uniform int maxIterations, uniform int output[]) {
     uniform float dx = (x1 - x0) / width;
     uniform float dy = (y1 - y0) / height;
-    uniform int span = 4;
+    const uniform int xspan = max(32, programCount*2);  /* make sure it is big enough to avoid false-sharing */
+    const uniform int yspan = 16; 
 
-    launch[height/span] mandelbrot_scanline(x0, dx, y0, dy, width, height, span,
-                                            maxIterations, output);
+
+#if 1
+    launch [width/xspan, height/yspan]
+#else
+    launch [height/yspan][width/xspan]
+#endif
+      mandelbrot_scanline(x0, dx, y0, dy, width, height, xspan, yspan,
+                          maxIterations, output);
 }
+#endif
diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
index f8b8cfcb..113fc4e8 100644
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -1,5 +1,23 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>	
   <PropertyGroup Label="Globals">
     <ProjectGuid>{E80DA7D4-AB22-4648-A068-327307156BE6}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/noise/noise.cpp b/examples/noise/noise.cpp
index 123f98c7..0664bbd9 100644
--- a/examples/noise/noise.cpp
+++ b/examples/noise/noise.cpp
@@ -42,6 +42,7 @@
 #include <algorithm>
 #include "../timing.h"
 #include "noise_ispc.h"
+#include <string.h>
 using namespace ispc;
 
 extern void noise_serial(float x0, float y0, float x1, float y1,
@@ -65,7 +66,8 @@ writePPM(float *buf, int width, int height, const char *fn) {
 }
 
 
-int main() {
+int main(int argc, char *argv[]) {
+    static unsigned int test_iterations[] = {3, 1};
     unsigned int width = 768;
     unsigned int height = 768;
     float x0 = -10;
@@ -73,6 +75,18 @@ int main() {
     float y0 = -10;
     float y1 = 10;
 
+    if (argc > 1) {
+        if (strncmp(argv[1], "--scale=", 8) == 0) {
+            float scale = atof(argv[1] + 8);
+            width *= scale;
+            height *= scale;
+        }
+    }
+    if ((argc == 3) || (argc == 4)) {
+        for (int i = 0; i < 2; i++) {
+            test_iterations[i] = atoi(argv[argc - 2 + i]);
+        }
+    }
     float *buf = new float[width*height];
 
     //
@@ -80,10 +94,11 @@ int main() {
     // time of three runs.
     //
     double minISPC = 1e30;
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < test_iterations[0]; ++i) {
         reset_and_start_timer();
         noise_ispc(x0, y0, x1, y1, width, height, buf);
         double dt = get_elapsed_mcycles();
+        printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt);
         minISPC = std::min(minISPC, dt);
     }
 
@@ -99,10 +114,11 @@ int main() {
     // minimum time.
     //
     double minSerial = 1e30;
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < test_iterations[1]; ++i) {
         reset_and_start_timer();
         noise_serial(x0, y0, x1, y1, width, height, buf);
         double dt = get_elapsed_mcycles();
+        printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt);
         minSerial = std::min(minSerial, dt);
     }
 
diff --git a/examples/noise/noise.vcxproj b/examples/noise/noise.vcxproj
index 7adc57f3..ff3953ae 100644
--- a/examples/noise/noise.vcxproj
+++ b/examples/noise/noise.vcxproj
@@ -1,5 +1,23 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>	
   <PropertyGroup Label="Globals">
     <ProjectGuid>{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/options/options.vcxproj b/examples/options/options.vcxproj
index af336aa1..d48ac8bc 100644
--- a/examples/options/options.vcxproj
+++ b/examples/options/options.vcxproj
@@ -1,5 +1,23 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>	
   <PropertyGroup Label="Globals">
     <ProjectGuid>{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/rt/rt.cpp b/examples/rt/rt.cpp
index 48bcc423..8f61656a 100644
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -96,27 +96,27 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
 
 
 static void usage() {
-    fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
+    fprintf(stderr, "rt <scene name base> [--scale=<factor>] [ispc iterations] [tasks iterations] [serial iterations]\n");
     exit(1);
 }
 
 
 int main(int argc, char *argv[]) {
+    static unsigned int test_iterations[] = {3, 7, 1};
     float scale = 1.f;
     const char *filename = NULL;
-    for (int i = 1; i < argc; ++i) {
-        if (strncmp(argv[i], "--scale=", 8) == 0) {
-            scale = atof(argv[i] + 8);
-            if (scale == 0.f)
-                usage();
+    if (argc < 2) usage();
+    filename = argv[1];
+    if (argc > 2) {
+        if (strncmp(argv[2], "--scale=", 8) == 0) {
+            scale = atof(argv[2] + 8);
+        }
+    }
+    if ((argc == 6) || (argc == 5)) {
+        for (int i = 0; i < 3; i++) {
+            test_iterations[i] = atoi(argv[argc - 3 + i]);
         }
-        else if (filename != NULL)
-            usage();
-        else
-            filename = argv[i];
     }
-    if (filename == NULL)
-        usage();
 
 #define READ(var, n)                                            \
     if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) {  \
@@ -211,11 +211,12 @@ int main(int argc, char *argv[]) {
     // Run 3 iterations with ispc + 1 core, record the minimum time
     //
     double minTimeISPC = 1e30;
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < test_iterations[0]; ++i) {
         reset_and_start_timer();
         raytrace_ispc(width, height, baseWidth, baseHeight, raster2camera, 
                       camera2world, image, id, nodes, triangles);
         double dt = get_elapsed_mcycles();
+        printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt);
         minTimeISPC = std::min(dt, minTimeISPC);
     }
     printf("[rt ispc, 1 core]:\t\t[%.3f] million cycles for %d x %d image\n", 
@@ -230,11 +231,12 @@ int main(int argc, char *argv[]) {
     // Run 3 iterations with ispc + 1 core, record the minimum time
     //
     double minTimeISPCtasks = 1e30;
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < test_iterations[1]; ++i) {
         reset_and_start_timer();
         raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera,
                             camera2world, image, id, nodes, triangles);
         double dt = get_elapsed_mcycles();
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", dt);
         minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
     }
     printf("[rt ispc + tasks]:\t\t[%.3f] million cycles for %d x %d image\n", 
@@ -250,11 +252,12 @@ int main(int argc, char *argv[]) {
     // minimum time.
     //
     double minTimeSerial = 1e30;
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < test_iterations[2]; ++i) {
         reset_and_start_timer();
         raytrace_serial(width, height, baseWidth, baseHeight, raster2camera, 
                         camera2world, image, id, nodes, triangles);
         double dt = get_elapsed_mcycles();
+        printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt);
         minTimeSerial = std::min(dt, minTimeSerial);
     }
     printf("[rt serial]:\t\t\t[%.3f] million cycles for %d x %d image\n", 
diff --git a/examples/rt/rt.vcxproj b/examples/rt/rt.vcxproj
index ea34de56..00b6dd3a 100644
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -1,5 +1,23 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>	
   <PropertyGroup Label="Globals">
     <ProjectGuid>{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/sort/sort.vcxproj b/examples/sort/sort.vcxproj
index 43f2b439..b0bdc63d 100644
--- a/examples/sort/sort.vcxproj
+++ b/examples/sort/sort.vcxproj
@@ -1,5 +1,23 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>	
   <PropertyGroup Label="Globals">
     <ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/stencil/stencil.cpp b/examples/stencil/stencil.cpp
index 593d901f..33abc85c 100644
--- a/examples/stencil/stencil.cpp
+++ b/examples/stencil/stencil.cpp
@@ -40,6 +40,7 @@
 
 #include <stdio.h>
 #include <algorithm>
+#include <string.h>
 #include <math.h>
 #include "../timing.h"
 #include "stencil_ispc.h"
@@ -66,9 +67,25 @@ void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {
 }
 
 
-int main() {
+int main(int argc, char *argv[]) {
+    static unsigned int test_iterations[] = {3, 3, 3};//the last two numbers must be equal here
     int Nx = 256, Ny = 256, Nz = 256;
     int width = 4;
+
+    if (argc > 1) {
+        if (strncmp(argv[1], "--scale=", 8) == 0) {
+            float scale = atof(argv[1] + 8);
+            Nx *= scale;
+            Ny *= scale;
+            Nz *= scale;
+        }
+    }
+    if ((argc == 4) || (argc == 5)) {
+        for (int i = 0; i < 3; i++) {
+            test_iterations[i] = atoi(argv[argc - 3 + i]);
+        }
+    }
+
     float *Aserial[2], *Aispc[2];
     Aserial[0] = new float [Nx * Ny * Nz];
     Aserial[1] = new float [Nx * Ny * Nz];
@@ -79,18 +96,18 @@ int main() {
     float coeff[4] = { 0.5, -.25, .125, -.0625 }; 
 
     InitData(Nx, Ny, Nz, Aispc, vsq);
-
     //
     // Compute the image using the ispc implementation on one core; report
     // the minimum time of three runs.
     //
     double minTimeISPC = 1e30;
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < test_iterations[0]; ++i) {
         reset_and_start_timer();
         loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
                           width, Nz - width, Nx, Ny, Nz, coeff, vsq,
                           Aispc[0], Aispc[1]);
         double dt = get_elapsed_mcycles();
+        printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt);
         minTimeISPC = std::min(minTimeISPC, dt);
     }
 
@@ -103,12 +120,13 @@ int main() {
     // the minimum time of three runs.
     //
     double minTimeISPCTasks = 1e30;
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < test_iterations[1]; ++i) {
         reset_and_start_timer();
         loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
                                 width, Nz - width, Nx, Ny, Nz, coeff, vsq,
                                 Aispc[0], Aispc[1]);
         double dt = get_elapsed_mcycles();
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", dt);
         minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
     }
 
@@ -121,12 +139,13 @@ int main() {
     // minimum time.
     //
     double minTimeSerial = 1e30;
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < test_iterations[2]; ++i) {
         reset_and_start_timer();
         loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
                             width, Nz - width, Nx, Ny, Nz, coeff, vsq,
                             Aserial[0], Aserial[1]);
         double dt = get_elapsed_mcycles();
+        printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt);
         minTimeSerial = std::min(minTimeSerial, dt);
     }
 
diff --git a/examples/stencil/stencil.vcxproj b/examples/stencil/stencil.vcxproj
index b5f5bb22..fd8564aa 100644
--- a/examples/stencil/stencil.vcxproj
+++ b/examples/stencil/stencil.vcxproj
@@ -1,5 +1,23 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp
index b97c4bba..77269f9f 100644
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -170,21 +170,48 @@
 
 // Signature of ispc-generated 'task' functions
 typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,
-                             int taskIndex, int taskCount);
+                             int taskIndex, int taskCount,
+                             int taskIndex0, int taskIndex1, int taskIndex2,
+                             int taskCount0, int taskCount1, int taskCount2);
 
 // Small structure used to hold the data for each task
+#ifdef _MSC_VER
+__declspec(align(16))
+#endif
 struct TaskInfo {
     TaskFuncType func;
     void *data;
-    int taskIndex, taskCount;
+    int taskIndex;
+    int taskCount3d[3];
 #if defined(ISPC_IS_WINDOWS)
     event taskEvent;
 #endif
-};
+    int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; }
+    int taskIndex0() const 
+    {
+      return taskIndex % taskCount3d[0];
+    }
+    int taskIndex1() const 
+    {
+      return ( taskIndex / taskCount3d[0] ) % taskCount3d[1];
+    }
+    int taskIndex2() const 
+    {
+      return taskIndex / ( taskCount3d[0]*taskCount3d[1] );
+    }
+    int taskCount0() const { return taskCount3d[0]; }
+    int taskCount1() const { return taskCount3d[1]; }
+    int taskCount2() const { return taskCount3d[2]; }
+    TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); }
+}
+#ifndef _MSC_VER
+__attribute__((aligned(32)));
+#endif
+;
 
 // ispc expects these functions to have C linkage / not be mangled
 extern "C" { 
-    void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
+    void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz);
     void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
     void ISPCSync(void *handle);
 }
@@ -518,7 +545,9 @@ lRunTask(void *ti) {
 
     // Actually run the task
     taskInfo->func(taskInfo->data, threadIndex, threadCount, 
-                   taskInfo->taskIndex, taskInfo->taskCount);
+                   taskInfo->taskIndex, taskInfo->taskCount(),
+            taskInfo->taskIndex0(), taskInfo->taskIndex1(), taskInfo->taskIndex2(),
+            taskInfo->taskCount0(), taskInfo->taskCount1(), taskInfo->taskCount2());
 }
 
 
@@ -559,7 +588,9 @@ lRunTask(LPVOID param) {
     // will cause bugs in code that uses those.
     int threadIndex = 0;
     int threadCount = 1;
-    ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+    ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
+            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
+            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
 
     // Signal the event that this task is done
     ti->taskEvent.set();
@@ -660,7 +691,9 @@ lTaskEntry(void *arg) {
         DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg));
         TaskInfo *myTask = tg->GetTaskInfo(taskNumber);
         myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex,
-                     myTask->taskCount);
+                     myTask->taskCount(),
+            myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(),
+            myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2());
 
         //
         // Decrement the "number of unfinished tasks" counter in the task
@@ -871,7 +904,9 @@ TaskGroup::Sync() {
         // Do work for _myTask_
         //
         // FIXME: bogus values for thread index/thread count here as well..
-        myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount);
+        myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount(),
+            myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(),
+            myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2());
 
         //
         // Decrement the number of unfinished tasks counter
@@ -901,7 +936,9 @@ TaskGroup::Launch(int baseIndex, int count) {
 
         // Actually run the task. 
         // Cilk does not expose the task -> thread mapping so we pretend it's 1:1
-        ti->func(ti->data, ti->taskIndex, ti->taskCount, ti->taskIndex, ti->taskCount);
+        ti->func(ti->data, ti->taskIndex, ti->taskCount(),
+            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
+            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
     }
 }
 
@@ -930,7 +967,9 @@ TaskGroup::Launch(int baseIndex, int count) {
         // Actually run the task. 
         int threadIndex = omp_get_thread_num();
         int threadCount = omp_get_num_threads();
-        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
+            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
+            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
     }
 }
 
@@ -961,7 +1000,9 @@ TaskGroup::Launch(int baseIndex, int count) {
         int threadIndex = ti->taskIndex;
         int threadCount = ti->taskCount;
 
-        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
+            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
+            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
     });
 }
 
@@ -988,7 +1029,9 @@ TaskGroup::Launch(int baseIndex, int count) {
             // TBB does not expose the task -> thread mapping so we pretend it's 1:1
             int threadIndex = ti->taskIndex;
             int threadCount = ti->taskCount;
-            ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+            ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
+            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
+            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
         });
     }
 }
@@ -1041,7 +1084,8 @@ FreeTaskGroup(TaskGroup *tg) {
 ///////////////////////////////////////////////////////////////////////////
 
 void
-ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
+ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2) {
+    const int count = count0*count1*count2;
     TaskGroup *taskGroup;
     if (*taskGroupPtr == NULL) {
         InitTaskSystem();
@@ -1057,7 +1101,9 @@ ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
         ti->func = (TaskFuncType)func;
         ti->data = data;
         ti->taskIndex = i;
-        ti->taskCount = count;
+        ti->taskCount3d[0] = count0;
+        ti->taskCount3d[1] = count1;
+        ti->taskCount3d[2] = count2;
     }
     taskGroup->Launch(baseIndex, count);
 }
diff --git a/examples/volume_rendering/volume.cpp b/examples/volume_rendering/volume.cpp
index 458cd407..b6eda986 100644
--- a/examples/volume_rendering/volume.cpp
+++ b/examples/volume_rendering/volume.cpp
@@ -135,10 +135,16 @@ loadVolume(const char *fn, int n[3]) {
 
 
 int main(int argc, char *argv[]) {
-    if (argc != 3) {
-        fprintf(stderr, "usage: volume <camera.dat> <volume_density.vol>\n");
+    static unsigned int test_iterations[] = {3, 7, 1};
+    if (argc < 3) {
+        fprintf(stderr, "usage: volume <camera.dat> <volume_density.vol> [ispc iterations] [tasks iterations] [serial iterations]\n");
         return 1;
     }
+    if (argc == 6) {
+        for (int i = 0; i < 3; i++) {
+            test_iterations[i] = atoi(argv[3 + i]);
+        }
+    }
 
     //
     // Load viewing data and the volume density data
@@ -156,11 +162,12 @@ int main(int argc, char *argv[]) {
     // time of three runs.
     //
     double minISPC = 1e30;
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < test_iterations[0]; ++i) {
         reset_and_start_timer();
         volume_ispc(density, n, raster2camera, camera2world,
                     width, height, image);
         double dt = get_elapsed_mcycles();
+        printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt);
         minISPC = std::min(minISPC, dt);
     }
 
@@ -176,11 +183,12 @@ int main(int argc, char *argv[]) {
     // tasks; report the minimum time of three runs.
     //
     double minISPCtasks = 1e30;
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < test_iterations[1]; ++i) {
         reset_and_start_timer();
         volume_ispc_tasks(density, n, raster2camera, camera2world,
                           width, height, image);
         double dt = get_elapsed_mcycles();
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", dt);
         minISPCtasks = std::min(minISPCtasks, dt);
     }
 
@@ -196,11 +204,12 @@ int main(int argc, char *argv[]) {
     // minimum time.
     //
     double minSerial = 1e30;
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < test_iterations[2]; ++i) {
         reset_and_start_timer();
         volume_serial(density, n, raster2camera, camera2world,
                       width, height, image);
         double dt = get_elapsed_mcycles();
+        printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt);
         minSerial = std::min(minSerial, dt);
     }
 
diff --git a/examples/volume_rendering/volume.vcxproj b/examples/volume_rendering/volume.vcxproj
index cc738a7e..a1fea5f1 100644
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -1,5 +1,23 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>	
   <PropertyGroup Label="Globals">
     <ProjectGuid>{dee5733a-e93e-449d-9114-9bffcaeb4df9}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
diff --git a/expr.cpp b/expr.cpp
index 1cbebad5..5be578eb 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -3551,11 +3551,18 @@ SelectExpr::Print() const {
 // FunctionCallExpr
 
 FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p,
-                                   bool il, Expr *lce)
+                                   bool il, Expr *lce[3])
     : Expr(p), isLaunch(il) {
     func = f;
     args = a;
-    launchCountExpr = lce;
+    if (lce != NULL)
+    {
+      launchCountExpr[0] = lce[0];
+      launchCountExpr[1] = lce[1];
+      launchCountExpr[2] = lce[2];
+    }
+    else
+      launchCountExpr[0] = launchCountExpr[1] = launchCountExpr[2] = NULL;
 }
 
 
@@ -3673,9 +3680,13 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
     llvm::Value *retVal = NULL;
     ctx->SetDebugPos(pos);
     if (ft->isTask) {
-        AssertPos(pos, launchCountExpr != NULL);
-        llvm::Value *launchCount = launchCountExpr->GetValue(ctx);
-        if (launchCount != NULL)
+        AssertPos(pos, launchCountExpr[0] != NULL);
+        llvm::Value *launchCount[3] = 
+        { launchCountExpr[0]->GetValue(ctx),
+          launchCountExpr[1]->GetValue(ctx),
+          launchCountExpr[2]->GetValue(ctx) };
+
+        if (launchCount[0] != NULL)
             ctx->LaunchInst(callee, argVals, launchCount);
     }
     else
@@ -3798,14 +3809,17 @@ FunctionCallExpr::TypeCheck() {
             if (!isLaunch)
                 Error(pos, "\"launch\" expression needed to call function "
                       "with \"task\" qualifier.");
-            if (!launchCountExpr)
+            for (int k = 0; k < 3; k++)
+            {
+              if (!launchCountExpr[k])
                 return NULL;
 
-            launchCountExpr =
-                TypeConvertExpr(launchCountExpr, AtomicType::UniformInt32,
-                                "task launch count");
-            if (launchCountExpr == NULL)
+              launchCountExpr[k] =
+                TypeConvertExpr(launchCountExpr[k], AtomicType::UniformInt32,
+                    "task launch count");
+              if (launchCountExpr[k] == NULL)
                 return NULL;
+            }
         }
         else {
             if (isLaunch) {
@@ -3813,7 +3827,7 @@ FunctionCallExpr::TypeCheck() {
                       "qualified function.");
                 return NULL;
             }
-            AssertPos(pos, launchCountExpr == NULL);
+            AssertPos(pos, launchCountExpr[0] == NULL);
         }
     }
     else {
diff --git a/expr.h b/expr.h
index 45780414..38617e8e 100644
--- a/expr.h
+++ b/expr.h
@@ -247,7 +247,8 @@ public:
 class FunctionCallExpr : public Expr {
 public:
     FunctionCallExpr(Expr *func, ExprList *args, SourcePos p,
-                     bool isLaunch = false, Expr *launchCountExpr = NULL);
+                     bool isLaunch = false, 
+                     Expr *launchCountExpr[3] = NULL);
 
     llvm::Value *GetValue(FunctionEmitContext *ctx) const;
     llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
@@ -262,7 +263,7 @@ public:
     Expr *func;
     ExprList *args;
     bool isLaunch;
-    Expr *launchCountExpr;
+    Expr *launchCountExpr[3];
 };
 
 
diff --git a/func.cpp b/func.cpp
index b975049b..af2cc05a 100644
--- a/func.cpp
+++ b/func.cpp
@@ -132,9 +132,28 @@ Function::Function(Symbol *s, Stmt *c) {
         Assert(taskIndexSym);
         taskCountSym = m->symbolTable->LookupVariable("taskCount");
         Assert(taskCountSym);
+
+        taskIndexSym0 = m->symbolTable->LookupVariable("taskIndex0");
+        Assert(taskIndexSym0);
+        taskIndexSym1 = m->symbolTable->LookupVariable("taskIndex1");
+        Assert(taskIndexSym1);
+        taskIndexSym2 = m->symbolTable->LookupVariable("taskIndex2");
+        Assert(taskIndexSym2);
+
+
+        taskCountSym0 = m->symbolTable->LookupVariable("taskCount0");
+        Assert(taskCountSym0);
+        taskCountSym1 = m->symbolTable->LookupVariable("taskCount1");
+        Assert(taskCountSym1);
+        taskCountSym2 = m->symbolTable->LookupVariable("taskCount2");
+        Assert(taskCountSym2);
     }
     else
+    {
         threadIndexSym = threadCountSym = taskIndexSym = taskCountSym = NULL;
+        taskIndexSym0 = taskIndexSym1 = taskIndexSym2 = NULL;
+        taskCountSym0 = taskCountSym1 = taskCountSym2 = NULL;
+    }
 }
 
 
@@ -225,6 +244,12 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
         llvm::Value *threadCount = argIter++;
         llvm::Value *taskIndex = argIter++;
         llvm::Value *taskCount = argIter++;
+        llvm::Value *taskIndex0 = argIter++;
+        llvm::Value *taskIndex1 = argIter++;
+        llvm::Value *taskIndex2 = argIter++;
+        llvm::Value *taskCount0 = argIter++;
+        llvm::Value *taskCount1 = argIter++;
+        llvm::Value *taskCount2 = argIter++;
 
         // Copy the function parameter values from the structure into local
         // storage
@@ -256,6 +281,20 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
 
         taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount");
         ctx->StoreInst(taskCount, taskCountSym->storagePtr);
+        
+        taskIndexSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex0");
+        ctx->StoreInst(taskIndex0, taskIndexSym0->storagePtr);
+        taskIndexSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex1");
+        ctx->StoreInst(taskIndex1, taskIndexSym1->storagePtr);
+        taskIndexSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex2");
+        ctx->StoreInst(taskIndex2, taskIndexSym2->storagePtr);
+        
+        taskCountSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount0");
+        ctx->StoreInst(taskCount0, taskCountSym0->storagePtr);
+        taskCountSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount1");
+        ctx->StoreInst(taskCount1, taskCountSym1->storagePtr);
+        taskCountSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount2");
+        ctx->StoreInst(taskCount2, taskCountSym2->storagePtr);
     }
     else {
         // Regular, non-task function
diff --git a/func.h b/func.h
index ac3e1447..88a96dbc 100644
--- a/func.h
+++ b/func.h
@@ -60,7 +60,10 @@ private:
     Stmt *code;
     Symbol *maskSymbol;
     Symbol *threadIndexSym, *threadCountSym;
-    Symbol *taskIndexSym, *taskCountSym;
+    Symbol *taskIndexSym,   *taskCountSym;
+    Symbol *taskIndexSym0, *taskCountSym0;
+    Symbol *taskIndexSym1, *taskCountSym1;
+    Symbol *taskIndexSym2, *taskCountSym2;
 };
 
 #endif // ISPC_FUNC_H
diff --git a/ispc.cpp b/ispc.cpp
index 36d31580..ed326b14 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -191,6 +191,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     m_tf_attributes(NULL),
 #endif
     m_nativeVectorWidth(-1),
+    m_nativeVectorAlignment(-1),
     m_dataTypeWidth(-1),
     m_vectorWidth(-1),
     m_generatePIC(pic),
@@ -309,6 +310,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         !strcasecmp(isa, "sse2-i32x4")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 4;
         this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
@@ -325,6 +327,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "sse2-i32x8")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
@@ -341,6 +344,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "sse4-i32x4")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 4;
         // TODO: why not sse42 and popcnt?
@@ -359,6 +363,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "sse4-i32x8")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -374,6 +379,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "sse4-i8x16")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 16;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 8;
         this->m_vectorWidth = 16;
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -389,6 +395,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "sse4-i16x8")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 16;
         this->m_vectorWidth = 8;
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -405,6 +412,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "generic-x4")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 4;
+        this->m_nativeVectorAlignment = 16;
         this->m_vectorWidth = 4;
         this->m_maskingIsFree = true;
         this->m_maskBitCount = 1;
@@ -416,6 +424,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "generic-x8")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 32;
         this->m_vectorWidth = 8;
         this->m_maskingIsFree = true;
         this->m_maskBitCount = 1;
@@ -427,6 +436,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "generic-x16")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 16;
+        this->m_nativeVectorAlignment = 64;
         this->m_vectorWidth = 16;
         this->m_maskingIsFree = true;
         this->m_maskBitCount = 1;
@@ -438,6 +448,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "generic-x32")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 32;
+        this->m_nativeVectorAlignment = 64;
         this->m_vectorWidth = 32;
         this->m_maskingIsFree = true;
         this->m_maskBitCount = 1;
@@ -449,6 +460,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "generic-x64")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 64;
+        this->m_nativeVectorAlignment = 64;
         this->m_vectorWidth = 64;
         this->m_maskingIsFree = true;
         this->m_maskBitCount = 1;
@@ -460,6 +472,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "generic-x1")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 1;
+        this->m_nativeVectorAlignment = 16;
         this->m_vectorWidth = 1;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
@@ -467,6 +480,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "avx1-i32x4")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov";
@@ -478,6 +492,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1-i32x8")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+avx,+popcnt,+cmov";
@@ -488,6 +503,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1-i64x4")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 64;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov";
@@ -499,6 +515,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1-i32x16")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 16;
         this->m_attributes = "+avx,+popcnt,+cmov";
@@ -509,6 +526,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1.1-i32x8")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
@@ -530,6 +548,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1.1-i32x16")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 16;
         this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
@@ -550,6 +569,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "avx1.1-i64x4")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 64;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
@@ -571,6 +591,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx2-i32x8")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
@@ -596,6 +617,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx2-i32x16")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 16;
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 16;
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
@@ -620,6 +642,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "avx2-i64x4")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 64;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
@@ -645,6 +668,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "neon-i8x16")) {
         this->m_isa = Target::NEON8;
         this->m_nativeVectorWidth = 16;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 8;
         this->m_vectorWidth = 16;
         this->m_attributes = "+neon,+fp16";
@@ -655,6 +679,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "neon-i16x8")) {
         this->m_isa = Target::NEON16;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 16;
         this->m_vectorWidth = 8;
         this->m_attributes = "+neon,+fp16";
@@ -666,6 +691,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "neon-i32x4")) {
         this->m_isa = Target::NEON32;
         this->m_nativeVectorWidth = 4;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 4;
         this->m_attributes = "+neon,+fp16";
@@ -918,7 +944,8 @@ Target::GetISATargetString() const {
 
 static bool
 lGenericTypeLayoutIndeterminate(llvm::Type *type) {
-    if (type->isPrimitiveType() || type->isIntegerTy())
+    if (type->isFloatingPointTy() || type->isX86_MMXTy() || type->isVoidTy() ||
+        type->isIntegerTy() || type->isLabelTy() || type->isMetadataTy())
         return false;
 
     if (type == LLVMTypes::BoolVectorType ||
diff --git a/ispc.h b/ispc.h
index b319d656..88eb8353 100644
--- a/ispc.h
+++ b/ispc.h
@@ -38,7 +38,7 @@
 #ifndef ISPC_H
 #define ISPC_H
 
-#define ISPC_VERSION "1.5.1dev"
+#define ISPC_VERSION "1.6.1dev"
 
 #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)
 #error "Only LLVM 3.1, 3.2, 3.3, 3.4 and the 3.5 development branch are supported"
@@ -260,6 +260,8 @@ public:
 
     int getNativeVectorWidth() const {return m_nativeVectorWidth;}
 
+    int getNativeVectorAlignment() const {return m_nativeVectorAlignment;}
+
     int getDataTypeWidth() const {return m_dataTypeWidth;}
 
     int getVectorWidth() const {return m_vectorWidth;}
@@ -332,6 +334,13 @@ private:
         SSE, 8 for AVX, etc.) */
     int m_nativeVectorWidth;
 
+    /** Native vector alignment in bytes. Theoretically this may be derived
+        from the vector size, but it's better to manage directly the alignement.
+        It allows easier experimenting and better fine tuning for particular
+        platform. This information is primatily used when
+        --opt=force-aligned-memory is used. */
+    int m_nativeVectorAlignment;
+
     /** Data type with in bits. Typically it's 32, but could be 8, 16 or 64.
         For generic it's -1, which means undefined. */
     int m_dataTypeWidth;
diff --git a/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch b/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch
new file mode 100644
index 00000000..8f0a790b
--- /dev/null
+++ b/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch
@@ -0,0 +1,115 @@
+From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001
+From: Michael Liao <michael.hliao@gmail.com>
+Date: Mon, 21 Oct 2013 17:47:58 -0700
+Subject: [PATCH] Fix PR17631
+
+- Skip instructions added in prolog. For specific targets, prolog may
+  insert helper function calls (e.g. _chkstk will be called when
+  there're more than 4K bytes allocated on stack). However, these
+  helpers don't use/def YMM/XMM registers.
+  It also include second fix for the problem: r196261+r196391.
+
+diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
+index 477f75a..0d37a7d 100644
+--- lib/Target/X86/X86VZeroUpper.cpp
++++ lib/Target/X86/X86VZeroUpper.cpp
+@@ -121,7 +121,7 @@
+ }
+ 
+ static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+-  for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) {
++  for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
+     if (!MO.clobbersPhysReg(reg))
+       return false;
+   }
+@@ -143,6 +143,21 @@
+   return false;
+ }
+ 
++/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
++/// instruction.
++static bool clobbersAnyYmmReg(MachineInstr *MI) {
++  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
++    const MachineOperand &MO = MI->getOperand(i);
++    if (!MO.isRegMask())
++      continue;
++    for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
++      if (MO.clobbersPhysReg(reg))
++        return true;
++    }
++  }
++  return false;
++}
++
+ /// runOnMachineFunction - Loop over all of the basic blocks, inserting
+ /// vzero upper instructions before function calls.
+ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+@@ -226,8 +241,9 @@
+   bool BBHasCall = false;
+ 
+   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
++    DebugLoc dl = I->getDebugLoc();
+     MachineInstr *MI = I;
+-    DebugLoc dl = I->getDebugLoc();
++
+     bool isControlFlow = MI->isCall() || MI->isReturn();
+ 
+     // Shortcut: don't need to check regular instructions in dirty state.
+@@ -246,6 +262,14 @@
+     if (!isControlFlow)
+       continue;
+ 
++    // If the call won't clobber any YMM register, skip it as well. It usually
++    // happens on helper function calls (such as '_chkstk', '_ftol2') where
++    // standard calling convention is not used (RegMask is not used to mark
++    // register clobbered and register usage (def/imp-def/use) is well-dfined
++    // and explicitly specified.
++    if (MI->isCall() && !clobbersAnyYmmReg(MI))
++      continue;
++
+     BBHasCall = true;
+ 
+     // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
+diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
+new file mode 100644
+index 0000000..a572ff2
+--- /dev/null
++++ test/CodeGen/X86/pr17631.ll
+@@ -0,0 +1,34 @@
++; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
++
++%struct_type = type { [64 x <8 x float>], <8 x float> }
++
++; Function Attrs: nounwind readnone
++declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
++
++; Function Attrs: nounwind
++define i32 @equal(<8 x i32> %A) {
++allocas:
++  %first_alloc  = alloca [64 x <8 x i32>]
++  %second_alloc = alloca %struct_type
++ 
++  %A1 = bitcast <8 x i32> %A to <8 x float>
++  %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
++  ret i32 %A2
++}
++
++; CHECK: equal
++; CHECK-NOT: vzeroupper
++; CHECK: _chkstk
++; CHECK: ret
++
++define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) {
++  %i = fptoui double %x to i64
++  store i64 %i, i64* %p
++  %ret = fadd <8 x float> %y, %y
++  ret <8 x float> %ret
++}
++
++; CHECK: foo
++; CHECK-NOT: vzeroupper
++; CHECK: _ftol2
++; CHECK: ret
+-- 
+1.8.1.2
+
diff --git a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch b/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch
deleted file mode 100644
index b6abb1d3..00000000
--- a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch
+++ /dev/null
@@ -1,69 +0,0 @@
-From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001
-From: Michael Liao <michael.hliao@gmail.com>
-Date: Mon, 21 Oct 2013 17:47:58 -0700
-Subject: [PATCH] Fix PR17631
-
-- Skip instructions added in prolog. For specific targets, prolog may
-  insert helper function calls (e.g. _chkstk will be called when
-  there're more than 4K bytes allocated on stack). However, these
-  helpers don't use/def YMM/XMM registers.
----
- lib/Target/X86/X86VZeroUpper.cpp | 11 ++++++++++-
- test/CodeGen/X86/pr17631.ll      | 22 ++++++++++++++++++++++
- 2 files changed, 32 insertions(+), 1 deletion(-)
- create mode 100644 test/CodeGen/X86/pr17631.ll
-
-diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
-index 477f75a..0d37a7d 100644
---- lib/Target/X86/X86VZeroUpper.cpp
-+++ lib/Target/X86/X86VZeroUpper.cpp
-@@ -231,8 +231,17 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
-   bool BBHasCall = false;
- 
-   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
--    MachineInstr *MI = I;
-     DebugLoc dl = I->getDebugLoc();
-+    MachineInstr *MI = I;
-+
-+    // Don't need to check instructions added in prolog.
-+    // In prolog, special function calls may be added for specific targets
-+    // (e.g. on Windows, a prolog helper '_chkstk' is called when the local
-+    // variables exceed 4K bytes on stack.) These helpers won't use/def YMM/XMM
-+    // registers.
-+    if (MI->getFlag(MachineInstr::FrameSetup))
-+      continue;
-+
-     bool isControlFlow = MI->isCall() || MI->isReturn();
- 
-     // Shortcut: don't need to check regular instructions in dirty state.
-diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
-new file mode 100644
-index 0000000..a572ff2
---- /dev/null
-+++ test/CodeGen/X86/pr17631.ll
-@@ -0,0 +1,22 @@
-+; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
-+ 
-+%struct_type = type { [64 x <8 x float>], <8 x float> }
-+ 
-+; Function Attrs: nounwind readnone
-+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
-+ 
-+; Function Attrs: nounwind
-+define i32 @equal(<8 x i32> %A) {
-+allocas:
-+  %first_alloc  = alloca [64 x <8 x i32>]
-+  %second_alloc = alloca %struct_type
-+ 
-+  %A1 = bitcast <8 x i32> %A to <8 x float>
-+  %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
-+  ret i32 %A2
-+}
-+
-+; CHECK: equal
-+; CHECK-NOT: vzeroupper
-+; CHECK: _chkstk
-+; CHECK: ret
--- 
-1.8.1.2
-
diff --git a/llvm_patches/3_4_r195476_r195779_i16_sext.patch b/llvm_patches/3_4_r195476_r195779_i16_sext.patch
deleted file mode 100644
index 4e2c0f6b..00000000
--- a/llvm_patches/3_4_r195476_r195779_i16_sext.patch
+++ /dev/null
@@ -1,57 +0,0 @@
-Two stability patches affecting sse4-i16x8 and sse4-i8x16 targets. See PR18014 and PR18054 for more details.
-
-Index: lib/Target/X86/X86ISelLowering.cpp
-===================================================================
---- lib/Target/X86/X86ISelLowering.cpp	(revision 195863)
-+++ lib/Target/X86/X86ISelLowering.cpp	(working copy)
-@@ -13120,19 +13120,27 @@
-       // fall through
-     case MVT::v4i32:
-     case MVT::v8i16: {
--      // (sext (vzext x)) -> (vsext x)
-       SDValue Op0 = Op.getOperand(0);
-       SDValue Op00 = Op0.getOperand(0);
-       SDValue Tmp1;
-       // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
-       if (Op0.getOpcode() == ISD::BITCAST &&
--          Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
-+          Op00.getOpcode() == ISD::VECTOR_SHUFFLE) {
-+        // (sext (vzext x)) -> (vsext x)
-         Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG);
--      if (Tmp1.getNode()) {
--        SDValue Tmp1Op0 = Tmp1.getOperand(0);
--        assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
--               "This optimization is invalid without a VZEXT.");
--        return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
-+        if (Tmp1.getNode()) {
-+          EVT ExtraEltVT = ExtraVT.getVectorElementType();
-+          // This folding is only valid when the in-reg type is a vector of i8,
-+          // i16, or i32.
-+          if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 ||
-+              ExtraEltVT == MVT::i32) {
-+            SDValue Tmp1Op0 = Tmp1.getOperand(0);
-+            assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
-+                   "This optimization is invalid without a VZEXT.");
-+            return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
-+          }
-+          Op0 = Tmp1;
-+        }
-       }
- 
-       // If the above didn't work, then just use Shift-Left + Shift-Right.
-@@ -17007,6 +17015,15 @@
-     if (BitWidth == 1)
-       return SDValue();
- 
-+    // Check all uses of that condition operand to check whether it will be
-+    // consumed by non-BLEND instructions, which may depend on all bits are set
-+    // properly.
-+    for (SDNode::use_iterator I = Cond->use_begin(),
-+                              E = Cond->use_end(); I != E; ++I)
-+      if (I->getOpcode() != ISD::VSELECT)
-+        // TODO: Add other opcodes eventually lowered into BLEND.
-+        return SDValue();
-+
-     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
-     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
- 
diff --git a/opt.cpp b/opt.cpp
index 3e320b4b..f70e522d 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -514,11 +514,31 @@ Optimize(llvm::Module *module, int optLevel) {
         llvm::initializeInstrumentation(*registry);
         llvm::initializeTarget(*registry);
 
-        optPM.add(llvm::createGlobalDCEPass(), 200);
+        optPM.add(llvm::createGlobalDCEPass(), 185);
+
+        // Setup to use LLVM default AliasAnalysis
+        // Ideally, we want call:
+        //    llvm::PassManagerBuilder pm_Builder;
+        //    pm_Builder.OptLevel = optLevel;
+        //    pm_Builder.addInitialAliasAnalysisPasses(optPM);
+        // but the addInitialAliasAnalysisPasses() is a private function
+        // so we explicitly enable them here.
+        // Need to keep sync with future LLVM change
+        // An alternative is to call populateFunctionPassManager()
+        optPM.add(llvm::createTypeBasedAliasAnalysisPass(), 190);
+        optPM.add(llvm::createBasicAliasAnalysisPass());
+        optPM.add(llvm::createCFGSimplificationPass());
+        // Here clang has an experimental pass SROAPass instead of
+        // ScalarReplAggregatesPass. We should add it in the future.
+        optPM.add(llvm::createScalarReplAggregatesPass());
+        optPM.add(llvm::createEarlyCSEPass());
+        optPM.add(llvm::createLowerExpectIntrinsicPass());
+        optPM.add(llvm::createTypeBasedAliasAnalysisPass());
+        optPM.add(llvm::createBasicAliasAnalysisPass());
 
         // Early optimizations to try to reduce the total amount of code to
         // work with if we can
-        optPM.add(llvm::createReassociatePass());
+        optPM.add(llvm::createReassociatePass(), 200);
         optPM.add(llvm::createConstantPropagationPass());
         optPM.add(llvm::createDeadInstEliminationPass());
         optPM.add(llvm::createCFGSimplificationPass());
@@ -904,7 +924,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                     lCopyMetadata(castPtr, callInst);
                     int align;
                     if (g->opt.forceAlignedMemory)
-                        align = 0;
+                        align = g->target->getNativeVectorAlignment();
                     else
                         align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
                     name = LLVMGetName(callInst->getArgOperand(0), "_load");
@@ -946,7 +966,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                         new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
                     int align;
                     if (g->opt.forceAlignedMemory)
-                        align = 0;
+                        align = g->target->getNativeVectorAlignment();
                     else
                         align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
                     storeInst->setAlignment(align);
@@ -1479,6 +1499,33 @@ lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
                                                  insertBefore);
             return;
         }
+        else if (bop->getOpcode() == llvm::Instruction::Shl) {
+            lExtractConstantOffset(op0, &c0, &v0, insertBefore);
+            lExtractConstantOffset(op1, &c1, &v1, insertBefore);
+
+            // Given the product of constant and variable terms, we have:
+            // (c0 + v0) * (2^(c1 + v1))  = c0 * 2^c1 * 2^v1 + v0 * 2^c1 * 2^v1
+            // We can optimize only if v1 == NULL.
+            if ((v1 != NULL) || (c0 == NULL) || (c1 == NULL)) {
+                *constOffset = NULL;
+                *variableOffset = vec;
+            }
+            else if (v0 == NULL) {
+                *constOffset = vec;
+                *variableOffset = NULL;
+            }
+            else {
+                *constOffset =
+                    llvm::BinaryOperator::Create(llvm::Instruction::Shl, c0, c1,
+                                                 LLVMGetName("shl", c0, c1),
+                                                 insertBefore);
+                *variableOffset =
+                    llvm::BinaryOperator::Create(llvm::Instruction::Shl, v0, c1,
+                                                 LLVMGetName("shl", v0, c1),
+                                                 insertBefore);
+            }
+            return;
+        }
         else if (bop->getOpcode() == llvm::Instruction::Mul) {
             lExtractConstantOffset(op0, &c0, &v0, insertBefore);
             lExtractConstantOffset(op1, &c1, &v1, insertBefore);
@@ -2758,7 +2805,8 @@ lImproveMaskedStore(llvm::CallInst *callInst) {
         lCopyMetadata(lvalue, callInst);
         llvm::Instruction *store =
             new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
-                                g->opt.forceAlignedMemory ? 0 : info->align);
+                                g->opt.forceAlignedMemory ?
+                                    g->target->getNativeVectorAlignment() : info->align);
         lCopyMetadata(store, callInst);
         llvm::ReplaceInstWithInst(callInst, store);
         return true;
@@ -2821,7 +2869,8 @@ lImproveMaskedLoad(llvm::CallInst *callInst,
                                     callInst);
         llvm::Instruction *load =
             new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */,
-                               g->opt.forceAlignedMemory ? 0 : info->align,
+                               g->opt.forceAlignedMemory ?
+                                   g->target->getNativeVectorAlignment() : info->align,
                                (llvm::Instruction *)NULL);
         lCopyMetadata(load, callInst);
         llvm::ReplaceInstWithInst(callInst, load);
@@ -3226,6 +3275,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
         }
         case 4: {
             // 4-wide vector load
+            if (g->opt.forceAlignedMemory) {
+                align = g->target->getNativeVectorAlignment();
+            }
             llvm::VectorType *vt =
                 llvm::VectorType::get(LLVMTypes::Int32Type, 4);
             loadOps[i].load = lGEPAndLoad(basePtr, start, align,
@@ -3234,6 +3286,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
         }
         case 8: {
             // 8-wide vector load
+            if (g->opt.forceAlignedMemory) {
+                align = g->target->getNativeVectorAlignment();
+            }
             llvm::VectorType *vt =
                 llvm::VectorType::get(LLVMTypes::Int32Type, 8);
             loadOps[i].load = lGEPAndLoad(basePtr, start, align,
@@ -5118,6 +5173,11 @@ FixBooleanSelectPass::runOnFunction(llvm::Function &F) {
     // LLVM 3.3 only
 #if defined(LLVM_3_3)
 
+    // Don't optimize generic targets.
+    if (g->target->getISA() == Target::GENERIC) {
+        return false;
+    }
+
     for (llvm::Function::iterator I = F.begin(), E = F.end();
          I != E; ++I) {
         llvm::BasicBlock* bb = &*I;
diff --git a/parse.yy b/parse.yy
index 38c5ba77..9a0377c5 100644
--- a/parse.yy
+++ b/parse.yy
@@ -353,17 +353,75 @@ launch_expression
     : TOKEN_LAUNCH postfix_expression '(' argument_expression_list ')'
       {
           ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @2);
-          $$ = new FunctionCallExpr($2, $4, Union(@2, @5), true, oneExpr);
+          Expr *launchCount[3] = {oneExpr, oneExpr, oneExpr};
+          $$ = new FunctionCallExpr($2, $4, Union(@2, @5), true, launchCount);
       }
     | TOKEN_LAUNCH postfix_expression '(' ')'
       {
           ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @2);
-          $$ = new FunctionCallExpr($2, new ExprList(Union(@3,@4)), Union(@2, @4), true, oneExpr);
+          Expr *launchCount[3] = {oneExpr, oneExpr, oneExpr};
+          $$ = new FunctionCallExpr($2, new ExprList(Union(@3,@4)), Union(@2, @4), true, launchCount);
        }
-    | TOKEN_LAUNCH '[' expression ']' postfix_expression '(' argument_expression_list ')'
-      { $$ = new FunctionCallExpr($5, $7, Union(@5,@8), true, $3); }
-    | TOKEN_LAUNCH '[' expression ']' postfix_expression '(' ')'
-      { $$ = new FunctionCallExpr($5, new ExprList(Union(@5,@6)), Union(@5,@7), true, $3); }
+
+    | TOKEN_LAUNCH '[' assignment_expression ']' postfix_expression '(' argument_expression_list ')'
+      { 
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @5);
+          Expr *launchCount[3] = {$3, oneExpr, oneExpr};
+          $$ = new FunctionCallExpr($5, $7, Union(@5,@8), true, launchCount);
+      }
+    | TOKEN_LAUNCH '[' assignment_expression ']' postfix_expression '(' ')'
+      { 
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @5);
+          Expr *launchCount[3] = {$3, oneExpr, oneExpr};
+          $$ = new FunctionCallExpr($5, new ExprList(Union(@5,@6)), Union(@5,@7), true, launchCount);
+      }
+
+    | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ']' postfix_expression '(' argument_expression_list ')'
+      { 
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @7);
+          Expr *launchCount[3] = {$3, $5, oneExpr};
+          $$ = new FunctionCallExpr($7, $9, Union(@7,@10), true, launchCount);
+      }
+    | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ']' postfix_expression '(' ')'
+      { 
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @7);
+          Expr *launchCount[3] = {$3, $5, oneExpr};
+          $$ = new FunctionCallExpr($7, new ExprList(Union(@7,@8)), Union(@7,@9), true, launchCount);
+      }
+    | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' argument_expression_list ')'
+      { 
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @8);
+          Expr *launchCount[3] = {$6, $3, oneExpr};
+          $$ = new FunctionCallExpr($8, $10, Union(@8,@11), true, launchCount);
+      }
+    | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' ')'
+      { 
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @8);
+          Expr *launchCount[3] = {$6, $3, oneExpr};
+          $$ = new FunctionCallExpr($8, new ExprList(Union(@8,@9)), Union(@8,@10), true, launchCount);
+      }
+
+    | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ',' assignment_expression ']' postfix_expression '(' argument_expression_list ')'
+      { 
+          Expr *launchCount[3] = {$3, $5, $7};
+          $$ = new FunctionCallExpr($9, $11, Union(@9,@12), true, launchCount);
+      }
+    | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ',' assignment_expression ']' postfix_expression '(' ')'
+      { 
+          Expr *launchCount[3] = {$3, $5, $7};
+          $$ = new FunctionCallExpr($9, new ExprList(Union(@9,@10)), Union(@9,@11), true, launchCount);
+      }
+    | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' argument_expression_list ')'
+      { 
+          Expr *launchCount[3] = {$9, $6, $3};
+          $$ = new FunctionCallExpr($11, $13, Union(@11,@14), true, launchCount);
+      }
+    | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' ')'
+      { 
+          Expr *launchCount[3] = {$9, $6, $3};
+          $$ = new FunctionCallExpr($11, new ExprList(Union(@11,@12)), Union(@11,@13), true, launchCount);
+      }
+
 
     | TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>'
        {
@@ -377,13 +435,13 @@ launch_expression
                 "around function call expression.");
           $$ = NULL;
        }
-    | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' argument_expression_list ')' '>'
+    | TOKEN_LAUNCH '[' assignment_expression ']' '<' postfix_expression '(' argument_expression_list ')' '>'
        {
           Error(Union(@5, @10), "\"launch\" expressions no longer take '<' '>' "
                 "around function call expression.");
           $$ = NULL;
        }
-    | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' ')' '>'
+    | TOKEN_LAUNCH '[' assignment_expression ']' '<' postfix_expression '(' ')' '>'
        {
           Error(Union(@5, @9), "\"launch\" expressions no longer take '<' '>' "
                 "around function call expression.");
@@ -2214,9 +2272,24 @@ static void lAddThreadIndexCountToSymbolTable(SourcePos pos) {
 
     Symbol *taskIndexSym = new Symbol("taskIndex", pos, type);
     m->symbolTable->AddVariable(taskIndexSym);
-
+    
     Symbol *taskCountSym = new Symbol("taskCount", pos, type);
     m->symbolTable->AddVariable(taskCountSym);
+    
+    Symbol *taskIndexSym0 = new Symbol("taskIndex0", pos, type);
+    m->symbolTable->AddVariable(taskIndexSym0);
+    Symbol *taskIndexSym1 = new Symbol("taskIndex1", pos, type);
+    m->symbolTable->AddVariable(taskIndexSym1);
+    Symbol *taskIndexSym2 = new Symbol("taskIndex2", pos, type);
+    m->symbolTable->AddVariable(taskIndexSym2);
+
+    
+    Symbol *taskCountSym0 = new Symbol("taskCount0", pos, type);
+    m->symbolTable->AddVariable(taskCountSym0);
+    Symbol *taskCountSym1 = new Symbol("taskCount1", pos, type);
+    m->symbolTable->AddVariable(taskCountSym1);
+    Symbol *taskCountSym2 = new Symbol("taskCount2", pos, type);
+    m->symbolTable->AddVariable(taskCountSym2);
 }
 
 
diff --git a/perf.ini b/perf.ini
index 249c25f4..eea017de 100755
--- a/perf.ini
+++ b/perf.ini
@@ -8,26 +8,29 @@
 %    #***
 %    [% comment]
 %****************************************************************************************************
+% All parameters of iteration number must be at the end of command string. Now all of the, are default (3 7 1).
 AOBench
 aobench
-10 512 512
+% --scale= from parameters
+2048 2048
 #***
 Deferred Shading
 deferred
+% --scale= from data and third parameter
 data/pp1280x720.bin
 #***
 Mandelbrot Set
 mandelbrot
-
+--scale=1.0
 #***
 Mandelbrot Set
 mandelbrot_tasks
-
+--scale=8.0
 ^
 #***
 Perlin Noise Function
 noise
-
+--scale=4.0
 #***
 Binomial Options
 options
@@ -41,14 +44,15 @@ options
 #***
 Ray Tracer
 rt
-sponza
+sponza --scale=6.0
 #***
 3D Stencil
 stencil
-
+--scale=2.0
 #***
 Volume Rendering
 volume_rendering
+% --scale= from data
 camera.dat density_highres.vol
 #***
 Sort
diff --git a/perf.py b/perf.py
index d1134990..65895335 100755
--- a/perf.py
+++ b/perf.py
@@ -99,16 +99,19 @@ def analyse_test(c1, c2, test, b_serial, perf_temp_n):
             j+=1
         if "million cycles" in line:
             if j == c1:
-                line = line.replace("]","[")
-                line = line.split("[")
-                number = float(line[3])
-                if "tasks" in line[1]:
-                    absolute_tasks.append(number)
+                if line[0] == '@':
+                    print_debug(line, True, perf_log)
                 else:
-                    if "ispc" in line[1]:
-                        absolute_ispc.append(number)
-                if "serial" in line[1]:
-                    serial.append(number)
+                    line = line.replace("]","[")
+                    line = line.split("[")
+                    number = float(line[3])
+                    if "tasks" in line[1]:
+                        absolute_tasks.append(number)
+                    else:
+                        if "ispc" in line[1]:
+                            absolute_ispc.append(number)
+                    if "serial" in line[1]:
+                        serial.append(number)
 
     if len(ispc) != 0:
         if len(tasks) != 0:
diff --git a/stdlib.ispc b/stdlib.ispc
index 6768594b..3b17283d 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1209,6 +1209,13 @@ packed_store_active(uniform unsigned int a[],
     return __packed_store_active(a, vals, (UIntMaskType)__mask);
 }
 
+static inline uniform int
+packed_store_active2(uniform unsigned int a[],
+                    unsigned int vals) {
+    return __packed_store_active2(a, vals, (UIntMaskType)__mask);
+}
+
+
 static inline uniform int 
 packed_load_active(uniform int a[], varying int * uniform vals) {
     return __packed_load_active(a, vals, (IntMaskType)__mask);
@@ -1219,6 +1226,12 @@ packed_store_active(uniform int a[], int vals) {
     return __packed_store_active(a, vals, (IntMaskType)__mask);
 }
 
+static inline uniform int
+packed_store_active2(uniform int a[], int vals) {
+    return __packed_store_active2(a, vals, (IntMaskType)__mask);
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // System information
 
diff --git a/test_static.cpp b/test_static.cpp
index 8985fdb3..27a5b136 100644
--- a/test_static.cpp
+++ b/test_static.cpp
@@ -62,17 +62,20 @@ extern "C" {
     extern void f_di(float *result, double *a, int *b);
     extern void result(float *val);
 
-    void ISPCLaunch(void **handlePtr, void *f, void *d, int);
+    void ISPCLaunch(void **handlePtr, void *f, void *d, int,int,int);
     void ISPCSync(void *handle);
     void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
 }
-
-void ISPCLaunch(void **handle, void *f, void *d, int count) {
+ 
+void ISPCLaunch(void **handle, void *f, void *d, int count0, int count1, int count2) {
     *handle = (void *)0xdeadbeef;
-    typedef void (*TaskFuncType)(void *, int, int, int, int);
+    typedef void (*TaskFuncType)(void *, int, int, int, int, int, int, int, int, int, int);
     TaskFuncType func = (TaskFuncType)f;
-    for (int i = 0; i < count; ++i)
-        func(d, 0, 1, i, count);
+    int count = count0*count1*count2, idx = 0;
+    for (int k = 0; k < count2; ++k)
+      for (int j = 0; j < count1; ++j)
+        for (int i = 0; i < count0; ++i)
+        func(d, 0, 1, idx++, count, i,j,k,count0,count1,count2);
 }
 
 void ISPCSync(void *) {
diff --git a/tests/launch-8.ispc b/tests/launch-8.ispc
new file mode 100644
index 00000000..eacba673
--- /dev/null
+++ b/tests/launch-8.ispc
@@ -0,0 +1,42 @@
+
+export uniform int width() { return programCount; }
+
+
+#define N0 10
+#define N1 20
+#define N2 50
+static uniform float array[N2][N1][N0];
+
+task void x(const float f) {
+    uniform int j;
+
+    assert(taskCount  == (int32)N0*N1*N2);
+    assert(taskCount0 == (int32)N0);
+    assert(taskCount1 == (int32)N1);
+    assert(taskCount2 == (int32)N2);
+    assert(taskIndex  == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2));
+    assert(taskIndex0 < (int32)N0);
+    assert(taskIndex1 < (int32)N1);
+    assert(taskIndex2 < (int32)N2);
+
+    const uniform int i0 = taskIndex0;
+    const uniform int i1 = taskIndex1;
+    const uniform int i2 = taskIndex2;
+    const uniform int i  = taskIndex;
+    array[i2][i1][i0] = i / 10000.;
+    cfor (j = 0; j < 10000; ++j)
+        array[i2][i1][i0] = sin(array[i2][i1][i0]);
+    if (array[i2][i1][i0] < .02)
+        array[i2][i1][i0] = i;
+}
+export void f_f(uniform float RET[], uniform float fFOO[]) { 
+    float f = fFOO[programIndex];
+    launch[N2][N1][N0] x(f);
+    sync;
+    RET[programIndex] = array[N2-1][N1-1][N0-1];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 9999.000000;
+}
diff --git a/tests/launch-9.ispc b/tests/launch-9.ispc
new file mode 100644
index 00000000..1952e8e7
--- /dev/null
+++ b/tests/launch-9.ispc
@@ -0,0 +1,42 @@
+
+export uniform int width() { return programCount; }
+
+
+#define N0 10
+#define N1 20
+#define N2 50
+static uniform float array[N2][N1][N0];
+
+task void x(const float f) {
+    uniform int j;
+
+    assert(taskCount  == (int32)N0*N1*N2);
+    assert(taskCount0 == (int32)N0);
+    assert(taskCount1 == (int32)N1);
+    assert(taskCount2 == (int32)N2);
+    assert(taskIndex  == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2));
+    assert(taskIndex0 < (int32)N0);
+    assert(taskIndex1 < (int32)N1);
+    assert(taskIndex2 < (int32)N2);
+
+    const uniform int i0 = taskIndex0;
+    const uniform int i1 = taskIndex1;
+    const uniform int i2 = taskIndex2;
+    const uniform int i  = taskIndex;
+    array[i2][i1][i0] = i / 10000.;
+    cfor (j = 0; j < 10000; ++j)
+        array[i2][i1][i0] = sin(array[i2][i1][i0]);
+    if (array[i2][i1][i0] < .02)
+        array[i2][i1][i0] = i;
+}
+export void f_f(uniform float RET[], uniform float fFOO[]) { 
+    float f = fFOO[programIndex];
+    launch[N0,N1,N2] x(f);
+    sync;
+    RET[programIndex] = array[N2-1][N1-1][N0-1];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 9999.000000;
+}
diff --git a/tests/packed-store2-1.ispc b/tests/packed-store2-1.ispc
new file mode 100644
index 00000000..0ca3230a
--- /dev/null
+++ b/tests/packed-store2-1.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform int pack[2+programCount];
+    for (uniform int i = 0; i < 2+programCount; ++i)
+        pack[i] = 0;
+    packed_store_active2(&pack[2], a);
+    RET[programIndex] = pack[programIndex]; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex-1;
+    RET[0] = RET[1] = 0;
+}
diff --git a/tests/packed-store2-2.ispc b/tests/packed-store2-2.ispc
new file mode 100644
index 00000000..c29230ca
--- /dev/null
+++ b/tests/packed-store2-2.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform int pack[2+programCount];
+    uniform int number;
+    for (uniform int i = 0; i < 2+programCount; ++i)
+        pack[i] = 0;
+    if ((int)a & 1)
+        number = packed_store_active2(&pack[2], a);
+    pack[2+number] = 0;
+    RET[programIndex] = pack[programIndex]; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    uniform int val = 1;
+    for (uniform int i = 2; i < 2+programCount/2; ++i, val += 2)
+        RET[i] = val;
+}
diff --git a/tests/packed-store2-3.ispc b/tests/packed-store2-3.ispc
new file mode 100644
index 00000000..9192525e
--- /dev/null
+++ b/tests/packed-store2-3.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform int pack[2+programCount];
+    for (uniform int i = 0; i < 2+programCount; ++i)
+        pack[i] = 0;
+    uniform int count = 0;
+    if ((int)a & 1)
+        count += packed_store_active2(&pack[2], a);
+    RET[programIndex] = count;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programCount == 1) ? 1 : programCount/2;
+}
diff --git a/tests/packed-store2.ispc b/tests/packed-store2.ispc
new file mode 100644
index 00000000..13973bc3
--- /dev/null
+++ b/tests/packed-store2.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform unsigned int pack[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        pack[i] = 0;
+    packed_store_active2(pack, (unsigned int)a);
+    RET[programIndex] = pack[programIndex]; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/type.cpp b/type.cpp
index cef8083e..cf7ac85d 100644
--- a/type.cpp
+++ b/type.cpp
@@ -3021,6 +3021,12 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
         callTypes.push_back(LLVMTypes::Int32Type); // threadCount
         callTypes.push_back(LLVMTypes::Int32Type); // taskIndex
         callTypes.push_back(LLVMTypes::Int32Type); // taskCount
+        callTypes.push_back(LLVMTypes::Int32Type); // taskIndex0
+        callTypes.push_back(LLVMTypes::Int32Type); // taskIndex1
+        callTypes.push_back(LLVMTypes::Int32Type); // taskIndex2
+        callTypes.push_back(LLVMTypes::Int32Type); // taskCount0
+        callTypes.push_back(LLVMTypes::Int32Type); // taskCount1
+        callTypes.push_back(LLVMTypes::Int32Type); // taskCount2
     }
     else
         // Otherwise we already have the types of the arguments