diff --git a/Makefile b/Makefile
index 1fc77dff..4af7aff4 100644
--- a/Makefile
+++ b/Makefile
@@ -140,7 +140,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=nvptx64 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=nvptx64 avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
diff --git a/alloy.py b/alloy.py
index d56ef3a4..0aaf3d8d 100755
--- a/alloy.py
+++ b/alloy.py
@@ -212,10 +212,10 @@ def check_targets():
                 answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"]
             if AVX11 == False and "rdrand" in f_lines[i]:
                 AVX11 = True;
-                answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
+                answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16", "avx1.1-i64x4"]
             if AVX2 == False and "avx2" in f_lines[i]:
                 AVX2 = True;
-                answer = answer + ["avx2-i32x8", "avx2-i32x16"]
+                answer = answer + ["avx2-i32x8", "avx2-i32x16", "avx2-i64x4"]
     if current_OS == "MacOS":
         f_lines = take_lines("sysctl machdep.cpu.features", "first")
         if "SSE2" in f_lines:
@@ -229,10 +229,10 @@ def check_targets():
             answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"]
         if "RDRAND" in f_lines:
             AVX11 = True;
-            answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
+            answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16", "avx1.1-i64x4"]
         if "AVX2.0" in f_lines:
             AVX2 = True;
-            answer = answer + ["avx2-i32x8", "avx2-i32x16"]
+            answer = answer + ["avx2-i32x8", "avx2-i32x16", "avx2-i64x4"]
 
     answer = answer + ["generic-4", "generic-16", "generic-8", "generic-1", "generic-32", "generic-64"]
     # now check what targets we have with the help of SDE
@@ -257,9 +257,9 @@ def check_targets():
         if AVX == False and "snb" in f_lines[i]:
             answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]]
         if AVX11 == False and "ivb" in f_lines[i]:
-            answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"]]
+            answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"], ["-ivb", "avx1.1-i64x4"]]
         if AVX2 == False and "hsw" in f_lines[i]:
-            answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"]]
+            answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"], ["-hsw", "avx2-i64x4"]]
     return [answer, answer_sde]
 
 def build_ispc(version_LLVM, make):
@@ -274,29 +274,38 @@ def build_ispc(version_LLVM, make):
 
 def execute_stability(stability, R, print_version):
     stability1 = copy.deepcopy(stability)
-    temp = run_tests.run_tests(stability1, [], print_version)
+    b_temp = run_tests.run_tests(stability1, [], print_version)
+    temp = b_temp[0]
+    time = b_temp[1]
     for j in range(0,4):
         R[j][0] = R[j][0] + temp[j]
         for i in range(0,len(temp[j])):
             R[j][1].append(temp[4])
     number_of_fails = temp[5]
     number_of_new_fails = len(temp[0]) + len(temp[1])
+    number_of_passes = len(temp[2]) + len(temp[3])
     if number_of_fails == 0:
         str_fails = ". No fails"
     else:
         str_fails = ". Fails: " + str(number_of_fails)
     if number_of_new_fails == 0:
-        str_new_fails = ", No new fails.\n"
+        str_new_fails = ", No new fails"
     else:
-        str_new_fails = ", New fails: " + str(number_of_new_fails) + ".\n"
-    print_debug(temp[4][1:-3] + str_fails + str_new_fails, False, stability_log)
+        str_new_fails = ", New fails: " + str(number_of_new_fails)
+    if number_of_passes == 0:
+        str_new_passes = "."
+    else:
+        str_new_passes = ", " + str(number_of_passes) + " new passes."
+    if stability.time:
+        str_time = " " + time + "\n"
+    else:
+        str_time = "\n"
+    print_debug(temp[4][1:-3] + str_fails + str_new_fails + str_new_passes + str_time, False, stability_log)
 
 def run_special_tests():
    i = 5 
 
-def validation_run(only, only_targets, reference_branch, number, notify, update, make):
-    if os.environ["ISPC_HOME"] != os.getcwd():
-        error("you ISPC_HOME and your current pass are different!\n", 2)
+def validation_run(only, only_targets, reference_branch, number, notify, update, speed_number, make, perf_llvm, time):
     os.chdir(os.environ["ISPC_HOME"])
     os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"]
     if options.notify != "":
@@ -322,9 +331,9 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
         stability.random = False
         stability.ispc_flags = ""
         stability.compiler_exe = None
-        stability.num_jobs = 1024
+        stability.num_jobs = speed_number
         stability.verbose = False
-        stability.time = False
+        stability.time = time
         stability.non_interactive = True
         stability.update = update
         stability.include_file = None
@@ -476,28 +485,36 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
 # prepare LLVM 3.3 as newest LLVM
         need_LLVM = check_LLVM(["3.3"])
         if len(need_LLVM) != 0:
-            build_LLVM(need_LLVM[i], "", "", "", False, False, False, True, False, make)
-# prepare reference point. build both test and reference compilers
-        try_do_LLVM("apply git", "git branch", True)
-        temp4 = take_lines("git branch", "all")
-        for line in temp4:
-            if "*" in line:
-                current_branch = line[2:-1]
-        stashing = True
-        sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n")
-        if "No local changes" in take_lines("git stash", "first"):
-            stashing = False
-        #try_do_LLVM("stash current branch ", "git stash", True)
-        try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True)
-        sys.stdout.write(".\n")
-        build_ispc("3.3", make)
-        sys.stdout.write(".\n")
-        os.rename("ispc", "ispc_ref")
-        try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True)
-        if stashing:
-            try_do_LLVM("return current branch ", "git stash pop", True)
-        sys.stdout.write("You can interrupt script now.\n")
-        build_ispc("3.3", make)
+            build_LLVM(need_LLVM[0], "", "", "", False, False, False, True, False, make)
+        if perf_llvm == False:
+            # prepare reference point. build both test and reference compilers
+            try_do_LLVM("apply git", "git branch", True)
+            temp4 = take_lines("git branch", "all")
+            for line in temp4:
+                if "*" in line:
+                    current_branch = line[2:-1]
+            stashing = True
+            sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n")
+            if "No local changes" in take_lines("git stash", "first"):
+                stashing = False
+            #try_do_LLVM("stash current branch ", "git stash", True)
+            try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True)
+            sys.stdout.write(".\n")
+            build_ispc("3.3", make)
+            sys.stdout.write(".\n")
+            os.rename("ispc", "ispc_ref")
+            try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True)
+            if stashing:
+                try_do_LLVM("return current branch ", "git stash pop", True)
+            sys.stdout.write("You can interrupt script now.\n")
+            build_ispc("3.3", make)
+        else:
+            # build compiler with two different LLVM versions
+            if len(check_LLVM([reference_branch])) != 0:
+                error("you haven't got llvm called " + reference_branch, 1)
+            build_ispc("3.3", make)
+            os.rename("ispc", "ispc_ref")
+            build_ispc(reference_branch, make)
 # begin validation run for performance. output is inserted into perf()
         perf.perf(performance, [])
         if options.notify != "":
@@ -560,16 +577,26 @@ def Main():
     stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log"
     current_path = os.getcwd()
     make = "make -j" + options.speed
+    if os.environ["ISPC_HOME"] != os.getcwd():
+        error("you ISPC_HOME and your current path are different!\n", 2)
+    if options.perf_llvm == True:
+        if options.branch == "master":
+            options.branch = "trunk"
     try:
+        start_time = time.time()
         if options.build_llvm:
             build_LLVM(options.version, options.revision, options.folder, options.tarball,
                     options.debug, options.selfbuild, options.extra, False, options.force, make)
         if options.validation_run:
             validation_run(options.only, options.only_targets, options.branch,
-                    options.number_for_performance, options.notify, options.update, make)
+                    options.number_for_performance, options.notify, options.update, int(options.speed),
+                    make, options.perf_llvm, options.time)
+        elapsed_time = time.time() - start_time
+        if options.time:
+            print_debug("Elapsed time: " + time.strftime('%Hh%Mm%Ssec.', time.gmtime(elapsed_time)) + "\n", False, "")
     finally:
         os.chdir(current_path)
-        date_name = "alloy_results_" + datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')
+        date_name = "alloy_results_" + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
         if os.path.exists(date_name):
             error("It's forbidden to run alloy two times in a second, logs are in ./logs", 1)
         os.rename(f_date, date_name)
@@ -656,11 +683,15 @@ run_group.add_option('--update-errors', dest='update',
 run_group.add_option('--only-targets', dest='only_targets',
     help='set list of targets to test. Possible values - all subnames of targets.',
     default="")
+run_group.add_option('--time', dest='time',
+    help='display time of testing', default=False, action='store_true')
 run_group.add_option('--only', dest='only',
     help='set types of tests. Possible values:\n' + 
         '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' +
         'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).',
         default="")
+run_group.add_option('--perf_LLVM', dest='perf_llvm',
+    help='compare LLVM 3.3 with "--compare-with", default trunk', default=False, action='store_true')
 parser.add_option_group(run_group)
 # options for activity "setup PATHS"
 setup_group = OptionGroup(parser, "Options for setup",
diff --git a/ast.cpp b/ast.cpp
index 83ee207d..60b20a80 100644
--- a/ast.cpp
+++ b/ast.cpp
@@ -223,7 +223,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
         else if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
             fce->func = (Expr *)WalkAST(fce->func, preFunc, postFunc, data);
             fce->args = (ExprList *)WalkAST(fce->args, preFunc, postFunc, data);
-            fce->launchCountExpr = (Expr *)WalkAST(fce->launchCountExpr, preFunc,
+            for (int k = 0; k < 3; k++)
+              fce->launchCountExpr[0] = (Expr *)WalkAST(fce->launchCountExpr[0], preFunc,
                                                    postFunc, data);
         }
         else if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL) {
diff --git a/builtins.cpp b/builtins.cpp
index 48ce9afb..e3141725 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -979,6 +979,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     }
     case Target::AVX11: {
         switch (g->target->getVectorWidth()) {
+        case 4:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avx11_i64x4_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avx11_i64x4_64bit);
+            }
+            break;
         case 8:
             if (runtime32) {
                 EXPORT_MODULE(builtins_bitcode_avx11_32bit);
@@ -1002,6 +1010,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     }
     case Target::AVX2: {
         switch (g->target->getVectorWidth()) {
+        case 4:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avx2_i64x4_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avx2_i64x4_64bit);
+            }
+            break;
         case 8:
             if (runtime32) {
                 EXPORT_MODULE(builtins_bitcode_avx2_32bit);
diff --git a/builtins/target-avx11-i64x4.ll b/builtins/target-avx11-i64x4.ll
new file mode 100644
index 00000000..8fe75266
--- /dev/null
+++ b/builtins/target-avx11-i64x4.ll
@@ -0,0 +1,120 @@
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx1-i64x4base.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+define(`expand_4to8', `
+  %$3 = shufflevector <4 x $1> %$2, <4 x $1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+')
+define(`extract_4from8', `
+  %$3 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <4 x float> @__half_to_float_varying(<4 x i16> %v4) nounwind readnone {
+  expand_4to8(i16, v4, v) 
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  extract_4from8(float, r, ret)
+  ret <4 x float> %ret
+}
+
+define <4 x i16> @__float_to_half_varying(<4 x float> %v4) nounwind readnone {
+  expand_4to8(float, v4, v) 
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  extract_4from8(i16, r, ret)
+  ret <4 x i16> %ret
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
diff --git a/builtins/target-avx2-i64x4.ll b/builtins/target-avx2-i64x4.ll
new file mode 100644
index 00000000..d74f32dc
--- /dev/null
+++ b/builtins/target-avx2-i64x4.ll
@@ -0,0 +1,355 @@
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+ifelse(LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
+
+include(`target-avx1-i64x4base.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+;; declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+;; declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readonly
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+;; declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readonly
+;; declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readonly
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+
+
+define(`expand_4to8', `
+  %$3 = shufflevector <4 x $1> %$2, <4 x $1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+')
+define(`extract_4from8', `
+  %$3 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <4 x float> @__half_to_float_varying(<4 x i16> %v4) nounwind readnone {
+  expand_4to8(i16, v4, v) 
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  extract_4from8(float, r, ret)
+  ret <4 x float> %ret
+}
+
+define <4 x i16> @__float_to_half_varying(<4 x float> %v4) nounwind readnone {
+  expand_4to8(float, v4, v) 
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  extract_4from8(i16, r, ret)
+  ret <4 x i16> %ret
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+declare void @llvm.trap() noreturn nounwind
+
+
+ifelse(LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
+gen_gather(i8)
+gen_gather(i16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <4 x i32> @__gather_base_offsets32_i32(i8 * %ptr,
+                             i32 %scale, <4 x i32> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8  = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8 * %ptr,
+                             <4 x i32> %offsets, <4 x i32> %vecmask, i8 %scale8)
+  ret <4 x i32> %v
+}
+
+
+define <4 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <4 x i64> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8  = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets, <4 x i32> %vecmask, i8 %scale8)
+
+  ret <4 x i32> %v
+}
+
+
+define <4 x i32> @__gather32_i32(<4 x i32> %ptrs,
+                                 <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8 * null,
+                      <4 x i32> %ptrs, <4 x i32> %vecmask, i8 1)
+  
+  ret <4 x i32> %v
+}
+
+
+define <4 x i32> @__gather64_i32(<4 x i64> %ptrs, 
+                                 <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x i32> %vecmask, i8 1)
+
+  ret <4 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <4 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <4 x i32> %offsets,
+                                  <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8 * %ptr,
+                       <4 x i32> %offsets, <4 x float> %mask, i8 %scale8)
+
+  ret <4 x float> %v
+}
+
+
+define <4 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <4 x i64> %offsets,
+                                   <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets, <4 x float> %mask, i8 %scale8)
+
+  ret <4 x float> %v
+}
+
+
+define <4 x float> @__gather32_float(<4 x i32> %ptrs, 
+                                     <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8 * null,
+                     <4 x i32> %ptrs, <4 x float> %mask, i8 1)
+
+  ret <4 x float> %v
+}
+
+
+define <4 x float> @__gather64_float(<4 x i64> %ptrs, 
+                                     <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x float> %mask, i8 1)
+
+  ret <4 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <4 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <4 x i32> %offsets,
+                             <4 x i64> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  %v = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets, <4 x i64> %vecmask, i8 %scale8)
+
+  ret <4 x i64> %v
+}
+
+
+define <4 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <4 x i64> %offsets,
+                             <4 x i64> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  %v = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets, <4 x i64> %vecmask, i8 %scale8)
+
+  ret <4 x i64> %v
+}
+
+
+define <4 x i64> @__gather32_i64(<4 x i32> %ptrs, 
+                                 <4 x i64> %vecmask) nounwind readonly alwaysinline {
+
+  %v = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs, <4 x i64> %vecmask, i8 1)
+  ret <4 x i64> %v
+}
+
+
+define <4 x i64> @__gather64_i64(<4 x i64> %ptrs, 
+                                 <4 x i64> %vecmask) nounwind readonly alwaysinline {
+  %v = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x i64> %vecmask, i8 1)
+  ret <4 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <4 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <4 x i32> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets, <4 x double> %vecmask, i8 %scale8)
+  ret <4 x double> %v
+}
+
+define <4 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <4 x i64> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets, <4 x double> %vecmask, i8 %scale8)
+
+  ret <4 x double> %v
+}
+
+define <4 x double> @__gather32_double(<4 x i32> %ptrs, 
+                                       <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs, <4 x double> %vecmask, i8 1)
+
+  ret <4 x double> %v
+}
+
+define <4 x double> @__gather64_double(<4 x i64> %ptrs, 
+                                       <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x double> %vecmask, i8 1)
+
+  ret <4 x double> %v
+}
+
+')
diff --git a/builtins/util.m4 b/builtins/util.m4
index 11501780..c90e8adc 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1813,7 +1813,7 @@ define(`stdlib_core', `
 declare i32 @__fast_masked_vload()
 
 declare i8* @ISPCAlloc(i8**, i64, i32) nounwind
-declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind
+declare void @ISPCLaunch(i8**, i8*, i8*, i32,i32,i32) nounwind
 declare void @ISPCSync(i8*) nounwind
 declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind
 
diff --git a/cbackend.cpp b/cbackend.cpp
index 481ca3fd..5a1ef705 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -558,8 +558,15 @@ char CWriter::ID = 0;
 static std::string CBEMangle(const std::string &S) {
   std::string Result;
 
-  for (unsigned i = 0, e = S.size(); i != e; ++i)
-    if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') {
+  for (unsigned i = 0, e = S.size(); i != e; ++i) {
+    if (i+1 != e && ((S[i] == '>' && S[i+1] == '>') ||
+                     (S[i] == '<' && S[i+1] == '<'))) {
+      Result += '_';
+      Result += 'A'+(S[i]&15);
+      Result += 'A'+((S[i]>>4)&15);
+      Result += '_';
+      i++;
+    } else if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') {
       Result += S[i];
     } else {
       Result += '_';
@@ -567,6 +574,7 @@ static std::string CBEMangle(const std::string &S) {
       Result += 'A'+((S[i]>>4)&15);
       Result += '_';
     }
+  }
   return Result;
 }
 
diff --git a/ctx.cpp b/ctx.cpp
index c50d22f9..3aee776a 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -3502,7 +3502,7 @@ FunctionEmitContext::ReturnInst() {
 llvm::Value *
 FunctionEmitContext::LaunchInst(llvm::Value *callee,
                                 std::vector<llvm::Value *> &argVals,
-                                llvm::Value *launchCount) {
+                                llvm::Value *launchCount[3]){
     if (callee == NULL) {
         AssertPos(currentPos, m->errorCount > 0);
         return NULL;
@@ -3563,7 +3563,9 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
     args.push_back(launchGroupHandlePtr);
     args.push_back(fptr);
     args.push_back(voidmem);
-    args.push_back(launchCount);
+    args.push_back(launchCount[0]);
+    args.push_back(launchCount[1]);
+    args.push_back(launchCount[2]);
     return CallInst(flaunch, NULL, args, "");
 }
 
diff --git a/ctx.h b/ctx.h
index 58f9aae3..4dd30053 100644
--- a/ctx.h
+++ b/ctx.h
@@ -542,7 +542,7 @@ public:
         he given argument values. */
     llvm::Value *LaunchInst(llvm::Value *callee,
                             std::vector<llvm::Value *> &argVals,
-                            llvm::Value *launchCount);
+                            llvm::Value *launchCount[3]);
 
     void SyncInst();
 
diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 78d35ddc..d6bf6fd5 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -2009,43 +2009,37 @@ static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val,
             ptr[i] = val[i];
 }
 
-static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) 
+static FORCEINLINE void __masked_store_i32(void *p, const __vec16_i32 val, const __vec16_i1 mask) 
 {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
   _mm512_mask_store_epi32(p, mask, val.v);
 #else
-  __vec16_i32 tmp;
-  tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-  tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-  tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v);
-  _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-  _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_mask_i32extscatter_epi32(p, mask, __ispc_stride1, val, _MM_DOWNCONV_EPI32_NONE, _MM_SCALE_4, _MM_HINT_NONE);
 #endif
 }
 
-static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, __vec16_i1 mask) 
+static FORCEINLINE void __masked_store_float(void *p, const __vec16_f val, const __vec16_i1 mask) 
 {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
   _mm512_mask_store_ps(p, mask, val.v);
 #else
-  __vec16_f tmp;
-  tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-  tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-  tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v);
-  _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
-  _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_mask_i32extscatter_ps(p, mask, __ispc_stride1, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_4, _MM_HINT_NONE);
 #endif
 }
 
-static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
-                                          __vec16_i1 mask) {
-    int64_t *ptr = (int64_t *)p;
-    for (int i = 0; i < 16; ++i)
-        if ((mask.v & (1 << i)) != 0)
-            ptr[i] = val[i];
+static FORCEINLINE void __masked_store_i64(void *p, const __vec16_i64 val, const __vec16_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  _mm512_mask_store_epi64(p, mask, val.v1);
+  _mm512_mask_store_epi64((uint8_t*)p+64, tmp_m, val.v2);
+#else
+  _mm512_mask_i32loextscatter_epi64(          p,                      mask,  __ispc_stride1, val.v1, _MM_DOWNCONV_EPI64_NONE, _MM_SCALE_8, _MM_HINT_NONE);
+  _mm512_mask_i32loextscatter_epi64((int64_t*)p+8, _mm512_kswapb(mask,mask), __ispc_stride1, val.v2, _MM_DOWNCONV_EPI64_NONE, _MM_SCALE_8, _MM_HINT_NONE);
+#endif
 }
 
-static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, __vec16_i1 mask) 
+static FORCEINLINE void __masked_store_double(void *p, const __vec16_d val, const __vec16_i1 mask) 
 {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
   __vec16_i1 tmp_m = mask;
@@ -2053,19 +2047,8 @@ static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, __vec16_i1
   _mm512_mask_store_pd(p, mask, val.v1);
   _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2);
 #else
-  __vec16_d tmp;
-  __vec16_i1 tmp_m = mask;
-  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
-  tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-  tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-  tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-  tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-  tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1);
-  tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2);
-  _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-  _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-  _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-  _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_mask_i32loextscatter_pd(           p,                    mask,  __ispc_stride1, val.v1, _MM_DOWNCONV_PD_NONE, _MM_SCALE_8, _MM_HINT_NONE);
+  _mm512_mask_i32loextscatter_pd((double*)p+8, _mm512_kswapb(mask,mask), __ispc_stride1, val.v2, _MM_DOWNCONV_PD_NONE, _MM_SCALE_8, _MM_HINT_NONE);
 #endif
 }
 
diff --git a/examples/mandelbrot_tasks3d/.gitignore b/examples/mandelbrot_tasks3d/.gitignore
new file mode 100644
index 00000000..c2471c27
--- /dev/null
+++ b/examples/mandelbrot_tasks3d/.gitignore
@@ -0,0 +1,2 @@
+mandelbrot
+*.ppm
diff --git a/examples/mandelbrot_tasks3d/Makefile b/examples/mandelbrot_tasks3d/Makefile
new file mode 100644
index 00000000..3dd44d65
--- /dev/null
+++ b/examples/mandelbrot_tasks3d/Makefile
@@ -0,0 +1,8 @@
+
+EXAMPLE=mandelbrot_tasks3d
+CPP_SRC=mandelbrot_tasks3d.cpp mandelbrot_tasks_serial.cpp
+ISPC_SRC=mandelbrot_tasks3d.ispc
+ISPC_IA_TARGETS=avx,sse2,sse4
+ISPC_ARM_TARGETS=neon
+
+include ../common.mk
diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj
new file mode 100644
index 00000000..3a8fca79
--- /dev/null
+++ b/examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj
@@ -0,0 +1,180 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{E80DA7D4-AB22-4648-A068-327307156BE6}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>mandelbrot_tasks</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+    <TargetName>mandelbrot_tasks</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+    <TargetName>mandelbrot_tasks</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+    <TargetName>mandelbrot_tasks</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+    <TargetName>mandelbrot_tasks</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="mandelbrot_tasks.cpp" />
+    <ClCompile Include="mandelbrot_tasks_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="mandelbrot_tasks.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp
new file mode 100644
index 00000000..9cbb966a
--- /dev/null
+++ b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp
@@ -0,0 +1,146 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include <string.h>
+#include "../timing.h"
+#include "mandelbrot_tasks3d_ispc.h"
+using namespace ispc;
+
+extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
+                              int width, int height, int maxIterations,
+                              int output[]);
+
+/* Write a PPM image file with the image of the Mandelbrot set */
+static void
+writePPM(int *buf, int width, int height, const char *fn) {
+    FILE *fp = fopen(fn, "wb");
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", width, height);
+    fprintf(fp, "255\n");
+    for (int i = 0; i < width*height; ++i) {
+        // Map the iteration count to colors by just alternating between
+        // two greys.
+        char c = (buf[i] & 0x1) ? 240 : 20;
+        for (int j = 0; j < 3; ++j)
+            fputc(c, fp);
+    }
+    fclose(fp);
+    printf("Wrote image file %s\n", fn);
+}
+
+
+static void usage() {
+    fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
+    exit(1);
+}
+
+int main(int argc, char *argv[]) {
+    unsigned int width = 1536;
+    unsigned int height = 1024;
+    float x0 = -2;
+    float x1 = 1;
+    float y0 = -1;
+    float y1 = 1;
+
+    if (argc == 1)
+        ;
+    else if (argc == 2) {
+        if (strncmp(argv[1], "--scale=", 8) == 0) {
+            float scale = atof(argv[1] + 8);
+            if (scale == 0.f)
+                usage();
+            width *= scale;
+            height *= scale;
+            // round up to multiples of 16
+            width = (width + 0xf) & ~0xf;
+            height = (height + 0xf) & ~0xf;
+        }
+        else 
+            usage();
+    }
+    else
+        usage();
+
+    int maxIterations = 512;
+    int *buf = new int[width*height];
+
+    //
+    // Compute the image using the ispc implementation; report the minimum
+    // time of three runs.
+    //
+    double minISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        // Clear out the buffer
+        for (unsigned int i = 0; i < width * height; ++i)
+            buf[i] = 0;
+        reset_and_start_timer();
+        mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
+        double dt = get_elapsed_mcycles();
+        minISPC = std::min(minISPC, dt);
+    }
+
+    printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
+    writePPM(buf, width, height, "mandelbrot-ispc.ppm");
+
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        // Clear out the buffer
+        for (unsigned int i = 0; i < width * height; ++i)
+            buf[i] = 0;
+        reset_and_start_timer();
+        mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
+        double dt = get_elapsed_mcycles();
+        minSerial = std::min(minSerial, dt);
+    }
+
+    printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
+    writePPM(buf, width, height, "mandelbrot-serial.ppm");
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
+
+    return 0;
+}
diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc
new file mode 100644
index 00000000..395bdca4
--- /dev/null
+++ b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc
@@ -0,0 +1,99 @@
+/*
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+static inline int
+mandel(float c_re, float c_im, int count) {
+    float z_re = c_re, z_im = c_im;
+    int i;
+    for (i = 0; i < count; ++i) {
+        if (z_re * z_re + z_im * z_im > 4.)
+            break;
+
+        float new_re = z_re*z_re - z_im*z_im;
+        float new_im = 2.f * z_re * z_im;
+        unmasked {
+            z_re = c_re + new_re;
+            z_im = c_im + new_im;
+        }
+    }
+
+    return i;
+}
+
+
+/* Task to compute the Mandelbrot iterations for a single scanline.
+ */
+task void
+mandelbrot_scanline(uniform float x0, uniform float dx, 
+                    uniform float y0, uniform float dy,
+                    uniform int width, uniform int height, 
+                    uniform int xspan, uniform int yspan,
+                    uniform int maxIterations, uniform int output[]) {
+    const uniform int xstart = taskIndex0 * xspan;
+    const uniform int xend   = min(xstart  + xspan, width);
+
+    const uniform int ystart = taskIndex1 * yspan;
+    const uniform int yend   = min(ystart  + yspan, height);
+    
+
+    foreach (yi = ystart ... yend, xi = xstart ... xend) {
+        float x = x0 + xi * dx;
+        float y = y0 + yi * dy;
+
+        int index = yi * width + xi;
+        output[index] = mandel(x, y, maxIterations);
+    }
+    
+}
+
+#if 1
+export void
+mandelbrot_ispc(uniform float x0, uniform float y0, 
+                uniform float x1, uniform float y1,
+                uniform int width, uniform int height, 
+                uniform int maxIterations, uniform int output[]) {
+    uniform float dx = (x1 - x0) / width;
+    uniform float dy = (y1 - y0) / height;
+    const uniform int xspan = 16;  /* make sure it is big enough to avoid false-sharing */
+    const uniform int yspan = 16; 
+
+
+#if 1
+    launch [width/xspan, height/yspan]
+#else
+    launch [height/yspan][width/xspan]
+#endif
+      mandelbrot_scanline(x0, dx, y0, dy, width, height, xspan, yspan,
+                          maxIterations, output);
+}
+#endif
diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp b/examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp
new file mode 100644
index 00000000..a76fb5ca
--- /dev/null
+++ b/examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp
@@ -0,0 +1,68 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+static int mandel(float c_re, float c_im, int count) {
+    float z_re = c_re, z_im = c_im;
+    int i;
+    for (i = 0; i < count; ++i) {
+        if (z_re * z_re + z_im * z_im > 4.f)
+            break;
+
+        float new_re = z_re*z_re - z_im*z_im;
+        float new_im = 2.f * z_re * z_im;
+        z_re = c_re + new_re;
+        z_im = c_im + new_im;
+    }
+
+    return i;
+}
+
+void mandelbrot_serial(float x0, float y0, float x1, float y1,
+                       int width, int height, int maxIterations,
+                       int output[])
+{
+    float dx = (x1 - x0) / width;
+    float dy = (y1 - y0) / height;
+
+    for (int j = 0; j < height; j++) {
+        for (int i = 0; i < width; ++i) {
+            float x = x0 + i * dx;
+            float y = y0 + j * dy;
+
+            int index = (j * width + i);
+            output[index] = mandel(x, y, maxIterations);
+        }
+    }
+}
+
diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp
index c9c2fa7b..6bc60129 100644
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -170,21 +170,41 @@
 
 // Signature of ispc-generated 'task' functions
 typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,
-                             int taskIndex, int taskCount);
+                             int taskIndex, int taskCount,
+                             int taskIndex0, int taskIndex1, int taskIndex2,
+                             int taskCount0, int taskCount1, int taskCount2);
 
 // Small structure used to hold the data for each task
 struct TaskInfo {
     TaskFuncType func;
     void *data;
-    int taskIndex, taskCount;
+    int taskIndex;
+    int taskCount3d[3];
 #if defined(ISPC_IS_WINDOWS)
     event taskEvent;
 #endif
-};
+    int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; }
+    int taskIndex0() const 
+    {
+      return taskIndex % taskCount3d[0];
+    }
+    int taskIndex1() const 
+    {
+      return ( taskIndex / taskCount3d[0] ) % taskCount3d[1];
+    }
+    int taskIndex2() const 
+    {
+      return taskIndex / ( taskCount3d[0]*taskCount3d[1] );
+    }
+    int taskCount0() const { return taskCount3d[0]; }
+    int taskCount1() const { return taskCount3d[1]; }
+    int taskCount2() const { return taskCount3d[2]; }
+    TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); }
+} __attribute__((aligned(32)));
 
 // ispc expects these functions to have C linkage / not be mangled
 extern "C" { 
-    void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
+    void ISPCLaunch(void **handlePtr, void *f, void *data, int countx,int county, int countz);
     void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
     void ISPCSync(void *handle);
 }
@@ -518,7 +538,9 @@ lRunTask(void *ti) {
 
     // Actually run the task
     taskInfo->func(taskInfo->data, threadIndex, threadCount, 
-                   taskInfo->taskIndex, taskInfo->taskCount);
+                   taskInfo->taskIndex, taskInfo->taskCount(),
+            taskInfo->taskIndex0(), taskInfo->taskIndex1(), taskInfo->taskIndex2(),
+            taskInfo->taskCount0(), taskInfo->taskCount1(), taskInfo->taskCount2());
 }
 
 
@@ -559,7 +581,9 @@ lRunTask(LPVOID param) {
     // will cause bugs in code that uses those.
     int threadIndex = 0;
     int threadCount = 1;
-    ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+    ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
+            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
+            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
 
     // Signal the event that this task is done
     ti->taskEvent.set();
@@ -660,7 +684,9 @@ lTaskEntry(void *arg) {
         DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg));
         TaskInfo *myTask = tg->GetTaskInfo(taskNumber);
         myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex,
-                     myTask->taskCount);
+                     myTask->taskCount(),
+            myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(),
+            myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2());
 
         //
         // Decrement the "number of unfinished tasks" counter in the task
@@ -861,7 +887,9 @@ TaskGroup::Sync() {
         // Do work for _myTask_
         //
         // FIXME: bogus values for thread index/thread count here as well..
-        myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount);
+        myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount(),
+            myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(),
+            myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2());
 
         //
         // Decrement the number of unfinished tasks counter
@@ -891,7 +919,9 @@ TaskGroup::Launch(int baseIndex, int count) {
 
         // Actually run the task. 
         // Cilk does not expose the task -> thread mapping so we pretend it's 1:1
-        ti->func(ti->data, ti->taskIndex, ti->taskCount, ti->taskIndex, ti->taskCount);
+        ti->func(ti->data, ti->taskIndex, ti->taskCount(),
+            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
+            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
     }
 }
 
@@ -920,7 +950,9 @@ TaskGroup::Launch(int baseIndex, int count) {
         // Actually run the task. 
         int threadIndex = omp_get_thread_num();
         int threadCount = omp_get_num_threads();
-        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
+            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
+            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
     }
 }
 
@@ -951,7 +983,9 @@ TaskGroup::Launch(int baseIndex, int count) {
         int threadIndex = ti->taskIndex;
         int threadCount = ti->taskCount;
 
-        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
+            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
+            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
     });
 }
 
@@ -978,7 +1012,9 @@ TaskGroup::Launch(int baseIndex, int count) {
             // TBB does not expose the task -> thread mapping so we pretend it's 1:1
             int threadIndex = ti->taskIndex;
             int threadCount = ti->taskCount;
-            ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+            ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
+            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
+            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
         });
     }
 }
@@ -1031,7 +1067,8 @@ FreeTaskGroup(TaskGroup *tg) {
 ///////////////////////////////////////////////////////////////////////////
 
 void
-ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
+ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2) {
+    const int count = count0*count1*count2;
     TaskGroup *taskGroup;
     if (*taskGroupPtr == NULL) {
         InitTaskSystem();
@@ -1047,7 +1084,9 @@ ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
         ti->func = (TaskFuncType)func;
         ti->data = data;
         ti->taskIndex = i;
-        ti->taskCount = count;
+        ti->taskCount3d[0] = count0;
+        ti->taskCount3d[1] = count1;
+        ti->taskCount3d[2] = count2;
     }
     taskGroup->Launch(baseIndex, count);
 }
diff --git a/expr.cpp b/expr.cpp
index 614cb5e5..60d9ce66 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -1660,6 +1660,64 @@ BinaryExpr::BinaryExpr(Op o, Expr *a, Expr *b, SourcePos p)
     arg1 = b;
 }
 
+Expr *lCreateBinaryOperatorCall(const BinaryExpr::Op bop,
+                                Expr *a0, Expr *a1,
+                                const SourcePos &sp)
+{
+    if ((a0 == NULL) || (a1 == NULL)) {
+        return NULL;
+    }
+    Expr *arg0 = a0->TypeCheck();
+    Expr *arg1 = a1->TypeCheck();
+    if ((arg0 == NULL) || (arg1 == NULL)) {
+        return NULL;
+    }
+    const Type *type0 = arg0->GetType();
+    const Type *type1 = arg1->GetType();
+
+    // If either operand is a reference, dereference it before we move
+    // forward
+    if (CastType<ReferenceType>(type0) != NULL) {
+        arg0 = new RefDerefExpr(arg0, arg0->pos);
+        type0 = arg0->GetType();
+    }
+    if (CastType<ReferenceType>(type1) != NULL) {
+        arg1 = new RefDerefExpr(arg1, arg1->pos);
+        type1 = arg1->GetType();
+    }
+    if ((type0 == NULL) || (type1 == NULL)) {
+        return NULL;
+    }
+    if (CastType<StructType>(type0) != NULL ||
+        CastType<StructType>(type1) != NULL) {
+        std::string opName = std::string("operator") + lOpString(bop);
+        std::vector<Symbol *> funs;
+        m->symbolTable->LookupFunction(opName.c_str(), &funs);
+        if (funs.size() == 0) {
+            Error(sp, "operator %s(%s, %s) is not defined.",
+            opName.c_str(), (type0->GetString()).c_str(), (type1->GetString()).c_str());
+            return NULL;
+        }
+        Expr *func = new FunctionSymbolExpr(opName.c_str(), funs, sp);
+        ExprList *args = new ExprList(sp);
+        args->exprs.push_back(arg0);
+        args->exprs.push_back(arg1);
+        Expr *opCallExpr = new FunctionCallExpr(func, args, sp);
+        return opCallExpr;
+    }
+    return NULL;
+}
+
+
+Expr * MakeBinaryExpr(BinaryExpr::Op o, Expr *a, Expr *b, SourcePos p) {
+    Expr * op = lCreateBinaryOperatorCall(o, a, b, p);
+    if (op != NULL) {
+        return op;
+    }
+    op = new BinaryExpr(o, a, b, p);
+    return op;
+}
+
 
 /** Emit code for a && or || logical operator.  In particular, the code
     here handles "short-circuit" evaluation, where the second expression
@@ -2985,29 +3043,10 @@ AssignExpr::TypeCheck() {
     if (lvalueIsReference)
         lvalue = new RefDerefExpr(lvalue, lvalue->pos);
 
-    FunctionSymbolExpr *fse;
-    if ((fse = dynamic_cast<FunctionSymbolExpr *>(rvalue)) != NULL) {
-        // Special case to use the type of the LHS to resolve function
-        // overloads when we're assigning a function pointer where the
-        // function is overloaded.
-        const Type *lvalueType = lvalue->GetType();
-        const FunctionType *ftype;
-        if (CastType<PointerType>(lvalueType) == NULL ||
-            (ftype = CastType<FunctionType>(lvalueType->GetBaseType())) == NULL) {
-            Error(lvalue->pos, "Can't assign function pointer to type \"%s\".",
-                  lvalueType ? lvalueType->GetString().c_str() : "<unknown>");
-            return NULL;
-        }
-
-        std::vector<const Type *> paramTypes;
-        for (int i = 0; i < ftype->GetNumParameters(); ++i)
-            paramTypes.push_back(ftype->GetParameterType(i));
-
-        if (!fse->ResolveOverloads(rvalue->pos, paramTypes)) {
-            Error(pos, "Unable to find overloaded function for function "
-                  "pointer assignment.");
-            return NULL;
-        }
+    if (PossiblyResolveFunctionOverloads(rvalue, lvalue->GetType()) == false) {
+        Error(pos, "Unable to find overloaded function for function "
+                "pointer assignment.");
+        return NULL;
     }
 
     const Type *lhsType = lvalue->GetType();
@@ -3501,11 +3540,13 @@ SelectExpr::Print() const {
 // FunctionCallExpr
 
 FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p,
-                                   bool il, Expr *lce)
+                                   bool il, Expr *lce[3])
     : Expr(p), isLaunch(il) {
     func = f;
     args = a;
-    launchCountExpr = lce;
+    launchCountExpr[0] = lce[0];
+    launchCountExpr[1] = lce[1];
+    launchCountExpr[2] = lce[2];
 }
 
 
@@ -3623,9 +3664,13 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
     llvm::Value *retVal = NULL;
     ctx->SetDebugPos(pos);
     if (ft->isTask) {
-        AssertPos(pos, launchCountExpr != NULL);
-        llvm::Value *launchCount = launchCountExpr->GetValue(ctx);
-        if (launchCount != NULL)
+        AssertPos(pos, launchCountExpr[0] != NULL);
+        llvm::Value *launchCount[3] = 
+        { launchCountExpr[0]->GetValue(ctx),
+          launchCountExpr[1]->GetValue(ctx),
+          launchCountExpr[2]->GetValue(ctx) };
+
+        if (launchCount[0] != NULL)
             ctx->LaunchInst(callee, argVals, launchCount);
     }
     else
@@ -3650,10 +3695,37 @@ FunctionCallExpr::GetLValue(FunctionEmitContext *ctx) const {
         return NULL;
     }
 }
- 
+
+
+bool FullResolveOverloads(Expr * func, ExprList * args,
+                        std::vector<const Type *> *argTypes,
+                        std::vector<bool> *argCouldBeNULL,
+                        std::vector<bool> *argIsConstant) {
+    for (unsigned int i = 0; i < args->exprs.size(); ++i) {
+        Expr *expr = args->exprs[i];
+        if (expr == NULL)
+            return false;
+        const Type *t = expr->GetType();
+        if (t == NULL)
+            return false;
+        argTypes->push_back(t);
+        argCouldBeNULL->push_back(lIsAllIntZeros(expr) || dynamic_cast<NullPointerExpr *>(expr));
+        argIsConstant->push_back(dynamic_cast<ConstExpr *>(expr) || dynamic_cast<NullPointerExpr *>(expr));
+    }
+    return true;
+}
+
 
 const Type *
 FunctionCallExpr::GetType() const {
+    std::vector<const Type *> argTypes;
+    std::vector<bool> argCouldBeNULL, argIsConstant;
+    if (FullResolveOverloads(func, args, &argTypes, &argCouldBeNULL, &argIsConstant) == true) {
+        FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
+        if (fse != NULL) {
+            fse->ResolveOverloads(args->pos, argTypes, &argCouldBeNULL, &argIsConstant);
+        }
+    }
     const FunctionType *ftype = lGetFunctionType(func);
     return ftype ? ftype->GetReturnType() : NULL;
 }
@@ -3689,20 +3761,9 @@ FunctionCallExpr::TypeCheck() {
 
     std::vector<const Type *> argTypes;
     std::vector<bool> argCouldBeNULL, argIsConstant;
-    for (unsigned int i = 0; i < args->exprs.size(); ++i) {
-        Expr *expr = args->exprs[i];
 
-        if (expr == NULL)
-            return NULL;
-        const Type *t = expr->GetType();
-        if (t == NULL)
-            return NULL;
-
-        argTypes.push_back(t);
-        argCouldBeNULL.push_back(lIsAllIntZeros(expr) ||
-                                 dynamic_cast<NullPointerExpr *>(expr));
-        argIsConstant.push_back(dynamic_cast<ConstExpr *>(expr) ||
-                                dynamic_cast<NullPointerExpr *>(expr));
+    if (FullResolveOverloads(func, args, &argTypes, &argCouldBeNULL, &argIsConstant) == false) {
+        return NULL;
     }
 
     FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
@@ -3732,14 +3793,17 @@ FunctionCallExpr::TypeCheck() {
             if (!isLaunch)
                 Error(pos, "\"launch\" expression needed to call function "
                       "with \"task\" qualifier.");
-            if (!launchCountExpr)
+            for (int k = 0; k < 3; k++)
+            {
+              if (!launchCountExpr[k])
                 return NULL;
 
-            launchCountExpr =
-                TypeConvertExpr(launchCountExpr, AtomicType::UniformInt32,
-                                "task launch count");
-            if (launchCountExpr == NULL)
+              launchCountExpr[k] =
+                TypeConvertExpr(launchCountExpr[k], AtomicType::UniformInt32,
+                    "task launch count");
+              if (launchCountExpr[k] == NULL)
                 return NULL;
+            }
         }
         else {
             if (isLaunch) {
@@ -3747,7 +3811,7 @@ FunctionCallExpr::TypeCheck() {
                       "qualified function.");
                 return NULL;
             }
-            AssertPos(pos, launchCountExpr == NULL);
+            AssertPos(pos, launchCountExpr[0] == NULL);
         }
     }
     else {
@@ -7010,7 +7074,8 @@ TypeCastExpr::GetLValue(FunctionEmitContext *ctx) const {
 
 const Type *
 TypeCastExpr::GetType() const {
-    AssertPos(pos, type->HasUnboundVariability() == false);
+    // We have to switch off this assert after supporting of operators.
+    //AssertPos(pos, type->HasUnboundVariability() == false);
     return type;
 }
 
@@ -8190,6 +8255,9 @@ FunctionSymbolExpr::ResolveOverloads(SourcePos argPos,
                                      const std::vector<bool> *argCouldBeNULL,
                                      const std::vector<bool> *argIsConstant) {
     const char *funName = candidateFunctions.front()->name.c_str();
+    if (triedToResolve == true) {
+        return true;
+    }
 
     triedToResolve = true;
 
diff --git a/expr.h b/expr.h
index 42fdff45..0d46191b 100644
--- a/expr.h
+++ b/expr.h
@@ -246,7 +246,8 @@ public:
 class FunctionCallExpr : public Expr {
 public:
     FunctionCallExpr(Expr *func, ExprList *args, SourcePos p,
-                     bool isLaunch = false, Expr *launchCountExpr = NULL);
+                     bool isLaunch = false, 
+                     Expr *launchCountExpr[3] = (Expr*[3]){NULL, NULL, NULL});
 
     llvm::Value *GetValue(FunctionEmitContext *ctx) const;
     llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
@@ -261,7 +262,7 @@ public:
     Expr *func;
     ExprList *args;
     bool isLaunch;
-    Expr *launchCountExpr;
+    Expr *launchCountExpr[3];
 };
 
 
@@ -730,6 +731,8 @@ bool CanConvertTypes(const Type *fromType, const Type *toType,
  */
 Expr *TypeConvertExpr(Expr *expr, const Type *toType, const char *errorMsgBase);
 
+Expr * MakeBinaryExpr(BinaryExpr::Op o, Expr *a, Expr *b, SourcePos p);
+
 /** Utility routine that emits code to initialize a symbol given an
     initializer expression.
 
diff --git a/fail_db.txt b/fail_db.txt
index f1aaaab2..9c43c7f0 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -1025,3 +1025,38 @@
 ./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.4         cl -O2 *
diff --git a/func.cpp b/func.cpp
index b975049b..af2cc05a 100644
--- a/func.cpp
+++ b/func.cpp
@@ -132,9 +132,28 @@ Function::Function(Symbol *s, Stmt *c) {
         Assert(taskIndexSym);
         taskCountSym = m->symbolTable->LookupVariable("taskCount");
         Assert(taskCountSym);
+
+        taskIndexSym0 = m->symbolTable->LookupVariable("taskIndex0");
+        Assert(taskIndexSym0);
+        taskIndexSym1 = m->symbolTable->LookupVariable("taskIndex1");
+        Assert(taskIndexSym1);
+        taskIndexSym2 = m->symbolTable->LookupVariable("taskIndex2");
+        Assert(taskIndexSym2);
+
+
+        taskCountSym0 = m->symbolTable->LookupVariable("taskCount0");
+        Assert(taskCountSym0);
+        taskCountSym1 = m->symbolTable->LookupVariable("taskCount1");
+        Assert(taskCountSym1);
+        taskCountSym2 = m->symbolTable->LookupVariable("taskCount2");
+        Assert(taskCountSym2);
     }
     else
+    {
         threadIndexSym = threadCountSym = taskIndexSym = taskCountSym = NULL;
+        taskIndexSym0 = taskIndexSym1 = taskIndexSym2 = NULL;
+        taskCountSym0 = taskCountSym1 = taskCountSym2 = NULL;
+    }
 }
 
 
@@ -225,6 +244,12 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
         llvm::Value *threadCount = argIter++;
         llvm::Value *taskIndex = argIter++;
         llvm::Value *taskCount = argIter++;
+        llvm::Value *taskIndex0 = argIter++;
+        llvm::Value *taskIndex1 = argIter++;
+        llvm::Value *taskIndex2 = argIter++;
+        llvm::Value *taskCount0 = argIter++;
+        llvm::Value *taskCount1 = argIter++;
+        llvm::Value *taskCount2 = argIter++;
 
         // Copy the function parameter values from the structure into local
         // storage
@@ -256,6 +281,20 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
 
         taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount");
         ctx->StoreInst(taskCount, taskCountSym->storagePtr);
+        
+        taskIndexSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex0");
+        ctx->StoreInst(taskIndex0, taskIndexSym0->storagePtr);
+        taskIndexSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex1");
+        ctx->StoreInst(taskIndex1, taskIndexSym1->storagePtr);
+        taskIndexSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex2");
+        ctx->StoreInst(taskIndex2, taskIndexSym2->storagePtr);
+        
+        taskCountSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount0");
+        ctx->StoreInst(taskCount0, taskCountSym0->storagePtr);
+        taskCountSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount1");
+        ctx->StoreInst(taskCount1, taskCountSym1->storagePtr);
+        taskCountSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount2");
+        ctx->StoreInst(taskCount2, taskCountSym2->storagePtr);
     }
     else {
         // Regular, non-task function
diff --git a/func.h b/func.h
index ac3e1447..88a96dbc 100644
--- a/func.h
+++ b/func.h
@@ -60,7 +60,10 @@ private:
     Stmt *code;
     Symbol *maskSymbol;
     Symbol *threadIndexSym, *threadCountSym;
-    Symbol *taskIndexSym, *taskCountSym;
+    Symbol *taskIndexSym,   *taskCountSym;
+    Symbol *taskIndexSym0, *taskCountSym0;
+    Symbol *taskIndexSym1, *taskCountSym1;
+    Symbol *taskIndexSym2, *taskCountSym2;
 };
 
 #endif // ISPC_FUNC_H
diff --git a/ispc.cpp b/ispc.cpp
index bd28fc51..96cf4447 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -515,6 +515,25 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 #if !defined(LLVM_3_1)
         // LLVM 3.2+ only
         this->m_hasRand = true;
+#endif
+    }
+    else if (!strcasecmp(isa, "avx1.1-i64x4")) {
+        this->m_isa = Target::AVX11;
+        this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_vectorWidth = 4;
+        this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
+#if defined(LLVM_3_4)
+        ",+rdrnd"
+#else
+        ",+rdrand"
+#endif
+        ;           
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 64;
+        this->m_hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
+        this->m_hasRand = true;
 #endif
     }
     else if (!strcasecmp(isa, "avx2") ||
@@ -563,6 +582,29 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         // LLVM 3.2+ only
         this->m_hasRand = true;
         this->m_hasGather = true;
+#endif
+    }
+    else if (!strcasecmp(isa, "avx2-i64x4")) {
+        this->m_isa = Target::AVX2;
+        this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_vectorWidth = 4;
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
+#if defined(LLVM_3_4)
+        ",+rdrnd"
+#else
+        ",+rdrand"
+#endif
+#ifndef LLVM_3_1
+            ",+fma"
+#endif // !LLVM_3_1
+            ;
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 64;
+        this->m_hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
+        this->m_hasRand = true;
+        this->m_hasGather = true;
 #endif
     }
 #ifdef ISPC_ARM_ENABLED
@@ -740,8 +782,8 @@ Target::SupportedTargets() {
         "sse2-i32x4, sse2-i32x8, "
         "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
         "avx1-i32x8, avx1-i32x16, avx1-i64x4, "
-        "avx1.1-i32x8, avx1.1-i32x16, "
-        "avx2-i32x8, avx2-i32x16, "
+        "avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 "
+        "avx2-i32x8, avx2-i32x16, avx2-i64x4, "
         "generic-x1, generic-x4, generic-x8, generic-x16, "
         "generic-x32, generic-x64";
 }
diff --git a/ispc.vcxproj b/ispc.vcxproj
index 58fa5b08..b9a3b6c5 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -28,10 +28,14 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-x2-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx11-i64x4-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx11-i64x4-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx2-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx2-x2-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx2-i64x4-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx2-i64x4-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-c-32.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-c-64.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-dispatch.cpp" />
@@ -323,6 +327,24 @@
       <Message>Building gen-bitcode-avx11-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx11-i64x4.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx11-i64x4-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx11-i64x4.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx11-i64x4-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx2.ll">
       <FileType>Document</FileType>
@@ -359,6 +381,24 @@
       <Message>Building gen-bitcode-avx2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx2-i64x4.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx2-i64x4-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx2-i64x4.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx2-i64x4-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-1.ll">
       <FileType>Document</FileType>
diff --git a/lex.ll b/lex.ll
index 3655220f..87a80145 100644
--- a/lex.ll
+++ b/lex.ll
@@ -419,6 +419,14 @@ while { RT; return TOKEN_WHILE; }
 \"C\" { RT; return TOKEN_STRING_C_LITERAL; }
 \.\.\. { RT; return TOKEN_DOTDOTDOT; }
 
+"operator*"  { return TOKEN_IDENTIFIER; }
+"operator+"  { return TOKEN_IDENTIFIER; }
+"operator-"  { return TOKEN_IDENTIFIER; }
+"operator<<" { return TOKEN_IDENTIFIER; }
+"operator>>" { return TOKEN_IDENTIFIER; }
+"operator/" { return TOKEN_IDENTIFIER; }
+"operator%" { return TOKEN_IDENTIFIER; }
+
 L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERAL; }
 
 {IDENT} {
diff --git a/parse.yy b/parse.yy
index 933a3455..9a0377c5 100644
--- a/parse.yy
+++ b/parse.yy
@@ -353,17 +353,75 @@ launch_expression
     : TOKEN_LAUNCH postfix_expression '(' argument_expression_list ')'
       {
           ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @2);
-          $$ = new FunctionCallExpr($2, $4, Union(@2, @5), true, oneExpr);
+          Expr *launchCount[3] = {oneExpr, oneExpr, oneExpr};
+          $$ = new FunctionCallExpr($2, $4, Union(@2, @5), true, launchCount);
       }
     | TOKEN_LAUNCH postfix_expression '(' ')'
       {
           ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @2);
-          $$ = new FunctionCallExpr($2, new ExprList(Union(@3,@4)), Union(@2, @4), true, oneExpr);
+          Expr *launchCount[3] = {oneExpr, oneExpr, oneExpr};
+          $$ = new FunctionCallExpr($2, new ExprList(Union(@3,@4)), Union(@2, @4), true, launchCount);
        }
-    | TOKEN_LAUNCH '[' expression ']' postfix_expression '(' argument_expression_list ')'
-      { $$ = new FunctionCallExpr($5, $7, Union(@5,@8), true, $3); }
-    | TOKEN_LAUNCH '[' expression ']' postfix_expression '(' ')'
-      { $$ = new FunctionCallExpr($5, new ExprList(Union(@5,@6)), Union(@5,@7), true, $3); }
+
+    | TOKEN_LAUNCH '[' assignment_expression ']' postfix_expression '(' argument_expression_list ')'
+      { 
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @5);
+          Expr *launchCount[3] = {$3, oneExpr, oneExpr};
+          $$ = new FunctionCallExpr($5, $7, Union(@5,@8), true, launchCount);
+      }
+    | TOKEN_LAUNCH '[' assignment_expression ']' postfix_expression '(' ')'
+      { 
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @5);
+          Expr *launchCount[3] = {$3, oneExpr, oneExpr};
+          $$ = new FunctionCallExpr($5, new ExprList(Union(@5,@6)), Union(@5,@7), true, launchCount);
+      }
+
+    | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ']' postfix_expression '(' argument_expression_list ')'
+      { 
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @7);
+          Expr *launchCount[3] = {$3, $5, oneExpr};
+          $$ = new FunctionCallExpr($7, $9, Union(@7,@10), true, launchCount);
+      }
+    | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ']' postfix_expression '(' ')'
+      { 
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @7);
+          Expr *launchCount[3] = {$3, $5, oneExpr};
+          $$ = new FunctionCallExpr($7, new ExprList(Union(@7,@8)), Union(@7,@9), true, launchCount);
+      }
+    | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' argument_expression_list ')'
+      { 
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @8);
+          Expr *launchCount[3] = {$6, $3, oneExpr};
+          $$ = new FunctionCallExpr($8, $10, Union(@8,@11), true, launchCount);
+      }
+    | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' ')'
+      { 
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @8);
+          Expr *launchCount[3] = {$6, $3, oneExpr};
+          $$ = new FunctionCallExpr($8, new ExprList(Union(@8,@9)), Union(@8,@10), true, launchCount);
+      }
+
+    | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ',' assignment_expression ']' postfix_expression '(' argument_expression_list ')'
+      { 
+          Expr *launchCount[3] = {$3, $5, $7};
+          $$ = new FunctionCallExpr($9, $11, Union(@9,@12), true, launchCount);
+      }
+    | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ',' assignment_expression ']' postfix_expression '(' ')'
+      { 
+          Expr *launchCount[3] = {$3, $5, $7};
+          $$ = new FunctionCallExpr($9, new ExprList(Union(@9,@10)), Union(@9,@11), true, launchCount);
+      }
+    | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' argument_expression_list ')'
+      { 
+          Expr *launchCount[3] = {$9, $6, $3};
+          $$ = new FunctionCallExpr($11, $13, Union(@11,@14), true, launchCount);
+      }
+    | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' ')'
+      { 
+          Expr *launchCount[3] = {$9, $6, $3};
+          $$ = new FunctionCallExpr($11, new ExprList(Union(@11,@12)), Union(@11,@13), true, launchCount);
+      }
+
 
     | TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>'
        {
@@ -377,13 +435,13 @@ launch_expression
                 "around function call expression.");
           $$ = NULL;
        }
-    | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' argument_expression_list ')' '>'
+    | TOKEN_LAUNCH '[' assignment_expression ']' '<' postfix_expression '(' argument_expression_list ')' '>'
        {
           Error(Union(@5, @10), "\"launch\" expressions no longer take '<' '>' "
                 "around function call expression.");
           $$ = NULL;
        }
-    | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' ')' '>'
+    | TOKEN_LAUNCH '[' assignment_expression ']' '<' postfix_expression '(' ')' '>'
        {
           Error(Union(@5, @9), "\"launch\" expressions no longer take '<' '>' "
                 "around function call expression.");
@@ -468,27 +526,27 @@ cast_expression
 multiplicative_expression
     : cast_expression
     | multiplicative_expression '*' cast_expression
-      { $$ = new BinaryExpr(BinaryExpr::Mul, $1, $3, Union(@1, @3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Mul, $1, $3, Union(@1, @3)); }
     | multiplicative_expression '/' cast_expression
-      { $$ = new BinaryExpr(BinaryExpr::Div, $1, $3, Union(@1, @3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Div, $1, $3, Union(@1, @3)); }
     | multiplicative_expression '%' cast_expression
-      { $$ = new BinaryExpr(BinaryExpr::Mod, $1, $3, Union(@1, @3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Mod, $1, $3, Union(@1, @3)); }
     ;
 
 additive_expression
     : multiplicative_expression
     | additive_expression '+' multiplicative_expression
-      { $$ = new BinaryExpr(BinaryExpr::Add, $1, $3, Union(@1, @3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Add, $1, $3, Union(@1, @3)); }
     | additive_expression '-' multiplicative_expression
-      { $$ = new BinaryExpr(BinaryExpr::Sub, $1, $3, Union(@1, @3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Sub, $1, $3, Union(@1, @3)); }
     ;
 
 shift_expression
     : additive_expression
     | shift_expression TOKEN_LEFT_OP additive_expression
-      { $$ = new BinaryExpr(BinaryExpr::Shl, $1, $3, Union(@1,@3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Shl, $1, $3, Union(@1, @3)); }
     | shift_expression TOKEN_RIGHT_OP additive_expression
-      { $$ = new BinaryExpr(BinaryExpr::Shr, $1, $3, Union(@1,@3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Shr, $1, $3, Union(@1, @3)); }
     ;
 
 relational_expression
@@ -2214,9 +2272,24 @@ static void lAddThreadIndexCountToSymbolTable(SourcePos pos) {
 
     Symbol *taskIndexSym = new Symbol("taskIndex", pos, type);
     m->symbolTable->AddVariable(taskIndexSym);
-
+    
     Symbol *taskCountSym = new Symbol("taskCount", pos, type);
     m->symbolTable->AddVariable(taskCountSym);
+    
+    Symbol *taskIndexSym0 = new Symbol("taskIndex0", pos, type);
+    m->symbolTable->AddVariable(taskIndexSym0);
+    Symbol *taskIndexSym1 = new Symbol("taskIndex1", pos, type);
+    m->symbolTable->AddVariable(taskIndexSym1);
+    Symbol *taskIndexSym2 = new Symbol("taskIndex2", pos, type);
+    m->symbolTable->AddVariable(taskIndexSym2);
+
+    
+    Symbol *taskCountSym0 = new Symbol("taskCount0", pos, type);
+    m->symbolTable->AddVariable(taskCountSym0);
+    Symbol *taskCountSym1 = new Symbol("taskCount1", pos, type);
+    m->symbolTable->AddVariable(taskCountSym1);
+    Symbol *taskCountSym2 = new Symbol("taskCount2", pos, type);
+    m->symbolTable->AddVariable(taskCountSym2);
 }
 
 
diff --git a/run_tests.py b/run_tests.py
index 803410b8..506d37a5 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -377,7 +377,12 @@ def file_check(compfails, runfails):
             temp3 = re.search("[0-9]*\.[0-9]*", temp2.group())
         compiler_version = options.compiler_exe + temp3.group()
     else:
-        compiler_version = "cl" 
+        compiler_version = "cl"
+    possible_compilers = ["g++4.4", "g++4.7", "clang++3.3", "cl"]
+    if not compiler_version in possible_compilers:
+        error("\n**********\nWe don't have history of fails for compiler " +
+                compiler_version +
+                "\nAll fails will be new!!!\n**********", 2)
     new_line = " "+options.arch.rjust(6)+" "+options.target.rjust(14)+" "+OS.rjust(7)+" "+llvm_version+" "+compiler_version.rjust(10)+" "+opt+" *\n"
 
     new_compfails = compfails[:]
@@ -449,8 +454,9 @@ def verify():
     check = [["g++", "clang++", "cl"],["-O0", "-O2"],["x86","x86-64"],
              ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM head"],
              ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8",
-              "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8", "avx1.1-i32x16",
-              "avx2-i32x8", "avx2-i32x16", "generic-1", "generic-4", "generic-8",
+              "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8",
+              "avx1.1-i32x16", "avx1.1-i64x4", "avx2-i32x8", "avx2-i32x16", "avx2-i64x4",
+              "generic-1", "generic-4", "generic-8",
               "generic-16", "generic-32", "generic-64"]]
     for i in range (0,len(f_lines)):
         if f_lines[i][0] == "%":
@@ -647,7 +653,8 @@ def run_tests(options1, args, print_version):
     if options.non_interactive == False:
         print_debug("\n", s, run_tests_log)
 
-    elapsed_time = time.time() - start_time
+    temp_time = (time.time() - start_time)
+    elapsed_time = time.strftime('%Hh%Mm%Ssec.', time.gmtime(temp_time))
 
     while not qret.empty():
         (c, r, skip) = qret.get()
@@ -675,12 +682,16 @@ def run_tests(options1, args, print_version):
     if len(compile_error_files) == 0 and len(run_error_files) == 0:
         print_debug("No fails\n", s, run_tests_log)
 
-    R = file_check(compile_error_files, run_error_files)
+    if len(args) == 0:
+        R = file_check(compile_error_files, run_error_files)
+    else:
+        error("don't check new fails for incomplete suite of tests", 2)
+        R = 0
 
     if options.time:
-        print_debug("Elapsed time: %d s\n" % elapsed_time, s, run_tests_log)
+        print_debug("Elapsed time: " + elapsed_time + "\n", s, run_tests_log)
 
-    return R
+    return [R, elapsed_time]
 
 
 from optparse import OptionParser
diff --git a/tests/operators.ispc b/tests/operators.ispc
new file mode 100644
index 00000000..95502bdd
--- /dev/null
+++ b/tests/operators.ispc
@@ -0,0 +1,70 @@
+
+export uniform int width() { return programCount; }
+
+struct S {
+    float a;
+};
+
+// References "struct&" were put in random order to test them.
+struct S operator*(struct S& rr, struct S rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator/(struct S& rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a - rr.a + 2;
+    return c;
+}
+
+struct S operator%(struct S rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator+(struct S rr, struct S rv) {
+    struct S c;
+    c.a = rr.a / rv.a + 3;
+    return c;
+}
+
+struct S operator-(struct S rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator>>(struct S& rr, struct S rv) {
+    struct S c;
+    c.a = rr.a / rv.a + 3;
+    return c;
+}
+
+struct S operator<<(struct S& rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S a, a1;
+struct S b, b1;
+struct S d1, d2, d3, d4, d5, d6, d7;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    a.a = aFOO[programIndex];
+    b.a = -aFOO[programIndex];
+    d1 = a * b;
+    d2 = a / b;
+    d3 = a % b;
+    d4 = a + b;
+    d5 = a - b;
+    d6 = a >> b;
+    d7 = a << b;
+    RET[programIndex] = d1.a + d2.a + d3.a + d4.a + d5.a + d6.a + d7.a;
+}
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 14;
+}
diff --git a/tests/operators1.ispc b/tests/operators1.ispc
new file mode 100644
index 00000000..f52c4c35
--- /dev/null
+++ b/tests/operators1.ispc
@@ -0,0 +1,64 @@
+
+export uniform int width() { return programCount; }
+
+struct S {
+    float a;
+};
+
+// References "struct&" were put in random order to test them.
+struct S operator*(struct S& rr, struct S rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator/(struct S& rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a - rr.a + 2;
+    return c;
+}
+
+struct S operator%(struct S rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator+(struct S rr, struct S rv) {
+    struct S c;
+    c.a = rr.a / rv.a + 3;
+    return c;
+}
+
+struct S operator-(struct S rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator>>(struct S& rr, struct S rv) {
+    struct S c;
+    c.a = rr.a / rv.a + 3;
+    return c;
+}
+
+struct S operator<<(struct S& rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S a;
+struct S b;
+struct S d;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    a.a = 5;
+    b.a = -5;
+    d = a * b + b / a - a << (b - b) - a;
+    RET[programIndex] = d.a;
+}
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 12;
+}
diff --git a/tests/operators2.ispc b/tests/operators2.ispc
new file mode 100644
index 00000000..b732b24a
--- /dev/null
+++ b/tests/operators2.ispc
@@ -0,0 +1,51 @@
+int off;
+
+export uniform int width() { return programCount; }
+
+struct S {
+    float a;
+};
+
+struct S operator+(struct S rr, struct S rv) {
+    struct S c;
+    c.a = rr.a / rv.a + 3;
+    if (off == 1)
+        c.a = 22;
+    return c;
+}
+
+struct S operator/(struct S rr, struct S rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 10;
+    if (off == 1)
+        c.a = 33;
+    return c;
+}
+
+struct S a;
+struct S b;
+struct S d;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int T = programIndex;
+    a.a = aFOO[programIndex];
+    b.a = -aFOO[programIndex];
+    if (programIndex == 3)
+        off = 1;
+    else
+        off = 0;
+    if (T % 2)
+        d = a + b;
+    else
+        d = a / b;
+
+    RET[programIndex] = d.a;
+}
+
+export void result(uniform float RET[4]) {
+    if (programIndex % 2)
+        RET[programIndex] = 2;
+    else
+        RET[programIndex] = 10;
+    RET[3] = 22;
+}
diff --git a/type.cpp b/type.cpp
index 5fa1845b..516276f0 100644
--- a/type.cpp
+++ b/type.cpp
@@ -2961,6 +2961,12 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
         callTypes.push_back(LLVMTypes::Int32Type); // threadCount
         callTypes.push_back(LLVMTypes::Int32Type); // taskIndex
         callTypes.push_back(LLVMTypes::Int32Type); // taskCount
+        callTypes.push_back(LLVMTypes::Int32Type); // taskIndex0
+        callTypes.push_back(LLVMTypes::Int32Type); // taskIndex1
+        callTypes.push_back(LLVMTypes::Int32Type); // taskIndex2
+        callTypes.push_back(LLVMTypes::Int32Type); // taskCount0
+        callTypes.push_back(LLVMTypes::Int32Type); // taskCount1
+        callTypes.push_back(LLVMTypes::Int32Type); // taskCount2
     }
     else
         // Otherwise we already have the types of the arguments